diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
index af9eb75a0288..cb22b598a94b 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
@@ -77,6 +77,12 @@ llvm::cl::opt<bool> clGPUUnalignedGEMMVectorDistribution(
                    "unaligned GEMMs when supported"),
     llvm::cl::init(false));
 
+llvm::cl::opt<bool> clGPUUseTileAndFuseConvolution(
+    "iree-codegen-llvmgpu-use-tile-and-fuse-convolution",
+    llvm::cl::desc(
+        "enable the tile and fuse pipeline for supported convolutions"),
+    llvm::cl::init(true));
+
 /// Flag to force using WMMA tensorcore operations.
 llvm::cl::opt<bool>
     clGPUUseWMMA("iree-codegen-llvmgpu-use-wmma",
@@ -2196,12 +2202,19 @@ static bool distributeToSquare(const int64_t oh, const int64_t ow,
 // Convolution Pipeline Configuration
 //====---------------------------------------------------------------------===//
 
-static LogicalResult setConvolutionConfig(IREE::GPU::TargetAttr target,
-                                          linalg::LinalgOp linalgOp,
-                                          const int64_t bestTilingFactor) {
+static LogicalResult setConvolutionConfig(
+    IREE::GPU::TargetAttr target, mlir::FunctionOpInterface entryPointFn,
+    linalg::LinalgOp linalgOp, const int64_t bestTilingFactor) {
   if (!isa<linalg::Conv2DNhwcHwcfOp, linalg::Conv2DNchwFchwOp>(linalgOp)) {
     return failure();
   }
+  if (clGPUUseTileAndFuseConvolution) {
+    if (succeeded(IREE::GPU::setTileAndFuseLoweringConfig(target, entryPointFn,
+                                                          linalgOp))) {
+      LDBG("Tile and fuse convolution config");
+      return success();
+    }
+  }
   const bool isNCHW = isa<linalg::Conv2DNchwFchwOp>(*linalgOp);
   const bool isNHWC = isa<linalg::Conv2DNhwcHwcfOp>(*linalgOp);
 
@@ -2284,9 +2297,8 @@ static LogicalResult setConvolutionConfig(IREE::GPU::TargetAttr target,
   SmallVector<int64_t> windowTileSizes(4, 0);
   windowTileSizes[ohIndex] = 1;
   tileSizes.push_back(windowTileSizes);
-  auto funcOp = linalgOp->getParentOfType<mlir::FunctionOpInterface>();
-  return setOpConfigAndEntryPointFnTranslation(funcOp, linalgOp, tileSizes,
-                                               pipeline, workgroupSize);
+  return setOpConfigAndEntryPointFnTranslation(
+      entryPointFn, linalgOp, tileSizes, pipeline, workgroupSize);
 }
 
 //====---------------------------------------------------------------------===//
@@ -2340,7 +2352,7 @@ static LogicalResult setRootConfig(IREE::GPU::TargetAttr target,
       LDBG("Warp Reduction Config");
       return success();
     }
-    if (succeeded(setConvolutionConfig(target, linalgOp, 16))) {
+    if (succeeded(setConvolutionConfig(target, entryPointFn, linalgOp, 16))) {
       LDBG("Convolution Config");
       return success();
     }
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/conv_pipeline_test_cuda.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/conv_pipeline_test_cuda.mlir
index d129117741e3..af33828f9135 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/conv_pipeline_test_cuda.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/conv_pipeline_test_cuda.mlir
@@ -1,4 +1,4 @@
-// RUN: iree-opt --split-input-file --iree-gpu-test-target=sm_60 \
+// RUN: iree-opt --split-input-file --iree-gpu-test-target=sm_60 --iree-codegen-llvmgpu-use-tile-and-fuse-convolution=false \
 // RUN:   --pass-pipeline='builtin.module(hal.executable(hal.executable.variant(builtin.module(iree-llvmgpu-select-lowering-strategy, func.func(iree-llvmgpu-lower-executable-target,canonicalize)))))' \
 // RUN:   %s | FileCheck %s
 
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/gpu_set_num_workgroups.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/gpu_set_num_workgroups.mlir
index feb0e2766303..66fc62f2e482 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/gpu_set_num_workgroups.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/gpu_set_num_workgroups.mlir
@@ -612,12 +612,11 @@ func.func @forward_dispatch_1_conv_2d_nhwc_hwcf_256x112x112x64x7x7x3_f32() {
   return
 }
 
-//  CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[1, 1, 8, 64, 1, 1, 4], [0, 1, 0, 0]{{\]}}
-//  CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<pipeline = LLVMGPUVectorize workgroup_size = [16, 2, 1]>
+//  CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [32, 1, 1]
 //      CHECK: func.func @forward_dispatch_1_conv_2d_nhwc_hwcf_256x112x112x64x7x7x3_f32
 // CHECK-SAME:     translation_info = #[[TRANSLATION]]
-//      CHECK:   linalg.generic
-// CHECK-SAME:       lowering_config = #[[CONFIG]]
+//      CHECK:   linalg.conv_2d
+// CHECK-SAME:       lowering_config =  #iree_gpu.lowering_config<{promote_operands = [0, 1], reduction = [0, 0, 0, 0, 1, 7, 3], thread = [1, 1, 1, 1, 0, 0, 0], workgroup = [1, 1, 1, 32, 0, 0, 0]}>
 
 // -----