From a65e8e3ce61976ff8c216675bc3376e84216a27f Mon Sep 17 00:00:00 2001
From: Deven Desai <deven.desai.amd@gmail.com>
Date: Mon, 7 Oct 2019 15:51:34 +0000
Subject: [PATCH] [ROCm] Adding pass to generate the HSACO binary blob from the
 GPU kernel function

---
 CMakeLists.txt                                |   7 +
 include/mlir/CMakeLists.txt                   |   1 +
 .../mlir/Conversion/GPUToROCM/CMakeLists.txt  |  33 ++
 .../mlir/Conversion/GPUToROCM/GPUToROCMPass.h |  92 ++++
 .../mlir/Conversion/GPUToROCM/ROCMConfig.h.in |  30 ++
 lib/Conversion/CMakeLists.txt                 |   1 +
 lib/Conversion/GPUToROCM/CMakeLists.txt       |  15 +
 .../GPUToROCM/ConvertKernelFuncToHSACO.cpp    | 407 ++++++++++++++++++
 test/Conversion/GPUToROCM/lit.local.cfg       |   2 +
 .../lower-amdgpu-kernel-to-hsaco.mlir         |  31 ++
 test/lit.site.cfg.py.in                       |   1 +
 tools/mlir-opt/CMakeLists.txt                 |   5 +
 12 files changed, 625 insertions(+)
 create mode 100644 include/mlir/Conversion/GPUToROCM/CMakeLists.txt
 create mode 100644 include/mlir/Conversion/GPUToROCM/GPUToROCMPass.h
 create mode 100644 include/mlir/Conversion/GPUToROCM/ROCMConfig.h.in
 create mode 100644 lib/Conversion/GPUToROCM/CMakeLists.txt
 create mode 100644 lib/Conversion/GPUToROCM/ConvertKernelFuncToHSACO.cpp
 create mode 100644 test/Conversion/GPUToROCM/lit.local.cfg
 create mode 100644 test/Conversion/GPUToROCM/lower-amdgpu-kernel-to-hsaco.mlir

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5329de4e8935..01acc0d7cd93 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -44,6 +44,13 @@ endif()
 
 set(MLIR_CUDA_RUNNER_ENABLED 0 CACHE BOOL "Enable building the mlir CUDA runner")
 
+# Build the ROCM conversions if the AMDGPU backend is available
+if ("AMDGPU" IN_LIST LLVM_TARGETS_TO_BUILD)
+  set(MLIR_ROCM_CONVERSIONS_ENABLED 1)
+else()
+  set(MLIR_ROCM_CONVERSIONS_ENABLED 0)
+endif()
+
 include_directories( "include")
 include_directories( ${MLIR_INCLUDE_DIR})
 
diff --git a/include/mlir/CMakeLists.txt b/include/mlir/CMakeLists.txt
index 1a5094df90db..2977e49500e5 100644
--- a/include/mlir/CMakeLists.txt
+++ b/include/mlir/CMakeLists.txt
@@ -2,3 +2,4 @@ add_subdirectory(Analysis)
 add_subdirectory(Dialect)
 add_subdirectory(EDSC)
 add_subdirectory(Transforms)
+add_subdirectory(Conversion/GPUToROCM)
diff --git a/include/mlir/Conversion/GPUToROCM/CMakeLists.txt b/include/mlir/Conversion/GPUToROCM/CMakeLists.txt
new file mode 100644
index 000000000000..e0a95217f454
--- /dev/null
+++ b/include/mlir/Conversion/GPUToROCM/CMakeLists.txt
@@ -0,0 +1,33 @@
+if(MLIR_ROCM_CONVERSIONS_ENABLED)
+
+  # Check whether the ROCm installation dir exists
+  set(ROCM_INSTALL_DIR "/opt/rocm" CACHE STRING "ROCm installation directory")
+  if (EXISTS ${ROCM_INSTALL_DIR})
+    message("-- ROCm Install Dir - ${ROCM_INSTALL_DIR}")
+  else()
+    message(SEND_ERROR "-- NOT FOUND : ROCm Install Dir - ${ROCM_INSTALL_DIR}")
+  endif()
+
+  # Check whether the ROCm device library dir exists
+  set(ROCM_DEVICE_LIB_DIR ${ROCM_INSTALL_DIR}/lib)
+  if (EXISTS ${ROCM_DEVICE_LIB_DIR})
+    message("-- ROCm Device Library Dir - ${ROCM_DEVICE_LIB_DIR}")
+  else ()
+    message(SEND_ERROR "-- NOT FOUND : ROCm Device Library Dir - ${ROCM_DEVICE_LIB_DIR}")
+  endif()
+
+  # Check whether the ROCm HCC linker exists
+  set(ROCM_HCC_LINKER ${ROCM_INSTALL_DIR}/hcc/bin/ld.lld)
+  if (EXISTS ${ROCM_HCC_LINKER})
+    message("-- ROCm HCC Linker - ${ROCM_HCC_LINKER}")
+  else ()
+    message(SEND_ERROR "-- NOT FOUND : ROCm HCC Linker - ${ROCM_HCC_LINKER}")
+  endif()
+
+  # Generate the ROCm Configuration header file
+  configure_file(
+    "${CMAKE_CURRENT_SOURCE_DIR}/ROCMConfig.h.in"
+    "${CMAKE_CURRENT_BINARY_DIR}/ROCMConfig.h"
+    )
+
+endif()
diff --git a/include/mlir/Conversion/GPUToROCM/GPUToROCMPass.h b/include/mlir/Conversion/GPUToROCM/GPUToROCMPass.h
new file mode 100644
index 000000000000..e7698611d858
--- /dev/null
+++ b/include/mlir/Conversion/GPUToROCM/GPUToROCMPass.h
@@ -0,0 +1,92 @@
+//===- GPUToROCmPass.h - MLIR ROCm runtime support --------------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+#ifndef MLIR_CONVERSION_GPUTOROCM_GPUTOROCMPASS_H_
+#define MLIR_CONVERSION_GPUTOROCM_GPUTOROCMPASS_H_
+
+#include <functional>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "mlir/Conversion/GPUToROCM/ROCMConfig.h"
+
+namespace mlir {
+
+namespace rocm {
+
+/// string constants used by the ROCM backend
+static constexpr const char *kHSACOAnnotation = "amdgpu.hsaco";
+static constexpr const char *kHSACOGetterAnnotation = "amdgpu.hsacogetter";
+static constexpr const char *kHSACOGetterSuffix = "_hsaco";
+static constexpr const char *kHSACOStorageSuffix = "_hsaco_cst";
+
+/// enum to represent the AMD GPU versions supported by the ROCM backend
+enum class AMDGPUVersion { GFX900 };
+
+/// enum to represent the HSA Code Object versions supported by the ROCM backend
+enum class HSACOVersion { V3 };
+
+/// Configurable parameters for generating the HSACO blobs from GPU Kernels
+struct HSACOGeneratorConfig {
+
+  /// Constructor - sets the default values for the configurable parameters
+  HSACOGeneratorConfig(bool isTestMode)
+      : testMode(isTestMode), amdgpuVersion(AMDGPUVersion::GFX900),
+        hsacoVersion(HSACOVersion::V3), rocdlDir(ROCM_DEVICE_LIB_DIR),
+        linkerPath(ROCM_HCC_LINKER) {}
+
+  /// testMode == true will result in skipping the HASCO generation process, and
+  /// simply return the string "HSACO" as the HSACO blob
+  bool testMode;
+
+  /// the AMDGPU version for which to generate the HSACO
+  AMDGPUVersion amdgpuVersion;
+
+  /// the code object version for the generated HSACO
+  HSACOVersion hsacoVersion;
+
+  /// the directory containing the ROCDL bitcode libraries
+  std::string rocdlDir;
+
+  /// the path the ld.lld linker to use when generating the HSACO
+  std::string linkerPath;
+};
+
+} // namespace rocm
+
+// unique pointer to the HSA Code Object (which is stored as char vector)
+using OwnedHSACO = std::unique_ptr<std::vector<char>>;
+
+class ModuleOp;
+template <typename T>
+class OpPassBase;
+
+/// Creates a pass to convert kernel functions into HSA Code Object blobs.
+///
+/// This transformation takes the body of each function that is annotated with
+/// the amdgpu_kernel calling convention, copies it to a new LLVM module,
+/// compiles the module with help of the AMDGPU backend to GCN ISA, and then
+/// invokes lld to produce a binary blob in HSA Code Object format. Such blob
+/// is then attached as a string attribute named 'amdgpu.hsaco' to the kernel
+/// function.  After the transformation, the body of the kernel function is
+/// removed (i.e., it is turned into a declaration).
+std::unique_ptr<OpPassBase<ModuleOp>> createConvertGPUKernelToHSACOPass(
+    rocm::HSACOGeneratorConfig hsacoGeneratorConfig);
+
+} // namespace mlir
+
+#endif // MLIR_CONVERSION_GPUTOROCM_GPUTOROCMPASS_H_
diff --git a/include/mlir/Conversion/GPUToROCM/ROCMConfig.h.in b/include/mlir/Conversion/GPUToROCM/ROCMConfig.h.in
new file mode 100644
index 000000000000..4f326c8186a8
--- /dev/null
+++ b/include/mlir/Conversion/GPUToROCM/ROCMConfig.h.in
@@ -0,0 +1,30 @@
+//===- ROCMConfig.h - ROCm Configuration Header -----------------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+#ifndef MLIR_CONVERSION_GPUTOROCM_ROCMCONFIG_H_
+#define MLIR_CONVERSION_GPUTOROCM_ROCMCONFIG_H_
+
+/// The code to generate the HSACO binary blobs (corresponding the GPU kernels)
+/// assumes the presense of ROCm libraries/utilities. The location of these
+/// tools is configured via cmake
+
+/// Path to the ROCm Device Library dir in the ROCM install
+#cmakedefine ROCM_DEVICE_LIB_DIR "@ROCM_DEVICE_LIB_DIR@"
+
+/// Path to the HCC Linker in the ROCM install
+#cmakedefine ROCM_HCC_LINKER "@ROCM_HCC_LINKER@"
+
+#endif // MLIR_CONVERSION_GPUTOROCM_ROCMCONFIG_H_
diff --git a/lib/Conversion/CMakeLists.txt b/lib/Conversion/CMakeLists.txt
index c0fd6b83fd95..57e3fb4cf114 100644
--- a/lib/Conversion/CMakeLists.txt
+++ b/lib/Conversion/CMakeLists.txt
@@ -1,5 +1,6 @@
 add_subdirectory(GPUToCUDA)
 add_subdirectory(GPUToNVVM)
+add_subdirectory(GPUToROCM)
 add_subdirectory(GPUToROCDL)
 add_subdirectory(GPUToSPIRV)
 add_subdirectory(LoopsToGPU)
diff --git a/lib/Conversion/GPUToROCM/CMakeLists.txt b/lib/Conversion/GPUToROCM/CMakeLists.txt
new file mode 100644
index 000000000000..93eff82748b8
--- /dev/null
+++ b/lib/Conversion/GPUToROCM/CMakeLists.txt
@@ -0,0 +1,15 @@
+if(MLIR_ROCM_CONVERSIONS_ENABLED)
+  llvm_map_components_to_libnames(amdgpu "AMDGPU")
+
+  add_llvm_library(MLIRGPUtoROCMTransforms
+    ConvertKernelFuncToHSACO.cpp
+  )
+  target_link_libraries(MLIRGPUtoROCMTransforms
+    MLIRGPU
+    MLIRLLVMIR
+    MLIRROCDLIR
+    MLIRPass
+    MLIRTargetROCDLIR
+    ${amdgpu}
+  )
+endif()
diff --git a/lib/Conversion/GPUToROCM/ConvertKernelFuncToHSACO.cpp b/lib/Conversion/GPUToROCM/ConvertKernelFuncToHSACO.cpp
new file mode 100644
index 000000000000..dba2f25abb50
--- /dev/null
+++ b/lib/Conversion/GPUToROCM/ConvertKernelFuncToHSACO.cpp
@@ -0,0 +1,407 @@
+//===- ConvertKernelFuncToHSACO.cpp - MLIR GPU lowering passes ------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file implements a pass to convert gpu kernel functions into a
+// corresponding binary blob that can be executed on a AMD GPU. Currently
+// only translates the function itself but no dependencies.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Conversion/GPUToROCM/GPUToROCMPass.h"
+
+#include "mlir/Dialect/GPU/GPUDialect.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/Function.h"
+#include "mlir/IR/Module.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Pass/PassRegistry.h"
+#include "mlir/Support/LogicalResult.h"
+#include "mlir/Target/ROCDLIR.h"
+
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IRReader/IRReader.h"
+#include "llvm/Linker/Linker.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Support/Program.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/TargetSelect.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Transforms/IPO/Internalize.h"
+
+#include <fstream>
+#include <iostream>
+#include <string>
+
+using namespace mlir;
+
+#define DEBUG_TYPE "gpu-to-rocm-conversion"
+
+static llvm::cl::OptionCategory clOptionsCategory(DEBUG_TYPE " options");
+
+static llvm::cl::opt<bool>
+    clDumpLLVMIR("rocm-dump-lllvm-ir",
+                 llvm::cl::desc("Dump the LLVM IR when generating HSACO"),
+                 llvm::cl::cat(clOptionsCategory));
+namespace {
+
+/// A pass converting tagged kernel functions to HSA Code Object blobs.
+///
+/// If tagged as a kernel module, each contained function is translated to ROCDL
+/// IR, which is then compiled using the llvm AMDGPU backend to generate the GPU
+/// binary code (i.e. the HSACO file). The HSACO binary blob is attached as an
+/// attribute to the function and the function body is erased.
+class GpuKernelToHSACOPass : public ModulePass<GpuKernelToHSACOPass> {
+public:
+  GpuKernelToHSACOPass(rocm::HSACOGeneratorConfig hsacoGeneratorConfig =
+                           rocm::HSACOGeneratorConfig(/*isTestMode=*/true))
+      : config(hsacoGeneratorConfig) {}
+
+  // Run the dialect converter on the module.
+  void runOnModule() override {
+    ModuleOp module = getModule();
+
+    // Nothing to do if this module does not contain the "gpu.kernel_module"
+    // attribute, which is used to mark the (nested) modules created to house
+    // the GPU kernel functions
+    if (!module.getAttrOfType<UnitAttr>(
+            gpu::GPUDialect::getKernelModuleAttrName()) ||
+        !module.getName())
+      return;
+
+    // This is a module containing a GPU kernel function, we have work to do!
+
+    // Make sure the AMDGPU target is initialized.
+    LLVMInitializeAMDGPUTarget();
+    LLVMInitializeAMDGPUTargetInfo();
+    LLVMInitializeAMDGPUTargetMC();
+    LLVMInitializeAMDGPUAsmPrinter();
+
+    auto llvmModule = translateModuleToROCDLIR(module);
+    if (!llvmModule)
+      return signalPassFailure();
+
+    if (StringAttr hsacoAttr =
+            translateGpuModuleToHSACOAnnotation(*llvmModule, module))
+      module.setAttr(rocm::kHSACOAnnotation, hsacoAttr);
+    else
+      signalPassFailure();
+  }
+
+private:
+  /// Translates llvmModule to cubin and returns the result as attribute.
+  StringAttr translateGpuModuleToHSACOAnnotation(llvm::Module &llvmModule,
+                                                 ModuleOp module);
+
+  OwnedHSACO convertModuleToHSACO(llvm::Module &llvmModule, ModuleOp module);
+
+  OwnedHSACO emitModuleToHSACO(llvm::Module &llvmModule, ModuleOp module,
+                               llvm::TargetMachine &targetMachine);
+
+  OwnedHSACO emitModuleToHSACOForTesting(llvm::Module &llvmModule,
+                                         ModuleOp module);
+
+  rocm::HSACOGeneratorConfig config;
+};
+
+} // anonymous namespace
+
+// get the "-mcpu" option string corresponding to the given AMDGPU version enum
+static std::string getMcpuOptionString(rocm::AMDGPUVersion v) {
+  switch (v) {
+  case rocm::AMDGPUVersion::GFX900:
+    return "gfx900";
+  }
+  return "<invalid AMDGPU version>";
+}
+
+// get filename for file containing the AMDGPU version specific bitcodes
+static std::string getBitcodeFilename(rocm::AMDGPUVersion v) {
+  switch (v) {
+  case rocm::AMDGPUVersion::GFX900:
+    return "oclc_isa_version_900.amdgcn.bc";
+  }
+  return "<invalid AMDGPU version>";
+}
+
+// get the option string corresponding to the given HSACO version enum
+static std::string getCodeObjectOptionString(rocm::HSACOVersion v) {
+  switch (v) {
+  case rocm::HSACOVersion::V3:
+    return "-code-object-v3";
+  }
+  return "invalid HSACO version";
+}
+
+// Gets the ROCm-Device-Libs filenames for a particular AMDGPU version.
+static std::vector<std::string>
+getROCDLPaths(const rocm::AMDGPUVersion amdgpuVersion,
+              llvm::StringRef rocdlDir) {
+
+  // AMDGPU version-neutral bitcodes.
+  static constexpr StringLiteral rocdlFilenames[] = {
+      "hc.amdgcn.bc",
+      "opencl.amdgcn.bc",
+      "ocml.amdgcn.bc",
+      "ockl.amdgcn.bc",
+      "oclc_finite_only_off.amdgcn.bc",
+      "oclc_daz_opt_off.amdgcn.bc",
+      "oclc_correctly_rounded_sqrt_on.amdgcn.bc",
+      "oclc_unsafe_math_off.amdgcn.bc",
+      "oclc_wavefrontsize64_on.amdgcn.bc"};
+
+  // Construct full path to ROCDL bitcode libraries.
+  std::vector<std::string> result;
+  for (auto filename : rocdlFilenames) {
+    llvm::SmallString<128> appendedPath;
+    llvm::sys::path::append(appendedPath, rocdlDir, filename);
+    result.push_back(appendedPath.str());
+  }
+
+  // Add AMDGPU version-specific bitcodes.
+  llvm::SmallString<128> appendedPath;
+  llvm::sys::path::append(appendedPath, rocdlDir,
+                          getBitcodeFilename(amdgpuVersion));
+  result.push_back(appendedPath.str());
+
+  return result;
+}
+
+// Links the given llvm module with the given bitcode modules.
+static LogicalResult
+linkWithBitcodeModules(llvm::Module &llvmModule, ModuleOp module,
+                       llvm::ArrayRef<std::string> bitcodeModulePaths) {
+  llvm::Linker linker(llvmModule);
+
+  for (auto &filename : bitcodeModulePaths) {
+    if (!llvm::sys::fs::exists(filename)) {
+      module.emitWarning("ROCDL bitcode module was not found at " + filename);
+      // TODO(rocm)
+      // The list currently returned by "getROCDLPaths" routine is a superset
+      // and some files in that list may not be available on older ROCM
+      // releases. So commenting out the call to propagate error status.
+      // Error propagation should be restored once the list returned by
+      // "getROCDLPaths" is stable/accurate.
+      // return failure();
+      continue;
+    }
+
+    llvm::SMDiagnostic diagnostic;
+    std::unique_ptr<llvm::Module> bitcodeModule(
+        llvm::parseIRFile(llvm::StringRef(filename.data(), filename.size()),
+                          diagnostic, llvmModule.getContext()));
+
+    if (bitcodeModule == nullptr) {
+      MLIRContext *mlirContext = module.getContext();
+      auto parseErrorLocation = mlir::FileLineColLoc::get(
+          diagnostic.getFilename().str(), diagnostic.getLineNo(),
+          diagnostic.getColumnNo(), mlirContext);
+      mlir::emitError(parseErrorLocation, diagnostic.getMessage().str());
+      module.emitError("Error parsing ROCDL bitcode module from " + filename);
+      return failure();
+    }
+
+    if (linker.linkInModule(
+            std::move(bitcodeModule), llvm::Linker::Flags::LinkOnlyNeeded,
+            [](llvm::Module &M, const llvm::StringSet<> &GVS) {
+              internalizeModule(M, [&M, &GVS](const llvm::GlobalValue &GV) {
+                return !GV.hasName() || (GVS.count(GV.getName()) == 0);
+              });
+            })) {
+      module.emitError("Error linking ROCDL bitcode module from " + filename);
+      return failure();
+    }
+  }
+
+  return success();
+}
+
+// Returns whether the module uses any ROCDL bitcode functions.
+// This function may have false positives
+static bool couldNeedDeviceBitcode(const llvm::Module &llvmModule) {
+  for (const llvm::Function &llvmFunction : llvmModule.functions()) {
+    // This is a conservative approximation
+    //  - not all such functions are in ROCm-Device-Libs.
+    if (!llvmFunction.isIntrinsic() && llvmFunction.isDeclaration())
+      return true;
+  }
+  return false;
+}
+
+// Links ROCm-Device-Libs into the given module if the module needs it.
+static LogicalResult linkROCDLIfNecessary(llvm::Module &llvmModule,
+                                          ModuleOp module,
+                                          rocm::AMDGPUVersion amdgpuVersion,
+                                          llvm::StringRef rocdlDir) {
+
+  if (!couldNeedDeviceBitcode(llvmModule))
+    return success();
+
+  return linkWithBitcodeModules(llvmModule, module,
+                                getROCDLPaths(amdgpuVersion, rocdlDir));
+}
+
+// Emits the given module to HSA Code Object. targetMachine is an initialized
+// TargetMachine for the AMDGPU target.
+OwnedHSACO
+GpuKernelToHSACOPass::emitModuleToHSACO(llvm::Module &llvmModule,
+                                        ModuleOp module,
+                                        llvm::TargetMachine &targetMachine) {
+  llvm::SmallString<128> tempdirName;
+  if (llvm::sys::fs::createUniqueDirectory("/tmp/amdgpu_mlir", tempdirName)) {
+    module.emitError("Failed to create tempdir for generating HSACO\n");
+    return {};
+  }
+
+  std::error_code ec;
+  if (clDumpLLVMIR) {
+    // dump the LLVM IR to file...this is just for debugging purposes
+    llvm::Twine irFilename =
+        llvm::Twine(llvmModule.getModuleIdentifier()) + ".ll";
+    llvm::SmallString<128> irPath;
+    llvm::sys::path::append(irPath, tempdirName, irFilename);
+
+    llvm::raw_fd_ostream irFileStream(irPath, ec, llvm::sys::fs::F_None);
+    llvmModule.print(irFileStream, nullptr);
+    irFileStream.flush();
+  }
+
+  // dump the GCN ISA binary file
+  llvm::Twine isabinFilename =
+      llvm::Twine(llvmModule.getModuleIdentifier()) + ".o";
+  llvm::SmallString<128> isabinPath;
+  llvm::sys::path::append(isabinPath, tempdirName, isabinFilename);
+
+  llvm::legacy::PassManager codegenPasses;
+  llvm::SmallVector<char, 0> stream;
+  llvm::raw_svector_ostream pstream(stream);
+  llvm::raw_fd_ostream isabinFileStream(isabinPath, ec, llvm::sys::fs::F_Text);
+  llvmModule.setDataLayout(targetMachine.createDataLayout());
+  targetMachine.addPassesToEmitFile(codegenPasses, isabinFileStream, nullptr,
+                                    llvm::TargetMachine::CGFT_ObjectFile);
+  codegenPasses.run(llvmModule);
+  isabinFileStream.flush();
+
+  // generate the hsaco binary
+  // TODO(rocm):
+  // Currently we invoke lld.ld as a separate process to generate the hsaco
+  // file. Ideally we would like invoke it (ld.lld) via an API call to do the
+  // same. That will require building the "lld" project (which apparently is
+  // at the same level as "llvm") and figuring out how to call it from within
+  // this "mlir" project.
+  llvm::Twine hsacoFilename =
+      llvm::Twine(llvmModule.getModuleIdentifier()) + ".hsaco";
+  llvm::SmallString<128> hsacoPath;
+  llvm::sys::path::append(hsacoPath, tempdirName, hsacoFilename);
+
+  llvm::StringRef lldProgram(config.linkerPath);
+  std::vector<llvm::StringRef> lldArgs{
+      llvm::StringRef("ld.lld"),     llvm::StringRef("-flavor"),
+      llvm::StringRef("gnu"),        llvm::StringRef("-shared"),
+      llvm::StringRef("isabinPath"), llvm::StringRef("-o"),
+      llvm::StringRef("hsacoPath"),
+  };
+  lldArgs[4] = llvm::StringRef(isabinPath);
+  lldArgs[6] = llvm::StringRef(hsacoPath);
+
+  std::string errorMessage;
+  int lldResult = llvm::sys::ExecuteAndWait(
+      lldProgram, llvm::ArrayRef<llvm::StringRef>(lldArgs), llvm::None, {}, 0,
+      0, &errorMessage);
+  if (lldResult) {
+    module.emitError("ld.lld execution failed : " + errorMessage);
+    return {};
+  }
+  // read HSACO
+  auto hsacoFileOrError = llvm::MemoryBuffer::getFileAsStream(hsacoPath);
+  if ((ec = hsacoFileOrError.getError()))
+    return {};
+
+  std::unique_ptr<llvm::MemoryBuffer> hsacoFile =
+      std::move(hsacoFileOrError.get());
+
+  return std::make_unique<std::vector<char>>(hsacoFile->getBufferStart(),
+                                             hsacoFile->getBufferEnd());
+}
+
+OwnedHSACO
+GpuKernelToHSACOPass::emitModuleToHSACOForTesting(llvm::Module &llvmModule,
+                                                  ModuleOp module) {
+  const char data[] = "HSACO";
+  return std::make_unique<std::vector<char>>(data, data + sizeof(data) - 1);
+}
+
+OwnedHSACO GpuKernelToHSACOPass::convertModuleToHSACO(llvm::Module &llvmModule,
+                                                      ModuleOp module) {
+  if (config.testMode)
+    return emitModuleToHSACOForTesting(llvmModule, module);
+
+  // Construct LLVM TargetMachine for AMDGPU target.
+  std::unique_ptr<llvm::TargetMachine> targetMachine;
+  {
+    std::string error;
+    llvm::Triple triple("amdgcn--amdhsa-amdgiz");
+    const llvm::Target *target =
+        llvm::TargetRegistry::lookupTarget("", triple, error);
+    if (target == nullptr) {
+      module.emitError("Cannot initialize target triple");
+      return {};
+    }
+    std::string mcpuStr = getMcpuOptionString(config.amdgpuVersion);
+    std::string codeObjectStr = getCodeObjectOptionString(config.hsacoVersion);
+    targetMachine.reset(target->createTargetMachine(triple.str(), mcpuStr,
+                                                    codeObjectStr, {}, {}));
+  }
+
+  // Set the data layout of the llvm module to match what the target needs.
+  llvmModule.setDataLayout(targetMachine->createDataLayout());
+
+  if (failed(linkROCDLIfNecessary(llvmModule, module, config.amdgpuVersion,
+                                  config.rocdlDir)))
+    return {};
+
+  // Lower LLVM module to HSA code object
+  return emitModuleToHSACO(llvmModule, module, *targetMachine);
+}
+
+StringAttr GpuKernelToHSACOPass::translateGpuModuleToHSACOAnnotation(
+    llvm::Module &llvmModule, ModuleOp module) {
+
+  OwnedHSACO hsaco = convertModuleToHSACO(llvmModule, module);
+  if (!hsaco)
+    return {};
+
+  return StringAttr::get({hsaco->data(), hsaco->size()}, module.getContext());
+}
+
+std::unique_ptr<OpPassBase<ModuleOp>> mlir::createConvertGPUKernelToHSACOPass(
+    rocm::HSACOGeneratorConfig hsacoGeneratorConfig) {
+  return std::make_unique<GpuKernelToHSACOPass>(hsacoGeneratorConfig);
+}
+
+static PassRegistration<GpuKernelToHSACOPass>
+    pass("test-kernel-to-hsaco",
+         "Convert all kernel functions to HSA code object blobs");
diff --git a/test/Conversion/GPUToROCM/lit.local.cfg b/test/Conversion/GPUToROCM/lit.local.cfg
new file mode 100644
index 000000000000..1d5aaaf6c673
--- /dev/null
+++ b/test/Conversion/GPUToROCM/lit.local.cfg
@@ -0,0 +1,2 @@
+if not config.run_rocm_tests:
+  config.unsupported = True
\ No newline at end of file
diff --git a/test/Conversion/GPUToROCM/lower-amdgpu-kernel-to-hsaco.mlir b/test/Conversion/GPUToROCM/lower-amdgpu-kernel-to-hsaco.mlir
new file mode 100644
index 000000000000..e43a44daf9c5
--- /dev/null
+++ b/test/Conversion/GPUToROCM/lower-amdgpu-kernel-to-hsaco.mlir
@@ -0,0 +1,31 @@
+// RUN: mlir-opt %s --test-kernel-to-hsaco -split-input-file | FileCheck %s
+
+// CHECK: attributes  {amdgpu.hsaco = "HSACO", gpu.kernel_module}
+module @gpu_kernels attributes  { gpu.kernel_module } {
+  // CHECK-LABEL: @kernel_A
+  llvm.func @kernel_A(%arg0 : !llvm.float, %arg1 : !llvm<"float*">)
+    // CHECK: attributes  {gpu.kernel}
+    attributes  { gpu.kernel } {
+    llvm.return
+  }
+}
+
+// -----
+
+// CHECK: attributes  {amdgpu.hsaco = "HSACO", gpu.kernel_module}
+module @gpu_kernels attributes  { gpu.kernel_module } {
+  // CHECK-LABEL: @kernel_A
+  llvm.func @kernel_A(%arg0 : !llvm.float, %arg1 : !llvm<"float*">)
+    // CHECK: attributes  {gpu.kernel}
+    attributes  { gpu.kernel } {
+    llvm.return
+  }
+  
+  // CHECK-LABEL: @kernel_B
+  llvm.func @kernel_B(%arg0 : !llvm.float, %arg1 : !llvm<"float*">)
+    // CHECK: attributes  {gpu.kernel}
+    attributes  { gpu.kernel } {
+    llvm.return
+  }
+}
+
diff --git a/test/lit.site.cfg.py.in b/test/lit.site.cfg.py.in
index aab566173e6c..b038822b6e64 100644
--- a/test/lit.site.cfg.py.in
+++ b/test/lit.site.cfg.py.in
@@ -36,6 +36,7 @@ config.build_examples = @LLVM_BUILD_EXAMPLES@
 config.run_cuda_tests = @MLIR_CUDA_CONVERSIONS_ENABLED@
 config.cuda_wrapper_library_dir = "@MLIR_CUDA_WRAPPER_LIBRARY_DIR@"
 config.enable_cuda_runner = @MLIR_CUDA_RUNNER_ENABLED@
+config.run_rocm_tests = @MLIR_ROCM_CONVERSIONS_ENABLED@
 
 # Support substitution of the tools_dir with user parameters. This is
 # used when we can't determine the tool dir at configuration time.
diff --git a/tools/mlir-opt/CMakeLists.txt b/tools/mlir-opt/CMakeLists.txt
index a279b0f4afcf..1965f8740032 100644
--- a/tools/mlir-opt/CMakeLists.txt
+++ b/tools/mlir-opt/CMakeLists.txt
@@ -55,6 +55,11 @@ if(MLIR_CUDA_CONVERSIONS_ENABLED)
     MLIRGPUtoCUDATransforms
   )
 endif()
+if(MLIR_ROCM_CONVERSIONS_ENABLED)
+  list(APPEND LIBS
+    MLIRGPUtoROCMTransforms
+  )
+endif()
 add_llvm_executable(mlir-opt
  mlir-opt.cpp
 )