From 080ea067ed33798ddee4d78d192c20d0e6077dea Mon Sep 17 00:00:00 2001
From: Javier Montalt Tordera <javier.monttalttordera@bayer.com>
Date: Fri, 10 Feb 2023 16:22:34 +0000
Subject: [PATCH 01/23] Adding Bazel compilation tools

---
 WORKSPACE                                     |   45 +
 build_deps/build_pip_pkg.sh                   |  103 ++
 build_deps/tf_dependency/BUILD                |    0
 build_deps/tf_dependency/BUILD.tpl            |   18 +
 build_deps/tf_dependency/build_defs.bzl.tpl   |    4 +
 build_deps/tf_dependency/tf_configure.bzl     |  244 +++
 build_deps/toolchains/gpu/BUILD               |    0
 build_deps/toolchains/gpu/crosstool/BUILD     |    0
 build_deps/toolchains/gpu/crosstool/BUILD.tpl |   69 +
 .../toolchains/gpu/crosstool/CROSSTOOL.tpl    | 1409 ++++++++++++++++
 .../gpu/crosstool/cc_toolchain_config.bzl.tpl | 1493 +++++++++++++++++
 .../crosstool_wrapper_driver_is_not_gcc.tpl   |  269 +++
 .../windows/msvc_wrapper_for_nvcc.py.tpl      |  192 +++
 build_deps/toolchains/gpu/cub.BUILD           |   25 +
 build_deps/toolchains/gpu/cuda/BUILD          |    0
 build_deps/toolchains/gpu/cuda/BUILD.tpl      |  227 +++
 .../toolchains/gpu/cuda/BUILD.windows.tpl     |  164 ++
 .../toolchains/gpu/cuda/build_defs.bzl.tpl    |   62 +
 .../toolchains/gpu/cuda/cuda_config.h.tpl     |   26 +
 build_deps/toolchains/gpu/cuda_configure.bzl  | 1116 ++++++++++++
 build_deps/toolchains/gpu/find_cuda_config.py |  682 ++++++++
 configure.py                                  |  199 +++
 tensorflow_nufft/BUILD                        |   37 +
 tensorflow_nufft/tensorflow_nufft.bzl         |   68 +
 tools/build/make_wheel_Linux_x86.sh           |   17 +
 tools/build/make_wheel_Windows_x86.sh         |   26 +
 tools/build/make_wheel_macOS_arm64.sh         |   33 +
 tools/build/make_wheel_macOS_x86.sh           |   32 +
 tools/docker/build_wheel.Dockerfile           |   74 +
 tools/install_deps/black.txt                  |    1 +
 tools/install_deps/buildifier.sh              |   18 +
 tools/install_deps/clang-format.sh            |   20 +
 tools/install_deps/doc_requirements.txt       |    2 +
 tools/install_deps/flake8.txt                 |    2 +
 tools/install_deps/install_bazelisk.sh        |   23 +
 tools/install_deps/pytest.txt                 |    7 +
 tools/install_deps/tensorflow-cpu.txt         |    1 +
 tools/install_deps/tensorflow.txt             |    1 +
 tools/install_deps/typedapi.txt               |    1 +
 tools/install_so_files.sh                     |    8 +
 tools/testing/build_and_run_tests.sh          |   38 +
 tools/testing/parallel_gpu_execute.sh         |   83 +
 tools/testing/source_code_test.py             |  241 +++
 43 files changed, 7080 insertions(+)
 create mode 100644 WORKSPACE
 create mode 100755 build_deps/build_pip_pkg.sh
 create mode 100644 build_deps/tf_dependency/BUILD
 create mode 100644 build_deps/tf_dependency/BUILD.tpl
 create mode 100644 build_deps/tf_dependency/build_defs.bzl.tpl
 create mode 100644 build_deps/tf_dependency/tf_configure.bzl
 create mode 100644 build_deps/toolchains/gpu/BUILD
 create mode 100644 build_deps/toolchains/gpu/crosstool/BUILD
 create mode 100644 build_deps/toolchains/gpu/crosstool/BUILD.tpl
 create mode 100644 build_deps/toolchains/gpu/crosstool/CROSSTOOL.tpl
 create mode 100755 build_deps/toolchains/gpu/crosstool/cc_toolchain_config.bzl.tpl
 create mode 100644 build_deps/toolchains/gpu/crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc.tpl
 create mode 100644 build_deps/toolchains/gpu/crosstool/windows/msvc_wrapper_for_nvcc.py.tpl
 create mode 100644 build_deps/toolchains/gpu/cub.BUILD
 create mode 100644 build_deps/toolchains/gpu/cuda/BUILD
 create mode 100644 build_deps/toolchains/gpu/cuda/BUILD.tpl
 create mode 100644 build_deps/toolchains/gpu/cuda/BUILD.windows.tpl
 create mode 100644 build_deps/toolchains/gpu/cuda/build_defs.bzl.tpl
 create mode 100644 build_deps/toolchains/gpu/cuda/cuda_config.h.tpl
 create mode 100644 build_deps/toolchains/gpu/cuda_configure.bzl
 create mode 100644 build_deps/toolchains/gpu/find_cuda_config.py
 create mode 100644 configure.py
 create mode 100644 tensorflow_nufft/BUILD
 create mode 100644 tensorflow_nufft/tensorflow_nufft.bzl
 create mode 100644 tools/build/make_wheel_Linux_x86.sh
 create mode 100644 tools/build/make_wheel_Windows_x86.sh
 create mode 100644 tools/build/make_wheel_macOS_arm64.sh
 create mode 100644 tools/build/make_wheel_macOS_x86.sh
 create mode 100644 tools/docker/build_wheel.Dockerfile
 create mode 100644 tools/install_deps/black.txt
 create mode 100644 tools/install_deps/buildifier.sh
 create mode 100644 tools/install_deps/clang-format.sh
 create mode 100644 tools/install_deps/doc_requirements.txt
 create mode 100644 tools/install_deps/flake8.txt
 create mode 100644 tools/install_deps/install_bazelisk.sh
 create mode 100644 tools/install_deps/pytest.txt
 create mode 100644 tools/install_deps/tensorflow-cpu.txt
 create mode 100644 tools/install_deps/tensorflow.txt
 create mode 100644 tools/install_deps/typedapi.txt
 create mode 100644 tools/install_so_files.sh
 create mode 100644 tools/testing/build_and_run_tests.sh
 create mode 100644 tools/testing/parallel_gpu_execute.sh
 create mode 100644 tools/testing/source_code_test.py

diff --git a/WORKSPACE b/WORKSPACE
new file mode 100644
index 0000000..03def49
--- /dev/null
+++ b/WORKSPACE
@@ -0,0 +1,45 @@
+load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
+load("//build_deps/tf_dependency:tf_configure.bzl", "tf_configure")
+load("//build_deps/toolchains/gpu:cuda_configure.bzl", "cuda_configure")
+
+http_archive(
+    name = "cub_archive",
+    build_file = "//build_deps/toolchains/gpu:cub.BUILD",
+    sha256 = "6bfa06ab52a650ae7ee6963143a0bbc667d6504822cbd9670369b598f18c58c3",
+    strip_prefix = "cub-1.8.0",
+    urls = [
+        "https://storage.googleapis.com/mirror.tensorflow.org/github.com/NVlabs/cub/archive/1.8.0.zip",
+        "https://github.com/NVlabs/cub/archive/1.8.0.zip",
+    ],
+)
+
+tf_configure(
+    name = "local_config_tf",
+)
+
+cuda_configure(name = "local_config_cuda")
+
+http_archive(
+    name = "org_tensorflow",
+    sha256 = "99c732b92b1b37fc243a559e02f9aef5671771e272758aa4aec7f34dc92dac48",
+    strip_prefix = "tensorflow-2.11.0",
+    urls = [
+        "https://github.com/tensorflow/tensorflow/archive/refs/tags/v2.11.0.tar.gz",
+    ],
+)
+
+load("@org_tensorflow//tensorflow:workspace3.bzl", "tf_workspace3")
+
+tf_workspace3()
+
+load("@org_tensorflow//tensorflow:workspace2.bzl", "tf_workspace2")
+
+tf_workspace2()
+
+load("@org_tensorflow//tensorflow:workspace1.bzl", "tf_workspace1")
+
+tf_workspace1()
+
+load("@org_tensorflow//tensorflow:workspace0.bzl", "tf_workspace0")
+
+tf_workspace0()
diff --git a/build_deps/build_pip_pkg.sh b/build_deps/build_pip_pkg.sh
new file mode 100755
index 0000000..cca5356
--- /dev/null
+++ b/build_deps/build_pip_pkg.sh
@@ -0,0 +1,103 @@
+#!/usr/bin/env bash
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+set -e
+set -x
+
+PLATFORM="$(uname -s | tr 'A-Z' 'a-z')"
+
+function is_windows() {
+  # On windows, the shell script is actually running in msys
+  [[ "${PLATFORM}" =~ msys_nt*|mingw*|cygwin*|uwin* ]]
+}
+
+function is_macos() {
+  [[ "${PLATFORM}" == "darwin" ]]
+}
+
+if is_windows; then
+  PIP_FILE_PREFIX="bazel-bin/build_pip_pkg.exe.runfiles/__main__/"
+else
+  PIP_FILE_PREFIX="bazel-bin/build_pip_pkg.runfiles/__main__/"
+fi
+
+function abspath() {
+  cd "$(dirname $1)"
+  echo "$PWD/$(basename $1)"
+  cd "$OLDPWD"
+}
+
+function main() {
+  DEST=${1}
+  NIGHTLY_FLAG=${2}
+
+  if [[ -z ${DEST} ]]; then
+    echo "No destination dir provided"
+    exit 1
+  fi
+
+  mkdir -p ${DEST}
+  DEST=$(abspath "${DEST}")
+  echo "=== destination directory: ${DEST}"
+
+  TMPDIR=$(mktemp -d -t tmp.XXXXXXXXXX)
+  echo $(date) : "=== Using tmpdir: ${TMPDIR}"
+  echo "=== Copy TensorFlow NUFFT files"
+
+  cp ${PIP_FILE_PREFIX}setup.py "${TMPDIR}"
+  cp ${PIP_FILE_PREFIX}MANIFEST.in "${TMPDIR}"
+  cp ${PIP_FILE_PREFIX}LICENSE "${TMPDIR}"
+  cp ${PIP_FILE_PREFIX}requirements.txt "${TMPDIR}"
+  touch ${TMPDIR}/stub.cc
+
+  if is_windows; then
+    from=$(cygpath -w ${PIP_FILE_PREFIX}tensorflow_nufft)
+    to=$(cygpath -w "${TMPDIR}"/tensorflow_nufft)
+    start robocopy //S "${from}" "${to}" //xf *_test.py
+    sleep 5
+  else
+    rsync -avm -L --exclude='*_test.py' ${PIP_FILE_PREFIX}tensorflow_nufft "${TMPDIR}"
+  fi
+
+  pushd ${TMPDIR}
+  echo $(date) : "=== Building wheel"
+
+
+  BUILD_CMD="setup.py bdist_wheel --platlib-patch"
+  if is_macos; then
+    if [[ x"$(arch)" == x"arm64" ]]; then
+      BUILD_CMD="${BUILD_CMD} --plat-name macosx_12_0_arm64"
+    else
+      BUILD_CMD="${BUILD_CMD} --plat-name macosx_10_14_x86_64"
+    fi
+    PYTHON=python3
+  else
+    PYTHON=python
+  fi
+
+  if [[ -z ${NIGHTLY_FLAG} ]]; then
+    # Windows has issues with locking library files for deletion so do not fail here
+    $PYTHON ${BUILD_CMD} || true
+  else
+    $PYTHON ${BUILD_CMD} ${NIGHTLY_FLAG} || true
+  fi
+
+  cp dist/*.whl "${DEST}"
+  popd
+  rm -rf ${TMPDIR}
+  echo $(date) : "=== Output wheel file is in: ${DEST}"
+}
+
+main "$@"
diff --git a/build_deps/tf_dependency/BUILD b/build_deps/tf_dependency/BUILD
new file mode 100644
index 0000000..e69de29
diff --git a/build_deps/tf_dependency/BUILD.tpl b/build_deps/tf_dependency/BUILD.tpl
new file mode 100644
index 0000000..047baee
--- /dev/null
+++ b/build_deps/tf_dependency/BUILD.tpl
@@ -0,0 +1,18 @@
+package(default_visibility = ["//visibility:public"])
+
+cc_library(
+    name = "tf_header_lib",
+    hdrs = [":tf_header_include"],
+    includes = ["include"],
+    visibility = ["//visibility:public"],
+)
+
+
+cc_library(
+    name = "libtensorflow_framework",
+    srcs = ["%{TF_SHARED_LIBRARY_NAME}"],
+    visibility = ["//visibility:public"],
+)
+
+%{TF_HEADER_GENRULE}
+%{TF_SHARED_LIBRARY_GENRULE}
\ No newline at end of file
diff --git a/build_deps/tf_dependency/build_defs.bzl.tpl b/build_deps/tf_dependency/build_defs.bzl.tpl
new file mode 100644
index 0000000..4ae1298
--- /dev/null
+++ b/build_deps/tf_dependency/build_defs.bzl.tpl
@@ -0,0 +1,4 @@
+# NUFFT Build Definitions inherited from TensorFlow Core
+
+D_GLIBCXX_USE_CXX11_ABI = "%{tf_cx11_abi}"
+CPLUSPLUS_VERSION = "%{tf_cplusplus_ver}"
diff --git a/build_deps/tf_dependency/tf_configure.bzl b/build_deps/tf_dependency/tf_configure.bzl
new file mode 100644
index 0000000..0c0b5e7
--- /dev/null
+++ b/build_deps/tf_dependency/tf_configure.bzl
@@ -0,0 +1,244 @@
+"""Setup TensorFlow as external dependency"""
+
+_TF_HEADER_DIR = "TF_HEADER_DIR"
+
+_TF_SHARED_LIBRARY_DIR = "TF_SHARED_LIBRARY_DIR"
+
+_TF_SHARED_LIBRARY_NAME = "TF_SHARED_LIBRARY_NAME"
+
+_TF_CXX11_ABI_FLAG = "TF_CXX11_ABI_FLAG"
+
+_TF_CPLUSPLUS_VER = "TF_CPLUSPLUS_VER"
+
+def _tpl(repository_ctx, tpl, substitutions = {}, out = None):
+    if not out:
+        out = tpl
+    repository_ctx.template(
+        out,
+        Label("//build_deps/tf_dependency:%s.tpl" % tpl),
+        substitutions,
+    )
+
+def _fail(msg):
+    """Output failure message when auto configuration fails."""
+    red = "\033[0;31m"
+    no_color = "\033[0m"
+    fail("%sPython Configuration Error:%s %s\n" % (red, no_color, msg))
+
+def _is_windows(repository_ctx):
+    """Returns true if the host operating system is windows."""
+    os_name = repository_ctx.os.name.lower()
+    if os_name.find("windows") != -1:
+        return True
+    return False
+
+def _execute(
+        repository_ctx,
+        cmdline,
+        error_msg = None,
+        error_details = None,
+        empty_stdout_fine = False):
+    """Executes an arbitrary shell command.
+
+    Helper for executes an arbitrary shell command.
+
+    Args:
+      repository_ctx: the repository_ctx object.
+      cmdline: list of strings, the command to execute.
+      error_msg: string, a summary of the error if the command fails.
+      error_details: string, details about the error or steps to fix it.
+      empty_stdout_fine: bool, if True, an empty stdout result is fine, otherwise
+        it's an error.
+
+    Returns:
+      The result of repository_ctx.execute(cmdline).
+    """
+    result = repository_ctx.execute(cmdline)
+    if result.stderr or not (empty_stdout_fine or result.stdout):
+        _fail("\n".join([
+            error_msg.strip() if error_msg else "Repository command failed",
+            result.stderr.strip(),
+            error_details if error_details else "",
+        ]))
+    return result
+
+def _read_dir(repository_ctx, src_dir):
+    """Returns a string with all files in a directory.
+
+    Finds all files inside a directory, traversing subfolders and following
+    symlinks. The returned string contains the full path of all files
+    separated by line breaks.
+
+    Args:
+        repository_ctx: the repository_ctx object.
+        src_dir: directory to find files from.
+
+    Returns:
+        A string of all files inside the given dir.
+    """
+    if _is_windows(repository_ctx):
+        src_dir = src_dir.replace("/", "\\")
+        find_result = _execute(
+            repository_ctx,
+            ["cmd.exe", "/c", "dir", src_dir, "/b", "/s", "/a-d"],
+            empty_stdout_fine = True,
+        )
+
+        # src_files will be used in genrule.outs where the paths must
+        # use forward slashes.
+        result = find_result.stdout.replace("\\", "/")
+    else:
+        find_result = _execute(
+            repository_ctx,
+            ["find", src_dir, "-follow", "-type", "f"],
+            empty_stdout_fine = True,
+        )
+        result = find_result.stdout
+    return result
+
+def _genrule(genrule_name, command, outs):
+    """Returns a string with a genrule.
+
+    Genrule executes the given command and produces the given outputs.
+
+    Args:
+        genrule_name: A unique name for genrule target.
+        command: The command to run.
+        outs: A list of files generated by this rule.
+
+    Returns:
+        A genrule target.
+    """
+    return (
+        "genrule(\n" +
+        '    name = "' +
+        genrule_name + '",\n' +
+        "    outs = [\n" +
+        outs +
+        "\n    ],\n" +
+        '    cmd = """\n' +
+        command +
+        '\n   """,\n' +
+        ")\n"
+    )
+
+def _norm_path(path):
+    """Returns a path with '/' and remove the trailing slash."""
+    path = path.replace("\\", "/")
+    if path[-1] == "/":
+        path = path[:-1]
+    return path
+
+def _symlink_genrule_for_dir(
+        repository_ctx,
+        src_dir,
+        dest_dir,
+        genrule_name,
+        src_files = [],
+        dest_files = [],
+        tf_pip_dir_rename_pair = []):
+    """Returns a genrule to symlink(or copy if on Windows) a set of files.
+    If src_dir is passed, files will be read from the given directory; otherwise
+    we assume files are in src_files and dest_files.
+    Args:
+        repository_ctx: the repository_ctx object.
+        src_dir: source directory.
+        dest_dir: directory to create symlink in.
+        genrule_name: genrule name.
+        src_files: list of source files instead of src_dir.
+        dest_files: list of corresonding destination files.
+        tf_pip_dir_rename_pair: list of the pair of tf pip parent directory to
+          replace. For example, in TF pip package, the source code is under
+          "tensorflow_core", and we might want to replace it with
+          "tensorflow" to match the header includes.
+    Returns:
+        genrule target that creates the symlinks.
+    """
+
+    # Check that tf_pip_dir_rename_pair has the right length
+    tf_pip_dir_rename_pair_len = len(tf_pip_dir_rename_pair)
+    if tf_pip_dir_rename_pair_len != 0 and tf_pip_dir_rename_pair_len != 2:
+        _fail("The size of argument tf_pip_dir_rename_pair should be either 0 or 2, but %d is given." % tf_pip_dir_rename_pair_len)
+
+    if src_dir != None:
+        src_dir = _norm_path(src_dir)
+        dest_dir = _norm_path(dest_dir)
+        files = "\n".join(sorted(_read_dir(repository_ctx, src_dir).splitlines()))
+
+        # Create a list with the src_dir stripped to use for outputs.
+        if tf_pip_dir_rename_pair_len:
+            dest_files = files.replace(src_dir, "").replace(tf_pip_dir_rename_pair[0], tf_pip_dir_rename_pair[1]).splitlines()
+        else:
+            dest_files = files.replace(src_dir, "").splitlines()
+        src_files = files.splitlines()
+    command = []
+    outs = []
+
+    for i in range(len(dest_files)):
+        if dest_files[i] != "":
+            # If we have only one file to link we do not want to use the dest_dir, as
+            # $(@D) will include the full path to the file.
+            dest = "$(@D)/" + dest_dir + dest_files[i] if len(dest_files) != 1 else "$(@D)/" + dest_files[i]
+
+            # Copy the headers to create a sandboxable setup.
+            cmd = "cp -f"
+            command.append(cmd + ' "%s" "%s"' % (src_files[i], dest))
+            outs.append('        "' + dest_dir + dest_files[i] + '",')
+
+    genrule = _genrule(
+        genrule_name,
+        ";\n".join(command),
+        "\n".join(outs),
+    )
+    return genrule
+
+def _tf_pip_impl(repository_ctx):
+    tf_header_dir = repository_ctx.os.environ[_TF_HEADER_DIR]
+    tf_header_rule = _symlink_genrule_for_dir(
+        repository_ctx,
+        tf_header_dir,
+        "include",
+        "tf_header_include",
+        tf_pip_dir_rename_pair = ["tensorflow_core", "tensorflow"],
+    )
+
+    tf_shared_library_dir = repository_ctx.os.environ[_TF_SHARED_LIBRARY_DIR]
+    tf_shared_library_name = repository_ctx.os.environ[_TF_SHARED_LIBRARY_NAME]
+    tf_shared_library_path = "%s/%s" % (tf_shared_library_dir, tf_shared_library_name)
+    tf_cx11_abi = "-D_GLIBCXX_USE_CXX11_ABI=%s" % (repository_ctx.os.environ[_TF_CXX11_ABI_FLAG])
+    tf_cplusplus_ver = "-std=%s" % repository_ctx.os.environ[_TF_CPLUSPLUS_VER]
+
+    tf_shared_library_rule = _symlink_genrule_for_dir(
+        repository_ctx,
+        None,
+        "",
+        tf_shared_library_name,
+        [tf_shared_library_path],
+        [tf_shared_library_name],
+    )
+
+    _tpl(repository_ctx, "BUILD", {
+        "%{TF_HEADER_GENRULE}": tf_header_rule,
+        "%{TF_SHARED_LIBRARY_GENRULE}": tf_shared_library_rule,
+        "%{TF_SHARED_LIBRARY_NAME}": tf_shared_library_name,
+    })
+
+    _tpl(
+        repository_ctx,
+        "build_defs.bzl",
+        {
+            "%{tf_cx11_abi}": tf_cx11_abi,
+            "%{tf_cplusplus_ver}": tf_cplusplus_ver,
+        },
+    )
+
+tf_configure = repository_rule(
+    environ = [
+        _TF_HEADER_DIR,
+        _TF_SHARED_LIBRARY_DIR,
+        _TF_SHARED_LIBRARY_NAME,
+        _TF_CXX11_ABI_FLAG,
+        _TF_CPLUSPLUS_VER,
+    ],
+    implementation = _tf_pip_impl,
+)
diff --git a/build_deps/toolchains/gpu/BUILD b/build_deps/toolchains/gpu/BUILD
new file mode 100644
index 0000000..e69de29
diff --git a/build_deps/toolchains/gpu/crosstool/BUILD b/build_deps/toolchains/gpu/crosstool/BUILD
new file mode 100644
index 0000000..e69de29
diff --git a/build_deps/toolchains/gpu/crosstool/BUILD.tpl b/build_deps/toolchains/gpu/crosstool/BUILD.tpl
new file mode 100644
index 0000000..5e21c2c
--- /dev/null
+++ b/build_deps/toolchains/gpu/crosstool/BUILD.tpl
@@ -0,0 +1,69 @@
+licenses(["restricted"])
+
+package(default_visibility = ["//visibility:public"])
+
+load(":cc_toolchain_config.bzl", "cc_toolchain_config")
+
+
+toolchain(
+    name = "toolchain-linux-x86_64",
+    exec_compatible_with = [
+        "@bazel_tools//platforms:linux",
+        "@bazel_tools//platforms:x86_64",
+    ],
+    target_compatible_with = [
+        "@bazel_tools//platforms:linux",
+        "@bazel_tools//platforms:x86_64",
+    ],
+    toolchain = ":cc-compiler-local",
+    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
+)
+
+cc_toolchain_suite(
+    name = "toolchain",
+    toolchains = {
+        "local|compiler": ":cc-compiler-local",
+        "k8": ":cc-compiler-local",
+        "ppc": ":cc-compiler-local",
+        "aarch64": ":cc-compiler-local",
+    },
+)
+
+cc_toolchain(
+    name = "cc-compiler-local",
+    all_files = "%{linker_files}",
+    compiler_files = ":empty",
+    dwp_files = ":empty",
+    linker_files = "%{linker_files}",
+    objcopy_files = ":empty",
+    strip_files = ":empty",
+    # To support linker flags that need to go to the start of command line
+    # we need the toolchain to support parameter files. Parameter files are
+    # last on the command line and contain all shared libraries to link, so all
+    # regular options will be left of them.
+    supports_param_files = 1,
+    toolchain_config = ":cc-compiler-local-config",
+    toolchain_identifier = "local_linux",
+)
+
+cc_toolchain_config(
+    name = "cc-compiler-local-config",
+    cpu = "local",
+    builtin_include_directories = "%{cxx_builtin_include_directories}".split(","),
+    extra_no_canonical_prefixes_flags = ["-fno-canonical-system-headers"],
+    host_compiler_path = "clang/bin/crosstool_wrapper_driver_is_not_gcc",
+    host_compiler_prefix = "/usr/bin",
+    host_compiler_warnings = [],
+    host_unfiltered_compile_flags = [],
+    linker_bin_path = "/usr/bin",
+)
+
+filegroup(
+    name = "empty",
+    srcs = [],
+)
+
+filegroup(
+    name = "crosstool_wrapper_driver_is_not_gcc",
+    srcs = ["clang/bin/crosstool_wrapper_driver_is_not_gcc"],
+)
diff --git a/build_deps/toolchains/gpu/crosstool/CROSSTOOL.tpl b/build_deps/toolchains/gpu/crosstool/CROSSTOOL.tpl
new file mode 100644
index 0000000..1a13ac8
--- /dev/null
+++ b/build_deps/toolchains/gpu/crosstool/CROSSTOOL.tpl
@@ -0,0 +1,1409 @@
+major_version: "local"
+minor_version: ""
+default_target_cpu: "same_as_host"
+
+toolchain {
+  abi_version: "local"
+  abi_libc_version: "local"
+  compiler: "compiler"
+  host_system_name: "local"
+  needsPic: true
+  target_libc: "local"
+  target_cpu: "local"
+  target_system_name: "local"
+  toolchain_identifier: "local_linux"
+
+  feature {
+    name: "c++11"
+    flag_set {
+      action: "c++-compile"
+      flag_group {
+        flag: "-std=c++11"
+      }
+    }
+  }
+
+  feature {
+    name: "stdlib"
+    flag_set {
+      action: "c++-link-executable"
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "-lstdc++"
+      }
+    }
+  }
+
+  feature {
+    name: "determinism"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        # Make C++ compilation deterministic. Use linkstamping instead of these
+        # compiler symbols.
+        flag: "-Wno-builtin-macro-redefined"
+        flag: "-D__DATE__=\"redacted\""
+        flag: "-D__TIMESTAMP__=\"redacted\""
+        flag: "-D__TIME__=\"redacted\""
+      }
+    }
+  }
+
+  feature {
+    name: "alwayslink"
+    flag_set {
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      action: "c++-link-executable"
+      flag_group {
+        flag: "-Wl,-no-as-needed"
+      }
+    }
+  }
+
+  # This feature will be enabled for builds that support pic by bazel.
+  feature {
+    name: "pic"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        expand_if_all_available: "pic"
+        flag: "-fPIC"
+      }
+      flag_group {
+        expand_if_none_available: "pic"
+        flag: "-fPIE"
+      }
+    }
+  }
+
+  # Security hardening on by default.
+  feature {
+    name: "hardening"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        # Conservative choice; -D_FORTIFY_SOURCE=2 may be unsafe in some cases.
+        # We need to undef it before redefining it as some distributions now
+        # have it enabled by default.
+        flag: "-U_FORTIFY_SOURCE"
+        flag: "-D_FORTIFY_SOURCE=1"
+        flag: "-fstack-protector"
+      }
+    }
+    flag_set {
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "-Wl,-z,relro,-z,now"
+      }
+    }
+    flag_set {
+      action: "c++-link-executable"
+      flag_group {
+        flag: "-pie"
+        flag: "-Wl,-z,relro,-z,now"
+      }
+    }
+  }
+
+  feature {
+    name: "warnings"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        # All warnings are enabled. Maybe enable -Werror as well?
+        flag: "-Wall"
+        %{host_compiler_warnings}
+      }
+    }
+  }
+
+  # Keep stack frames for debugging, even in opt mode.
+  feature {
+    name: "frame-pointer"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        flag: "-fno-omit-frame-pointer"
+      }
+    }
+  }
+
+  feature {
+    name: "build-id"
+    flag_set {
+      action: "c++-link-executable"
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        # Stamp the binary with a unique identifier.
+        flag: "-Wl,--build-id=md5"
+        flag: "-Wl,--hash-style=gnu"
+      }
+    }
+  }
+
+  feature {
+    name: "no-canonical-prefixes"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      action: "c++-link-executable"
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "-no-canonical-prefixes"
+        %{extra_no_canonical_prefixes_flags}
+      }
+    }
+  }
+
+  feature {
+    name: "disable-assertions"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        flag: "-DNDEBUG"
+      }
+    }
+  }
+
+  feature {
+    name: "linker-bin-path"
+
+    flag_set {
+      action: "c++-link-executable"
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        %{linker_bin_path_flag}
+      }
+    }
+  }
+
+  feature {
+    name: "common"
+    implies: "stdlib"
+    implies: "c++11"
+    implies: "determinism"
+    implies: "alwayslink"
+    implies: "hardening"
+    implies: "warnings"
+    implies: "frame-pointer"
+    implies: "build-id"
+    implies: "no-canonical-prefixes"
+    implies: "linker-bin-path"
+  }
+
+  feature {
+    name: "opt"
+    implies: "common"
+    implies: "disable-assertions"
+
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        # No debug symbols.
+        # Maybe we should enable https://gcc.gnu.org/wiki/DebugFission for opt
+        # or even generally? However, that can't happen here, as it requires
+        # special handling in Bazel.
+        flag: "-g0"
+
+        # Conservative choice for -O
+        # -O3 can increase binary size and even slow down the resulting binaries.
+        # Profile first and / or use FDO if you need better performance than this.
+        flag: "-O2"
+
+        # Removal of unused code and data at link time (can this increase binary size in some cases?).
+        flag: "-ffunction-sections"
+        flag: "-fdata-sections"
+      }
+    }
+    flag_set {
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      action: "c++-link-executable"
+      flag_group {
+        flag: "-Wl,--gc-sections"
+      }
+    }
+  }
+
+  feature {
+    name: "fastbuild"
+    implies: "common"
+  }
+
+  feature {
+    name: "dbg"
+    implies: "common"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        flag: "-g"
+      }
+    }
+  }
+
+  # Set clang as a C/C++ compiler.
+  tool_path { name: "gcc" path: "%{host_compiler_path}" }
+
+  # Use the default system toolchain for everything else.
+  tool_path { name: "ar" path: "/usr/bin/ar" }
+  tool_path { name: "compat-ld" path: "/usr/bin/ld" }
+  tool_path { name: "cpp" path: "/usr/bin/cpp" }
+  tool_path { name: "dwp" path: "/usr/bin/dwp" }
+  tool_path { name: "gcov" path: "/usr/bin/gcov" }
+  tool_path { name: "ld" path: "/usr/bin/ld" }
+  tool_path { name: "nm" path: "/usr/bin/nm" }
+  tool_path { name: "objcopy" path: "/usr/bin/objcopy" }
+  tool_path { name: "objdump" path: "/usr/bin/objdump" }
+  tool_path { name: "strip" path: "/usr/bin/strip" }
+
+  # Enabled dynamic linking.
+  linking_mode_flags { mode: DYNAMIC }
+
+%{host_compiler_includes}
+}
+
+toolchain {
+  abi_version: "local"
+  abi_libc_version: "local"
+  compiler: "compiler"
+  host_system_name: "local"
+  needsPic: true
+  target_libc: "macosx"
+  target_cpu: "darwin"
+  target_system_name: "local"
+  toolchain_identifier: "local_darwin"
+  feature {
+    name: "c++11"
+    flag_set {
+      action: "c++-compile"
+      flag_group {
+        flag: "-std=c++11"
+      }
+    }
+  }
+
+  feature {
+    name: "stdlib"
+    flag_set {
+      action: "c++-link-executable"
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "-lc++"
+      }
+    }
+  }
+
+  feature {
+    name: "determinism"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        # Make C++ compilation deterministic. Use linkstamping instead of these
+        # compiler symbols.
+        flag: "-Wno-builtin-macro-redefined"
+        flag: "-D__DATE__=\"redacted\""
+        flag: "-D__TIMESTAMP__=\"redacted\""
+        flag: "-D__TIME__=\"redacted\""
+      }
+    }
+  }
+
+  # This feature will be enabled for builds that support pic by bazel.
+  feature {
+    name: "pic"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        expand_if_all_available: "pic"
+        flag: "-fPIC"
+      }
+      flag_group {
+        expand_if_none_available: "pic"
+        flag: "-fPIE"
+      }
+    }
+  }
+
+  # Security hardening on by default.
+  feature {
+    name: "hardening"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        # Conservative choice; -D_FORTIFY_SOURCE=2 may be unsafe in some cases.
+        # We need to undef it before redefining it as some distributions now
+        # have it enabled by default.
+        flag: "-U_FORTIFY_SOURCE"
+        flag: "-D_FORTIFY_SOURCE=1"
+        flag: "-fstack-protector"
+      }
+    }
+    flag_set {
+      action: "c++-link-executable"
+      flag_group {
+        flag: "-pie"
+      }
+    }
+  }
+
+  feature {
+    name: "warnings"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        # All warnings are enabled. Maybe enable -Werror as well?
+        flag: "-Wall"
+        %{host_compiler_warnings}
+      }
+    }
+  }
+
+  # Keep stack frames for debugging, even in opt mode.
+  feature {
+    name: "frame-pointer"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        flag: "-fno-omit-frame-pointer"
+      }
+    }
+  }
+
+  feature {
+    name: "no-canonical-prefixes"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      action: "c++-link-executable"
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag:"-no-canonical-prefixes"
+      }
+    }
+  }
+
+  feature {
+    name: "disable-assertions"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        flag: "-DNDEBUG"
+      }
+    }
+  }
+
+  feature {
+    name: "linker-bin-path"
+
+    flag_set {
+      action: "c++-link-executable"
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        %{linker_bin_path_flag}
+      }
+    }
+  }
+
+  feature {
+    name: "undefined-dynamic"
+    flag_set {
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      action: "c++-link-executable"
+      flag_group {
+        flag: "-undefined"
+        flag: "dynamic_lookup"
+      }
+    }
+  }
+
+  feature {
+    name: "common"
+    implies: "stdlib"
+    implies: "c++11"
+    implies: "determinism"
+    implies: "hardening"
+    implies: "warnings"
+    implies: "frame-pointer"
+    implies: "no-canonical-prefixes"
+    implies: "linker-bin-path"
+    implies: "undefined-dynamic"
+  }
+
+  feature {
+    name: "opt"
+    implies: "common"
+    implies: "disable-assertions"
+
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        # No debug symbols.
+        # Maybe we should enable https://gcc.gnu.org/wiki/DebugFission for opt
+        # or even generally? However, that can't happen here, as it requires
+        # special handling in Bazel.
+        flag: "-g0"
+
+        # Conservative choice for -O
+        # -O3 can increase binary size and even slow down the resulting binaries.
+        # Profile first and / or use FDO if you need better performance than this.
+        flag: "-O2"
+
+        # Removal of unused code and data at link time (can this increase binary size in some cases?).
+        flag: "-ffunction-sections"
+        flag: "-fdata-sections"
+      }
+    }
+  }
+
+  feature {
+    name: "fastbuild"
+    implies: "common"
+  }
+
+  feature {
+    name: "dbg"
+    implies: "common"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        flag: "-g"
+      }
+    }
+  }
+
+  # Set clang as a C/C++ compiler.
+  tool_path { name: "gcc" path: "%{host_compiler_path}" }
+
+  # Use the default system toolchain for everything else.
+  tool_path { name: "ar" path: "/usr/bin/libtool" }
+  tool_path { name: "compat-ld" path: "/usr/bin/ld" }
+  tool_path { name: "cpp" path: "/usr/bin/cpp" }
+  tool_path { name: "dwp" path: "/usr/bin/dwp" }
+  tool_path { name: "gcov" path: "/usr/bin/gcov" }
+  tool_path { name: "ld" path: "/usr/bin/ld" }
+  tool_path { name: "nm" path: "/usr/bin/nm" }
+  tool_path { name: "objcopy" path: "/usr/bin/objcopy" }
+  tool_path { name: "objdump" path: "/usr/bin/objdump" }
+  tool_path { name: "strip" path: "/usr/bin/strip" }
+
+  # Enabled dynamic linking.
+  linking_mode_flags { mode: DYNAMIC }
+
+%{host_compiler_includes}
+}
+
+toolchain {
+  toolchain_identifier: "local_windows"
+  host_system_name: "local"
+  target_system_name: "local"
+
+  abi_version: "local"
+  abi_libc_version: "local"
+  target_cpu: "x64_windows"
+  compiler: "msvc-cl"
+  target_libc: "msvcrt"
+
+%{cxx_builtin_include_directory}
+
+  tool_path {
+    name: "ar"
+    path: "%{msvc_lib_path}"
+  }
+  tool_path {
+    name: "ml"
+    path: "%{msvc_ml_path}"
+  }
+  tool_path {
+    name: "cpp"
+    path: "%{msvc_cl_path}"
+  }
+  tool_path {
+    name: "gcc"
+    path: "%{msvc_cl_path}"
+  }
+  tool_path {
+    name: "gcov"
+    path: "wrapper/bin/msvc_nop.bat"
+  }
+  tool_path {
+    name: "ld"
+    path: "%{msvc_link_path}"
+  }
+  tool_path {
+    name: "nm"
+    path: "wrapper/bin/msvc_nop.bat"
+  }
+  tool_path {
+    name: "objcopy"
+    path: "wrapper/bin/msvc_nop.bat"
+  }
+  tool_path {
+    name: "objdump"
+    path: "wrapper/bin/msvc_nop.bat"
+  }
+  tool_path {
+    name: "strip"
+    path: "wrapper/bin/msvc_nop.bat"
+  }
+  supports_interface_shared_objects: true
+
+  # TODO(pcloudy): Review those flags below, they should be defined by cl.exe
+  compiler_flag: "/DCOMPILER_MSVC"
+
+  # Don't define min/max macros in windows.h.
+  compiler_flag: "/DNOMINMAX"
+
+  # Platform defines.
+  compiler_flag: "/D_WIN32_WINNT=0x0600"
+  # Turn off warning messages.
+  compiler_flag: "/D_CRT_SECURE_NO_DEPRECATE"
+  compiler_flag: "/D_CRT_SECURE_NO_WARNINGS"
+  compiler_flag: "/D_SILENCE_STDEXT_HASH_DEPRECATION_WARNINGS"
+
+  # Useful options to have on for compilation.
+  # Increase the capacity of object files to 2^32 sections.
+  compiler_flag: "/bigobj"
+  # Allocate 500MB for precomputed headers.
+  compiler_flag: "/Zm500"
+  # Use unsigned char by default.
+  compiler_flag: "/J"
+  # Use function level linking.
+  compiler_flag: "/Gy"
+  # Use string pooling.
+  compiler_flag: "/GF"
+  # Catch C++ exceptions only and tell the compiler to assume that functions declared
+  # as extern "C" never throw a C++ exception.
+  compiler_flag: "/EHsc"
+
+  # Globally disabled warnings.
+  # Don't warn about elements of array being be default initialized.
+  compiler_flag: "/wd4351"
+  # Don't warn about no matching delete found.
+  compiler_flag: "/wd4291"
+  # Don't warn about diamond inheritance patterns.
+  compiler_flag: "/wd4250"
+  # Don't warn about insecure functions (e.g. non _s functions).
+  compiler_flag: "/wd4996"
+
+  linker_flag: "/MACHINE:X64"
+
+  feature {
+    name: "no_legacy_features"
+  }
+
+  # TODO(klimek): Previously we were using a .bat file to start python to run
+  # the python script that can redirect to nvcc - unfortunately .bat files
+  # have a rather short maximum length for command lines (8k). Instead, we
+  # now use the python binary as the compiler and pass the python script to
+  # it at the start of the command line. Investigate different possibilities
+  # to run the nvcc wrapper, either using pyinstaller --onefile, or writing
+  # a small C++ wrapper to redirect.
+  feature {
+    name: "redirector"
+    enabled: true
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      action: "c++-module-compile"
+      action: "c++-module-codegen"
+      action: "c++-header-parsing"
+      action: "assemble"
+      action: "preprocess-assemble"
+      flag_group {
+        flag: "-B"
+        flag: "external/local_config_cuda/crosstool/windows/msvc_wrapper_for_nvcc.py"
+      }
+    }
+  }
+
+  # Suppress startup banner.
+  feature {
+    name: "nologo"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      action: "c++-module-compile"
+      action: "c++-module-codegen"
+      action: "c++-header-parsing"
+      action: "assemble"
+      action: "preprocess-assemble"
+      action: "c++-link-executable"
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      action: "c++-link-static-library"
+      flag_group {
+        flag: "/nologo"
+      }
+    }
+  }
+
+  feature {
+    name: 'has_configured_linker_path'
+  }
+
+  # This feature indicates strip is not supported, building stripped binary will just result a copy of orignial binary
+  feature {
+    name: 'no_stripping'
+  }
+
+  # This feature indicates this is a toolchain targeting Windows.
+  feature {
+    name: 'targets_windows'
+    implies: 'copy_dynamic_libraries_to_binary'
+    enabled: true
+  }
+
+  feature {
+    name: 'copy_dynamic_libraries_to_binary'
+  }
+
+  action_config {
+    config_name: 'assemble'
+    action_name: 'assemble'
+    tool {
+      tool_path: '%{msvc_ml_path}'
+    }
+    implies: 'compiler_input_flags'
+    implies: 'compiler_output_flags'
+    implies: 'nologo'
+    implies: 'msvc_env'
+    implies: 'sysroot'
+  }
+
+  action_config {
+    config_name: 'preprocess-assemble'
+    action_name: 'preprocess-assemble'
+    tool {
+      tool_path: '%{msvc_ml_path}'
+    }
+    implies: 'compiler_input_flags'
+    implies: 'compiler_output_flags'
+    implies: 'nologo'
+    implies: 'msvc_env'
+    implies: 'sysroot'
+  }
+
+  action_config {
+    config_name: 'c-compile'
+    action_name: 'c-compile'
+    tool {
+      tool_path: '%{msvc_cl_path}'
+    }
+    implies: 'compiler_input_flags'
+    implies: 'compiler_output_flags'
+    implies: 'legacy_compile_flags'
+    implies: 'nologo'
+    implies: 'msvc_env'
+    implies: 'parse_showincludes'
+    implies: 'user_compile_flags'
+    implies: 'sysroot'
+    implies: 'unfiltered_compile_flags'
+  }
+
+  action_config {
+    config_name: 'c++-compile'
+    action_name: 'c++-compile'
+    tool {
+      tool_path: '%{msvc_cl_path}'
+    }
+    implies: 'compiler_input_flags'
+    implies: 'compiler_output_flags'
+    implies: 'legacy_compile_flags'
+    implies: 'nologo'
+    implies: 'msvc_env'
+    implies: 'parse_showincludes'
+    implies: 'user_compile_flags'
+    implies: 'sysroot'
+    implies: 'unfiltered_compile_flags'
+  }
+
+  action_config {
+    config_name: 'c++-link-executable'
+    action_name: 'c++-link-executable'
+    tool {
+      tool_path: '%{msvc_link_path}'
+    }
+    implies: 'nologo'
+    implies: 'linkstamps'
+    implies: 'output_execpath_flags'
+    implies: 'input_param_flags'
+    implies: 'user_link_flags'
+    implies: 'legacy_link_flags'
+    implies: 'linker_subsystem_flag'
+    implies: 'linker_param_file'
+    implies: 'msvc_env'
+    implies: 'no_stripping'
+  }
+
+  action_config {
+    config_name: 'c++-link-dynamic-library'
+    action_name: 'c++-link-dynamic-library'
+    tool {
+      tool_path: '%{msvc_link_path}'
+    }
+    implies: 'nologo'
+    implies: 'shared_flag'
+    implies: 'linkstamps'
+    implies: 'output_execpath_flags'
+    implies: 'input_param_flags'
+    implies: 'user_link_flags'
+    implies: 'legacy_link_flags'
+    implies: 'linker_subsystem_flag'
+    implies: 'linker_param_file'
+    implies: 'msvc_env'
+    implies: 'no_stripping'
+    implies: 'has_configured_linker_path'
+    implies: 'def_file'
+  }
+
+  action_config {
+      config_name: 'c++-link-nodeps-dynamic-library'
+      action_name: 'c++-link-nodeps-dynamic-library'
+      tool {
+        tool_path: '%{msvc_link_path}'
+      }
+      implies: 'nologo'
+      implies: 'shared_flag'
+      implies: 'linkstamps'
+      implies: 'output_execpath_flags'
+      implies: 'input_param_flags'
+      implies: 'user_link_flags'
+      implies: 'legacy_link_flags'
+      implies: 'linker_subsystem_flag'
+      implies: 'linker_param_file'
+      implies: 'msvc_env'
+      implies: 'no_stripping'
+      implies: 'has_configured_linker_path'
+      implies: 'def_file'
+    }
+
+  action_config {
+    config_name: 'c++-link-static-library'
+    action_name: 'c++-link-static-library'
+    tool {
+      tool_path: '%{msvc_lib_path}'
+    }
+    implies: 'nologo'
+    implies: 'archiver_flags'
+    implies: 'input_param_flags'
+    implies: 'linker_param_file'
+    implies: 'msvc_env'
+  }
+
+  # TODO(b/65151735): Remove legacy_compile_flags feature when legacy fields are
+  # not used in this crosstool
+  feature {
+    name: 'legacy_compile_flags'
+    flag_set {
+      expand_if_all_available: 'legacy_compile_flags'
+      action: 'preprocess-assemble'
+      action: 'c-compile'
+      action: 'c++-compile'
+      action: 'c++-header-parsing'
+      action: 'c++-module-compile'
+      action: 'c++-module-codegen'
+      flag_group {
+        iterate_over: 'legacy_compile_flags'
+        flag: '%{legacy_compile_flags}'
+      }
+    }
+  }
+
+  feature {
+    name: "msvc_env"
+    env_set {
+      action: "c-compile"
+      action: "c++-compile"
+      action: "c++-module-compile"
+      action: "c++-module-codegen"
+      action: "c++-header-parsing"
+      action: "assemble"
+      action: "preprocess-assemble"
+      action: "c++-link-executable"
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      action: "c++-link-static-library"
+      env_entry {
+        key: "PATH"
+        value: "%{msvc_env_path}"
+      }
+      env_entry {
+        key: "INCLUDE"
+        value: "%{msvc_env_include}"
+      }
+      env_entry {
+        key: "LIB"
+        value: "%{msvc_env_lib}"
+      }
+      env_entry {
+        key: "TMP"
+        value: "%{msvc_env_tmp}"
+      }
+      env_entry {
+        key: "TEMP"
+        value: "%{msvc_env_tmp}"
+      }
+    }
+  }
+
+  feature {
+    name: 'include_paths'
+    flag_set {
+      action: "assemble"
+      action: 'preprocess-assemble'
+      action: 'c-compile'
+      action: 'c++-compile'
+      action: 'c++-header-parsing'
+      action: 'c++-module-compile'
+      flag_group {
+        iterate_over: 'quote_include_paths'
+        flag: '/I%{quote_include_paths}'
+      }
+      flag_group {
+        iterate_over: 'include_paths'
+        flag: '/I%{include_paths}'
+      }
+      flag_group {
+        iterate_over: 'system_include_paths'
+        flag: '/I%{system_include_paths}'
+      }
+    }
+  }
+
+  feature {
+    name: "preprocessor_defines"
+    flag_set {
+      action: "assemble"
+      action: "preprocess-assemble"
+      action: "c-compile"
+      action: "c++-compile"
+      action: "c++-header-parsing"
+      action: "c++-module-compile"
+      flag_group {
+        flag: "/D%{preprocessor_defines}"
+        iterate_over: "preprocessor_defines"
+      }
+    }
+  }
+
+  # Tell Bazel to parse the output of /showIncludes
+  feature {
+    name: 'parse_showincludes'
+    flag_set {
+      action: 'preprocess-assemble'
+      action: 'c-compile'
+      action: 'c++-compile'
+      action: 'c++-module-compile'
+      action: 'c++-header-parsing'
+      flag_group {
+        flag: "/showIncludes"
+      }
+    }
+  }
+
+
+  feature {
+    name: 'generate_pdb_file'
+    requires: {
+      feature: 'dbg'
+    }
+    requires: {
+      feature: 'fastbuild'
+    }
+  }
+
+  feature {
+    name: 'shared_flag'
+    flag_set {
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: '/DLL'
+      }
+    }
+  }
+
+  feature {
+    name: 'linkstamps'
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      expand_if_all_available: 'linkstamp_paths'
+      flag_group {
+        iterate_over: 'linkstamp_paths'
+        flag: '%{linkstamp_paths}'
+      }
+    }
+  }
+
+  feature {
+    name: 'output_execpath_flags'
+    flag_set {
+      expand_if_all_available: 'output_execpath'
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: '/OUT:%{output_execpath}'
+      }
+    }
+  }
+
+  feature {
+    name: 'archiver_flags'
+    flag_set {
+      expand_if_all_available: 'output_execpath'
+      action: 'c++-link-static-library'
+      flag_group {
+        flag: '/OUT:%{output_execpath}'
+      }
+    }
+  }
+
+  feature {
+    name: 'input_param_flags'
+    flag_set {
+      expand_if_all_available: 'interface_library_output_path'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "/IMPLIB:%{interface_library_output_path}"
+      }
+    }
+    flag_set {
+      expand_if_all_available: 'libopts'
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        iterate_over: 'libopts'
+        flag: '%{libopts}'
+      }
+    }
+    flag_set {
+      expand_if_all_available: 'libraries_to_link'
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      action: 'c++-link-static-library'
+      flag_group {
+        iterate_over: 'libraries_to_link'
+        flag_group {
+          expand_if_equal: {
+            variable: 'libraries_to_link.type'
+            value: 'object_file_group'
+          }
+          iterate_over: 'libraries_to_link.object_files'
+          flag_group {
+            flag: '%{libraries_to_link.object_files}'
+          }
+        }
+        flag_group {
+          expand_if_equal: {
+            variable: 'libraries_to_link.type'
+            value: 'object_file'
+          }
+          flag_group {
+            flag: '%{libraries_to_link.name}'
+          }
+        }
+        flag_group {
+          expand_if_equal: {
+            variable: 'libraries_to_link.type'
+            value: 'interface_library'
+          }
+          flag_group {
+            flag: '%{libraries_to_link.name}'
+          }
+        }
+        flag_group {
+          expand_if_equal: {
+            variable: 'libraries_to_link.type'
+            value: 'static_library'
+          }
+          flag_group {
+            expand_if_false: 'libraries_to_link.is_whole_archive'
+            flag: '%{libraries_to_link.name}'
+          }
+          flag_group {
+            expand_if_true: 'libraries_to_link.is_whole_archive'
+            flag: '/WHOLEARCHIVE:%{libraries_to_link.name}'
+          }
+        }
+      }
+    }
+  }
+
+  # Since this feature is declared earlier in the CROSSTOOL than
+  # "user_link_flags", this feature will be applied prior to it anwyhere they
+  # are both implied. And since "user_link_flags" contains the linkopts from
+  # the build rule, this allows the user to override the /SUBSYSTEM in the BUILD
+  # file.
+  feature {
+    name: 'linker_subsystem_flag'
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: '/SUBSYSTEM:CONSOLE'
+      }
+    }
+  }
+
+  # The "user_link_flags" contains user-defined linkopts (from build rules)
+  # so it should be defined after features that declare user-overridable flags.
+  # For example the "linker_subsystem_flag" defines a default "/SUBSYSTEM" flag
+  # but we want to let the user override it, therefore "link_flag_subsystem" is
+  # defined earlier in the CROSSTOOL file than "user_link_flags".
+  feature {
+    name: 'user_link_flags'
+    flag_set {
+      expand_if_all_available: 'user_link_flags'
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        iterate_over: 'user_link_flags'
+        flag: '%{user_link_flags}'
+      }
+    }
+  }
+  feature {
+    name: 'legacy_link_flags'
+    flag_set {
+      expand_if_all_available: 'legacy_link_flags'
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        iterate_over: 'legacy_link_flags'
+        flag: '%{legacy_link_flags}'
+      }
+    }
+  }
+
+  feature {
+    name: 'linker_param_file'
+    flag_set {
+      expand_if_all_available: 'linker_param_file'
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      action: 'c++-link-static-library'
+      flag_group {
+        flag: '@%{linker_param_file}'
+      }
+    }
+  }
+
+  feature {
+    name: 'static_link_msvcrt'
+  }
+
+  feature {
+    name: 'static_link_msvcrt_no_debug'
+    flag_set {
+      action: 'c-compile'
+      action: 'c++-compile'
+      flag_group {
+        flag: "/MT"
+      }
+    }
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "/DEFAULTLIB:libcmt.lib"
+      }
+    }
+    requires: { feature: 'fastbuild'}
+    requires: { feature: 'opt'}
+  }
+
+  feature {
+    name: 'dynamic_link_msvcrt_no_debug'
+    flag_set {
+      action: 'c-compile'
+      action: 'c++-compile'
+      flag_group {
+        flag: "/MD"
+      }
+    }
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "/DEFAULTLIB:msvcrt.lib"
+      }
+    }
+    requires: { feature: 'fastbuild'}
+    requires: { feature: 'opt'}
+  }
+
+  feature {
+    name: 'static_link_msvcrt_debug'
+    flag_set {
+      action: 'c-compile'
+      action: 'c++-compile'
+      flag_group {
+        flag: "/MTd"
+      }
+    }
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "/DEFAULTLIB:libcmtd.lib"
+      }
+    }
+    requires: { feature: 'dbg'}
+  }
+
+  feature {
+    name: 'dynamic_link_msvcrt_debug'
+    flag_set {
+      action: 'c-compile'
+      action: 'c++-compile'
+      flag_group {
+        flag: "/MDd"
+      }
+    }
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "/DEFAULTLIB:msvcrtd.lib"
+      }
+    }
+    requires: { feature: 'dbg'}
+  }
+
+  feature {
+    name: 'dbg'
+    flag_set {
+      action: 'c-compile'
+      action: 'c++-compile'
+      flag_group {
+        flag: "/Od"
+        flag: "/Z7"
+        flag: "/DDEBUG"
+      }
+    }
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "/DEBUG:FULL"
+        flag: "/INCREMENTAL:NO"
+      }
+    }
+    implies: 'generate_pdb_file'
+  }
+
+  feature {
+    name: 'fastbuild'
+    flag_set {
+      action: 'c-compile'
+      action: 'c++-compile'
+      flag_group {
+        flag: "/Od"
+        flag: "/Z7"
+        flag: "/DDEBUG"
+      }
+    }
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "/DEBUG:FASTLINK"
+        flag: "/INCREMENTAL:NO"
+      }
+    }
+    implies: 'generate_pdb_file'
+  }
+
+  feature {
+    name: 'opt'
+    flag_set {
+      action: 'c-compile'
+      action: 'c++-compile'
+      flag_group {
+        flag: "/O2"
+        flag: "/DNDEBUG"
+      }
+    }
+  }
+
+  feature {
+    name: 'user_compile_flags'
+    flag_set {
+      expand_if_all_available: 'user_compile_flags'
+      action: 'preprocess-assemble'
+      action: 'c-compile'
+      action: 'c++-compile'
+      action: 'c++-header-parsing'
+      action: 'c++-module-compile'
+      action: 'c++-module-codegen'
+      flag_group {
+        iterate_over: 'user_compile_flags'
+        flag: '%{user_compile_flags}'
+      }
+    }
+  }
+
+  feature {
+    name: 'sysroot'
+    flag_set {
+      expand_if_all_available: 'sysroot'
+      action: 'assemble'
+      action: 'preprocess-assemble'
+      action: 'c-compile'
+      action: 'c++-compile'
+      action: 'c++-header-parsing'
+      action: 'c++-module-compile'
+      action: 'c++-module-codegen'
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        iterate_over: 'sysroot'
+        flag: '--sysroot=%{sysroot}'
+      }
+    }
+  }
+
+  feature {
+    name: 'unfiltered_compile_flags'
+    flag_set {
+      expand_if_all_available: 'unfiltered_compile_flags'
+      action: 'preprocess-assemble'
+      action: 'c-compile'
+      action: 'c++-compile'
+      action: 'c++-header-parsing'
+      action: 'c++-module-compile'
+      action: 'c++-module-codegen'
+      flag_group {
+        iterate_over: 'unfiltered_compile_flags'
+        flag: '%{unfiltered_compile_flags}'
+      }
+    }
+  }
+
+  feature {
+    name: 'compiler_output_flags'
+    flag_set {
+      action: 'assemble'
+      flag_group {
+        expand_if_all_available: 'output_file'
+        expand_if_none_available: 'output_assembly_file'
+        expand_if_none_available: 'output_preprocess_file'
+        flag: '/Fo%{output_file}'
+        flag: '/Zi'
+      }
+    }
+    flag_set {
+      action: 'preprocess-assemble'
+      action: 'c-compile'
+      action: 'c++-compile'
+      action: 'c++-header-parsing'
+      action: 'c++-module-compile'
+      action: 'c++-module-codegen'
+      flag_group {
+        expand_if_all_available: 'output_file'
+        expand_if_none_available: 'output_assembly_file'
+        expand_if_none_available: 'output_preprocess_file'
+        flag: '/Fo%{output_file}'
+      }
+      flag_group {
+        expand_if_all_available: 'output_file'
+        expand_if_all_available: 'output_assembly_file'
+        flag: '/Fa%{output_file}'
+      }
+      flag_group {
+        expand_if_all_available: 'output_file'
+        expand_if_all_available: 'output_preprocess_file'
+        flag: '/P'
+        flag: '/Fi%{output_file}'
+      }
+    }
+  }
+
+  feature {
+    name: 'compiler_input_flags'
+    flag_set {
+      action: 'assemble'
+      action: 'preprocess-assemble'
+      action: 'c-compile'
+      action: 'c++-compile'
+      action: 'c++-header-parsing'
+      action: 'c++-module-compile'
+      action: 'c++-module-codegen'
+      flag_group {
+        expand_if_all_available: 'source_file'
+        flag: '/c'
+        flag: '%{source_file}'
+      }
+    }
+  }
+
+  feature {
+    name : 'def_file',
+    flag_set {
+      expand_if_all_available: 'def_file_path'
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "/DEF:%{def_file_path}"
+        # We can specify a different DLL name in DEF file, /ignore:4070 suppresses
+        # the warning message about DLL name doesn't match the default one.
+        # See https://msdn.microsoft.com/en-us/library/sfkk2fz7.aspx
+        flag: "/ignore:4070"
+      }
+    }
+  }
+
+  feature {
+    name: 'windows_export_all_symbols'
+  }
+
+  feature {
+    name: 'no_windows_export_all_symbols'
+  }
+
+  linking_mode_flags { mode: DYNAMIC }
+}
diff --git a/build_deps/toolchains/gpu/crosstool/cc_toolchain_config.bzl.tpl b/build_deps/toolchains/gpu/crosstool/cc_toolchain_config.bzl.tpl
new file mode 100755
index 0000000..ba002b4
--- /dev/null
+++ b/build_deps/toolchains/gpu/crosstool/cc_toolchain_config.bzl.tpl
@@ -0,0 +1,1493 @@
+"""cc_toolchain_config rule for configuring CUDA toolchains on Linux, Mac, and Windows."""
+
+load(
+    "@bazel_tools//tools/cpp:cc_toolchain_config_lib.bzl",
+    "action_config",
+    "env_entry",
+    "env_set",
+    "feature",
+    "feature_set",
+    "flag_group",
+    "flag_set",
+    "tool",
+    "tool_path",
+    "variable_with_value",
+)
+load(
+    "@bazel_tools//tools/build_defs/cc:action_names.bzl",
+    "ASSEMBLE_ACTION_NAME",
+    "CC_FLAGS_MAKE_VARIABLE_ACTION_NAME",
+    "CLIF_MATCH_ACTION_NAME",
+    "CPP_COMPILE_ACTION_NAME",
+    "CPP_HEADER_PARSING_ACTION_NAME",
+    "CPP_LINK_DYNAMIC_LIBRARY_ACTION_NAME",
+    "CPP_LINK_EXECUTABLE_ACTION_NAME",
+    "CPP_LINK_NODEPS_DYNAMIC_LIBRARY_ACTION_NAME",
+    "CPP_LINK_STATIC_LIBRARY_ACTION_NAME",
+    "CPP_MODULE_CODEGEN_ACTION_NAME",
+    "CPP_MODULE_COMPILE_ACTION_NAME",
+    "C_COMPILE_ACTION_NAME",
+    "LINKSTAMP_COMPILE_ACTION_NAME",
+    "LTO_BACKEND_ACTION_NAME",
+    "LTO_INDEXING_ACTION_NAME",
+    "OBJCPP_COMPILE_ACTION_NAME",
+    "OBJCPP_EXECUTABLE_ACTION_NAME",
+    "OBJC_ARCHIVE_ACTION_NAME",
+    "OBJC_COMPILE_ACTION_NAME",
+    "OBJC_EXECUTABLE_ACTION_NAME",
+    "OBJC_FULLY_LINK_ACTION_NAME",
+    "PREPROCESS_ASSEMBLE_ACTION_NAME",
+    "STRIP_ACTION_NAME",
+)
+
+ACTION_NAMES = struct(
+    assemble = ASSEMBLE_ACTION_NAME,
+    c_compile = C_COMPILE_ACTION_NAME,
+    cc_flags_make_variable = CC_FLAGS_MAKE_VARIABLE_ACTION_NAME,
+    clif_match = CLIF_MATCH_ACTION_NAME,
+    cpp_compile = CPP_COMPILE_ACTION_NAME,
+    cpp_header_parsing = CPP_HEADER_PARSING_ACTION_NAME,
+    cpp_link_dynamic_library = CPP_LINK_DYNAMIC_LIBRARY_ACTION_NAME,
+    cpp_link_executable = CPP_LINK_EXECUTABLE_ACTION_NAME,
+    cpp_link_nodeps_dynamic_library = CPP_LINK_NODEPS_DYNAMIC_LIBRARY_ACTION_NAME,
+    cpp_link_static_library = CPP_LINK_STATIC_LIBRARY_ACTION_NAME,
+    cpp_module_codegen = CPP_MODULE_CODEGEN_ACTION_NAME,
+    cpp_module_compile = CPP_MODULE_COMPILE_ACTION_NAME,
+    ld_embed_data = "ld_embed_data",
+    linkstamp_compile = LINKSTAMP_COMPILE_ACTION_NAME,
+    lto_backend = LTO_BACKEND_ACTION_NAME,
+    lto_indexing = LTO_INDEXING_ACTION_NAME,
+    objc_archive = OBJC_ARCHIVE_ACTION_NAME,
+    objc_compile = OBJC_COMPILE_ACTION_NAME,
+    objc_executable = OBJC_EXECUTABLE_ACTION_NAME,
+    objc_fully_link = OBJC_FULLY_LINK_ACTION_NAME,
+    objcopy_embed_data = "objcopy_embed_data",
+    objcpp_compile = OBJCPP_COMPILE_ACTION_NAME,
+    objcpp_executable = OBJCPP_EXECUTABLE_ACTION_NAME,
+    preprocess_assemble = PREPROCESS_ASSEMBLE_ACTION_NAME,
+    strip = STRIP_ACTION_NAME,
+)
+
+def _impl(ctx):
+    if (ctx.attr.cpu == "darwin"):
+        toolchain_identifier = "local_darwin"
+    elif (ctx.attr.cpu == "local"):
+        toolchain_identifier = "local_linux"
+    elif (ctx.attr.cpu == "x64_windows"):
+        toolchain_identifier = "local_windows"
+    else:
+        fail("Unreachable")
+
+    host_system_name = "local"
+
+    target_system_name = "local"
+
+    if (ctx.attr.cpu == "darwin"):
+        target_cpu = "darwin"
+    elif (ctx.attr.cpu == "local"):
+        target_cpu = "local"
+    elif (ctx.attr.cpu == "x64_windows"):
+        target_cpu = "x64_windows"
+    else:
+        fail("Unreachable")
+
+    if (ctx.attr.cpu == "local"):
+        target_libc = "local"
+    elif (ctx.attr.cpu == "darwin"):
+        target_libc = "macosx"
+    elif (ctx.attr.cpu == "x64_windows"):
+        target_libc = "msvcrt"
+    else:
+        fail("Unreachable")
+
+    if (ctx.attr.cpu == "darwin" or
+        ctx.attr.cpu == "local"):
+        compiler = "compiler"
+    elif (ctx.attr.cpu == "x64_windows"):
+        compiler = "msvc-cl"
+    else:
+        fail("Unreachable")
+
+    abi_version = "local"
+
+    abi_libc_version = "local"
+
+    cc_target_os = None
+
+    builtin_sysroot = None
+
+    all_link_actions = [
+        ACTION_NAMES.cpp_link_executable,
+        ACTION_NAMES.cpp_link_dynamic_library,
+        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+    ]
+
+    cpp_link_dynamic_library_action = action_config(
+        action_name = ACTION_NAMES.cpp_link_dynamic_library,
+        implies = [
+            "nologo",
+            "shared_flag",
+            "linkstamps",
+            "output_execpath_flags",
+            "input_param_flags",
+            "user_link_flags",
+            "linker_subsystem_flag",
+            "linker_param_file",
+            "msvc_env",
+            "no_stripping",
+            "has_configured_linker_path",
+            "def_file",
+        ],
+        tools = [tool(path = ctx.attr.msvc_link_path)],
+    )
+
+    cpp_link_nodeps_dynamic_library_action = action_config(
+        action_name = ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+        implies = [
+            "nologo",
+            "shared_flag",
+            "linkstamps",
+            "output_execpath_flags",
+            "input_param_flags",
+            "user_link_flags",
+            "linker_subsystem_flag",
+            "linker_param_file",
+            "msvc_env",
+            "no_stripping",
+            "has_configured_linker_path",
+            "def_file",
+        ],
+        tools = [tool(path = ctx.attr.msvc_link_path)],
+    )
+
+    cpp_link_static_library_action = action_config(
+        action_name = ACTION_NAMES.cpp_link_static_library,
+        implies = [
+            "nologo",
+            "archiver_flags",
+            "input_param_flags",
+            "linker_param_file",
+            "msvc_env",
+        ],
+        tools = [tool(path = ctx.attr.msvc_lib_path)],
+    )
+
+    assemble_action = action_config(
+        action_name = ACTION_NAMES.assemble,
+        implies = [
+            "compiler_input_flags",
+            "compiler_output_flags",
+            "nologo",
+            "msvc_env",
+            "sysroot",
+        ],
+        tools = [tool(path = ctx.attr.msvc_ml_path)],
+    )
+
+    preprocess_assemble_action = action_config(
+        action_name = ACTION_NAMES.preprocess_assemble,
+        implies = [
+            "compiler_input_flags",
+            "compiler_output_flags",
+            "nologo",
+            "msvc_env",
+            "sysroot",
+        ],
+        tools = [tool(path = ctx.attr.msvc_ml_path)],
+    )
+
+    c_compile_action = action_config(
+        action_name = ACTION_NAMES.c_compile,
+        implies = [
+            "compiler_input_flags",
+            "compiler_output_flags",
+            "nologo",
+            "msvc_env",
+            "parse_showincludes",
+            "user_compile_flags",
+            "sysroot",
+            "unfiltered_compile_flags",
+        ],
+        tools = [tool(path = ctx.attr.msvc_cl_path)],
+    )
+
+    cpp_compile_action = action_config(
+        action_name = ACTION_NAMES.cpp_compile,
+        implies = [
+            "compiler_input_flags",
+            "compiler_output_flags",
+            "nologo",
+            "msvc_env",
+            "parse_showincludes",
+            "user_compile_flags",
+            "sysroot",
+            "unfiltered_compile_flags",
+        ],
+        tools = [tool(path = ctx.attr.msvc_cl_path)],
+    )
+
+    cpp_link_executable_action = action_config(
+        action_name = ACTION_NAMES.cpp_link_executable,
+        implies = [
+            "nologo",
+            "linkstamps",
+            "output_execpath_flags",
+            "input_param_flags",
+            "user_link_flags",
+            "linker_subsystem_flag",
+            "linker_param_file",
+            "msvc_env",
+            "no_stripping",
+        ],
+        tools = [tool(path = ctx.attr.msvc_link_path)],
+    )
+
+    if (ctx.attr.cpu == "darwin" or
+        ctx.attr.cpu == "local"):
+        action_configs = []
+    elif (ctx.attr.cpu == "x64_windows"):
+        action_configs = [
+            assemble_action,
+            preprocess_assemble_action,
+            c_compile_action,
+            cpp_compile_action,
+            cpp_link_executable_action,
+            cpp_link_dynamic_library_action,
+            cpp_link_nodeps_dynamic_library_action,
+            cpp_link_static_library_action,
+        ]
+    else:
+        fail("Unreachable")
+
+    no_windows_export_all_symbols_feature = feature(name = "no_windows_export_all_symbols")
+
+    pic_feature = feature(
+        name = "pic",
+        enabled = True,
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                flag_groups = [
+                    flag_group(flags = ["-fPIC"], expand_if_available = "pic"),
+                    flag_group(
+                        flags = ["-fPIE"],
+                        expand_if_not_available = "pic",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    preprocessor_defines_feature = feature(
+        name = "preprocessor_defines",
+        enabled = True,
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.assemble,
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                ],
+                flag_groups = [
+                    flag_group(
+                        flags = ["/D%{preprocessor_defines}"],
+                        iterate_over = "preprocessor_defines",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    generate_pdb_file_feature = feature(
+        name = "generate_pdb_file",
+        requires = [
+            feature_set(features = ["dbg"]),
+            feature_set(features = ["fastbuild"]),
+        ],
+    )
+
+    linkstamps_feature = feature(
+        name = "linkstamps",
+        flag_sets = [
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [
+                    flag_group(
+                        flags = ["%{linkstamp_paths}"],
+                        iterate_over = "linkstamp_paths",
+                        expand_if_available = "linkstamp_paths",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    unfiltered_compile_flags_feature = feature(
+        name = "unfiltered_compile_flags",
+        flag_sets = ([
+            flag_set(
+                actions = [
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                ],
+                flag_groups = [
+                    flag_group(
+                        flags = ctx.attr.host_unfiltered_compile_flags,
+                    ),
+                ],
+            ),
+        ] if ctx.attr.host_unfiltered_compile_flags else []),
+    )
+
+    determinism_feature = feature(
+        name = "determinism",
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                flag_groups = [
+                    flag_group(
+                        flags = [
+                            "-Wno-builtin-macro-redefined",
+                            "-D__DATE__=\"redacted\"",
+                            "-D__TIMESTAMP__=\"redacted\"",
+                            "-D__TIME__=\"redacted\"",
+                        ],
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    nologo_feature = feature(
+        name = "nologo",
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.assemble,
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.cpp_link_executable,
+                    ACTION_NAMES.cpp_link_dynamic_library,
+                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                    ACTION_NAMES.cpp_link_static_library,
+                ],
+                flag_groups = [flag_group(flags = ["/nologo"])],
+            ),
+        ],
+    )
+
+    supports_pic_feature = feature(name = "supports_pic", enabled = True)
+
+    output_execpath_flags_feature = feature(
+        name = "output_execpath_flags",
+        flag_sets = [
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [
+                    flag_group(
+                        flags = ["/OUT:%{output_execpath}"],
+                        expand_if_available = "output_execpath",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    default_link_flags_feature = feature(
+        name = "default_link_flags",
+        enabled = True,
+        flag_sets = [
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [flag_group(flags = ["/MACHINE:X64"])],
+            ),
+        ],
+    )
+
+    if (ctx.attr.cpu == "local"):
+        hardening_feature = feature(
+            name = "hardening",
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [
+                        flag_group(
+                            flags = [
+                                "-U_FORTIFY_SOURCE",
+                                "-D_FORTIFY_SOURCE=1",
+                                "-fstack-protector",
+                            ],
+                        ),
+                    ],
+                ),
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.cpp_link_dynamic_library,
+                        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                    ],
+                    flag_groups = [flag_group(flags = ["-Wl,-z,relro,-z,now"])],
+                ),
+                flag_set(
+                    actions = [ACTION_NAMES.cpp_link_executable],
+                    flag_groups = [flag_group(flags = ["-pie", "-Wl,-z,relro,-z,now"])],
+                ),
+            ],
+        )
+    elif (ctx.attr.cpu == "darwin"):
+        hardening_feature = feature(
+            name = "hardening",
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [
+                        flag_group(
+                            flags = [
+                                "-U_FORTIFY_SOURCE",
+                                "-D_FORTIFY_SOURCE=1",
+                                "-fstack-protector",
+                            ],
+                        ),
+                    ],
+                ),
+                flag_set(
+                    actions = [ACTION_NAMES.cpp_link_executable],
+                    flag_groups = [flag_group(flags = ["-pie"])],
+                ),
+            ],
+        )
+    else:
+        hardening_feature = None
+
+    supports_dynamic_linker_feature = feature(name = "supports_dynamic_linker", enabled = True)
+
+    targets_windows_feature = feature(
+        name = "targets_windows",
+        enabled = True,
+        implies = ["copy_dynamic_libraries_to_binary"],
+    )
+
+    msvc_env_feature = feature(
+        name = "msvc_env",
+        env_sets = [
+            env_set(
+                actions = [
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.assemble,
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.cpp_link_executable,
+                    ACTION_NAMES.cpp_link_dynamic_library,
+                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                    ACTION_NAMES.cpp_link_static_library,
+                ],
+                env_entries = [
+                    env_entry(key = "PATH", value = ctx.attr.msvc_env_path),
+                    env_entry(
+                        key = "INCLUDE",
+                        value = ctx.attr.msvc_env_include,
+                    ),
+                    env_entry(key = "LIB", value = ctx.attr.msvc_env_lib),
+                    env_entry(key = "TMP", value = ctx.attr.msvc_env_tmp),
+                    env_entry(key = "TEMP", value = ctx.attr.msvc_env_tmp),
+                ],
+            ),
+        ],
+    )
+
+    linker_subsystem_flag_feature = feature(
+        name = "linker_subsystem_flag",
+        flag_sets = [
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [flag_group(flags = ["/SUBSYSTEM:CONSOLE"])],
+            ),
+        ],
+    )
+
+    dynamic_link_msvcrt_no_debug_feature = feature(
+        name = "dynamic_link_msvcrt_no_debug",
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                flag_groups = [flag_group(flags = ["/MD"])],
+            ),
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [flag_group(flags = ["/DEFAULTLIB:msvcrt.lib"])],
+            ),
+        ],
+        requires = [
+            feature_set(features = ["fastbuild"]),
+            feature_set(features = ["opt"]),
+        ],
+    )
+
+    warnings_feature = feature(
+        name = "warnings",
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                flag_groups = [
+                    flag_group(
+                        flags = ["-Wall"] + ctx.attr.host_compiler_warnings,
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    dynamic_link_msvcrt_debug_feature = feature(
+        name = "dynamic_link_msvcrt_debug",
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                flag_groups = [flag_group(flags = ["/MDd"])],
+            ),
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [flag_group(flags = ["/DEFAULTLIB:msvcrtd.lib"])],
+            ),
+        ],
+        requires = [feature_set(features = ["dbg"])],
+    )
+
+    compiler_output_flags_feature = feature(
+        name = "compiler_output_flags",
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.assemble],
+                flag_groups = [
+                    flag_group(
+                        flag_groups = [
+                            flag_group(
+                                flags = ["/Fo%{output_file}", "/Zi"],
+                                expand_if_not_available = "output_preprocess_file",
+                            ),
+                        ],
+                        expand_if_available = "output_file",
+                        expand_if_not_available = "output_assembly_file",
+                    ),
+                ],
+            ),
+            flag_set(
+                actions = [
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                ],
+                flag_groups = [
+                    flag_group(
+                        flag_groups = [
+                            flag_group(
+                                flags = ["/Fo%{output_file}"],
+                                expand_if_not_available = "output_preprocess_file",
+                            ),
+                        ],
+                        expand_if_available = "output_file",
+                        expand_if_not_available = "output_assembly_file",
+                    ),
+                    flag_group(
+                        flag_groups = [
+                            flag_group(
+                                flags = ["/Fa%{output_file}"],
+                                expand_if_available = "output_assembly_file",
+                            ),
+                        ],
+                        expand_if_available = "output_file",
+                    ),
+                    flag_group(
+                        flag_groups = [
+                            flag_group(
+                                flags = ["/P", "/Fi%{output_file}"],
+                                expand_if_available = "output_preprocess_file",
+                            ),
+                        ],
+                        expand_if_available = "output_file",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    default_compile_flags_feature = feature(
+        name = "default_compile_flags",
+        enabled = True,
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.assemble,
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.linkstamp_compile,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                    ACTION_NAMES.lto_backend,
+                    ACTION_NAMES.clif_match,
+                ],
+                flag_groups = [
+                    flag_group(
+                        flags = [
+                            "/DCOMPILER_MSVC",
+                            "/DNOMINMAX",
+                            "/D_WIN32_WINNT=0x0600",
+                            "/D_CRT_SECURE_NO_DEPRECATE",
+                            "/D_CRT_SECURE_NO_WARNINGS",
+                            "/D_SILENCE_STDEXT_HASH_DEPRECATION_WARNINGS",
+                            "/bigobj",
+                            "/Zm500",
+                            "/J",
+                            "/Gy",
+                            "/GF",
+                            "/EHsc",
+                            "/wd4351",
+                            "/wd4291",
+                            "/wd4250",
+                            "/wd4996",
+                        ],
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    static_link_msvcrt_debug_feature = feature(
+        name = "static_link_msvcrt_debug",
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                flag_groups = [flag_group(flags = ["/MTd"])],
+            ),
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [flag_group(flags = ["/DEFAULTLIB:libcmtd.lib"])],
+            ),
+        ],
+        requires = [feature_set(features = ["dbg"])],
+    )
+
+    static_link_msvcrt_feature = feature(name = "static_link_msvcrt")
+
+    if (ctx.attr.cpu == "darwin" or
+        ctx.attr.cpu == "local"):
+        dbg_feature = feature(
+            name = "dbg",
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [flag_group(flags = ["-g"])],
+                ),
+            ],
+            implies = ["common"],
+        )
+    elif (ctx.attr.cpu == "x64_windows"):
+        dbg_feature = feature(
+            name = "dbg",
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [flag_group(flags = ["/Od", "/Z7", "/DDEBUG"])],
+                ),
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [flag_group(flags = ["/DEBUG:FULL", "/INCREMENTAL:NO"])],
+                ),
+            ],
+            implies = ["generate_pdb_file"],
+        )
+    else:
+        dbg_feature = None
+
+    undefined_dynamic_feature = feature(
+        name = "undefined-dynamic",
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.cpp_link_dynamic_library,
+                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                    ACTION_NAMES.cpp_link_executable,
+                ],
+                flag_groups = [flag_group(flags = ["-undefined", "dynamic_lookup"])],
+            ),
+        ],
+    )
+
+    parse_showincludes_feature = feature(
+        name = "parse_showincludes",
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                ],
+                flag_groups = [flag_group(flags = ["/showIncludes"])],
+            ),
+        ],
+    )
+
+    linker_param_file_feature = feature(
+        name = "linker_param_file",
+        flag_sets = [
+            flag_set(
+                actions = all_link_actions +
+                          [ACTION_NAMES.cpp_link_static_library],
+                flag_groups = [
+                    flag_group(
+                        flags = ["@%{linker_param_file}"],
+                        expand_if_available = "linker_param_file",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    static_link_msvcrt_no_debug_feature = feature(
+        name = "static_link_msvcrt_no_debug",
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                flag_groups = [flag_group(flags = ["/MT"])],
+            ),
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [flag_group(flags = ["/DEFAULTLIB:libcmt.lib"])],
+            ),
+        ],
+        requires = [
+            feature_set(features = ["fastbuild"]),
+            feature_set(features = ["opt"]),
+        ],
+    )
+
+    supports_interface_shared_libraries_feature = feature(
+        name = "supports_interface_shared_libraries",
+        enabled = True,
+    )
+
+    disable_assertions_feature = feature(
+        name = "disable-assertions",
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                flag_groups = [flag_group(flags = ["-DNDEBUG"])],
+            ),
+        ],
+    )
+
+    if (ctx.attr.cpu == "x64_windows"):
+        fastbuild_feature = feature(
+            name = "fastbuild",
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [flag_group(flags = ["/Od", "/Z7", "/DDEBUG"])],
+                ),
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [
+                        flag_group(flags = ["/DEBUG:FASTLINK", "/INCREMENTAL:NO"]),
+                    ],
+                ),
+            ],
+            implies = ["generate_pdb_file"],
+        )
+    elif (ctx.attr.cpu == "darwin" or
+          ctx.attr.cpu == "local"):
+        fastbuild_feature = feature(name = "fastbuild", implies = ["common"])
+    else:
+        fastbuild_feature = None
+
+    user_compile_flags_feature = feature(
+        name = "user_compile_flags",
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                ],
+                flag_groups = [
+                    flag_group(
+                        flags = ["%{user_compile_flags}"],
+                        iterate_over = "user_compile_flags",
+                        expand_if_available = "user_compile_flags",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    compiler_input_flags_feature = feature(
+        name = "compiler_input_flags",
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.assemble,
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                ],
+                flag_groups = [
+                    flag_group(
+                        flags = ["/c", "%{source_file}"],
+                        expand_if_available = "source_file",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    no_legacy_features_feature = feature(name = "no_legacy_features")
+
+    archiver_flags_feature = feature(
+        name = "archiver_flags",
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.cpp_link_static_library],
+                flag_groups = [
+                    flag_group(
+                        flags = ["/OUT:%{output_execpath}"],
+                        expand_if_available = "output_execpath",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    redirector_feature = feature(
+        name = "redirector",
+        enabled = True,
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.assemble,
+                    ACTION_NAMES.preprocess_assemble,
+                ],
+                flag_groups = [
+                    flag_group(
+                        flags = [
+                            "-B",
+                            "external/local_config_cuda/crosstool/windows/msvc_wrapper_for_nvcc.py",
+                        ],
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    linker_bin_path_feature = feature(
+        name = "linker-bin-path",
+        flag_sets = [
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [flag_group(flags = ["-B" + ctx.attr.linker_bin_path])],
+            ),
+        ],
+    )
+
+    if (ctx.attr.cpu == "local"):
+        opt_feature = feature(
+            name = "opt",
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [
+                        flag_group(
+                            flags = ["-g0", "-O2", "-ffunction-sections", "-fdata-sections"],
+                        ),
+                    ],
+                ),
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.cpp_link_dynamic_library,
+                        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                        ACTION_NAMES.cpp_link_executable,
+                    ],
+                    flag_groups = [flag_group(flags = ["-Wl,--gc-sections"])],
+                ),
+            ],
+            implies = ["common", "disable-assertions"],
+        )
+    elif (ctx.attr.cpu == "darwin"):
+        opt_feature = feature(
+            name = "opt",
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [
+                        flag_group(
+                            flags = ["-g0", "-O2", "-ffunction-sections", "-fdata-sections"],
+                        ),
+                    ],
+                ),
+            ],
+            implies = ["common", "disable-assertions"],
+        )
+    elif (ctx.attr.cpu == "x64_windows"):
+        opt_feature = feature(
+            name = "opt",
+            flag_sets = [
+                flag_set(
+                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                    flag_groups = [flag_group(flags = ["/O2", "/DNDEBUG"])],
+                ),
+            ],
+        )
+    else:
+        opt_feature = None
+
+    include_paths_feature = feature(
+        name = "include_paths",
+        enabled = True,
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.assemble,
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                ],
+                flag_groups = [
+                    flag_group(
+                        flags = ["/I%{quote_include_paths}"],
+                        iterate_over = "quote_include_paths",
+                    ),
+                    flag_group(
+                        flags = ["/I%{include_paths}"],
+                        iterate_over = "include_paths",
+                    ),
+                    flag_group(
+                        flags = ["/I%{system_include_paths}"],
+                        iterate_over = "system_include_paths",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    shared_flag_feature = feature(
+        name = "shared_flag",
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.cpp_link_dynamic_library,
+                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                ],
+                flag_groups = [flag_group(flags = ["/DLL"])],
+            ),
+        ],
+    )
+
+    windows_export_all_symbols_feature = feature(name = "windows_export_all_symbols")
+
+    frame_pointer_feature = feature(
+        name = "frame-pointer",
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
+                flag_groups = [flag_group(flags = ["-fno-omit-frame-pointer"])],
+            ),
+        ],
+    )
+
+    build_id_feature = feature(
+        name = "build-id",
+        flag_sets = [
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [
+                    flag_group(
+                        flags = ["-Wl,--build-id=md5", "-Wl,--hash-style=gnu"],
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    sysroot_feature = feature(
+        name = "sysroot",
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.assemble,
+                    ACTION_NAMES.preprocess_assemble,
+                    ACTION_NAMES.c_compile,
+                    ACTION_NAMES.cpp_compile,
+                    ACTION_NAMES.cpp_header_parsing,
+                    ACTION_NAMES.cpp_module_compile,
+                    ACTION_NAMES.cpp_module_codegen,
+                    ACTION_NAMES.cpp_link_executable,
+                    ACTION_NAMES.cpp_link_dynamic_library,
+                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                ],
+                flag_groups = [
+                    flag_group(
+                        flags = ["--sysroot=%{sysroot}"],
+                        iterate_over = "sysroot",
+                        expand_if_available = "sysroot",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    def_file_feature = feature(
+        name = "def_file",
+        flag_sets = [
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [
+                    flag_group(
+                        flags = ["/DEF:%{def_file_path}", "/ignore:4070"],
+                        expand_if_available = "def_file_path",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    if (ctx.attr.cpu == "darwin"):
+        stdlib_feature = feature(
+            name = "stdlib",
+            flag_sets = [
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [flag_group(flags = ["-lc++"])],
+                ),
+            ],
+        )
+    elif (ctx.attr.cpu == "local"):
+        stdlib_feature = feature(
+            name = "stdlib",
+            flag_sets = [
+                flag_set(
+                    actions = all_link_actions,
+                    flag_groups = [flag_group(flags = ["-lstdc++"])],
+                ),
+            ],
+        )
+    else:
+        stdlib_feature = None
+
+    no_stripping_feature = feature(name = "no_stripping")
+
+    alwayslink_feature = feature(
+        name = "alwayslink",
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.cpp_link_dynamic_library,
+                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                    ACTION_NAMES.cpp_link_executable,
+                ],
+                flag_groups = [flag_group(flags = ["-Wl,-no-as-needed"])],
+            ),
+        ],
+    )
+
+    input_param_flags_feature = feature(
+        name = "input_param_flags",
+        flag_sets = [
+            flag_set(
+                actions = [
+                    ACTION_NAMES.cpp_link_dynamic_library,
+                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                ],
+                flag_groups = [
+                    flag_group(
+                        flags = ["/IMPLIB:%{interface_library_output_path}"],
+                        expand_if_available = "interface_library_output_path",
+                    ),
+                ],
+            ),
+            flag_set(
+                actions = all_link_actions +
+                          [ACTION_NAMES.cpp_link_static_library],
+                flag_groups = [
+                    flag_group(
+                        iterate_over = "libraries_to_link",
+                        flag_groups = [
+                            flag_group(
+                                iterate_over = "libraries_to_link.object_files",
+                                flag_groups = [flag_group(flags = ["%{libraries_to_link.object_files}"])],
+                                expand_if_equal = variable_with_value(
+                                    name = "libraries_to_link.type",
+                                    value = "object_file_group",
+                                ),
+                            ),
+                            flag_group(
+                                flag_groups = [flag_group(flags = ["%{libraries_to_link.name}"])],
+                                expand_if_equal = variable_with_value(
+                                    name = "libraries_to_link.type",
+                                    value = "object_file",
+                                ),
+                            ),
+                            flag_group(
+                                flag_groups = [flag_group(flags = ["%{libraries_to_link.name}"])],
+                                expand_if_equal = variable_with_value(
+                                    name = "libraries_to_link.type",
+                                    value = "interface_library",
+                                ),
+                            ),
+                            flag_group(
+                                flag_groups = [
+                                    flag_group(
+                                        flags = ["%{libraries_to_link.name}"],
+                                        expand_if_false = "libraries_to_link.is_whole_archive",
+                                    ),
+                                    flag_group(
+                                        flags = ["/WHOLEARCHIVE:%{libraries_to_link.name}"],
+                                        expand_if_true = "libraries_to_link.is_whole_archive",
+                                    ),
+                                ],
+                                expand_if_equal = variable_with_value(
+                                    name = "libraries_to_link.type",
+                                    value = "static_library",
+                                ),
+                            ),
+                        ],
+                        expand_if_available = "libraries_to_link",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    if (ctx.attr.cpu == "local"):
+        no_canonical_prefixes_feature = feature(
+            name = "no-canonical-prefixes",
+            flag_sets = [
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_link_executable,
+                        ACTION_NAMES.cpp_link_dynamic_library,
+                        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                    ],
+                    flag_groups = [
+                        flag_group(
+                            flags = [
+                                "-no-canonical-prefixes",
+                            ] + ctx.attr.extra_no_canonical_prefixes_flags,
+                        ),
+                    ],
+                ),
+            ],
+        )
+    elif (ctx.attr.cpu == "darwin"):
+        no_canonical_prefixes_feature = feature(
+            name = "no-canonical-prefixes",
+            flag_sets = [
+                flag_set(
+                    actions = [
+                        ACTION_NAMES.c_compile,
+                        ACTION_NAMES.cpp_compile,
+                        ACTION_NAMES.cpp_link_executable,
+                        ACTION_NAMES.cpp_link_dynamic_library,
+                        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+                    ],
+                    flag_groups = [flag_group(flags = ["-no-canonical-prefixes"])],
+                ),
+            ],
+        )
+    else:
+        no_canonical_prefixes_feature = None
+
+    has_configured_linker_path_feature = feature(name = "has_configured_linker_path")
+
+    copy_dynamic_libraries_to_binary_feature = feature(name = "copy_dynamic_libraries_to_binary")
+
+    user_link_flags_feature = feature(
+        name = "user_link_flags",
+        flag_sets = [
+            flag_set(
+                actions = all_link_actions,
+                flag_groups = [
+                    flag_group(
+                        flags = ["%{user_link_flags}"],
+                        iterate_over = "user_link_flags",
+                        expand_if_available = "user_link_flags",
+                    ),
+                ],
+            ),
+        ],
+    )
+
+    cpp11_feature = feature(
+        name = "c++11",
+        flag_sets = [
+            flag_set(
+                actions = [ACTION_NAMES.cpp_compile],
+                flag_groups = [flag_group(flags = ["-std=c++11"])],
+            ),
+        ],
+    )
+
+    if (ctx.attr.cpu == "local"):
+        common_feature = feature(
+            name = "common",
+            implies = [
+                "stdlib",
+                "c++11",
+                "determinism",
+                "alwayslink",
+                "hardening",
+                "warnings",
+                "frame-pointer",
+                "build-id",
+                "no-canonical-prefixes",
+                "linker-bin-path",
+            ],
+        )
+    elif (ctx.attr.cpu == "darwin"):
+        common_feature = feature(
+            name = "common",
+            implies = [
+                "stdlib",
+                "c++11",
+                "determinism",
+                "hardening",
+                "warnings",
+                "frame-pointer",
+                "no-canonical-prefixes",
+                "linker-bin-path",
+                "undefined-dynamic",
+            ],
+        )
+    else:
+        common_feature = None
+
+    if (ctx.attr.cpu == "local"):
+        features = [
+            cpp11_feature,
+            stdlib_feature,
+            determinism_feature,
+            alwayslink_feature,
+            pic_feature,
+            hardening_feature,
+            warnings_feature,
+            frame_pointer_feature,
+            build_id_feature,
+            no_canonical_prefixes_feature,
+            disable_assertions_feature,
+            linker_bin_path_feature,
+            common_feature,
+            opt_feature,
+            fastbuild_feature,
+            dbg_feature,
+            supports_dynamic_linker_feature,
+            supports_pic_feature,
+        ]
+    elif (ctx.attr.cpu == "darwin"):
+        features = [
+            cpp11_feature,
+            stdlib_feature,
+            determinism_feature,
+            pic_feature,
+            hardening_feature,
+            warnings_feature,
+            frame_pointer_feature,
+            no_canonical_prefixes_feature,
+            disable_assertions_feature,
+            linker_bin_path_feature,
+            undefined_dynamic_feature,
+            common_feature,
+            opt_feature,
+            fastbuild_feature,
+            dbg_feature,
+            supports_dynamic_linker_feature,
+            supports_pic_feature,
+        ]
+    elif (ctx.attr.cpu == "x64_windows"):
+        features = [
+            no_legacy_features_feature,
+            redirector_feature,
+            nologo_feature,
+            has_configured_linker_path_feature,
+            no_stripping_feature,
+            targets_windows_feature,
+            copy_dynamic_libraries_to_binary_feature,
+            default_compile_flags_feature,
+            msvc_env_feature,
+            include_paths_feature,
+            preprocessor_defines_feature,
+            parse_showincludes_feature,
+            generate_pdb_file_feature,
+            shared_flag_feature,
+            linkstamps_feature,
+            output_execpath_flags_feature,
+            archiver_flags_feature,
+            input_param_flags_feature,
+            linker_subsystem_flag_feature,
+            user_link_flags_feature,
+            default_link_flags_feature,
+            linker_param_file_feature,
+            static_link_msvcrt_feature,
+            static_link_msvcrt_no_debug_feature,
+            dynamic_link_msvcrt_no_debug_feature,
+            static_link_msvcrt_debug_feature,
+            dynamic_link_msvcrt_debug_feature,
+            dbg_feature,
+            fastbuild_feature,
+            opt_feature,
+            user_compile_flags_feature,
+            sysroot_feature,
+            unfiltered_compile_flags_feature,
+            compiler_output_flags_feature,
+            compiler_input_flags_feature,
+            def_file_feature,
+            windows_export_all_symbols_feature,
+            no_windows_export_all_symbols_feature,
+            supports_dynamic_linker_feature,
+            supports_interface_shared_libraries_feature,
+        ]
+    else:
+        fail("Unreachable")
+
+    cxx_builtin_include_directories = ctx.attr.builtin_include_directories
+
+    if (ctx.attr.cpu == "x64_windows"):
+        tool_paths = [
+            tool_path(name = "ar", path = ctx.attr.msvc_lib_path),
+            tool_path(name = "ml", path = ctx.attr.msvc_ml_path),
+            tool_path(name = "cpp", path = ctx.attr.msvc_cl_path),
+            tool_path(name = "gcc", path = ctx.attr.msvc_cl_path),
+            tool_path(name = "gcov", path = "wrapper/bin/msvc_nop.bat"),
+            tool_path(name = "ld", path = ctx.attr.msvc_link_path),
+            tool_path(name = "nm", path = "wrapper/bin/msvc_nop.bat"),
+            tool_path(
+                name = "objcopy",
+                path = "wrapper/bin/msvc_nop.bat",
+            ),
+            tool_path(
+                name = "objdump",
+                path = "wrapper/bin/msvc_nop.bat",
+            ),
+            tool_path(
+                name = "strip",
+                path = "wrapper/bin/msvc_nop.bat",
+            ),
+        ]
+    elif (ctx.attr.cpu == "local"):
+        tool_paths = [
+            tool_path(name = "gcc", path = ctx.attr.host_compiler_path),
+            tool_path(name = "ar", path = ctx.attr.host_compiler_prefix + "/ar"),
+            tool_path(name = "compat-ld", path = ctx.attr.host_compiler_prefix + "/ld"),
+            tool_path(name = "cpp", path = ctx.attr.host_compiler_prefix + "/cpp"),
+            tool_path(name = "dwp", path = ctx.attr.host_compiler_prefix + "/dwp"),
+            tool_path(name = "gcov", path = ctx.attr.host_compiler_prefix + "/gcov"),
+            tool_path(name = "ld", path = ctx.attr.host_compiler_prefix + "/ld"),
+            tool_path(name = "nm", path = ctx.attr.host_compiler_prefix + "/nm"),
+            tool_path(name = "objcopy", path = ctx.attr.host_compiler_prefix + "/objcopy"),
+            tool_path(name = "objdump", path = ctx.attr.host_compiler_prefix + "/objdump"),
+            tool_path(name = "strip", path = ctx.attr.host_compiler_prefix + "/strip"),
+        ]
+    elif (ctx.attr.cpu == "darwin"):
+        tool_paths = [
+            tool_path(name = "gcc", path = ctx.attr.host_compiler_path),
+            tool_path(name = "ar", path = ctx.attr.host_compiler_prefix + "/libtool"),
+            tool_path(name = "compat-ld", path = ctx.attr.host_compiler_prefix + "/ld"),
+            tool_path(name = "cpp", path = ctx.attr.host_compiler_prefix + "/cpp"),
+            tool_path(name = "dwp", path = ctx.attr.host_compiler_prefix + "/dwp"),
+            tool_path(name = "gcov", path = ctx.attr.host_compiler_prefix + "/gcov"),
+            tool_path(name = "ld", path = ctx.attr.host_compiler_prefix + "/ld"),
+            tool_path(name = "nm", path = ctx.attr.host_compiler_prefix + "/nm"),
+            tool_path(name = "objcopy", path = ctx.attr.host_compiler_prefix + "/objcopy"),
+            tool_path(name = "objdump", path = ctx.attr.host_compiler_prefix + "/objdump"),
+            tool_path(name = "strip", path = ctx.attr.host_compiler_prefix + "/strip"),
+        ]
+    else:
+        fail("Unreachable")
+
+    out = ctx.actions.declare_file(ctx.label.name)
+    ctx.actions.write(out, "Fake executable")
+    return [
+        cc_common.create_cc_toolchain_config_info(
+            ctx = ctx,
+            features = features,
+            action_configs = action_configs,
+            artifact_name_patterns = [],
+            cxx_builtin_include_directories = cxx_builtin_include_directories,
+            toolchain_identifier = toolchain_identifier,
+            host_system_name = host_system_name,
+            target_system_name = target_system_name,
+            target_cpu = target_cpu,
+            target_libc = target_libc,
+            compiler = compiler,
+            abi_version = abi_version,
+            abi_libc_version = abi_libc_version,
+            tool_paths = tool_paths,
+            make_variables = [],
+            builtin_sysroot = builtin_sysroot,
+            cc_target_os = cc_target_os,
+        ),
+        DefaultInfo(
+            executable = out,
+        ),
+    ]
+
+cc_toolchain_config = rule(
+    attrs = {
+        "cpu": attr.string(
+            mandatory = True,
+            values = [
+                "darwin",
+                "local",
+                "x64_windows",
+            ],
+        ),
+        "builtin_include_directories": attr.string_list(),
+        "extra_no_canonical_prefixes_flags": attr.string_list(),
+        "host_compiler_path": attr.string(),
+        "host_compiler_prefix": attr.string(),
+        "host_compiler_warnings": attr.string_list(),
+        "host_unfiltered_compile_flags": attr.string_list(),
+        "linker_bin_path": attr.string(),
+        "msvc_cl_path": attr.string(default = "msvc_not_used"),
+        "msvc_env_include": attr.string(default = "msvc_not_used"),
+        "msvc_env_lib": attr.string(default = "msvc_not_used"),
+        "msvc_env_path": attr.string(default = "msvc_not_used"),
+        "msvc_env_tmp": attr.string(default = "msvc_not_used"),
+        "msvc_lib_path": attr.string(default = "msvc_not_used"),
+        "msvc_link_path": attr.string(default = "msvc_not_used"),
+        "msvc_ml_path": attr.string(default = "msvc_not_used"),
+    },
+    executable = True,
+    provides = [CcToolchainConfigInfo],
+    implementation = _impl,
+)
diff --git a/build_deps/toolchains/gpu/crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc.tpl b/build_deps/toolchains/gpu/crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc.tpl
new file mode 100644
index 0000000..81c16c6
--- /dev/null
+++ b/build_deps/toolchains/gpu/crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc.tpl
@@ -0,0 +1,269 @@
+#!/usr/bin/env python
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Crosstool wrapper for compiling CUDA programs.
+
+SYNOPSIS:
+  crosstool_wrapper_is_not_gcc [options passed in by cc_library()
+                                or cc_binary() rule]
+
+DESCRIPTION:
+  This script is expected to be called by the cc_library() or cc_binary() bazel
+  rules. When the option "-x cuda" is present in the list of arguments passed
+  to this script, it invokes the nvcc CUDA compiler. Most arguments are passed
+  as is as a string to --compiler-options of nvcc. When "-x cuda" is not
+  present, this wrapper invokes hybrid_driver_is_not_gcc with the input
+  arguments as is.
+
+NOTES:
+  Changes to the contents of this file must be propagated from
+  //third_party/gpus/crosstool/crosstool_wrapper_is_not_gcc to
+  //third_party/gpus/crosstool/v*/*/clang/bin/crosstool_wrapper_is_not_gcc
+"""
+
+from __future__ import print_function
+
+__author__ = 'keveman@google.com (Manjunath Kudlur)'
+
+from argparse import ArgumentParser
+import os
+import subprocess
+import re
+import sys
+import pipes
+
+# Template values set by cuda_autoconf.
+CPU_COMPILER = ('%{cpu_compiler}')
+GCC_HOST_COMPILER_PATH = ('%{gcc_host_compiler_path}')
+
+NVCC_PATH = '%{nvcc_path}'
+PREFIX_DIR = os.path.dirname(GCC_HOST_COMPILER_PATH)
+NVCC_VERSION = '%{cuda_version}'
+
+def Log(s):
+  print('gpus/crosstool: {0}'.format(s))
+
+
+def GetOptionValue(argv, option):
+  """Extract the list of values for option from the argv list.
+
+  Args:
+    argv: A list of strings, possibly the argv passed to main().
+    option: The option whose value to extract, without the leading '-'.
+
+  Returns:
+    A list of values, either directly following the option,
+    (eg., -opt val1 val2) or values collected from multiple occurrences of
+    the option (eg., -opt val1 -opt val2).
+  """
+
+  parser = ArgumentParser()
+  parser.add_argument('-' + option, nargs='*', action='append')
+  args, _ = parser.parse_known_args(argv)
+  if not args or not vars(args)[option]:
+    return []
+  else:
+    return sum(vars(args)[option], [])
+
+
+def GetHostCompilerOptions(argv):
+  """Collect the -isystem, -iquote, and --sysroot option values from argv.
+
+  Args:
+    argv: A list of strings, possibly the argv passed to main().
+
+  Returns:
+    The string that can be used as the --compiler-options to nvcc.
+  """
+
+  parser = ArgumentParser()
+  parser.add_argument('-isystem', nargs='*', action='append')
+  parser.add_argument('-iquote', nargs='*', action='append')
+  parser.add_argument('--sysroot', nargs=1)
+  parser.add_argument('-g', nargs='*', action='append')
+  parser.add_argument('-fno-canonical-system-headers', action='store_true')
+
+  args, _ = parser.parse_known_args(argv)
+
+  opts = ''
+
+  if args.isystem:
+    opts += ' -isystem ' + ' -isystem '.join(sum(args.isystem, []))
+  if args.iquote:
+    opts += ' -iquote ' + ' -iquote '.join(sum(args.iquote, []))
+  if args.g:
+    opts += ' -g' + ' -g'.join(sum(args.g, []))
+  if args.fno_canonical_system_headers:
+    opts += ' -fno-canonical-system-headers'
+  if args.sysroot:
+    opts += ' --sysroot ' + args.sysroot[0]
+
+  return opts
+
+def _update_options(nvcc_options):
+  if NVCC_VERSION in ("7.0",):
+    return nvcc_options
+
+  update_options = { "relaxed-constexpr" : "expt-relaxed-constexpr" }
+  return [ update_options[opt] if opt in update_options else opt
+                    for opt in nvcc_options ]
+
+def GetNvccOptions(argv):
+  """Collect the -nvcc_options values from argv.
+
+  Args:
+    argv: A list of strings, possibly the argv passed to main().
+
+  Returns:
+    The string that can be passed directly to nvcc.
+  """
+
+  parser = ArgumentParser()
+  parser.add_argument('-nvcc_options', nargs='*', action='append')
+
+  args, _ = parser.parse_known_args(argv)
+
+  if args.nvcc_options:
+    options = _update_options(sum(args.nvcc_options, []))
+    return ' '.join(['--'+a for a in options])
+  return ''
+
+
+def InvokeNvcc(argv, log=False):
+  """Call nvcc with arguments assembled from argv.
+
+  Args:
+    argv: A list of strings, possibly the argv passed to main().
+    log: True if logging is requested.
+
+  Returns:
+    The return value of calling os.system('nvcc ' + args)
+  """
+
+  host_compiler_options = GetHostCompilerOptions(argv)
+  nvcc_compiler_options = GetNvccOptions(argv)
+  opt_option = GetOptionValue(argv, 'O')
+  m_options = GetOptionValue(argv, 'm')
+  m_options = ''.join([' -m' + m for m in m_options if m in ['32', '64']])
+  include_options = GetOptionValue(argv, 'I')
+  out_file = GetOptionValue(argv, 'o')
+  depfiles = GetOptionValue(argv, 'MF')
+  defines = GetOptionValue(argv, 'D')
+  defines = ''.join([' -D' + define for define in defines])
+  undefines = GetOptionValue(argv, 'U')
+  undefines = ''.join([' -U' + define for define in undefines])
+  std_options = GetOptionValue(argv, 'std')
+  # Supported -std flags as of CUDA 9.0. Only keep last to mimic gcc/clang.
+  nvcc_allowed_std_options = ["c++03", "c++11", "c++14", "c++17"]
+  std_options = ''.join([' -std=' + define
+      for define in std_options if define in nvcc_allowed_std_options])
+
+  # The list of source files get passed after the -c option. I don't know of
+  # any other reliable way to just get the list of source files to be compiled.
+  src_files = GetOptionValue(argv, 'c')
+
+  # Pass -w through from host to nvcc, but don't do anything fancier with
+  # warnings-related flags, since they're not necessarily the same across
+  # compilers.
+  warning_options = ' -w' if '-w' in argv else ''
+
+  if len(src_files) == 0:
+    return 1
+  if len(out_file) != 1:
+    return 1
+
+  opt = (' -O2' if (len(opt_option) > 0 and int(opt_option[0]) > 0)
+         else ' -g -G')
+
+  includes = (' -I ' + ' -I '.join(include_options)
+              if len(include_options) > 0
+              else '')
+
+  # Unfortunately, there are other options that have -c prefix too.
+  # So allowing only those look like C/C++ files.
+  src_files = [f for f in src_files if
+               re.search('\.cpp$|\.cc$|\.c$|\.cxx$|\.C$', f)]
+  srcs = ' '.join(src_files)
+  out = ' -o ' + out_file[0]
+
+  supported_cuda_compute_capabilities = [ %{cuda_compute_capabilities} ]
+  nvccopts = '-D_FORCE_INLINES '
+  supported_cuda_compute_capabilities = sorted([
+      x.replace(".", "") for x in supported_cuda_compute_capabilities])
+  for capability in supported_cuda_compute_capabilities[:-1]:
+    nvccopts += r'-gencode=arch=compute_%s,\"code=sm_%s\" ' % (
+        capability, capability)
+  if supported_cuda_compute_capabilities:
+    capability = supported_cuda_compute_capabilities[-1]
+    nvccopts += r'-gencode=arch=compute_%s,code=\"sm_%s,compute_%s\" ' % (
+        capability, capability, capability)
+  nvccopts += ' ' + nvcc_compiler_options
+  nvccopts += undefines
+  nvccopts += defines
+  nvccopts += std_options
+  nvccopts += m_options
+  nvccopts += warning_options
+
+  if depfiles:
+    # Generate the dependency file
+    depfile = depfiles[0]
+    cmd = (NVCC_PATH + ' ' + nvccopts +
+           ' --compiler-options "' + host_compiler_options + '"' +
+           ' --compiler-bindir=' + GCC_HOST_COMPILER_PATH +
+           ' -I .' +
+           ' -x cu ' + opt + includes + ' ' + srcs + ' -M -o ' + depfile)
+    if log: Log(cmd)
+    exit_status = os.system(cmd)
+    if exit_status != 0:
+      return exit_status
+
+  cmd = (NVCC_PATH + ' ' + nvccopts +
+         ' --compiler-options "' + host_compiler_options + ' -fPIC"' +
+         ' --compiler-bindir=' + GCC_HOST_COMPILER_PATH +
+         ' -I .' +
+         ' -x cu ' + opt + includes + ' -c ' + srcs + out)
+
+  # TODO(zhengxq): for some reason, 'gcc' needs this help to find 'as'.
+  # Need to investigate and fix.
+  cmd = 'PATH=' + PREFIX_DIR + ':$PATH ' + cmd
+  if log: Log(cmd)
+  return os.system(cmd)
+
+
+def main():
+  parser = ArgumentParser()
+  parser.add_argument('-x', nargs=1)
+  parser.add_argument('--cuda_log', action='store_true')
+  args, leftover = parser.parse_known_args(sys.argv[1:])
+
+  if args.x and args.x[0] == 'cuda':
+    if args.cuda_log: Log('-x cuda')
+    leftover = [pipes.quote(s) for s in leftover]
+    if args.cuda_log: Log('using nvcc')
+    return InvokeNvcc(leftover, log=args.cuda_log)
+
+  # Strip our flags before passing through to the CPU compiler for files which
+  # are not -x cuda. We can't just pass 'leftover' because it also strips -x.
+  # We not only want to pass -x to the CPU compiler, but also keep it in its
+  # relative location in the argv list (the compiler is actually sensitive to
+  # this).
+  cpu_compiler_flags = [flag for flag in sys.argv[1:]
+                             if not flag.startswith(('--cuda_log'))]
+
+  return subprocess.call([CPU_COMPILER] + cpu_compiler_flags)
+
+if __name__ == '__main__':
+  sys.exit(main())
diff --git a/build_deps/toolchains/gpu/crosstool/windows/msvc_wrapper_for_nvcc.py.tpl b/build_deps/toolchains/gpu/crosstool/windows/msvc_wrapper_for_nvcc.py.tpl
new file mode 100644
index 0000000..1a09756
--- /dev/null
+++ b/build_deps/toolchains/gpu/crosstool/windows/msvc_wrapper_for_nvcc.py.tpl
@@ -0,0 +1,192 @@
+#!/usr/bin/env python
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Crosstool wrapper for compiling CUDA programs with nvcc on Windows.
+
+DESCRIPTION:
+  This script is the Windows version of //third_party/gpus/crosstool/crosstool_wrapper_is_not_gcc
+"""
+
+from __future__ import print_function
+
+from argparse import ArgumentParser
+import os
+import subprocess
+import re
+import sys
+import pipes
+
+# Template values set by cuda_autoconf.
+CPU_COMPILER = ('%{cpu_compiler}')
+GCC_HOST_COMPILER_PATH = ('%{gcc_host_compiler_path}')
+
+NVCC_PATH = '%{nvcc_path}'
+NVCC_VERSION = '%{cuda_version}'
+NVCC_TEMP_DIR = "%{nvcc_tmp_dir}"
+supported_cuda_compute_capabilities = [ %{cuda_compute_capabilities} ]
+
+def Log(s):
+  print('gpus/crosstool: {0}'.format(s))
+
+
+def GetOptionValue(argv, option):
+  """Extract the list of values for option from options.
+
+  Args:
+    option: The option whose value to extract, without the leading '/'.
+
+  Returns:
+    1. A list of values, either directly following the option,
+    (eg., /opt val1 val2) or values collected from multiple occurrences of
+    the option (eg., /opt val1 /opt val2).
+    2. The leftover options.
+  """
+
+  parser = ArgumentParser(prefix_chars='/')
+  parser.add_argument('/' + option, nargs='*', action='append')
+  args, leftover = parser.parse_known_args(argv)
+  if args and vars(args)[option]:
+    return (sum(vars(args)[option], []), leftover)
+  return ([], leftover)
+
+def _update_options(nvcc_options):
+  if NVCC_VERSION in ("7.0",):
+    return nvcc_options
+
+  update_options = { "relaxed-constexpr" : "expt-relaxed-constexpr" }
+  return [ update_options[opt] if opt in update_options else opt
+                    for opt in nvcc_options ]
+
+def GetNvccOptions(argv):
+  """Collect the -nvcc_options values from argv.
+
+  Args:
+    argv: A list of strings, possibly the argv passed to main().
+
+  Returns:
+    1. The string that can be passed directly to nvcc.
+    2. The leftover options.
+  """
+
+  parser = ArgumentParser()
+  parser.add_argument('-nvcc_options', nargs='*', action='append')
+
+  args, leftover = parser.parse_known_args(argv)
+
+  if args.nvcc_options:
+    options = _update_options(sum(args.nvcc_options, []))
+    return (['--' + a for a in options], leftover)
+  return ([], leftover)
+
+
+def InvokeNvcc(argv, log=False):
+  """Call nvcc with arguments assembled from argv.
+
+  Args:
+    argv: A list of strings, possibly the argv passed to main().
+    log: True if logging is requested.
+
+  Returns:
+    The return value of calling os.system('nvcc ' + args)
+  """
+
+  src_files = [f for f in argv if
+               re.search('\.cpp$|\.cc$|\.c$|\.cxx$|\.C$', f)]
+  if len(src_files) == 0:
+    raise Error('No source files found for cuda compilation.')
+
+  out_file = [ f for f in argv if f.startswith('/Fo') ]
+  if len(out_file) != 1:
+    raise Error('Please sepecify exactly one output file for cuda compilation.')
+  out = ['-o', out_file[0][len('/Fo'):]]
+
+  nvcc_compiler_options, argv = GetNvccOptions(argv)
+
+  opt_option, argv = GetOptionValue(argv, 'O')
+  opt = ['-g', '-G']
+  if (len(opt_option) > 0 and opt_option[0] != 'd'):
+    opt = ['-O2']
+
+  include_options, argv = GetOptionValue(argv, 'I')
+  includes = ["-I " + include for include in include_options]
+
+  defines, argv = GetOptionValue(argv, 'D')
+  defines = ['-D' + define for define in defines]
+
+  undefines, argv = GetOptionValue(argv, 'U')
+  undefines = ['-U' + define for define in undefines]
+
+  # The rest of the unrecongized options should be passed to host compiler
+  host_compiler_options = [option for option in argv if option not in (src_files + out_file)]
+
+  m_options = ["-m64"]
+
+  nvccopts = ['-D_FORCE_INLINES']
+  for capability in supported_cuda_compute_capabilities:
+    capability = capability.replace('.', '')
+    nvccopts += [r'-gencode=arch=compute_%s,"code=sm_%s,compute_%s"' % (
+        capability, capability, capability)]
+  nvccopts += nvcc_compiler_options
+  nvccopts += undefines
+  nvccopts += defines
+  nvccopts += m_options
+  nvccopts += ['--compiler-options="' + " ".join(host_compiler_options) + '"']
+  nvccopts += ['-x', 'cu'] + opt + includes + out + ['-c'] + src_files
+  # If we don't specify --keep-dir, nvcc will generate intermediate files under TEMP
+  # Put them under NVCC_TEMP_DIR instead, then Bazel can ignore files under NVCC_TEMP_DIR during dependency check
+  # http://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html#options-for-guiding-compiler-driver
+  # Different actions are sharing NVCC_TEMP_DIR, so we cannot remove it if the directory already exists.
+  if os.path.isfile(NVCC_TEMP_DIR):
+    os.remove(NVCC_TEMP_DIR)
+  if not os.path.exists(NVCC_TEMP_DIR):
+    os.makedirs(NVCC_TEMP_DIR)
+  nvccopts += ['--keep', '--keep-dir', NVCC_TEMP_DIR]
+  cmd = [NVCC_PATH] + nvccopts
+  if log:
+    Log(cmd)
+  proc = subprocess.Popen(cmd,
+                          stdout=sys.stdout,
+                          stderr=sys.stderr,
+                          env=os.environ.copy(),
+                          shell=True)
+  proc.wait()
+  return proc.returncode
+
+def main():
+  parser = ArgumentParser()
+  parser.add_argument('-x', nargs=1)
+  parser.add_argument('--cuda_log', action='store_true')
+  args, leftover = parser.parse_known_args(sys.argv[1:])
+
+  if args.x and args.x[0] == 'cuda':
+    if args.cuda_log: Log('-x cuda')
+    leftover = [pipes.quote(s) for s in leftover]
+    if args.cuda_log: Log('using nvcc')
+    return InvokeNvcc(leftover, log=args.cuda_log)
+
+  # Strip our flags before passing through to the CPU compiler for files which
+  # are not -x cuda. We can't just pass 'leftover' because it also strips -x.
+  # We not only want to pass -x to the CPU compiler, but also keep it in its
+  # relative location in the argv list (the compiler is actually sensitive to
+  # this).
+  cpu_compiler_flags = [flag for flag in sys.argv[1:]
+                             if not flag.startswith(('--cuda_log'))
+                             and not flag.startswith(('-nvcc_options'))]
+
+  return subprocess.call([CPU_COMPILER] + cpu_compiler_flags)
+
+if __name__ == '__main__':
+  sys.exit(main())
diff --git a/build_deps/toolchains/gpu/cub.BUILD b/build_deps/toolchains/gpu/cub.BUILD
new file mode 100644
index 0000000..cdc9e4f
--- /dev/null
+++ b/build_deps/toolchains/gpu/cub.BUILD
@@ -0,0 +1,25 @@
+# Description: CUB library which is a set of primitives for GPU programming.
+
+load("@local_config_cuda//cuda:build_defs.bzl", "cuda_default_copts", "if_cuda")
+
+package(
+    default_visibility = ["//visibility:public"],
+)
+
+licenses(["notice"])  # BSD
+
+filegroup(
+    name = "cub_header_files",
+    srcs = glob([
+        "cub/**",
+    ]),
+)
+
+cc_library(
+    name = "cub",
+    hdrs = if_cuda([":cub_header_files"]),
+    include_prefix = "gpu",
+    deps = [
+        "@local_config_cuda//cuda:cuda_headers",
+    ],
+)
diff --git a/build_deps/toolchains/gpu/cuda/BUILD b/build_deps/toolchains/gpu/cuda/BUILD
new file mode 100644
index 0000000..e69de29
diff --git a/build_deps/toolchains/gpu/cuda/BUILD.tpl b/build_deps/toolchains/gpu/cuda/BUILD.tpl
new file mode 100644
index 0000000..1ac5643
--- /dev/null
+++ b/build_deps/toolchains/gpu/cuda/BUILD.tpl
@@ -0,0 +1,227 @@
+load(":build_defs.bzl", "cuda_header_library")
+
+licenses(["restricted"])  # MPL2, portions GPL v3, LGPL v3, BSD-like
+
+package(default_visibility = ["//visibility:public"])
+
+config_setting(
+    name = "using_nvcc",
+    values = {
+        "define": "using_cuda_nvcc=true",
+    },
+)
+
+config_setting(
+    name = "using_clang",
+    values = {
+        "define": "using_cuda_clang=true",
+    },
+)
+
+# Equivalent to using_clang && -c opt.
+config_setting(
+    name = "using_clang_opt",
+    values = {
+        "define": "using_cuda_clang=true",
+        "compilation_mode": "opt",
+    },
+)
+
+config_setting(
+    name = "darwin",
+    values = {"cpu": "darwin"},
+    visibility = ["//visibility:public"],
+)
+
+config_setting(
+    name = "freebsd",
+    values = {"cpu": "freebsd"},
+    visibility = ["//visibility:public"],
+)
+
+cuda_header_library(
+    name = "cuda_headers",
+    hdrs = [
+        %{cuda_headers}
+    ],
+    include_prefix = "third_party/gpus",
+    includes = [
+        ".",
+        "cuda/include",
+        "cuda/include/crt",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "cudart_static",
+    srcs = ["cuda/lib/%{cudart_static_lib}"],
+    includes = [
+        ".",
+        "cuda/include",
+    ],
+    linkopts = select({
+        ":freebsd": [],
+        "//conditions:default": ["-ldl"],
+    }) + [
+        "-lpthread",
+        %{cudart_static_linkopt}
+    ],
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "cuda_driver",
+    srcs = ["cuda/lib/%{cuda_driver_lib}"],
+    includes = [
+        ".",
+        "cuda/include",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "cudart",
+    srcs = ["cuda/lib/%{cudart_lib}"],
+    data = ["cuda/lib/%{cudart_lib}"],
+    includes = [
+        ".",
+        "cuda/include",
+    ],
+    linkstatic = 1,
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "cublas",
+    srcs = ["cuda/lib/%{cublas_lib}"],
+    data = ["cuda/lib/%{cublas_lib}"],
+    includes = [
+        ".",
+        "cuda/include",
+    ],
+    linkstatic = 1,
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "cusolver",
+    srcs = ["cuda/lib/%{cusolver_lib}"],
+    data = ["cuda/lib/%{cusolver_lib}"],
+    includes = [
+        ".",
+        "cuda/include",
+    ],
+    linkopts = ["-lgomp"],
+    linkstatic = 1,
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "cudnn",
+    srcs = ["cuda/lib/%{cudnn_lib}"],
+    data = ["cuda/lib/%{cudnn_lib}"],
+    includes = [
+        ".",
+        "cuda/include",
+    ],
+    linkstatic = 1,
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "cudnn_header",
+    includes = [
+        ".",
+        "cuda/include",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "cufft",
+    srcs = ["cuda/lib/%{cufft_lib}"],
+    data = ["cuda/lib/%{cufft_lib}"],
+    includes = [
+        ".",
+        "cuda/include",
+    ],
+    linkstatic = 1,
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "curand",
+    srcs = ["cuda/lib/%{curand_lib}"],
+    data = ["cuda/lib/%{curand_lib}"],
+    includes = [
+        ".",
+        "cuda/include",
+    ],
+    linkstatic = 1,
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "cuda",
+    visibility = ["//visibility:public"],
+    deps = [
+        ":cublas",
+        ":cuda_headers",
+        ":cudart",
+        ":cudnn",
+        ":cufft",
+        ":curand",
+    ],
+)
+
+cc_library(
+    name = "cupti_headers",
+    hdrs = [
+        "cuda/cuda_config.h",
+        ":cuda-extras",
+    ],
+    includes = [
+        ".",
+        "cuda/extras/CUPTI/include/",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "cupti_dsos",
+    data = ["cuda/lib/%{cupti_lib}"],
+    includes = [
+        ".",
+        "cuda/include",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "libdevice_root",
+    data = [":cuda-nvvm"],
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "cuda_libs",
+    data = [
+        ":cudart",
+    ],
+    linkopts = select({
+        ":darwin": [
+            "-Wl,-rpath,./lib",
+            "-Wl,-rpath,./extras/CUPTI/lib",
+        ],
+        "//conditions:default": [
+            "-Wl,-rpath,./lib64",
+            "-Wl,-rpath,./extras/CUPTI/lib64",
+        ],
+    }),
+    deps = [
+        ":cudart",
+    ],
+)
+
+%{copy_rules}
diff --git a/build_deps/toolchains/gpu/cuda/BUILD.windows.tpl b/build_deps/toolchains/gpu/cuda/BUILD.windows.tpl
new file mode 100644
index 0000000..3ed4fd4
--- /dev/null
+++ b/build_deps/toolchains/gpu/cuda/BUILD.windows.tpl
@@ -0,0 +1,164 @@
+licenses(["restricted"])  # MPL2, portions GPL v3, LGPL v3, BSD-like
+
+package(default_visibility = ["//visibility:public"])
+
+config_setting(
+    name = "using_nvcc",
+    values = {
+        "define": "using_cuda_nvcc=true",
+    },
+)
+
+config_setting(
+    name = "using_clang",
+    values = {
+        "define": "using_cuda_clang=true",
+    },
+)
+
+# Equivalent to using_clang && -c opt.
+config_setting(
+    name = "using_clang_opt",
+    values = {
+        "define": "using_cuda_clang=true",
+        "compilation_mode": "opt",
+    },
+)
+
+config_setting(
+    name = "darwin",
+    values = {"cpu": "darwin"},
+    visibility = ["//visibility:public"],
+)
+
+config_setting(
+    name = "freebsd",
+    values = {"cpu": "freebsd"},
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "cuda_headers",
+    hdrs = [
+        "cuda/cuda_config.h",
+        %{cuda_headers}
+    ],
+    includes = [
+        ".",
+        "cuda/include",
+        "cuda/include/crt",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+cc_import(
+    name = "cudart_static",
+    # /WHOLEARCHIVE:cudart_static.lib will cause a
+    # "Internal error during CImplib::EmitThunk" error.
+    # Treat this library as interface library to avoid being whole archived when
+    # linking a DLL that depends on this.
+    # TODO(pcloudy): Remove this rule after b/111278841 is resolved.
+    interface_library = "cuda/lib/%{cudart_static_lib}",
+    system_provided = 1,
+    visibility = ["//visibility:public"],
+)
+
+cc_import(
+    name = "cuda_driver",
+    interface_library = "cuda/lib/%{cuda_driver_lib}",
+    system_provided = 1,
+    visibility = ["//visibility:public"],
+)
+
+cc_import(
+    name = "cudart",
+    interface_library = "cuda/lib/%{cudart_lib}",
+    system_provided = 1,
+    visibility = ["//visibility:public"],
+)
+
+cc_import(
+    name = "cublas",
+    interface_library = "cuda/lib/%{cublas_lib}",
+    system_provided = 1,
+    visibility = ["//visibility:public"],
+)
+
+cc_import(
+    name = "cusolver",
+    interface_library = "cuda/lib/%{cusolver_lib}",
+    system_provided = 1,
+    visibility = ["//visibility:public"],
+)
+
+cc_import(
+    name = "cudnn",
+    interface_library = "cuda/lib/%{cudnn_lib}",
+    system_provided = 1,
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "cudnn_header",
+    includes = [
+        ".",
+        "cuda/include",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+cc_import(
+    name = "cufft",
+    interface_library = "cuda/lib/%{cufft_lib}",
+    system_provided = 1,
+    visibility = ["//visibility:public"],
+)
+
+cc_import(
+    name = "curand",
+    interface_library = "cuda/lib/%{curand_lib}",
+    system_provided = 1,
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "cuda",
+    visibility = ["//visibility:public"],
+    deps = [
+        ":cublas",
+        ":cuda_headers",
+        ":cudart",
+        ":cudnn",
+        ":cufft",
+        ":curand",
+    ],
+)
+
+cc_library(
+    name = "cupti_headers",
+    hdrs = [
+        "cuda/cuda_config.h",
+        ":cuda-extras",
+    ],
+    includes = [
+        ".",
+        "cuda/",
+        "cuda/extras/CUPTI/include/",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+cc_import(
+    name = "cupti_dsos",
+    interface_library = "cuda/lib/%{cupti_lib}",
+    system_provided = 1,
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "libdevice_root",
+    data = [":cuda-nvvm"],
+    visibility = ["//visibility:public"],
+)
+
+%{copy_rules}
diff --git a/build_deps/toolchains/gpu/cuda/build_defs.bzl.tpl b/build_deps/toolchains/gpu/cuda/build_defs.bzl.tpl
new file mode 100644
index 0000000..a4f484f
--- /dev/null
+++ b/build_deps/toolchains/gpu/cuda/build_defs.bzl.tpl
@@ -0,0 +1,62 @@
+# Macros for building CUDA code.
+def if_cuda(if_true, if_false = []):
+    """Shorthand for select()'ing on whether we're building with CUDA.
+
+    Returns a select statement which evaluates to if_true if we're building
+    with CUDA enabled.  Otherwise, the select statement evaluates to if_false.
+
+    """
+    return select({
+        "@local_config_cuda//cuda:using_nvcc": if_true,
+        "@local_config_cuda//cuda:using_clang": if_true,
+        "//conditions:default": if_false
+    })
+
+
+def cuda_default_copts():
+    """Default options for all CUDA compilations."""
+    return if_cuda(["-x", "cuda", "-DGOOGLE_CUDA=1"] + %{cuda_extra_copts})
+
+
+def cuda_is_configured():
+    """Returns true if CUDA was enabled during the configure process."""
+    return %{cuda_is_configured}
+
+def if_cuda_is_configured(x):
+    """Tests if the CUDA was enabled during the configure process.
+
+    Unlike if_cuda(), this does not require that we are building with
+    --config=cuda. Used to allow non-CUDA code to depend on CUDA libraries.
+    """
+    if cuda_is_configured():
+      return x
+    return []
+
+def cuda_header_library(
+        name,
+        hdrs,
+        include_prefix = None,
+        strip_include_prefix = None,
+        deps = [],
+        **kwargs):
+    """Generates a cc_library containing both virtual and system include paths.
+
+    Generates both a header-only target with virtual includes plus the full
+    target without virtual includes. This works around the fact that bazel can't
+    mix 'includes' and 'include_prefix' in the same target."""
+
+    native.cc_library(
+        name = name + "_virtual",
+        hdrs = hdrs,
+        include_prefix = include_prefix,
+        strip_include_prefix = strip_include_prefix,
+        deps = deps,
+        visibility = ["//visibility:private"],
+    )
+
+    native.cc_library(
+        name = name,
+        textual_hdrs = hdrs,
+        deps = deps + [":%s_virtual" % name],
+        **kwargs
+    )
diff --git a/build_deps/toolchains/gpu/cuda/cuda_config.h.tpl b/build_deps/toolchains/gpu/cuda/cuda_config.h.tpl
new file mode 100644
index 0000000..811b040
--- /dev/null
+++ b/build_deps/toolchains/gpu/cuda/cuda_config.h.tpl
@@ -0,0 +1,26 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef CUDA_CUDA_CONFIG_H_
+#define CUDA_CUDA_CONFIG_H_
+
+#define TF_CUDA_CAPABILITIES %{cuda_compute_capabilities}
+
+#define TF_CUDA_VERSION "%{cuda_version}"
+#define TF_CUDNN_VERSION "%{cudnn_version}"
+
+#define TF_CUDA_TOOLKIT_PATH "%{cuda_toolkit_path}"
+
+#endif  // CUDA_CUDA_CONFIG_H_
diff --git a/build_deps/toolchains/gpu/cuda_configure.bzl b/build_deps/toolchains/gpu/cuda_configure.bzl
new file mode 100644
index 0000000..ba38c6b
--- /dev/null
+++ b/build_deps/toolchains/gpu/cuda_configure.bzl
@@ -0,0 +1,1116 @@
+# -*- Python -*-
+"""Repository rule for CUDA autoconfiguration.
+`cuda_configure` depends on the following environment variables:
+  * `TF_NEED_CUDA`: Whether to enable building with CUDA.
+  * `GCC_HOST_COMPILER_PATH`: The GCC host compiler path
+  * `TF_CUDA_CLANG`: Whether to use clang as a cuda compiler.
+  * `CLANG_CUDA_COMPILER_PATH`: The clang compiler path that will be used for
+    both host and device code compilation if TF_CUDA_CLANG is 1.
+  * `TF_CUDA_PATHS`: The base paths to look for CUDA and cuDNN. Default is
+    `/usr/local/cuda,usr/`.
+  * `CUDA_TOOLKIT_PATH`: The path to the CUDA toolkit. Default is
+    `/usr/local/cuda`.
+  * `TF_CUDA_VERSION`: The version of the CUDA toolkit. If this is blank, then
+    use the system default.
+  * `TF_CUDNN_VERSION`: The version of the cuDNN library.
+  * `CUDNN_INSTALL_PATH`: The path to the cuDNN library. Default is
+    `/usr/local/cuda`.
+  * `TF_CUDA_COMPUTE_CAPABILITIES`: The CUDA compute capabilities. Default is
+    `3.5,5.2`.
+  * `PYTHON_BIN_PATH`: The python binary path
+"""
+
+load(
+    "@bazel_tools//tools/cpp:lib_cc_configure.bzl",
+    "escape_string",
+    "get_env_var",
+)
+load(
+    "@bazel_tools//tools/cpp:windows_cc_configure.bzl",
+    "find_msvc_tool",
+    "find_vc_path",
+    "setup_vc_env_vars",
+)
+
+_GCC_HOST_COMPILER_PATH = "GCC_HOST_COMPILER_PATH"
+
+_CLANG_CUDA_COMPILER_PATH = "CLANG_CUDA_COMPILER_PATH"
+
+_CUDA_TOOLKIT_PATH = "CUDA_TOOLKIT_PATH"
+
+_TF_CUDA_VERSION = "TF_CUDA_VERSION"
+
+_TF_CUDNN_VERSION = "TF_CUDNN_VERSION"
+
+_CUDNN_INSTALL_PATH = "CUDNN_INSTALL_PATH"
+
+_TF_CUDA_COMPUTE_CAPABILITIES = "TF_CUDA_COMPUTE_CAPABILITIES"
+
+_TF_DOWNLOAD_CLANG = "TF_DOWNLOAD_CLANG"
+
+_PYTHON_BIN_PATH = "PYTHON_BIN_PATH"
+
+_DEFAULT_CUDA_COMPUTE_CAPABILITIES = [
+    "3.5",
+    "5.2",
+]
+
+def _get_python_bin(repository_ctx):
+    """Gets the python bin path."""
+    python_bin = repository_ctx.os.environ.get(_PYTHON_BIN_PATH)
+    if python_bin != None:
+        return python_bin
+    python_bin_name = "python.exe" if _is_windows(repository_ctx) else "python"
+    python_bin_path = repository_ctx.which(python_bin_name)
+    if python_bin_path != None:
+        return str(python_bin_path)
+    auto_configure_fail(
+        "Cannot find python in PATH, please make sure " +
+        "python is installed and add its directory in PATH, or --define " +
+        "%s='/something/else'.\nPATH=%s" % (
+            _PYTHON_BIN_PATH,
+            repository_ctx.os.environ.get("PATH", ""),
+        ),
+    )
+
+def _get_nvcc_tmp_dir_for_windows(repository_ctx):
+    """Return the tmp directory for nvcc to generate intermediate source files."""
+    escaped_tmp_dir = escape_string(
+        get_env_var(repository_ctx, "TMP", "C:\\Windows\\Temp").replace(
+            "\\",
+            "\\\\",
+        ),
+    )
+    return escaped_tmp_dir + "\\\\nvcc_inter_files_tmp_dir"
+
+def _get_msvc_compiler(repository_ctx):
+    vc_path = find_vc_path(repository_ctx)
+    return find_msvc_tool(repository_ctx, vc_path, "cl.exe").replace("\\", "/")
+
+def _get_win_cuda_defines(repository_ctx):
+    """Return CROSSTOOL defines for Windows"""
+
+    # If we are not on Windows, return empty vaules for Windows specific fields.
+    # This ensures the CROSSTOOL file parser is happy.
+    if not _is_windows(repository_ctx):
+        return dict({
+            "%{msvc_env_tmp}": "",
+            "%{msvc_env_path}": "",
+            "%{msvc_env_include}": "",
+            "%{msvc_env_lib}": "",
+            "%{msvc_cl_path}": "",
+            "%{msvc_ml_path}": "",
+            "%{msvc_link_path}": "",
+            "%{msvc_lib_path}": "",
+            "%{cxx_builtin_include_directory}": "",
+        })
+
+    vc_path = find_vc_path(repository_ctx)
+    if not vc_path:
+        auto_configure_fail(
+            "Visual C++ build tools not found on your machine. " +
+            "Please check your installation following https://docs.bazel.build/versions/master/windows.html#using",
+        )
+        return {}
+
+    env = setup_vc_env_vars(repository_ctx, vc_path)
+    escaped_paths = escape_string(env["PATH"])
+    escaped_include_paths = escape_string(env["INCLUDE"])
+    escaped_lib_paths = escape_string(env["LIB"])
+    escaped_tmp_dir = escape_string(
+        get_env_var(repository_ctx, "TMP", "C:\\Windows\\Temp").replace(
+            "\\",
+            "\\\\",
+        ),
+    )
+
+    msvc_cl_path = _get_python_bin(repository_ctx)
+    msvc_ml_path = find_msvc_tool(repository_ctx, vc_path, "ml64.exe").replace(
+        "\\",
+        "/",
+    )
+    msvc_link_path = find_msvc_tool(repository_ctx, vc_path, "link.exe").replace(
+        "\\",
+        "/",
+    )
+    msvc_lib_path = find_msvc_tool(repository_ctx, vc_path, "lib.exe").replace(
+        "\\",
+        "/",
+    )
+
+    # nvcc will generate some temporary source files under %{nvcc_tmp_dir}
+    # The generated files are guranteed to have unique name, so they can share the same tmp directory
+    escaped_cxx_include_directories = [
+        "cxx_builtin_include_directory: \"%s\"" %
+        _get_nvcc_tmp_dir_for_windows(repository_ctx),
+    ]
+    for path in escaped_include_paths.split(";"):
+        if path:
+            escaped_cxx_include_directories.append(
+                "cxx_builtin_include_directory: \"%s\"" % path,
+            )
+
+    return {
+        "%{msvc_env_tmp}": escaped_tmp_dir,
+        "%{msvc_env_path}": escaped_paths,
+        "%{msvc_env_include}": escaped_include_paths,
+        "%{msvc_env_lib}": escaped_lib_paths,
+        "%{msvc_cl_path}": msvc_cl_path,
+        "%{msvc_ml_path}": msvc_ml_path,
+        "%{msvc_link_path}": msvc_link_path,
+        "%{msvc_lib_path}": msvc_lib_path,
+        "%{cxx_builtin_include_directory}": "\n".join(escaped_cxx_include_directories),
+    }
+
+def find_cc(repository_ctx):
+    """Find the C++ compiler."""
+    if _is_windows(repository_ctx):
+        return _get_msvc_compiler(repository_ctx)
+
+    target_cc_name = "gcc"
+    cc_path_envvar = _GCC_HOST_COMPILER_PATH
+    cc_name = target_cc_name
+
+    if cc_path_envvar in repository_ctx.os.environ:
+        cc_name_from_env = repository_ctx.os.environ[cc_path_envvar].strip()
+        if cc_name_from_env:
+            cc_name = cc_name_from_env
+    if cc_name.startswith("/"):
+        # Absolute path, maybe we should make this supported by our which function.
+        return cc_name
+    cc = repository_ctx.which(cc_name)
+    if cc == None:
+        fail(("Cannot find {}, either correct your path or set the {}" +
+              " environment variable").format(target_cc_name, cc_path_envvar))
+    return cc
+
+_INC_DIR_MARKER_BEGIN = "#include <...>"
+
+# OSX add " (framework directory)" at the end of line, strip it.
+_OSX_FRAMEWORK_SUFFIX = " (framework directory)"
+
+_OSX_FRAMEWORK_SUFFIX_LEN = len(_OSX_FRAMEWORK_SUFFIX)
+
+def _cxx_inc_convert(path):
+    """Convert path returned by cc -E xc++ in a complete path."""
+    path = path.strip()
+    if path.endswith(_OSX_FRAMEWORK_SUFFIX):
+        path = path[:-_OSX_FRAMEWORK_SUFFIX_LEN].strip()
+    return path
+
+def _normalize_include_path(repository_ctx, path):
+    """Normalizes include paths before writing them to the crosstool.
+      If path points inside the 'crosstool' folder of the repository, a relative
+      path is returned.
+      If path points outside the 'crosstool' folder, an absolute path is returned.
+      """
+    path = str(repository_ctx.path(path))
+    crosstool_folder = str(repository_ctx.path(".").get_child("crosstool"))
+
+    if path.startswith(crosstool_folder):
+        # We drop the path to "$REPO/crosstool" and a trailing path separator.
+        return path[len(crosstool_folder) + 1:]
+    return path
+
+def _get_cxx_inc_directories_impl(repository_ctx, cc, lang_is_cpp):
+    """Compute the list of default C or C++ include directories."""
+    if lang_is_cpp:
+        lang = "c++"
+    else:
+        lang = "c"
+    result = repository_ctx.execute([cc, "-E", "-x" + lang, "-", "-v"])
+    index1 = result.stderr.find(_INC_DIR_MARKER_BEGIN)
+    if index1 == -1:
+        return []
+    index1 = result.stderr.find("\n", index1)
+    if index1 == -1:
+        return []
+    index2 = result.stderr.rfind("\n ")
+    if index2 == -1 or index2 < index1:
+        return []
+    index2 = result.stderr.find("\n", index2 + 1)
+    if index2 == -1:
+        inc_dirs = result.stderr[index1 + 1:]
+    else:
+        inc_dirs = result.stderr[index1 + 1:index2].strip()
+
+    return [
+        _normalize_include_path(repository_ctx, _cxx_inc_convert(p))
+        for p in inc_dirs.split("\n")
+    ]
+
+def get_cxx_inc_directories(repository_ctx, cc):
+    """Compute the list of default C and C++ include directories."""
+
+    # For some reason `clang -xc` sometimes returns include paths that are
+    # different from the ones from `clang -xc++`. (Symlink and a dir)
+    # So we run the compiler with both `-xc` and `-xc++` and merge resulting lists
+    includes_cpp = _get_cxx_inc_directories_impl(repository_ctx, cc, True)
+    includes_c = _get_cxx_inc_directories_impl(repository_ctx, cc, False)
+
+    return includes_cpp + [
+        inc
+        for inc in includes_c
+        if inc not in includes_cpp
+    ]
+
+def auto_configure_fail(msg):
+    """Output failure message when cuda configuration fails."""
+    red = "\033[0;31m"
+    no_color = "\033[0m"
+    fail("\n%sCuda Configuration Error:%s %s\n" % (red, no_color, msg))
+
+# END cc_configure common functions (see TODO above).
+
+def _host_compiler_includes(repository_ctx, cc):
+    """Generates the cxx_builtin_include_directory entries for gcc inc dirs.
+      Args:
+        repository_ctx: The repository context.
+        cc: The path to the gcc host compiler.
+      Returns:
+        A string containing the cxx_builtin_include_directory for each of the gcc
+        host compiler include directories, which can be added to the CROSSTOOL
+        file.
+      """
+    inc_dirs = get_cxx_inc_directories(repository_ctx, cc)
+    inc_entries = []
+    for inc_dir in inc_dirs:
+        inc_entries.append("  cxx_builtin_include_directory: \"%s\"" % inc_dir)
+    return "\n".join(inc_entries)
+
+def _cuda_include_path(repository_ctx, cuda_config):
+    """Generates the cxx_builtin_include_directory entries for cuda inc dirs.
+      Args:
+        repository_ctx: The repository context.
+        cc: The path to the gcc host compiler.
+      Returns:
+        A string containing the cxx_builtin_include_directory for each of the gcc
+        host compiler include directories, which can be added to the CROSSTOOL
+        file.
+      """
+    nvcc_path = repository_ctx.path("%s/bin/nvcc%s" % (
+        cuda_config.cuda_toolkit_path,
+        ".exe" if cuda_config.cpu_value == "Windows" else "",
+    ))
+    result = repository_ctx.execute([
+        nvcc_path,
+        "-v",
+        "/dev/null",
+        "-o",
+        "/dev/null",
+    ])
+    target_dir = ""
+    for one_line in result.stderr.splitlines():
+        if one_line.startswith("#$ _TARGET_DIR_="):
+            target_dir = (
+                cuda_config.cuda_toolkit_path + "/" + one_line.replace(
+                    "#$ _TARGET_DIR_=",
+                    "",
+                ) + "/include"
+            )
+    inc_entries = []
+    if target_dir != "":
+        inc_entries.append("  cxx_builtin_include_directory: \"%s\"" % target_dir)
+    default_include = cuda_config.cuda_toolkit_path + "/include"
+    inc_entries.append(
+        "  cxx_builtin_include_directory: \"%s\"" % default_include,
+    )
+    return "\n".join(inc_entries)
+
+def enable_cuda(repository_ctx):
+    if "TF_NEED_CUDA" in repository_ctx.os.environ:
+        enable_cuda = repository_ctx.os.environ["TF_NEED_CUDA"].strip()
+        return enable_cuda == "1"
+    return False
+
+def matches_version(environ_version, detected_version):
+    """Checks whether the user-specified version matches the detected version.
+      This function performs a weak matching so that if the user specifies only
+      the
+      major or major and minor versions, the versions are still considered
+      matching
+      if the version parts match. To illustrate:
+          environ_version  detected_version  result
+          -----------------------------------------
+          5.1.3            5.1.3             True
+          5.1              5.1.3             True
+          5                5.1               True
+          5.1.3            5.1               False
+          5.2.3            5.1.3             False
+      Args:
+        environ_version: The version specified by the user via environment
+          variables.
+        detected_version: The version autodetected from the CUDA installation on
+          the system.
+      Returns: True if user-specified version matches detected version and False
+        otherwise.
+    """
+    environ_version_parts = environ_version.split(".")
+    detected_version_parts = detected_version.split(".")
+    if len(detected_version_parts) < len(environ_version_parts):
+        return False
+    for i, part in enumerate(detected_version_parts):
+        if i >= len(environ_version_parts):
+            break
+        if part != environ_version_parts[i]:
+            return False
+    return True
+
+def find_cuda_define(repository_ctx, header_dir, header_file, define):
+    """Returns the value of a #define in a header file.
+      Greps through a header file and returns the value of the specified #define.
+      If the #define is not found, then raise an error.
+      Args:
+        repository_ctx: The repository context.
+        header_dir: The directory containing the header file.
+        header_file: The header file name.
+        define: The #define to search for.
+      Returns:
+        The value of the #define found in the header.
+      """
+
+    # Confirm location of the header and grep for the line defining the macro.
+    h_path = repository_ctx.path("%s/%s" % (header_dir, header_file))
+    if not h_path.exists:
+        auto_configure_fail("Cannot find %s at %s" % (header_file, str(h_path)))
+    result = repository_ctx.execute(
+        # Grep one more lines as some #defines are split into two lines.
+        [
+            "grep",
+            "--color=never",
+            "-A1",
+            "-E",
+            define,
+            str(h_path),
+        ],
+    )
+    if result.stderr:
+        auto_configure_fail("Error reading %s: %s" % (str(h_path), result.stderr))
+
+    # Parse the version from the line defining the macro.
+    if result.stdout.find(define) == -1:
+        auto_configure_fail(
+            "Cannot find line containing '%s' in %s" % (define, h_path),
+        )
+
+    # Split results to lines
+    lines = result.stdout.split("\n")
+    num_lines = len(lines)
+    for l in range(num_lines):
+        line = lines[l]
+        if define in line:  # Find the line with define
+            version = line
+            if l != num_lines - 1 and line[-1] == "\\":  # Add next line, if multiline
+                version = version[:-1] + lines[l + 1]
+            break
+
+    # Remove any comments
+    version = version.split("//")[0]
+
+    # Remove define name
+    version = version.replace(define, "").strip()
+
+    # Remove the code after the version number.
+    version_end = version.find(" ")
+    if version_end != -1:
+        if version_end == 0:
+            auto_configure_fail(
+                "Cannot extract the version from line containing '%s' in %s" %
+                (define, str(h_path)),
+            )
+        version = version[:version_end].strip()
+    return version
+
+def compute_capabilities(repository_ctx):
+    """Returns a list of strings representing cuda compute capabilities."""
+    if _TF_CUDA_COMPUTE_CAPABILITIES not in repository_ctx.os.environ:
+        return _DEFAULT_CUDA_COMPUTE_CAPABILITIES
+    capabilities_str = repository_ctx.os.environ[_TF_CUDA_COMPUTE_CAPABILITIES]
+    capabilities = capabilities_str.split(",")
+    for capability in capabilities:
+        # Workaround for Skylark's lack of support for regex. This check should
+        # be equivalent to checking:
+        #     if re.match("[0-9]+.[0-9]+", capability) == None:
+        parts = capability.split(".")
+        if len(parts) != 2 or not parts[0].isdigit() or not parts[1].isdigit():
+            auto_configure_fail("Invalid compute capability: %s" % capability)
+    return capabilities
+
+def get_cpu_value(repository_ctx):
+    """Returns the name of the host operating system.
+      Args:
+        repository_ctx: The repository context.
+      Returns:
+        A string containing the name of the host operating system.
+      """
+    os_name = repository_ctx.os.name.lower()
+    if os_name.startswith("mac os"):
+        return "Darwin"
+    if os_name.find("windows") != -1:
+        return "Windows"
+    result = repository_ctx.execute(["uname", "-s"])
+    return result.stdout.strip()
+
+def _is_windows(repository_ctx):
+    """Returns true if the host operating system is windows."""
+    return get_cpu_value(repository_ctx) == "Windows"
+
+def lib_name(base_name, cpu_value, version = None, static = False):
+    """Constructs the platform-specific name of a library.
+      Args:
+        base_name: The name of the library, such as "cudart"
+        cpu_value: The name of the host operating system.
+        version: The version of the library.
+        static: True the library is static or False if it is a shared object.
+      Returns:
+        The platform-specific name of the library.
+      """
+    version = "" if not version else "." + version
+    if cpu_value in ("Linux", "FreeBSD"):
+        if static:
+            return "lib%s.a" % base_name
+        return "lib%s.so%s" % (base_name, version)
+    elif cpu_value == "Windows":
+        return "%s.lib" % base_name
+    elif cpu_value == "Darwin":
+        if static:
+            return "lib%s.a" % base_name
+        return "lib%s%s.dylib" % (base_name, version)
+    else:
+        auto_configure_fail("Invalid cpu_value: %s" % cpu_value)
+
+def find_lib(repository_ctx, paths, check_soname = True):
+    """
+      Finds a library among a list of potential paths.
+      Args:
+        paths: List of paths to inspect.
+      Returns:
+        Returns the first path in paths that exist.
+    """
+    objdump = repository_ctx.which("objdump")
+    mismatches = []
+    for path in [repository_ctx.path(path) for path in paths]:
+        if not path.exists:
+            continue
+        if check_soname and objdump != None and not _is_windows(repository_ctx):
+            output = repository_ctx.execute([objdump, "-p", str(path)]).stdout
+            output = [line for line in output.splitlines() if "SONAME" in line]
+            sonames = [line.strip().split(" ")[-1] for line in output]
+            if not any([soname == path.basename for soname in sonames]):
+                mismatches.append(str(path))
+                continue
+        return path
+    if mismatches:
+        auto_configure_fail(
+            "None of the libraries match their SONAME: " + ", ".join(mismatches),
+        )
+    auto_configure_fail("No library found under: " + ", ".join(paths))
+
+def _find_cuda_lib(
+        lib,
+        repository_ctx,
+        cpu_value,
+        basedir,
+        version,
+        static = False):
+    """Finds the given CUDA or cuDNN library on the system.
+      Args:
+        lib: The name of the library, such as "cudart"
+        repository_ctx: The repository context.
+        cpu_value: The name of the host operating system.
+        basedir: The install directory of CUDA or cuDNN.
+        version: The version of the library.
+        static: True if static library, False if shared object.
+      Returns:
+        Returns the path to the library.
+      """
+    file_name = lib_name(lib, cpu_value, version, static)
+
+    return find_lib(
+        repository_ctx,
+        ["%s/%s" % (basedir, file_name)],
+        check_soname = version and not static,
+    )
+
+def _find_libs(repository_ctx, cuda_config):
+    """Returns the CUDA and cuDNN libraries on the system.
+      Args:
+        repository_ctx: The repository context.
+        cuda_config: The CUDA config as returned by _get_cuda_config
+      Returns:
+        Map of library names to structs of filename and path.
+      """
+    cpu_value = cuda_config.cpu_value
+    stub_dir = "" if _is_windows(repository_ctx) else "/stubs"
+    return {
+        "cuda": _find_cuda_lib(
+            "cuda",
+            repository_ctx,
+            cpu_value,
+            cuda_config.config["cuda_library_dir"] + stub_dir,
+            None,
+        ),
+        "cudart": _find_cuda_lib(
+            "cudart",
+            repository_ctx,
+            cpu_value,
+            cuda_config.config["cuda_library_dir"],
+            cuda_config.cudart_version,
+        ),
+        "cudart_static": _find_cuda_lib(
+            "cudart_static",
+            repository_ctx,
+            cpu_value,
+            cuda_config.config["cuda_library_dir"],
+            cuda_config.cuda_version,
+            static = True,
+        ),
+        "cublas": _find_cuda_lib(
+            "cublas",
+            repository_ctx,
+            cpu_value,
+            cuda_config.config["cublas_library_dir"],
+            cuda_config.cublas_version,
+        ),
+        "cusolver": _find_cuda_lib(
+            "cusolver",
+            repository_ctx,
+            cpu_value,
+            cuda_config.config["cusolver_library_dir"],
+            cuda_config.cusolver_version,
+        ),
+        "curand": _find_cuda_lib(
+            "curand",
+            repository_ctx,
+            cpu_value,
+            cuda_config.config["curand_library_dir"],
+            cuda_config.curand_version,
+        ),
+        "cufft": _find_cuda_lib(
+            "cufft",
+            repository_ctx,
+            cpu_value,
+            cuda_config.config["cufft_library_dir"],
+            cuda_config.cufft_version,
+        ),
+        "cudnn": _find_cuda_lib(
+            "cudnn",
+            repository_ctx,
+            cpu_value,
+            cuda_config.config["cudnn_library_dir"],
+            cuda_config.cudnn_version,
+        ),
+        "cupti": _find_cuda_lib(
+            "cupti",
+            repository_ctx,
+            cpu_value,
+            cuda_config.config["cupti_library_dir"],
+            cuda_config.cuda_version,
+        ),
+    }
+
+def _cudart_static_linkopt(cpu_value):
+    """Returns additional platform-specific linkopts for cudart."""
+    return "" if cpu_value == "Darwin" else "\"-lrt\","
+
+def _get_cuda_config(repository_ctx):
+    """Detects and returns information about the CUDA installation on the system.
+      Args:
+        repository_ctx: The repository context.
+      Returns:
+        A struct containing the following fields:
+          cuda_toolkit_path: The CUDA toolkit installation directory.
+          cudnn_install_basedir: The cuDNN installation directory.
+          cuda_version: The version of CUDA on the system.
+          cudart_version: The CUDA runtime version on the system.
+          cudnn_version: The version of cuDNN on the system.
+          compute_capabilities: A list of the system's CUDA compute capabilities.
+          cpu_value: The name of the host operating system.
+      """
+    config = find_cuda_config(repository_ctx, ["cuda", "cudnn"])
+    cpu_value = get_cpu_value(repository_ctx)
+    toolkit_path = config["cuda_toolkit_path"]
+
+    is_windows = _is_windows(repository_ctx)
+    cuda_version = config["cuda_version"].split(".")
+    cuda_major = cuda_version[0]
+    cuda_minor = cuda_version[1]
+
+    cuda_version = ("64_%s%s" if is_windows else "%s.%s") % (cuda_major, cuda_minor)
+    cudnn_version = ("64_%s" if is_windows else "%s") % config["cudnn_version"]
+
+    if int(cuda_major) >= 11:
+        # The libcudart soname in CUDA 11.x is versioned as 11.0 for backward compatability.
+        if int(cuda_major) == 11:
+            cudart_version = "64_110" if is_windows else "11.0"
+        else:
+            cudart_version = ("64_%s" if is_windows else "%s") % cuda_major
+        cublas_version = ("64_%s" if is_windows else "%s") % config["cublas_version"].split(".")[0]
+        cusolver_version = ("64_%s" if is_windows else "%s") % config["cusolver_version"].split(".")[0]
+        curand_version = ("64_%s" if is_windows else "%s") % config["curand_version"].split(".")[0]
+        cufft_version = ("64_%s" if is_windows else "%s") % config["cufft_version"].split(".")[0]
+    elif (int(cuda_major), int(cuda_minor)) >= (10, 1):
+        # cuda_lib_version is for libraries like cuBLAS, cuFFT, cuSOLVER, etc.
+        # It changed from 'x.y' to just 'x' in CUDA 10.1.
+        cuda_lib_version = ("64_%s" if is_windows else "%s") % cuda_major
+        cublas_version = cuda_lib_version
+        cusolver_version = cuda_lib_version
+        curand_version = cuda_lib_version
+        cufft_version = cuda_lib_version
+        cudart_version = cuda_version
+    else:
+        cublas_version = cuda_version
+        cusolver_version = cuda_version
+        curand_version = cuda_version
+        cufft_version = cuda_version
+        cudart_version = cuda_version
+
+    return struct(
+        cuda_toolkit_path = toolkit_path,
+        cuda_version = cuda_version,
+        cudart_version = cudart_version,
+        cublas_version = cublas_version,
+        cusolver_version = cusolver_version,
+        curand_version = curand_version,
+        cufft_version = cufft_version,
+        cudnn_version = cudnn_version,
+        compute_capabilities = compute_capabilities(repository_ctx),
+        cpu_value = cpu_value,
+        config = config,
+    )
+
+def _tpl(repository_ctx, tpl, substitutions = {}, out = None):
+    if substitutions == None:
+        substitutions = {}
+    if not out:
+        out = tpl.replace(":", "/")
+    repository_ctx.template(
+        out,
+        Label("//build_deps/toolchains/gpu/%s.tpl" % tpl),
+        substitutions,
+    )
+
+_DUMMY_CROSSTOOL_BUILD_FILE = """
+load("//crosstool:error_gpu_disabled.bzl", "error_gpu_disabled")
+error_gpu_disabled()
+"""
+
+def _create_dummy_repository(repository_ctx):
+    cpu_value = get_cpu_value(repository_ctx)
+
+    # Set up BUILD file for cuda/.
+    _tpl(
+        repository_ctx,
+        "cuda:build_defs.bzl",
+        {
+            "%{cuda_is_configured}": "False",
+            "%{cuda_extra_copts}": "[]",
+        },
+    )
+    _tpl(
+        repository_ctx,
+        "cuda:BUILD",
+        {
+            "%{cuda_driver_lib}": lib_name("cuda", cpu_value),
+            "%{cudart_static_lib}": lib_name(
+                "cudart_static",
+                cpu_value,
+                static = True,
+            ),
+            "%{cudart_static_linkopt}": _cudart_static_linkopt(cpu_value),
+            "%{cudart_lib}": lib_name("cudart", cpu_value),
+            "%{cublas_lib}": lib_name("cublas", cpu_value),
+            "%{cusolver_lib}": lib_name("cusolver", cpu_value),
+            "%{cudnn_lib}": lib_name("cudnn", cpu_value),
+            "%{cufft_lib}": lib_name("cufft", cpu_value),
+            "%{curand_lib}": lib_name("curand", cpu_value),
+            "%{cupti_lib}": lib_name("cupti", cpu_value),
+            "%{copy_rules}": "",
+            "%{cuda_headers}": "",
+        },
+    )
+
+    # Create dummy files for the CUDA toolkit since they are still required by
+    # tensorflow/core/platform/default/build_config:cuda.
+    repository_ctx.file("cuda/cuda/include/cuda.h")
+    repository_ctx.file("cuda/cuda/include/cublas.h")
+    repository_ctx.file("cuda/cuda/include/cudnn.h")
+    repository_ctx.file("cuda/cuda/extras/CUPTI/include/cupti.h")
+    repository_ctx.file("cuda/cuda/lib/%s" % lib_name("cuda", cpu_value))
+    repository_ctx.file("cuda/cuda/lib/%s" % lib_name("cudart", cpu_value))
+    repository_ctx.file(
+        "cuda/cuda/lib/%s" % lib_name("cudart_static", cpu_value),
+    )
+    repository_ctx.file("cuda/cuda/lib/%s" % lib_name("cublas", cpu_value))
+    repository_ctx.file("cuda/cuda/lib/%s" % lib_name("cusolver", cpu_value))
+    repository_ctx.file("cuda/cuda/lib/%s" % lib_name("cudnn", cpu_value))
+    repository_ctx.file("cuda/cuda/lib/%s" % lib_name("curand", cpu_value))
+    repository_ctx.file("cuda/cuda/lib/%s" % lib_name("cufft", cpu_value))
+    repository_ctx.file("cuda/cuda/lib/%s" % lib_name("cupti", cpu_value))
+
+def _execute(
+        repository_ctx,
+        cmdline,
+        error_msg = None,
+        error_details = None,
+        empty_stdout_fine = False):
+    """Executes an arbitrary shell command.
+      Args:
+        repository_ctx: the repository_ctx object
+        cmdline: list of strings, the command to execute
+        error_msg: string, a summary of the error if the command fails
+        error_details: string, details about the error or steps to fix it
+        empty_stdout_fine: bool, if True, an empty stdout result is fine,
+          otherwise it's an error
+      Return: the result of repository_ctx.execute(cmdline)
+    """
+    result = repository_ctx.execute(cmdline)
+    if result.stderr or not (empty_stdout_fine or result.stdout):
+        auto_configure_fail(
+            "\n".join([
+                error_msg.strip() if error_msg else "Repository command failed",
+                result.stderr.strip(),
+                error_details if error_details else "",
+            ]),
+        )
+    return result
+
+def _norm_path(path):
+    """Returns a path with '/' and remove the trailing slash."""
+    path = path.replace("\\", "/")
+    if path[-1] == "/":
+        path = path[:-1]
+    return path
+
+def make_copy_files_rule(repository_ctx, name, srcs, outs):
+    """Returns a rule to copy a set of files."""
+    cmds = []
+
+    # Copy files.
+    for src, out in zip(srcs, outs):
+        cmds.append('cp -f "%s" $(location %s)' % (src, out))
+    outs = [('        "%s",' % out) for out in outs]
+    return """genrule(
+    name = "%s",
+    outs = [
+%s
+    ],
+    cmd = \"""%s \""",
+)""" % (name, "\n".join(outs), " && ".join(cmds))
+
+def make_copy_dir_rule(repository_ctx, name, src_dir, out_dir):
+    """Returns a rule to recursively copy a directory."""
+    src_dir = _norm_path(src_dir)
+    out_dir = _norm_path(out_dir)
+    outs = _read_dir(repository_ctx, src_dir)
+    outs = [('        "%s",' % out.replace(src_dir, out_dir)) for out in outs]
+
+    # '@D' already contains the relative path for a single file, see
+    # http://docs.bazel.build/versions/master/be/make-variables.html#predefined_genrule_variables
+    out_dir = "$(@D)/%s" % out_dir if len(outs) > 1 else "$(@D)"
+    return """genrule(
+    name = "%s",
+    outs = [
+%s
+    ],
+    cmd = \"""cp -rLf "%s/." "%s/" \""",
+)""" % (name, "\n".join(outs), src_dir, out_dir)
+
+def _read_dir(repository_ctx, src_dir):
+    """Returns a string with all files in a directory.
+      Finds all files inside a directory, traversing subfolders and following
+      symlinks. The returned string contains the full path of all files
+      separated by line breaks.
+      """
+    if _is_windows(repository_ctx):
+        src_dir = src_dir.replace("/", "\\")
+        find_result = _execute(
+            repository_ctx,
+            ["cmd.exe", "/c", "dir", src_dir, "/b", "/s", "/a-d"],
+            empty_stdout_fine = True,
+        )
+
+        # src_files will be used in genrule.outs where the paths must
+        # use forward slashes.
+        result = find_result.stdout.replace("\\", "/")
+    else:
+        find_result = _execute(
+            repository_ctx,
+            ["find", src_dir, "-follow", "-type", "f"],
+            empty_stdout_fine = True,
+        )
+        result = find_result.stdout
+    return sorted(result.splitlines())
+
+def _create_local_cuda_repository(repository_ctx):
+    """Creates the repository containing files set up to build with CUDA."""
+    cuda_config = _get_cuda_config(repository_ctx)
+
+    cuda_include_path = cuda_config.config["cuda_include_dir"]
+    cublas_include_path = cuda_config.config["cublas_include_dir"]
+    cudnn_header_dir = cuda_config.config["cudnn_include_dir"]
+    cupti_header_dir = cuda_config.config["cupti_include_dir"]
+    nvvm_libdevice_dir = cuda_config.config["nvvm_library_dir"]
+
+    # Create genrule to copy files from the installed CUDA toolkit into execroot.
+    copy_rules = [
+        make_copy_dir_rule(
+            repository_ctx,
+            name = "cuda-include",
+            src_dir = cuda_include_path,
+            out_dir = "cuda/include",
+        ),
+        make_copy_dir_rule(
+            repository_ctx,
+            name = "cuda-nvvm",
+            src_dir = nvvm_libdevice_dir,
+            out_dir = "cuda/nvvm/libdevice",
+        ),
+        make_copy_dir_rule(
+            repository_ctx,
+            name = "cuda-extras",
+            src_dir = cupti_header_dir,
+            out_dir = "cuda/extras/CUPTI/include",
+        ),
+    ]
+
+    copy_rules.append(make_copy_files_rule(
+        repository_ctx,
+        name = "cublas-include",
+        srcs = [
+            cublas_include_path + "/cublas.h",
+            cublas_include_path + "/cublas_v2.h",
+            cublas_include_path + "/cublas_api.h",
+        ],
+        outs = [
+            "cublas/include/cublas.h",
+            "cublas/include/cublas_v2.h",
+            "cublas/include/cublas_api.h",
+        ],
+    ))
+
+    cuda_libs = _find_libs(repository_ctx, cuda_config)
+    cuda_lib_srcs = []
+    cuda_lib_outs = []
+    for path in cuda_libs.values():
+        cuda_lib_srcs.append(str(path))
+        cuda_lib_outs.append("cuda/lib/" + path.basename)
+    copy_rules.append(make_copy_files_rule(
+        repository_ctx,
+        name = "cuda-lib",
+        srcs = cuda_lib_srcs,
+        outs = cuda_lib_outs,
+    ))
+
+    copy_rules.append(make_copy_dir_rule(
+        repository_ctx,
+        name = "cuda-bin",
+        src_dir = cuda_config.cuda_toolkit_path + "/bin",
+        out_dir = "cuda/bin",
+    ))
+
+    # Copy cudnn.h if cuDNN was not installed to CUDA_TOOLKIT_PATH.
+    included_files = _read_dir(repository_ctx, cuda_include_path)
+    if not any([file.endswith("cudnn.h") for file in included_files]):
+        if [int(x) for x in cuda_config.cudnn_version.split(".")] < [8, 0]:
+            cudnn_headers = ["cudnn.h"]
+        else:
+            cudnn_headers = [
+                "cudnn_adv_infer.h",
+                "cudnn_adv_train.h",
+                "cudnn_cnn_infer.h",
+                "cudnn_cnn_train.h",
+                "cudnn_ops_infer.h",
+                "cudnn_ops_train.h",
+                "cudnn.h",
+                "cudnn_version.h",
+            ]
+        cudnn_srcs = []
+        cudnn_outs = []
+        for header in cudnn_headers:
+            cudnn_srcs.append(cudnn_header_dir + "/" + header)
+            cudnn_outs.append("cudnn/include/" + header)
+
+        copy_rules.append(make_copy_files_rule(
+            repository_ctx,
+            name = "cudnn-include",
+            srcs = cudnn_srcs,
+            outs = cudnn_outs,
+        ))
+    else:
+        copy_rules.append("filegroup(name = 'cudnn-include')\n")
+
+    # Set up BUILD file for cuda/
+    _tpl(
+        repository_ctx,
+        "cuda:build_defs.bzl",
+        {
+            "%{cuda_is_configured}": "True",
+            "%{cuda_extra_copts}": "[]",
+        },
+    )
+
+    _tpl(
+        repository_ctx,
+        "cuda:BUILD",
+        {
+            "%{cuda_driver_lib}": cuda_libs["cuda"].basename,
+            "%{cudart_static_lib}": cuda_libs["cudart_static"].basename,
+            "%{cudart_static_linkopt}": _cudart_static_linkopt(cuda_config.cpu_value),
+            "%{cudart_lib}": cuda_libs["cudart"].basename,
+            "%{cublas_lib}": cuda_libs["cublas"].basename,
+            "%{cusolver_lib}": cuda_libs["cusolver"].basename,
+            "%{cudnn_lib}": cuda_libs["cudnn"].basename,
+            "%{cufft_lib}": cuda_libs["cufft"].basename,
+            "%{curand_lib}": cuda_libs["curand"].basename,
+            "%{cupti_lib}": cuda_libs["cupti"].basename,
+            "%{copy_rules}": "\n".join(copy_rules),
+            "%{cuda_headers}": (
+                '":cuda-include",\n' + '        ":cudnn-include",'
+            ),
+        },
+        "cuda/BUILD",
+    )
+
+    # Set up crosstool/
+    cc = find_cc(repository_ctx)
+    cc_fullpath = cc
+
+    host_compiler_includes = _host_compiler_includes(repository_ctx, cc_fullpath)
+
+    cuda_defines = {}
+
+    # Bazel sets '-B/usr/bin' flag to workaround build errors on RHEL (see
+    # https://github.com/bazelbuild/bazel/issues/760).
+    # However, this stops our custom clang toolchain from picking the provided
+    # LLD linker, so we're only adding '-B/usr/bin' when using non-downloaded
+    # toolchain.
+    # TODO: when bazel stops adding '-B/usr/bin' by default, remove this
+    #       flag from the CROSSTOOL completely (see
+    #       https://github.com/bazelbuild/bazel/issues/5634)
+    cuda_defines["%{linker_bin_path_flag}"] = 'flag: "-B/usr/bin"'
+
+    cuda_defines["%{host_compiler_path}"] = "clang/bin/crosstool_wrapper_driver_is_not_gcc"
+    cuda_defines["%{host_compiler_warnings}"] = ""
+
+    # nvcc has the system include paths built in and will automatically
+    # search them; we cannot work around that, so we add the relevant cuda
+    # system paths to the allowed compiler specific include paths.
+    cuda_defines["%{host_compiler_includes}"] = (
+        host_compiler_includes + "\n" + _cuda_include_path(
+            repository_ctx,
+            cuda_config,
+        ) +
+        "\n  cxx_builtin_include_directory: \"%s\"" % cupti_header_dir +
+        "\n  cxx_builtin_include_directory: \"%s\"" % cudnn_header_dir
+    )
+
+    # For gcc, do not canonicalize system header paths; some versions of gcc
+    # pick the shortest possible path for system includes when creating the
+    # .d file - given that includes that are prefixed with "../" multiple
+    # time quickly grow longer than the root of the tree, this can lead to
+    # bazel's header check failing.
+    cuda_defines["%{extra_no_canonical_prefixes_flags}"] = (
+        "flag: \"-fno-canonical-system-headers\""
+    )
+    nvcc_path = str(
+        repository_ctx.path("%s/bin/nvcc%s" % (
+            cuda_config.cuda_toolkit_path,
+            ".exe" if _is_windows(repository_ctx) else "",
+        )),
+    )
+
+    builtin_include_directories = []
+    for one_line in cuda_defines["%{host_compiler_includes}"].splitlines():
+        inc_dir = one_line.split(":")[1][2:-1]
+        builtin_include_directories.append(inc_dir)
+
+    _tpl(
+        repository_ctx,
+        "crosstool:BUILD",
+        {
+            "%{linker_files}": ":crosstool_wrapper_driver_is_not_gcc",
+            "%{cxx_builtin_include_directories}": ",".join(builtin_include_directories),
+            "%{win_linker_files}": ":windows_msvc_wrapper_files",
+        },
+    )
+    wrapper_defines = {
+        "%{cpu_compiler}": str(cc),
+        "%{cuda_version}": cuda_config.cuda_version,
+        "%{nvcc_path}": nvcc_path,
+        "%{gcc_host_compiler_path}": str(cc),
+        "%{cuda_compute_capabilities}": ", ".join(
+            ["\"%s\"" % c for c in cuda_config.compute_capabilities],
+        ),
+        "%{nvcc_tmp_dir}": _get_nvcc_tmp_dir_for_windows(repository_ctx),
+    }
+
+    _tpl(
+        repository_ctx,
+        "crosstool:cc_toolchain_config.bzl",
+        wrapper_defines,
+    )
+    _tpl(
+        repository_ctx,
+        "crosstool:clang/bin/crosstool_wrapper_driver_is_not_gcc",
+        wrapper_defines,
+    )
+
+    _tpl(
+        repository_ctx,
+        "crosstool:windows/msvc_wrapper_for_nvcc.py",
+        wrapper_defines,
+    )
+
+    _tpl(
+        repository_ctx,
+        "crosstool:CROSSTOOL",
+        cuda_defines.update(_get_win_cuda_defines(repository_ctx)),
+        out = "crosstool/CROSSTOOL",
+    )
+
+def find_cuda_config(repository_ctx, cuda_libraries):
+    """Returns CUDA config dictionary from running find_cuda_config.py"""
+    exec_result = repository_ctx.execute([
+        _get_python_bin(repository_ctx),
+        repository_ctx.path(Label("//build_deps/toolchains/gpu:find_cuda_config.py")),
+    ] + cuda_libraries)
+    if exec_result.return_code:
+        auto_configure_fail("Failed to run find_cuda_config.py: %s" % exec_result.stderr)
+
+    # Parse the dict from stdout.
+    return dict([tuple(x.split(": ")) for x in exec_result.stdout.splitlines()])
+
+def _cuda_autoconf_impl(repository_ctx):
+    """Implementation of the cuda_autoconf repository rule."""
+    if not enable_cuda(repository_ctx):
+        _create_dummy_repository(repository_ctx)
+    else:
+        _create_local_cuda_repository(repository_ctx)
+
+cuda_configure = repository_rule(
+    environ = [
+        _GCC_HOST_COMPILER_PATH,
+        _CLANG_CUDA_COMPILER_PATH,
+        "TF_NEED_CUDA",
+        "TF_CUDA_CLANG",
+        _TF_DOWNLOAD_CLANG,
+        _CUDA_TOOLKIT_PATH,
+        _CUDNN_INSTALL_PATH,
+        _TF_CUDA_VERSION,
+        _TF_CUDNN_VERSION,
+        _TF_CUDA_COMPUTE_CAPABILITIES,
+        "NVVMIR_LIBRARY_DIR",
+        _PYTHON_BIN_PATH,
+    ],
+    implementation = _cuda_autoconf_impl,
+)
+
+"""Detects and configures the local CUDA toolchain.
+Add the following to your WORKSPACE FILE:
+```python
+cuda_configure(name = "local_config_cuda")
+```
+Args:
+  name: A unique name for this workspace rule.
+"""
diff --git a/build_deps/toolchains/gpu/find_cuda_config.py b/build_deps/toolchains/gpu/find_cuda_config.py
new file mode 100644
index 0000000..b24430b
--- /dev/null
+++ b/build_deps/toolchains/gpu/find_cuda_config.py
@@ -0,0 +1,682 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Prints CUDA library and header directories and versions found on the system.
+
+The script searches for CUDA library and header files on the system, inspects
+them to determine their version and prints the configuration to stdout.
+The paths to inspect and the required versions are specified through environment
+variables. If no valid configuration is found, the script prints to stderr and
+returns an error code.
+
+The list of libraries to find is specified as arguments. Supported libraries are
+CUDA (includes cuBLAS), cuDNN, NCCL, and TensorRT.
+
+The script takes a list of base directories specified by the TF_CUDA_PATHS
+environment variable as comma-separated glob list. The script looks for headers
+and library files in a hard-coded set of subdirectories from these base paths.
+If TF_CUDA_PATHS is not specified, a OS specific default is used:
+
+  Linux:   /usr/local/cuda, /usr, and paths from 'ldconfig -p'.
+  Windows: CUDA_PATH environment variable, or
+           C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\*
+
+For backwards compatibility, some libraries also use alternative base
+directories from other environment variables if they are specified. List of
+library-specific environment variables:
+
+  Library   Version env variable  Additional base directories
+  ----------------------------------------------------------------
+  CUDA      TF_CUDA_VERSION       CUDA_TOOLKIT_PATH
+  cuBLAS    TF_CUBLAS_VERSION     CUDA_TOOLKIT_PATH
+  cuDNN     TF_CUDNN_VERSION      CUDNN_INSTALL_PATH
+  NCCL      TF_NCCL_VERSION       NCCL_INSTALL_PATH, NCCL_HDR_PATH
+  TensorRT  TF_TENSORRT_VERSION   TENSORRT_INSTALL_PATH
+
+Versions environment variables can be of the form 'x' or 'x.y' to request a
+specific version, empty or unspecified to accept any version.
+
+The output of a found library is of the form:
+tf_<library>_version: x.y.z
+tf_<library>_header_dir: ...
+tf_<library>_library_dir: ...
+"""
+
+import io
+import os
+import glob
+import platform
+import re
+import subprocess
+import sys
+
+# pylint: disable=g-import-not-at-top
+try:
+    from shutil import which
+except ImportError:
+    from distutils.spawn import find_executable as which
+# pylint: enable=g-import-not-at-top
+
+
+class ConfigError(Exception):
+    pass
+
+
+def _is_linux():
+    return platform.system() == "Linux"
+
+
+def _is_windows():
+    return platform.system() == "Windows"
+
+
+def _is_macos():
+    return platform.system() == "Darwin"
+
+
+def _matches_version(actual_version, required_version):
+    """Checks whether some version meets the requirements.
+
+    All elements of the required_version need to be present in the
+    actual_version.
+
+        required_version  actual_version  result
+        -----------------------------------------
+        1                 1.1             True
+        1.2               1               False
+        1.2               1.3             False
+                          1               True
+
+    Args:
+      required_version: The version specified by the user.
+      actual_version: The version detected from the CUDA installation.
+    Returns: Whether the actual version matches the required one.
+    """
+    if actual_version is None:
+        return False
+
+    # Strip spaces from the versions.
+    actual_version = actual_version.strip()
+    required_version = required_version.strip()
+    return actual_version.startswith(required_version)
+
+
+def _at_least_version(actual_version, required_version):
+    actual = [int(v) for v in actual_version.split(".")]
+    required = [int(v) for v in required_version.split(".")]
+    return actual >= required
+
+
+def _get_header_version(path, name):
+    """Returns preprocessor defines in C header file."""
+    for line in io.open(path, "r", encoding="utf-8").readlines():
+        match = re.match(r"#define %s +(\d+)" % name, line)
+        if match:
+            return match.group(1)
+    return ""
+
+
+def _cartesian_product(first, second):
+    """Returns all path combinations of first and second."""
+    return [os.path.join(f, s) for f in first for s in second]
+
+
+def _get_ld_config_paths():
+    """Returns all directories from 'ldconfig -p'."""
+    if not _is_linux():
+        return []
+    ldconfig_path = which("ldconfig") or "/sbin/ldconfig"
+    output = subprocess.check_output([ldconfig_path, "-p"])
+    pattern = re.compile(".* => (.*)")
+    result = set()
+    for line in output.splitlines():
+        try:
+            match = pattern.match(line.decode("ascii"))
+        except UnicodeDecodeError:
+            match = False
+        if match:
+            result.add(os.path.dirname(match.group(1)))
+    return sorted(list(result))
+
+
+def _get_default_cuda_paths(cuda_version):
+    if not cuda_version:
+        cuda_version = "*"
+    elif "." not in cuda_version:
+        cuda_version = cuda_version + ".*"
+
+    if _is_windows():
+        return [
+            os.environ.get(
+                "CUDA_PATH",
+                "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v%s\\"
+                % cuda_version,
+            )
+        ]
+    return [
+        "/usr/local/cuda-%s" % cuda_version,
+        "/usr/local/cuda",
+        "/usr",
+        "/usr/local/cudnn",
+    ] + _get_ld_config_paths()
+
+
+def _header_paths():
+    """Returns hard-coded set of relative paths to look for header files."""
+    return [
+        "",
+        "include",
+        "include/cuda",
+        "include/*-linux-gnu",
+        "extras/CUPTI/include",
+        "include/cuda/CUPTI",
+        "local/cuda/extras/CUPTI/include",
+    ]
+
+
+def _library_paths():
+    """Returns hard-coded set of relative paths to look for library files."""
+    return [
+        "",
+        "lib64",
+        "lib",
+        "lib/*-linux-gnu",
+        "lib/x64",
+        "extras/CUPTI/*",
+        "local/cuda/lib64",
+        "local/cuda/extras/CUPTI/lib64",
+    ]
+
+
+def _not_found_error(base_paths, relative_paths, filepattern):
+    base_paths = "".join(["\n        '%s'" % path for path in sorted(base_paths)])
+    relative_paths = "".join(["\n        '%s'" % path for path in relative_paths])
+    return ConfigError(
+        "Could not find any %s in any subdirectory:%s\nof:%s\n"
+        % (filepattern, relative_paths, base_paths)
+    )
+
+
+def _find_file(base_paths, relative_paths, filepattern):
+    for path in _cartesian_product(base_paths, relative_paths):
+        for file in glob.glob(os.path.join(path, filepattern)):
+            return file
+    raise _not_found_error(base_paths, relative_paths, filepattern)
+
+
+def _find_library(base_paths, library_name, required_version):
+    """Returns first valid path to the requested library."""
+    if _is_windows():
+        filepattern = library_name + ".lib"
+    elif _is_macos():
+        filepattern = "%s*.dylib" % (
+            ".".join(["lib" + library_name] + required_version.split(".")[:1])
+        )
+    else:
+        filepattern = (
+            ".".join(["lib" + library_name, "so"] + required_version.split(".")[:1])
+            + "*"
+        )
+    return _find_file(base_paths, _library_paths(), filepattern)
+
+
+def _find_versioned_file(
+    base_paths, relative_paths, filepatterns, required_version, get_version
+):
+    """Returns first valid path to a file that matches the requested version."""
+    if type(filepatterns) not in [list, tuple]:
+        filepatterns = [filepatterns]
+    for path in _cartesian_product(base_paths, relative_paths):
+        for filepattern in filepatterns:
+            for file in glob.glob(os.path.join(path, filepattern)):
+                actual_version = get_version(file)
+                if _matches_version(actual_version, required_version):
+                    return file, actual_version
+    raise _not_found_error(
+        base_paths,
+        relative_paths,
+        ", ".join(filepatterns) + " matching version '%s'" % required_version,
+    )
+
+
+def _find_header(base_paths, header_name, required_version, get_version):
+    """Returns first valid path to a header that matches the requested version."""
+    return _find_versioned_file(
+        base_paths, _header_paths(), header_name, required_version, get_version
+    )
+
+
+def _find_cuda_config(base_paths, required_version):
+    def get_header_version(path):
+        version = int(_get_header_version(path, "CUDA_VERSION"))
+        if not version:
+            return None
+        return "%d.%d" % (version // 1000, version % 1000 // 10)
+
+    cuda_header_path, header_version = _find_header(
+        base_paths, "cuda.h", required_version, get_header_version
+    )
+    cuda_version = header_version  # x.y, see above.
+
+    cuda_library_path = _find_library(base_paths, "cudart", cuda_version)
+
+    def get_nvcc_version(path):
+        pattern = r"Cuda compilation tools, release \d+\.\d+, V(\d+\.\d+\.\d+)"
+        for line in subprocess.check_output([path, "--version"]).splitlines():
+            match = re.match(pattern, line.decode("ascii"))
+            if match:
+                return match.group(1)
+        return None
+
+    nvcc_name = "nvcc.exe" if _is_windows() else "nvcc"
+    nvcc_path, nvcc_version = _find_versioned_file(
+        base_paths,
+        [
+            "",
+            "bin",
+            "local/cuda/bin",
+        ],
+        nvcc_name,
+        cuda_version,
+        get_nvcc_version,
+    )
+
+    nvvm_path = _find_file(
+        base_paths,
+        [
+            "nvvm/libdevice",
+            "share/cuda",
+            "lib/nvidia-cuda-toolkit/libdevice",
+            "local/cuda/nvvm/libdevice",
+        ],
+        "libdevice*.10.bc",
+    )
+
+    cupti_header_path = _find_file(base_paths, _header_paths(), "cupti.h")
+    cupti_library_path = _find_library(base_paths, "cupti", required_version)
+
+    cuda_binary_dir = os.path.dirname(nvcc_path)
+    nvvm_library_dir = os.path.dirname(nvvm_path)
+
+    # XLA requires the toolkit path to find ptxas and libdevice.
+    # TODO(csigg): pass in both directories instead.
+    cuda_toolkit_paths = (
+        os.path.normpath(os.path.join(cuda_binary_dir, "..")),
+        os.path.normpath(os.path.join(nvvm_library_dir, "../..")),
+    )
+    if cuda_toolkit_paths[0] != cuda_toolkit_paths[1]:
+        raise ConfigError(
+            "Inconsistent CUDA toolkit path: %s vs %s" % cuda_toolkit_paths
+        )
+
+    return {
+        "cuda_version": cuda_version,
+        "cuda_include_dir": os.path.dirname(cuda_header_path),
+        "cuda_library_dir": os.path.dirname(cuda_library_path),
+        "cuda_binary_dir": cuda_binary_dir,
+        "nvvm_library_dir": nvvm_library_dir,
+        "cupti_include_dir": os.path.dirname(cupti_header_path),
+        "cupti_library_dir": os.path.dirname(cupti_library_path),
+        "cuda_toolkit_path": cuda_toolkit_paths[0],
+    }
+
+
+def _find_cublas_config(base_paths, required_version, cuda_version):
+
+    if _at_least_version(cuda_version, "10.1"):
+
+        def get_header_version(path):
+            version = (
+                _get_header_version(path, name)
+                for name in ("CUBLAS_VER_MAJOR", "CUBLAS_VER_MINOR", "CUBLAS_VER_PATCH")
+            )
+            return ".".join(version)
+
+        header_path, header_version = _find_header(
+            base_paths, "cublas_api.h", required_version, get_header_version
+        )
+        # cuBLAS uses the major version only.
+        cublas_version = header_version.split(".")[0]
+
+    else:
+        # There is no version info available before CUDA 10.1, just find the file.
+        header_version = cuda_version
+        header_path = _find_file(base_paths, _header_paths(), "cublas_api.h")
+        # cuBLAS version is the same as CUDA version (x.y).
+        cublas_version = required_version
+
+    library_path = _find_library(base_paths, "cublas", cublas_version)
+
+    return {
+        "cublas_version": header_version,
+        "cublas_include_dir": os.path.dirname(header_path),
+        "cublas_library_dir": os.path.dirname(library_path),
+    }
+
+
+def _find_cusolver_config(base_paths, required_version, cuda_version):
+
+    if _at_least_version(cuda_version, "11.0"):
+
+        def get_header_version(path):
+            version = (
+                _get_header_version(path, name)
+                for name in (
+                    "CUSOLVER_VER_MAJOR",
+                    "CUSOLVER_VER_MINOR",
+                    "CUSOLVER_VER_PATCH",
+                )
+            )
+            return ".".join(version)
+
+        header_path, header_version = _find_header(
+            base_paths, "cusolver_common.h", required_version, get_header_version
+        )
+        cusolver_version = header_version.split(".")[0]
+
+    else:
+        header_version = cuda_version
+        header_path = _find_file(base_paths, _header_paths(), "cusolver_common.h")
+        cusolver_version = required_version
+
+    library_path = _find_library(base_paths, "cusolver", cusolver_version)
+
+    return {
+        "cusolver_version": header_version,
+        "cusolver_include_dir": os.path.dirname(header_path),
+        "cusolver_library_dir": os.path.dirname(library_path),
+    }
+
+
+def _find_curand_config(base_paths, required_version, cuda_version):
+
+    if _at_least_version(cuda_version, "11.0"):
+
+        def get_header_version(path):
+            version = (
+                _get_header_version(path, name)
+                for name in ("CURAND_VER_MAJOR", "CURAND_VER_MINOR", "CURAND_VER_PATCH")
+            )
+            return ".".join(version)
+
+        header_path, header_version = _find_header(
+            base_paths, "curand.h", required_version, get_header_version
+        )
+        curand_version = header_version.split(".")[0]
+
+    else:
+        header_version = cuda_version
+        header_path = _find_file(base_paths, _header_paths(), "curand.h")
+        curand_version = required_version
+
+    library_path = _find_library(base_paths, "curand", curand_version)
+
+    return {
+        "curand_version": header_version,
+        "curand_include_dir": os.path.dirname(header_path),
+        "curand_library_dir": os.path.dirname(library_path),
+    }
+
+
+def _find_cufft_config(base_paths, required_version, cuda_version):
+
+    if _at_least_version(cuda_version, "11.0"):
+
+        def get_header_version(path):
+            version = (
+                _get_header_version(path, name)
+                for name in ("CUFFT_VER_MAJOR", "CUFFT_VER_MINOR", "CUFFT_VER_PATCH")
+            )
+            return ".".join(version)
+
+        header_path, header_version = _find_header(
+            base_paths, "cufft.h", required_version, get_header_version
+        )
+        cufft_version = header_version.split(".")[0]
+
+    else:
+        header_version = cuda_version
+        header_path = _find_file(base_paths, _header_paths(), "cufft.h")
+        cufft_version = required_version
+
+    library_path = _find_library(base_paths, "cufft", cufft_version)
+
+    return {
+        "cufft_version": header_version,
+        "cufft_include_dir": os.path.dirname(header_path),
+        "cufft_library_dir": os.path.dirname(library_path),
+    }
+
+
+def _find_cudnn_config(base_paths, required_version):
+    def get_header_version(path):
+        version = [
+            _get_header_version(path, name)
+            for name in ("CUDNN_MAJOR", "CUDNN_MINOR", "CUDNN_PATCHLEVEL")
+        ]
+        return ".".join(version) if version[0] else None
+
+    header_path, header_version = _find_header(
+        base_paths, ("cudnn.h", "cudnn_version.h"), required_version, get_header_version
+    )
+    cudnn_version = header_version.split(".")[0]
+
+    library_path = _find_library(base_paths, "cudnn", cudnn_version)
+
+    return {
+        "cudnn_version": cudnn_version,
+        "cudnn_include_dir": os.path.dirname(header_path),
+        "cudnn_library_dir": os.path.dirname(library_path),
+    }
+
+
+def _find_cusparse_config(base_paths, required_version, cuda_version):
+
+    if _at_least_version(cuda_version, "11.0"):
+
+        def get_header_version(path):
+            version = (
+                _get_header_version(path, name)
+                for name in (
+                    "CUSPARSE_VER_MAJOR",
+                    "CUSPARSE_VER_MINOR",
+                    "CUSPARSE_VER_PATCH",
+                )
+            )
+            return ".".join(version)
+
+        header_path, header_version = _find_header(
+            base_paths, "cusparse.h", required_version, get_header_version
+        )
+        cusparse_version = header_version.split(".")[0]
+
+    else:
+        header_version = cuda_version
+        header_path = _find_file(base_paths, _header_paths(), "cusparse.h")
+        cusparse_version = required_version
+
+    library_path = _find_library(base_paths, "cusparse", cusparse_version)
+
+    return {
+        "cusparse_version": header_version,
+        "cusparse_include_dir": os.path.dirname(header_path),
+        "cusparse_library_dir": os.path.dirname(library_path),
+    }
+
+
+def _find_nccl_config(base_paths, required_version):
+    def get_header_version(path):
+        version = (
+            _get_header_version(path, name)
+            for name in ("NCCL_MAJOR", "NCCL_MINOR", "NCCL_PATCH")
+        )
+        return ".".join(version)
+
+    header_path, header_version = _find_header(
+        base_paths, "nccl.h", required_version, get_header_version
+    )
+    nccl_version = header_version.split(".")[0]
+
+    library_path = _find_library(base_paths, "nccl", nccl_version)
+
+    return {
+        "nccl_version": nccl_version,
+        "nccl_include_dir": os.path.dirname(header_path),
+        "nccl_library_dir": os.path.dirname(library_path),
+    }
+
+
+def _find_tensorrt_config(base_paths, required_version):
+    def get_header_version(path):
+        version = (
+            _get_header_version(path, name)
+            for name in ("NV_TENSORRT_MAJOR", "NV_TENSORRT_MINOR", "NV_TENSORRT_PATCH")
+        )
+        # `version` is a generator object, so we convert it to a list before using
+        # it (muitiple times below).
+        version = list(version)
+        if not all(version):
+            return None  # Versions not found, make _matches_version returns False.
+        return ".".join(version)
+
+    try:
+        header_path, header_version = _find_header(
+            base_paths, "NvInfer.h", required_version, get_header_version
+        )
+    except ConfigError:
+        # TensorRT 6 moved the version information to NvInferVersion.h.
+        header_path, header_version = _find_header(
+            base_paths, "NvInferVersion.h", required_version, get_header_version
+        )
+
+    tensorrt_version = header_version.split(".")[0]
+    library_path = _find_library(base_paths, "nvinfer", tensorrt_version)
+
+    return {
+        "tensorrt_version": tensorrt_version,
+        "tensorrt_include_dir": os.path.dirname(header_path),
+        "tensorrt_library_dir": os.path.dirname(library_path),
+    }
+
+
+def _list_from_env(env_name, default=[]):
+    """Returns comma-separated list from environment variable."""
+    if env_name in os.environ:
+        return os.environ[env_name].split(",")
+    return default
+
+
+def _get_legacy_path(env_name, default=[]):
+    """Returns a path specified by a legacy environment variable.
+
+    CUDNN_INSTALL_PATH, NCCL_INSTALL_PATH, TENSORRT_INSTALL_PATH set to
+    '/usr/lib/x86_64-linux-gnu' would previously find both library and header
+    paths. Detect those and return '/usr', otherwise forward to _list_from_env().
+    """
+    if env_name in os.environ:
+        match = re.match(r"^(/[^/ ]*)+/lib/\w+-linux-gnu/?$", os.environ[env_name])
+        if match:
+            return [match.group(1)]
+    return _list_from_env(env_name, default)
+
+
+def _normalize_path(path):
+    """Returns normalized path, with forward slashes on Windows."""
+    path = os.path.realpath(path)
+    if _is_windows():
+        path = path.replace("\\", "/")
+    return path
+
+
+def find_cuda_config():
+    """Returns a dictionary of CUDA library and header file paths."""
+    libraries = [argv.lower() for argv in sys.argv[1:]]
+    cuda_version = os.environ.get("TF_CUDA_VERSION", "")
+    base_paths = _list_from_env("TF_CUDA_PATHS", _get_default_cuda_paths(cuda_version))
+    base_paths = [path for path in base_paths if os.path.exists(path)]
+
+    result = {}
+    if "cuda" in libraries:
+        cuda_paths = _list_from_env("CUDA_TOOLKIT_PATH", base_paths)
+        result.update(_find_cuda_config(cuda_paths, cuda_version))
+
+        cuda_version = result["cuda_version"]
+        cublas_paths = base_paths
+        if tuple(int(v) for v in cuda_version.split(".")) < (10, 1):
+            # Before CUDA 10.1, cuBLAS was in the same directory as the toolkit.
+            cublas_paths = cuda_paths
+        cublas_version = os.environ.get("TF_CUBLAS_VERSION", "")
+        result.update(_find_cublas_config(cublas_paths, cublas_version, cuda_version))
+
+        cusolver_paths = base_paths
+        if tuple(int(v) for v in cuda_version.split(".")) < (11, 0):
+            cusolver_paths = cuda_paths
+        cusolver_version = os.environ.get("TF_CUSOLVER_VERSION", "")
+        result.update(
+            _find_cusolver_config(cusolver_paths, cusolver_version, cuda_version)
+        )
+
+        curand_paths = base_paths
+        if tuple(int(v) for v in cuda_version.split(".")) < (11, 0):
+            curand_paths = cuda_paths
+        curand_version = os.environ.get("TF_CURAND_VERSION", "")
+        result.update(_find_curand_config(curand_paths, curand_version, cuda_version))
+
+        cufft_paths = base_paths
+        if tuple(int(v) for v in cuda_version.split(".")) < (11, 0):
+            cufft_paths = cuda_paths
+        cufft_version = os.environ.get("TF_CUFFT_VERSION", "")
+        result.update(_find_cufft_config(cufft_paths, cufft_version, cuda_version))
+
+        cusparse_paths = base_paths
+        if tuple(int(v) for v in cuda_version.split(".")) < (11, 0):
+            cusparse_paths = cuda_paths
+        cusparse_version = os.environ.get("TF_CUSPARSE_VERSION", "")
+        result.update(
+            _find_cusparse_config(cusparse_paths, cusparse_version, cuda_version)
+        )
+
+    if "cudnn" in libraries:
+        cudnn_paths = _get_legacy_path("CUDNN_INSTALL_PATH", base_paths)
+        cudnn_version = os.environ.get("TF_CUDNN_VERSION", "")
+        result.update(_find_cudnn_config(cudnn_paths, cudnn_version))
+
+    if "nccl" in libraries:
+        nccl_paths = _get_legacy_path("NCCL_INSTALL_PATH", base_paths)
+        nccl_version = os.environ.get("TF_NCCL_VERSION", "")
+        result.update(_find_nccl_config(nccl_paths, nccl_version))
+
+    if "tensorrt" in libraries:
+        tensorrt_paths = _get_legacy_path("TENSORRT_INSTALL_PATH", base_paths)
+        tensorrt_version = os.environ.get("TF_TENSORRT_VERSION", "")
+        result.update(_find_tensorrt_config(tensorrt_paths, tensorrt_version))
+
+    for k, v in result.items():
+        if k.endswith("_dir") or k.endswith("_path"):
+            result[k] = _normalize_path(v)
+
+    return result
+
+
+def main():
+    try:
+        for key, value in sorted(find_cuda_config().items()):
+            print("%s: %s" % (key, value))
+    except ConfigError as e:
+        sys.stderr.write(str(e))
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/configure.py b/configure.py
new file mode 100644
index 0000000..3cacf01
--- /dev/null
+++ b/configure.py
@@ -0,0 +1,199 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# Usage: python configure.py
+#
+
+
+import os
+import pathlib
+import platform
+import logging
+
+import tensorflow as tf
+
+from packaging.version import Version
+
+_TFA_BAZELRC = ".bazelrc"
+
+
+# Writes variables to bazelrc file
+def write(line):
+    with open(_TFA_BAZELRC, "a") as f:
+        f.write(line + "\n")
+
+
+def write_action_env(var_name, var):
+    write('build --action_env {}="{}"'.format(var_name, var))
+
+
+def is_macos():
+    return platform.system() == "Darwin"
+
+
+def is_windows():
+    return platform.system() == "Windows"
+
+
+def is_linux():
+    return platform.system() == "Linux"
+
+
+def is_raspi_arm():
+    return os.uname()[4] == "armv7l" or os.uname()[4] == "aarch64"
+
+
+def is_linux_ppc64le():
+    return is_linux() and platform.machine() == "ppc64le"
+
+
+def is_linux_x86_64():
+    return is_linux() and platform.machine() == "x86_64"
+
+
+def is_linux_arm():
+    return is_linux() and platform.machine() == "arm"
+
+
+def is_linux_aarch64():
+    return is_linux() and platform.machine() == "aarch64"
+
+
+def is_linux_s390x():
+    return is_linux() and platform.machine() == "s390x"
+
+
+def get_tf_header_dir():
+    import tensorflow as tf
+
+    tf_header_dir = tf.sysconfig.get_compile_flags()[0][2:]
+    if is_windows():
+        tf_header_dir = tf_header_dir.replace("\\", "/")
+    return tf_header_dir
+
+
+def get_cpp_version():
+    cpp_version = "c++14"
+    if Version(tf.__version__) >= Version("2.10"):
+        cpp_version = "c++17"
+    return cpp_version
+
+
+def get_tf_shared_lib_dir():
+    import tensorflow as tf
+
+    # OS Specific parsing
+    if is_windows():
+        tf_shared_lib_dir = tf.sysconfig.get_compile_flags()[0][2:-7] + "python"
+        return tf_shared_lib_dir.replace("\\", "/")
+    elif is_raspi_arm():
+        return tf.sysconfig.get_compile_flags()[0][2:-7] + "python"
+    else:
+        return tf.sysconfig.get_link_flags()[0][2:]
+
+
+# Converts the linkflag namespec to the full shared library name
+def get_shared_lib_name():
+    import tensorflow as tf
+
+    namespec = tf.sysconfig.get_link_flags()
+    if is_macos():
+        # MacOS
+        return "lib" + namespec[1][2:] + ".dylib"
+    elif is_windows():
+        # Windows
+        return "_pywrap_tensorflow_internal.lib"
+    elif is_raspi_arm():
+        # The below command for linux would return an empty list
+        return "_pywrap_tensorflow_internal.so"
+    else:
+        # Linux
+        return namespec[1][3:]
+
+
+def create_build_configuration():
+    print()
+    print("Configuring TensorFlow NUFFT to be built from source...")
+
+    if os.path.isfile(_TFA_BAZELRC):
+        os.remove(_TFA_BAZELRC)
+
+    logging.disable(logging.WARNING)
+
+    write_action_env("TF_HEADER_DIR", get_tf_header_dir())
+    write_action_env("TF_SHARED_LIBRARY_DIR", get_tf_shared_lib_dir())
+    write_action_env("TF_SHARED_LIBRARY_NAME", get_shared_lib_name())
+    write_action_env("TF_CXX11_ABI_FLAG", tf.sysconfig.CXX11_ABI_FLAG)
+
+    # This should be replaced with a call to tf.sysconfig if it's added
+    write_action_env("TF_CPLUSPLUS_VER", get_cpp_version())
+
+    write("build --spawn_strategy=standalone")
+    write("build --strategy=Genrule=standalone")
+    write("build  --experimental_repo_remote_exec")
+    write("build -c opt")
+    write(
+        "build --cxxopt="
+        + '"-D_GLIBCXX_USE_CXX11_ABI="'
+        + str(tf.sysconfig.CXX11_ABI_FLAG)
+    )
+
+    if is_windows():
+        write("build --config=windows")
+        write("build:windows --enable_runfiles")
+        write("build:windows --copt=/experimental:preprocessor")
+        write("build:windows --host_copt=/experimental:preprocessor")
+        write("build:windows --copt=/arch=AVX")
+        write("build:windows --cxxopt=/std:" + get_cpp_version())
+        write("build:windows --host_cxxopt=/std:" + get_cpp_version())
+
+    if is_macos() or is_linux():
+        if not is_linux_ppc64le() and not is_linux_arm() and not is_linux_aarch64():
+            write("build --copt=-mavx")
+        write("build --cxxopt=-std=" + get_cpp_version())
+        write("build --host_cxxopt=-std=" + get_cpp_version())
+
+    if os.getenv("TF_NEED_CUDA", "0") == "1":
+        print("> Building GPU & CPU ops")
+        configure_cuda()
+    else:
+        print("> Building only CPU ops")
+
+    print()
+    print("Build configurations successfully written to", _TFA_BAZELRC, ":\n")
+    print(pathlib.Path(_TFA_BAZELRC).read_text())
+
+
+def configure_cuda():
+    write_action_env("TF_NEED_CUDA", "1")
+    write_action_env(
+        "CUDA_TOOLKIT_PATH", os.getenv("CUDA_TOOLKIT_PATH", "/usr/local/cuda")
+    )
+    write_action_env(
+        "CUDNN_INSTALL_PATH",
+        os.getenv("CUDNN_INSTALL_PATH", "/usr/lib/x86_64-linux-gnu"),
+    )
+    write_action_env("TF_CUDA_VERSION", os.getenv("TF_CUDA_VERSION", "11.2"))
+    write_action_env("TF_CUDNN_VERSION", os.getenv("TF_CUDNN_VERSION", "8"))
+
+    write("test --config=cuda")
+    write("build --config=cuda")
+    write("build:cuda --define=using_cuda=true --define=using_cuda_nvcc=true")
+    write(
+        "build:cuda --crosstool_top=@ubuntu20.04-gcc9_manylinux2014-cuda11.2-cudnn8.1-tensorrt7.2_config_cuda//crosstool:toolchain"
+    )
+
+
+if __name__ == "__main__":
+    create_build_configuration()
diff --git a/tensorflow_nufft/BUILD b/tensorflow_nufft/BUILD
new file mode 100644
index 0000000..70435e8
--- /dev/null
+++ b/tensorflow_nufft/BUILD
@@ -0,0 +1,37 @@
+licenses(["notice"])  # Apache 2.0
+
+package(default_visibility = ["//visibility:public"])
+
+config_setting(
+    name = "windows",
+    constraint_values = ["@bazel_tools//platforms:windows"],
+)
+
+py_library(
+    name = "tensorflow_nufft",
+    srcs = glob(["*.py"]),
+    -- deps = [
+    --     "//tensorflow_nufft/activations",
+    --     "//tensorflow_nufft/callbacks",
+    --     "//tensorflow_nufft/image",
+    --     "//tensorflow_addons/layers",
+    --     "//tensorflow_addons/losses",
+    --     "//tensorflow_addons/metrics",
+    --     "//tensorflow_addons/optimizers",
+    --     "//tensorflow_addons/rnn",
+    --     "//tensorflow_addons/seq2seq",
+    --     "//tensorflow_addons/testing",
+    --     "//tensorflow_addons/text",
+    --     "//tensorflow_addons/utils",
+    -- ],
+)
+
+-- py_test(
+--     name = "tensorflow_nufft_test",
+--     size = "small",
+--     srcs = glob(["tests/*"]),
+--     main = "tests/run_all_test.py",
+--     deps = [
+--         ":tensorflow_nufft",
+--     ],
+-- )
diff --git a/tensorflow_nufft/tensorflow_nufft.bzl b/tensorflow_nufft/tensorflow_nufft.bzl
new file mode 100644
index 0000000..d019c20
--- /dev/null
+++ b/tensorflow_nufft/tensorflow_nufft.bzl
@@ -0,0 +1,68 @@
+load("@local_config_tf//:build_defs.bzl", "CPLUSPLUS_VERSION", "D_GLIBCXX_USE_CXX11_ABI")
+load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda", "if_cuda_is_configured")
+
+def custom_op_library(
+        name,
+        srcs = [],
+        cuda_srcs = [],
+        deps = [],
+        cuda_deps = [],
+        copts = [],
+        **kwargs):
+    deps = deps + [
+        "@local_config_tf//:libtensorflow_framework",
+        "@local_config_tf//:tf_header_lib",
+    ]
+
+    if cuda_srcs:
+        copts = copts + if_cuda(["-DGOOGLE_CUDA=1"])
+        cuda_copts = copts + if_cuda_is_configured([
+            "-x cuda",
+            "-nvcc_options=relaxed-constexpr",
+            "-nvcc_options=ftz=true",
+        ])
+        cuda_deps = deps + if_cuda_is_configured(cuda_deps) + if_cuda_is_configured([
+            "@local_config_cuda//cuda:cuda_headers",
+            "@local_config_cuda//cuda:cudart_static",
+        ])
+        basename = name.split(".")[0]
+        native.cc_library(
+            name = basename + "_gpu",
+            srcs = cuda_srcs,
+            deps = cuda_deps,
+            copts = cuda_copts,
+            alwayslink = 1,
+            **kwargs
+        )
+        deps = deps + if_cuda_is_configured([":" + basename + "_gpu"])
+
+    copts = copts + select({
+        "//tensorflow_nufft:windows": [
+            "/DEIGEN_STRONG_INLINE=inline",
+            "-DTENSORFLOW_MONOLITHIC_BUILD",
+            "/D_USE_MATH_DEFINES",
+            "/DPLATFORM_WINDOWS",
+            "/DEIGEN_HAS_C99_MATH",
+            "/DTENSORFLOW_USE_EIGEN_THREADPOOL",
+            "/DEIGEN_AVOID_STL_ARRAY",
+            "/Iexternal/gemmlowp",
+            "/wd4018",
+            "/wd4577",
+            "/DNOGDI",
+            "/UTF_COMPILE_LIBRARY",
+        ],
+        "//conditions:default": ["-pthread", CPLUSPLUS_VERSION, D_GLIBCXX_USE_CXX11_ABI],
+    })
+
+    native.cc_binary(
+        name = name,
+        srcs = srcs,
+        copts = copts,
+        linkshared = 1,
+        features = select({
+            "//tensorflow_nufft:windows": ["windows_export_all_symbols"],
+            "//conditions:default": [],
+        }),
+        deps = deps,
+        **kwargs
+    )
diff --git a/tools/build/make_wheel_Linux_x86.sh b/tools/build/make_wheel_Linux_x86.sh
new file mode 100644
index 0000000..c76289c
--- /dev/null
+++ b/tools/build/make_wheel_Linux_x86.sh
@@ -0,0 +1,17 @@
+set -e -x
+
+df -h
+docker info
+# to get more disk space
+rm -rf /usr/share/dotnet &
+
+# Tests are ran as part of make_wheel target
+DOCKER_BUILDKIT=1 docker build \
+    -f tools/docker/build_wheel.Dockerfile \
+    --output type=local,dest=wheelhouse \
+    --build-arg PY_VERSION \
+    --build-arg TF_VERSION \
+    --build-arg NIGHTLY_FLAG \
+    --build-arg NIGHTLY_TIME \
+    --build-arg SKIP_CUSTOM_OP_TESTS \
+    ./
diff --git a/tools/build/make_wheel_Windows_x86.sh b/tools/build/make_wheel_Windows_x86.sh
new file mode 100644
index 0000000..18204b4
--- /dev/null
+++ b/tools/build/make_wheel_Windows_x86.sh
@@ -0,0 +1,26 @@
+set -e -x
+
+export TF_NEED_CUDA=0
+export PYTHON_BIN_PATH=$(which python)
+export BAZEL_VC="C:/Program Files (x86)/Microsoft Visual Studio/2019/Enterprise/VC/"
+
+# Install Deps
+python --version
+python -m pip install --default-timeout=1000 wheel setuptools tensorflow==$TF_VERSION
+
+# Test
+bash ./tools/testing/build_and_run_tests.sh $SKIP_CUSTOM_OP_TESTS
+
+# Clean
+bazel clean
+
+# Build
+python configure.py
+
+bazel.exe build \
+  --noshow_progress \
+  --noshow_loading_progress \
+  --verbose_failures \
+  --test_output=errors \
+  build_pip_pkg
+bazel-bin/build_pip_pkg wheelhouse $NIGHTLY_FLAG
diff --git a/tools/build/make_wheel_macOS_arm64.sh b/tools/build/make_wheel_macOS_arm64.sh
new file mode 100644
index 0000000..4724afc
--- /dev/null
+++ b/tools/build/make_wheel_macOS_arm64.sh
@@ -0,0 +1,33 @@
+set -e -x
+
+export TF_NEED_CUDA=0
+
+python --version
+python -m pip install --default-timeout=1000 delocate==0.10.3 wheel setuptools tensorflow==$TF_VERSION
+
+python configure.py
+# Setting DYLD_LIBRARY_PATH to help delocate finding tensorflow after the rpath invalidation
+export DYLD_LIBRARY_PATH=$DYLD_LIBRARY_PATH:$(python -c 'import configure; print(configure.get_tf_shared_lib_dir())')
+
+# For dynamic linking, we want the ARM version of TensorFlow.
+# Since we cannot run it on x86 so we need to force pip to install it regardless
+python -m pip install \
+  --platform=macosx_12_0_arm64 \
+  --no-deps \
+  --target=$(python -c 'import site; print(site.getsitepackages()[0])') \
+  --upgrade \
+  tensorflow-macos==$TF_VERSION
+
+bazel build \
+  --cpu=darwin_arm64 \
+  --copt -mmacosx-version-min=12.0 \
+  --linkopt -mmacosx-version-min=12.0 \
+  --noshow_progress \
+  --noshow_loading_progress \
+  --verbose_failures \
+  --test_output=errors \
+  build_pip_pkg
+
+bazel-bin/build_pip_pkg artifacts "--plat-name macosx_11_0_arm64 $NIGHTLY_FLAG"
+delocate-wheel -w wheelhouse -v artifacts/*.whl
+
diff --git a/tools/build/make_wheel_macOS_x86.sh b/tools/build/make_wheel_macOS_x86.sh
new file mode 100644
index 0000000..908a671
--- /dev/null
+++ b/tools/build/make_wheel_macOS_x86.sh
@@ -0,0 +1,32 @@
+set -e -x
+
+export TF_NEED_CUDA=0
+
+# Install Deps
+python --version
+python -m pip install --default-timeout=1000 delocate==0.10.3 wheel setuptools tensorflow==$TF_VERSION
+
+# Test
+bash ./tools/testing/build_and_run_tests.sh $SKIP_CUSTOM_OP_TESTS
+
+# Clean
+bazel clean
+
+# Build
+python configure.py
+
+bazel build \
+  --copt=-mmacosx-version-min=10.14 \
+  --linkopt=-mmacosx-version-min=10.14 \
+  --noshow_progress \
+  --noshow_loading_progress \
+  --verbose_failures \
+  --test_output=errors \
+  build_pip_pkg
+
+bazel-bin/build_pip_pkg artifacts $NIGHTLY_FLAG
+
+# Setting DYLD_LIBRARY_PATH to help delocate finding tensorflow after the rpath invalidation
+export DYLD_LIBRARY_PATH=$DYLD_LIBRARY_PATH:$(python -c 'import configure; print(configure.get_tf_shared_lib_dir())')
+delocate-wheel -w wheelhouse -v artifacts/*.whl
+
diff --git a/tools/docker/build_wheel.Dockerfile b/tools/docker/build_wheel.Dockerfile
new file mode 100644
index 0000000..34eb3a1
--- /dev/null
+++ b/tools/docker/build_wheel.Dockerfile
@@ -0,0 +1,74 @@
+#syntax=docker/dockerfile:1.1.5-experimental
+ARG PY_VERSION
+FROM tensorflow/build:2.11-python$PY_VERSION as base_install
+
+ENV TF_NEED_CUDA="1"
+ARG PY_VERSION
+ARG TF_VERSION
+
+# TODO: Remove this if tensorflow/build container removes their keras-nightly install
+# https://github.com/tensorflow/build/issues/78
+RUN python -m pip uninstall -y keras-nightly
+
+RUN python -m pip install --default-timeout=1000 tensorflow==$TF_VERSION
+
+COPY tools/install_deps/ /install_deps
+RUN python -m pip install -r /install_deps/pytest.txt
+
+COPY requirements.txt .
+RUN python -m pip install -r requirements.txt
+
+COPY ./ /tensorflow-nufft
+WORKDIR /tensorflow-nufft
+
+# -------------------------------------------------------------------
+FROM base_install as tfa_gpu_tests
+CMD ["bash", "tools/testing/build_and_run_tests.sh"]
+
+# -------------------------------------------------------------------
+FROM base_install as make_wheel
+ARG NIGHTLY_FLAG=
+ARG NIGHTLY_TIME=
+ARG SKIP_CUSTOM_OP_TESTS=
+
+# SKIP_CUSTOM_OP_TESTS is not supported.
+RUN if [[ -n "$SKIP_CUSTOM_OP_TESTS" ]] ; then exit 1 ; fi
+
+RUN python configure.py
+
+# Test Before Building
+RUN bash tools/testing/build_and_run_tests.sh $SKIP_CUSTOM_OP_TESTS
+
+# Build
+RUN bazel build \
+        --noshow_progress \
+        --noshow_loading_progress \
+        --verbose_failures \
+        --test_output=errors \
+        --crosstool_top=@ubuntu20.04-gcc9_manylinux2014-cuda11.2-cudnn8.1-tensorrt7.2_config_cuda//crosstool:toolchain \
+        build_pip_pkg && \
+    # Package Whl
+    bazel-bin/build_pip_pkg artifacts $NIGHTLY_FLAG
+
+RUN bash tools/releases/tf_auditwheel_patch.sh
+RUN python -m auditwheel repair --plat manylinux2014_x86_64 artifacts/*.whl
+RUN ls -al wheelhouse/
+
+# -------------------------------------------------------------------
+
+FROM python:$PY_VERSION as test_wheel_in_fresh_environment
+
+ARG TF_VERSION
+ARG SKIP_CUSTOM_OP_TESTS
+
+RUN python -m pip install --default-timeout=1000 tensorflow==$TF_VERSION
+
+COPY --from=make_wheel /tensorflow-nufft/wheelhouse/ /tensorflow-nufft/wheelhouse/
+RUN pip install /tensorflow-nufft/wheelhouse/*.whl
+
+RUN if [[ -z "$SKIP_CUSTOM_OP_TESTS" ]] ; then python -c "import tensorflow_nufft as tfft" ; else python -c "import tensorflow_nufft as tfft" ; fi
+
+# -------------------------------------------------------------------
+FROM scratch as output
+
+COPY --from=test_wheel_in_fresh_environment /tensorflow-nufft/wheelhouse/ .
diff --git a/tools/install_deps/black.txt b/tools/install_deps/black.txt
new file mode 100644
index 0000000..a84bf64
--- /dev/null
+++ b/tools/install_deps/black.txt
@@ -0,0 +1 @@
+black==22.3.0
diff --git a/tools/install_deps/buildifier.sh b/tools/install_deps/buildifier.sh
new file mode 100644
index 0000000..ddf937b
--- /dev/null
+++ b/tools/install_deps/buildifier.sh
@@ -0,0 +1,18 @@
+#!/usr/bin/env bash
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+wget -O /usr/local/bin/buildifier https://github.com/bazelbuild/buildtools/releases/download/0.29.0/buildifier
+chmod +x /usr/local/bin/buildifier
diff --git a/tools/install_deps/clang-format.sh b/tools/install_deps/clang-format.sh
new file mode 100644
index 0000000..16f93dc
--- /dev/null
+++ b/tools/install_deps/clang-format.sh
@@ -0,0 +1,20 @@
+#!/usr/bin/env bash
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+
+wget -O /usr/local/bin/clang-format-9 https://github.com/DoozyX/clang-format-lint-action/raw/master/clang-format/clang-format9
+chmod +x /usr/local/bin/clang-format-9
+ln -s /usr/local/bin/clang-format-9 /usr/local/bin/clang-format
diff --git a/tools/install_deps/doc_requirements.txt b/tools/install_deps/doc_requirements.txt
new file mode 100644
index 0000000..7a7a9ed
--- /dev/null
+++ b/tools/install_deps/doc_requirements.txt
@@ -0,0 +1,2 @@
+git+https://github.com/tensorflow/docs
+pyyaml
\ No newline at end of file
diff --git a/tools/install_deps/flake8.txt b/tools/install_deps/flake8.txt
new file mode 100644
index 0000000..a4acfac
--- /dev/null
+++ b/tools/install_deps/flake8.txt
@@ -0,0 +1,2 @@
+flake8~=4.0
+pep8-naming~=0.12.1
diff --git a/tools/install_deps/install_bazelisk.sh b/tools/install_deps/install_bazelisk.sh
new file mode 100644
index 0000000..6305798
--- /dev/null
+++ b/tools/install_deps/install_bazelisk.sh
@@ -0,0 +1,23 @@
+# Downloads bazelisk to ${output_dir} as `bazel`.
+date
+
+output_dir=${1:-"/usr/local/bin"}
+
+case "$(uname -s)" in
+    Darwin) name=bazelisk-darwin-amd64 ;;
+    Linux)  name=bazelisk-linux-amd64  ;;
+    *) name=bazelisk-windows-amd64 ;;
+esac
+
+mkdir -p "${output_dir}"
+curl -LO "https://github.com/bazelbuild/bazelisk/releases/download/v1.3.0/${name}"
+
+mv "${name}" "${output_dir}/bazel"
+chmod u+x "${output_dir}/bazel"
+
+if [[ ! ":$PATH:" =~ :${output_dir}/?: ]]; then
+    PATH="${output_dir}:$PATH"
+fi
+
+which bazel
+date
diff --git a/tools/install_deps/pytest.txt b/tools/install_deps/pytest.txt
new file mode 100644
index 0000000..d49227a
--- /dev/null
+++ b/tools/install_deps/pytest.txt
@@ -0,0 +1,7 @@
+pytest~=6.2.5
+pytest-xdist~=1.31
+pytest-extra-durations~=0.1.3
+scikit-learn~=1.0.2
+scikit-image~=0.19.2
+Pillow~=9.0.1
+tqdm>=4.36.1
diff --git a/tools/install_deps/tensorflow-cpu.txt b/tools/install_deps/tensorflow-cpu.txt
new file mode 100644
index 0000000..a56d24a
--- /dev/null
+++ b/tools/install_deps/tensorflow-cpu.txt
@@ -0,0 +1 @@
+tensorflow-cpu~=2.11.0
diff --git a/tools/install_deps/tensorflow.txt b/tools/install_deps/tensorflow.txt
new file mode 100644
index 0000000..1f4b31a
--- /dev/null
+++ b/tools/install_deps/tensorflow.txt
@@ -0,0 +1 @@
+tensorflow~=2.11.0
\ No newline at end of file
diff --git a/tools/install_deps/typedapi.txt b/tools/install_deps/typedapi.txt
new file mode 100644
index 0000000..386d7f9
--- /dev/null
+++ b/tools/install_deps/typedapi.txt
@@ -0,0 +1 @@
+typedapi~=0.2.0
diff --git a/tools/install_so_files.sh b/tools/install_so_files.sh
new file mode 100644
index 0000000..6b6168f
--- /dev/null
+++ b/tools/install_so_files.sh
@@ -0,0 +1,8 @@
+set -e -x
+
+if [ "$TF_NEED_CUDA" == "1" ]; then
+  CUDA_FLAG="--crosstool_top=@ubuntu20.04-gcc9_manylinux2014-cuda11.2-cudnn8.1-tensorrt7.2_config_cuda//crosstool:toolchain"
+fi
+
+bazel build $CUDA_FLAG //tensorflow_nufft/...
+cp ./bazel-bin/tensorflow_nufft/python/ops/_nufft_ops.so ./tensorflow_nufft/python/ops/
diff --git a/tools/testing/build_and_run_tests.sh b/tools/testing/build_and_run_tests.sh
new file mode 100644
index 0000000..a17ee31
--- /dev/null
+++ b/tools/testing/build_and_run_tests.sh
@@ -0,0 +1,38 @@
+#!/usr/bin/env bash
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# ==============================================================================
+# usage: bash tools/testing/build_and_run_tests.sh
+
+set -x -e
+
+SKIP_CUSTOM_OP_TESTS_FLAG=${1}
+
+python -m pip install -r tools/install_deps/pytest.txt -e ./
+python ./configure.py
+bash tools/install_so_files.sh
+python -c "import tensorflow as tf; print(tf.config.list_physical_devices())"
+
+# use 10 workers if a gpu is available, otherwise,
+# one worker per cpu core. Kokoro has 38 cores, that'd be too much
+# for the gpu memory, until we change the device placement to
+# use multiple gpus when they are available.
+EXTRA_ARGS="-n 10"
+if ! [ -x "$(command -v nvidia-smi)" ]; then
+  EXTRA_ARGS="-n auto"
+fi
+
+bazel clean
+python -m pytest -v --functions-durations=20 --modules-durations=5 $SKIP_CUSTOM_OP_TESTS_FLAG $EXTRA_ARGS ./tensorflow_nufft
diff --git a/tools/testing/parallel_gpu_execute.sh b/tools/testing/parallel_gpu_execute.sh
new file mode 100644
index 0000000..2efe8de
--- /dev/null
+++ b/tools/testing/parallel_gpu_execute.sh
@@ -0,0 +1,83 @@
+#!/usr/bin/env bash
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#
+#
+# A script to run multiple GPU tests in parallel controlled with an environment
+# variable.
+#
+# Required environment variables:
+#     TF_GPU_COUNT = Number of GPUs available.
+
+TF_GPU_COUNT=${TF_GPU_COUNT:-4}
+TF_TESTS_PER_GPU=${TF_TESTS_PER_GPU:-8}
+# We want to allow running one of the following configs:
+#  - 4 tests per GPU on k80
+#  - 8 tests per GPU on p100
+# p100 has minimum 12G memory. Therefore, we should limit each test to 1.5G.
+# To leave some room in case we want to run more tests in parallel in the
+# future and to use a rounder number, we set it to 1G.
+export TF_PER_DEVICE_MEMORY_LIMIT_MB=${TF_PER_DEVICE_MEMORY_LIMIT_MB:-1024}
+
+# *******************************************************************
+#         This section of the script is needed to
+#         make things work on windows under msys.
+# *******************************************************************
+RUNFILES_MANIFEST_FILE="${TEST_SRCDIR}/MANIFEST"
+function rlocation() {
+  if is_absolute "$1" ; then
+    # If the file path is already fully specified, simply return it.
+    echo "$1"
+  elif [[ -e "$TEST_SRCDIR/$1" ]]; then
+    # If the file exists in the $TEST_SRCDIR then just use it.
+    echo "$TEST_SRCDIR/$1"
+  elif [[ -e "$RUNFILES_MANIFEST_FILE" ]]; then
+    # If a runfiles manifest file exists then use it.
+    echo "$(grep "^$1 " "$RUNFILES_MANIFEST_FILE" | sed 's/[^ ]* //')"
+  fi
+}
+
+TEST_BINARY="$(rlocation $TEST_WORKSPACE/${1#./})"
+shift
+# *******************************************************************
+
+mkdir -p /var/lock
+# Try to acquire any of the TF_GPU_COUNT * TF_TESTS_PER_GPU
+# slots to run a test at.
+#
+# Prefer to allocate 1 test per GPU over 4 tests on 1 GPU.
+# So, we iterate over TF_TESTS_PER_GPU first.
+for j in `seq 0 $((TF_TESTS_PER_GPU-1))`; do
+  for i in `seq 0 $((TF_GPU_COUNT-1))`; do
+    exec {lock_fd}>/var/lock/gpulock${i}_${j} || exit 1
+    if flock -n "$lock_fd";
+    then
+      (
+        # This export only works within the brackets, so it is isolated to one
+        # single command.
+        export CUDA_VISIBLE_DEVICES=$i
+        export HIP_VISIBLE_DEVICES=$i
+        echo "Running test $TEST_BINARY $* on GPU $CUDA_VISIBLE_DEVICES"
+        "$TEST_BINARY" $@
+      )
+      return_code=$?
+      flock -u "$lock_fd"
+      exit $return_code
+    fi
+  done
+done
+
+echo "Cannot find a free GPU to run the test $* on, exiting with failure..."
+exit 1
diff --git a/tools/testing/source_code_test.py b/tools/testing/source_code_test.py
new file mode 100644
index 0000000..1c9f35b
--- /dev/null
+++ b/tools/testing/source_code_test.py
@@ -0,0 +1,241 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#
+import glob
+import os
+
+from typedapi import ensure_api_is_typed
+
+import importlib
+import tensorflow_nufft as tfa
+import tensorflow as tf
+
+BASE_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))
+
+
+def test_api_typed():
+    modules_list = [
+        tfa,
+        tfa.activations,
+        tfa.callbacks,
+        tfa.image,
+        tfa.losses,
+        tfa.metrics,
+        tfa.optimizers,
+        tfa.rnn,
+        tfa.seq2seq,
+        tfa.text,
+    ]
+    # Files within this list will be exempt from verification.
+    exception_list = [
+        tfa.rnn.PeepholeLSTMCell,
+        tf.keras.optimizers.Optimizer,
+    ]
+    if importlib.util.find_spec("tensorflow.keras.optimizers.legacy") is not None:
+        exception_list.append(tf.keras.optimizers.legacy.Optimizer)
+
+    help_message = (
+        "You can also take a look at the section about it in the CONTRIBUTING.md:\n"
+        "https://github.com/tensorflow/addons/blob/master/CONTRIBUTING.md#about-type-hints"
+    )
+    ensure_api_is_typed(
+        modules_list,
+        exception_list,
+        init_only=True,
+        additional_message=help_message,
+    )
+
+
+def test_case_insensitive_filesystems():
+    # Make sure BASE_DIR is project root.
+    # If it doesn't, we probably computed the wrong directory.
+    if not os.path.isdir(os.path.join(BASE_DIR, "tensorflow_nufft")):
+        raise AssertionError("BASE_DIR = {} is not project root".format(BASE_DIR))
+
+    for dirpath, dirnames, filenames in os.walk(BASE_DIR, followlinks=True):
+        lowercase_directories = [x.lower() for x in dirnames]
+        lowercase_files = [x.lower() for x in filenames]
+
+        lowercase_dir_contents = lowercase_directories + lowercase_files
+        if len(lowercase_dir_contents) != len(set(lowercase_dir_contents)):
+            raise AssertionError(
+                "Files with same name but different case detected "
+                "in directory: {}".format(dirpath)
+            )
+
+
+def get_lines_of_source_code(allowlist=None):
+    allowlist = allowlist or []
+    source_dir = os.path.join(BASE_DIR, "tensorflow_nufft")
+    for path in glob.glob(source_dir + "/**/*.py", recursive=True):
+        if in_allowlist(path, allowlist):
+            continue
+        with open(path) as f:
+            for line_idx, line in enumerate(f):
+                yield path, line_idx, line
+
+
+def in_allowlist(file_path, allowlist):
+    for allowed_file in allowlist:
+        if file_path.endswith(allowed_file):
+            return True
+    return False
+
+
+def test_no_private_tf_api():
+    # TODO: remove all elements of the list and remove the allowlist
+    # This allowlist should not grow. Do not add elements to this list.
+    allowlist = [
+        "tensorflow_nufft/metrics/r_square.py",
+        "tensorflow_nufft/utils/test_utils.py",
+        "tensorflow_nufft/seq2seq/decoder.py",
+        "tensorflow_nufft/utils/types.py",
+    ]
+
+    for file_path, line_idx, line in get_lines_of_source_code(allowlist):
+
+        if "import tensorflow.python" in line or "from tensorflow.python" in line:
+            raise ImportError(
+                "A private tensorflow API import was found in {} at line {}.\n"
+                "tensorflow.python refers to TensorFlow's internal source "
+                "code and private functions/classes.\n"
+                "The use of those is forbidden in NUFFT for stability reasons."
+                "\nYou should find a public alternative or ask the "
+                "TensorFlow team to expose publicly the function/class "
+                "that you are using.\n"
+                "If you're trying to do `import tensorflow.python.keras` "
+                "it can be replaced with `import tensorflow.keras`."
+                "".format(file_path, line_idx + 1)
+            )
+
+
+def test_no_tf_cond():
+    # TODO: remove all elements of the list and remove the allowlist
+    # This allowlist should not grow. Do not add elements to this list.
+    allowlist = [
+        "tensorflow_nufft/text/crf.py",
+        "tensorflow_nufft/layers/wrappers.py",
+        "tensorflow_nufft/image/connected_components.py",
+        "tensorflow_nufft/optimizers/novograd.py",
+        "tensorflow_nufft/metrics/cohens_kappa.py",
+        "tensorflow_nufft/seq2seq/sampler.py",
+        "tensorflow_nufft/seq2seq/beam_search_decoder.py",
+    ]
+    for file_path, line_idx, line in get_lines_of_source_code(allowlist):
+
+        if "tf.cond(" in line:
+            raise NameError(
+                "The usage of a tf.cond() function call was found in "
+                "file {} at line {}:\n\n"
+                "   {}\n"
+                "In TensorFlow 2.x, using a simple `if` in a function decorated "
+                "with `@tf.function` is equivalent to a tf.cond() thanks to Autograph. \n"
+                "TensorFlow NUFFT aims to be written with idiomatic TF 2.x code. \n"
+                "As such, using tf.cond() is not allowed in the codebase. \n"
+                "Use a `if` and decorate your function with @tf.function instead. \n"
+                "You can take a look at "
+                "https://www.tensorflow.org/guide/function#use_python_control_flow"
+                "".format(file_path, line_idx, line)
+            )
+
+
+def test_no_experimental_api():
+    # TODO: remove all elements of the list and remove the allowlist
+    # This allowlist should not grow. Do not add elements to this list.
+    allowlist = [
+        "tensorflow_nufft/optimizers/constants.py",
+        "tensorflow_nufft/optimizers/weight_decay_optimizers.py",
+        "tensorflow_nufft/layers/max_unpooling_2d.py",
+        "tensorflow_nufft/image/dense_image_warp.py",
+    ]
+    for file_path, line_idx, line in get_lines_of_source_code(allowlist):
+
+        if file_path.endswith("_test.py") or file_path.endswith("conftest.py"):
+            continue
+        if file_path.endswith("tensorflow_nufft/utils/test_utils.py"):
+            continue
+
+        if "experimental" in line:
+            raise NameError(
+                "The usage of a TensorFlow experimental API was found in file {} "
+                "at line {}:\n\n"
+                "   {}\n"
+                "Experimental APIs are ok in tests but not in user-facing code. "
+                "This is because Experimental APIs might have bugs and are not "
+                "widely used yet.\n"
+                "NUFFT should show how to write TensorFlow "
+                "code in a stable and forward-compatible way."
+                "".format(file_path, line_idx, line)
+            )
+
+
+def test_no_tf_control_dependencies():
+    # TODO: remove all elements of the list and remove the allowlist
+    # This allowlist should not grow. Do not add elements to this list.
+    allowlist = [
+        "tensorflow_nufft/layers/wrappers.py",
+        "tensorflow_nufft/image/utils.py",
+        "tensorflow_nufft/image/dense_image_warp.py",
+        "tensorflow_nufft/optimizers/average_wrapper.py",
+        "tensorflow_nufft/optimizers/discriminative_layer_training.py",
+        "tensorflow_nufft/optimizers/yogi.py",
+        "tensorflow_nufft/optimizers/lookahead.py",
+        "tensorflow_nufft/optimizers/weight_decay_optimizers.py",
+        "tensorflow_nufft/optimizers/rectified_adam.py",
+        "tensorflow_nufft/optimizers/lamb.py",
+        "tensorflow_nufft/seq2seq/sampler.py",
+        "tensorflow_nufft/seq2seq/beam_search_decoder.py",
+        "tensorflow_nufft/seq2seq/attention_wrapper.py",
+    ]
+    for file_path, line_idx, line in get_lines_of_source_code(allowlist):
+
+        if "tf.control_dependencies(" in line:
+
+            raise NameError(
+                "The usage of a tf.control_dependencies() function call was found in "
+                "file {} at line {}:\n\n"
+                "   {}\n"
+                "In TensorFlow 2.x, in a function decorated "
+                "with `@tf.function` the dependencies are controlled automatically"
+                " thanks to Autograph. \n"
+                "TensorFlow Addons aims to be written with idiomatic TF 2.x code. \n"
+                "As such, using tf.control_dependencies() is not allowed in the codebase. \n"
+                "Decorate your function with @tf.function instead. \n"
+                "You can take a look at \n"
+                "https://github.com/tensorflow/community/blob/master/rfcs/20180918-functions-not-sessions-20.md#program-order-semantics--control-dependencies"
+                "".format(file_path, line_idx, line)
+            )
+
+
+def test_no_deprecated_v1():
+    # TODO: remove all elements of the list and remove the allowlist
+    # This allowlist should not grow. Do not add elements to this list.
+    allowlist = [
+        "tensorflow_nufft/text/skip_gram_ops.py",
+        "tensorflow_nufft/seq2seq/decoder.py",
+        "tensorflow_nufft/seq2seq/tests/attention_wrapper_test.py",
+    ]
+    for file_path, line_idx, line in get_lines_of_source_code(allowlist):
+
+        if "tf.compat.v1" in line:
+            raise NameError(
+                "The usage of a tf.compat.v1 API was found in file {} at line {}:\n\n"
+                "   {}\n"
+                "TensorFlow Addons doesn't support running programs with "
+                "`tf.compat.v1.disable_v2_behavior()`.\n"
+                "As such, there should be no need for the compatibility module "
+                "tf.compat. Please find an alternative using only the TF2.x API."
+                "".format(file_path, line_idx, line)
+            )

From 3e66ec05e37f8dc46be24878c6a9f52613ce850f Mon Sep 17 00:00:00 2001
From: Javier Montalt Tordera <javier.montalt@outlook.com>
Date: Sun, 5 Mar 2023 11:59:32 +0000
Subject: [PATCH 02/23] FFTW dependency

---
 .bazeliskrc                         |  1 +
 .devcontainer/devcontainer.json     |  3 +-
 WORKSPACE                           | 27 ++++++++++++
 tensorflow_nufft/BUILD              | 68 +++++++++++++++++++----------
 tools/build/make_wheel_Linux_x86.sh |  0
 tools/docker/build_wheel.Dockerfile | 28 ++++++++++--
 6 files changed, 100 insertions(+), 27 deletions(-)
 create mode 100644 .bazeliskrc
 mode change 100644 => 100755 tools/build/make_wheel_Linux_x86.sh

diff --git a/.bazeliskrc b/.bazeliskrc
new file mode 100644
index 0000000..72fbf2d
--- /dev/null
+++ b/.bazeliskrc
@@ -0,0 +1 @@
+USE_BAZEL_VERSION=5.3.0
diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
index 7b4ff62..9c275e9 100644
--- a/.devcontainer/devcontainer.json
+++ b/.devcontainer/devcontainer.json
@@ -9,7 +9,8 @@
 		"ms-vscode.makefile-tools",
 		"github.vscode-pull-request-github",
 		"github.copilot",
-		"zxh404.vscode-proto3"
+		"zxh404.vscode-proto3",
+		"BazelBuild.vscode-bazel"
 	],
 	// Enable GPUs.
 	"runArgs": [
diff --git a/WORKSPACE b/WORKSPACE
index 03def49..bad4f45 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -2,6 +2,14 @@ load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
 load("//build_deps/tf_dependency:tf_configure.bzl", "tf_configure")
 load("//build_deps/toolchains/gpu:cuda_configure.bzl", "cuda_configure")
 
+all_content = """\
+filegroup(
+    name = "all",
+    srcs = glob(["**"]),
+    visibility = ["//visibility:public"],
+)
+"""
+
 http_archive(
     name = "cub_archive",
     build_file = "//build_deps/toolchains/gpu:cub.BUILD",
@@ -43,3 +51,22 @@ tf_workspace1()
 load("@org_tensorflow//tensorflow:workspace0.bzl", "tf_workspace0")
 
 tf_workspace0()
+
+http_archive(
+    name = "rules_foreign_cc",
+    sha256 = "2a4d07cd64b0719b39a7c12218a3e507672b82a97b98c6a89d38565894cf7c51",
+    strip_prefix = "rules_foreign_cc-0.9.0",
+    url = "https://github.com/bazelbuild/rules_foreign_cc/archive/refs/tags/0.9.0.tar.gz",
+)
+
+load("@rules_foreign_cc//foreign_cc:repositories.bzl", "rules_foreign_cc_dependencies")
+
+rules_foreign_cc_dependencies()
+
+http_archive(
+   name = "fftw",
+   build_file_content = all_content,
+   sha256 = "56c932549852cddcfafdab3820b0200c7742675be92179e59e6215b340e26467",
+   strip_prefix = "fftw-3.3.10",
+   url = "https://www.fftw.org/fftw-3.3.10.tar.gz",
+)
diff --git a/tensorflow_nufft/BUILD b/tensorflow_nufft/BUILD
index 70435e8..373306a 100644
--- a/tensorflow_nufft/BUILD
+++ b/tensorflow_nufft/BUILD
@@ -2,6 +2,10 @@ licenses(["notice"])  # Apache 2.0
 
 package(default_visibility = ["//visibility:public"])
 
+load("@rules_foreign_cc//foreign_cc:configure.bzl", "configure_make")
+
+load("//tensorflow_nufft:tensorflow_nufft.bzl", "custom_op_library")
+
 config_setting(
     name = "windows",
     constraint_values = ["@bazel_tools//platforms:windows"],
@@ -10,28 +14,46 @@ config_setting(
 py_library(
     name = "tensorflow_nufft",
     srcs = glob(["*.py"]),
-    -- deps = [
-    --     "//tensorflow_nufft/activations",
-    --     "//tensorflow_nufft/callbacks",
-    --     "//tensorflow_nufft/image",
-    --     "//tensorflow_addons/layers",
-    --     "//tensorflow_addons/losses",
-    --     "//tensorflow_addons/metrics",
-    --     "//tensorflow_addons/optimizers",
-    --     "//tensorflow_addons/rnn",
-    --     "//tensorflow_addons/seq2seq",
-    --     "//tensorflow_addons/testing",
-    --     "//tensorflow_addons/text",
-    --     "//tensorflow_addons/utils",
-    -- ],
+    data = [
+        ":python/ops/_nufft_ops.so",
+    ],
 )
 
--- py_test(
---     name = "tensorflow_nufft_test",
---     size = "small",
---     srcs = glob(["tests/*"]),
---     main = "tests/run_all_test.py",
---     deps = [
---         ":tensorflow_nufft",
---     ],
--- )
+custom_op_library(
+    name = "python/ops/_nufft_ops.so",
+    srcs = [
+        "cc/kernels/legendre_rule_fast.cc",
+        "cc/kernels/legendre_rule_fast.h",
+        "cc/kernels/nufft_kernels.cc",
+        "cc/kernels/nufft_options.h",
+        "cc/kernels/nufft_plan.cc",
+        "cc/kernels/nufft_plan.h",
+        "cc/kernels/nufft_util.cc",
+        "cc/kernels/nufft_util.h",
+        "cc/kernels/reverse_functor_cpu.cc",
+        "cc/kernels/reverse_functor.h",
+        "cc/kernels/transpose_functor_cpu.cc",
+        "cc/kernels/transpose_functor.h",
+        "cc/ops/nufft_ops.cc",
+    ],
+    cuda_srcs = [
+        "cc/kernels/nufft_plan.h",
+        "cc/kernels/nufft_plan.cu.cc",
+        "cc/kernels/reverse_functor.h",
+        "cc/kernels/reverse_functor_gpu.cu.cc",
+        "cc/kernels/transpose_functor.h",
+        "cc/kernels/transpose_functor_gpu.cu.cc",
+    ],
+    deps = [
+        "fftw"
+    ],
+)
+
+configure_make(
+    name = "fftw",
+    configure_in_place = True,
+    lib_source = "@fftw//:all",
+    out_include_dir = "api",
+    out_lib_dir = ".libs",
+    out_static_libs = ["libfftw3.a"],
+)
diff --git a/tools/build/make_wheel_Linux_x86.sh b/tools/build/make_wheel_Linux_x86.sh
old mode 100644
new mode 100755
diff --git a/tools/docker/build_wheel.Dockerfile b/tools/docker/build_wheel.Dockerfile
index 34eb3a1..5f1a100 100644
--- a/tools/docker/build_wheel.Dockerfile
+++ b/tools/docker/build_wheel.Dockerfile
@@ -1,6 +1,7 @@
 #syntax=docker/dockerfile:1.1.5-experimental
-ARG PY_VERSION
-FROM tensorflow/build:2.11-python$PY_VERSION as base_install
+ARG PY_VERSION=3.8
+ARG TF_VERSION=2.11
+FROM tensorflow/build:$TF_VERSION-python$PY_VERSION as base_install
 
 ENV TF_NEED_CUDA="1"
 ARG PY_VERSION
@@ -12,6 +13,24 @@ RUN python -m pip uninstall -y keras-nightly
 
 RUN python -m pip install --default-timeout=1000 tensorflow==$TF_VERSION
 
+# # Install FFTW library.
+# ARG FFTW_VERSION=3.3.10
+# ARG PREFIX=/dt9/usr
+# ARG CC="${PREFIX}/bin/gcc"
+# ARG CXX="${PREFIX}/bin/g++"
+# ARG LIBDIR="${PREFIX}/lib"
+# ARG INCLUDEDIR="${PREFIX}/include"
+# ARG CFLAGS="-O3 -march=x86-64 -mtune=generic -fPIC"
+# RUN cd /opt && \
+#     curl -sL http://www.fftw.org/fftw-${FFTW_VERSION}.tar.gz | tar xz && \
+#     cd fftw-${FFTW_VERSION} && \
+#     ./configure CC="${CC}" CFLAGS="${CFLAGS}" --prefix ${PREFIX} --enable-openmp --enable-float && \
+#     make && \
+#     make install && \
+#     ./configure CC="${CC}" CFLAGS="${CFLAGS}" --prefix ${PREFIX} --enable-openmp && \
+#     make && \
+#     make install
+
 COPY tools/install_deps/ /install_deps
 RUN python -m pip install -r /install_deps/pytest.txt
 
@@ -31,7 +50,10 @@ ARG NIGHTLY_FLAG=
 ARG NIGHTLY_TIME=
 ARG SKIP_CUSTOM_OP_TESTS=
 
-# SKIP_CUSTOM_OP_TESTS is not supported.
+# NIGHTLY_FLAG, NIGHTLY_TIME and SKIP_CUSTOM_OP_TESTS are not currently
+# supported for TensorFlow NUFFT.
+RUN if [[ -n "$NIGHTLY_FLAG" ]] ; then exit 1 ; fi
+RUN if [[ -n "$NIGHTLY_TIME" ]] ; then exit 1 ; fi
 RUN if [[ -n "$SKIP_CUSTOM_OP_TESTS" ]] ; then exit 1 ; fi
 
 RUN python configure.py

From 1e8235edf3732944d5c86296079586c8e1a31719 Mon Sep 17 00:00:00 2001
From: Javier Montalt Tordera <javier.montalt@outlook.com>
Date: Sat, 11 Mar 2023 16:51:14 +0000
Subject: [PATCH 03/23] Added FFTW3 dep

---
 .gitignore                                    |  1 +
 WORKSPACE                                     | 38 +++++++++++++------
 build_deps/fftw3/BUILD                        | 10 +++++
 .../toolchains/gpu/cuda/build_defs.bzl.tpl    |  7 ++++
 tensorflow_nufft/BUILD                        | 25 +++++++-----
 5 files changed, 59 insertions(+), 22 deletions(-)
 create mode 100644 build_deps/fftw3/BUILD

diff --git a/.gitignore b/.gitignore
index 91f291b..3fbf52a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,6 +9,7 @@
 *.ipynb_checkpoints*
 __pycache__/
 artifacts/
+wheelhouse/
 docs/_*
 docs/api_docs/tfft/
 *_pb2.py
diff --git a/WORKSPACE b/WORKSPACE
index bad4f45..407db23 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -2,7 +2,7 @@ load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
 load("//build_deps/tf_dependency:tf_configure.bzl", "tf_configure")
 load("//build_deps/toolchains/gpu:cuda_configure.bzl", "cuda_configure")
 
-all_content = """\
+ALL_CONTENT = """\
 filegroup(
     name = "all",
     srcs = glob(["**"]),
@@ -10,6 +10,10 @@ filegroup(
 )
 """
 
+tf_configure(name = "local_config_tf")
+
+cuda_configure(name = "local_config_cuda")
+
 http_archive(
     name = "cub_archive",
     build_file = "//build_deps/toolchains/gpu:cub.BUILD",
@@ -21,12 +25,6 @@ http_archive(
     ],
 )
 
-tf_configure(
-    name = "local_config_tf",
-)
-
-cuda_configure(name = "local_config_cuda")
-
 http_archive(
     name = "org_tensorflow",
     sha256 = "99c732b92b1b37fc243a559e02f9aef5671771e272758aa4aec7f34dc92dac48",
@@ -52,6 +50,22 @@ load("@org_tensorflow//tensorflow:workspace0.bzl", "tf_workspace0")
 
 tf_workspace0()
 
+# Compilation of protocol buffers.
+http_archive(
+    name = "rules_proto",
+    sha256 = "dc3fb206a2cb3441b485eb1e423165b231235a1ea9b031b4433cf7bc1fa460dd",
+    strip_prefix = "rules_proto-5.3.0-21.7",
+    urls = [
+        "https://github.com/bazelbuild/rules_proto/archive/refs/tags/5.3.0-21.7.tar.gz",
+    ],
+)
+
+load("@rules_proto//proto:repositories.bzl", "rules_proto_dependencies", "rules_proto_toolchains")
+
+rules_proto_dependencies()
+rules_proto_toolchains()
+
+# Compilation of non-Bazel C/C++ projects (e.g., FFTW3).
 http_archive(
     name = "rules_foreign_cc",
     sha256 = "2a4d07cd64b0719b39a7c12218a3e507672b82a97b98c6a89d38565894cf7c51",
@@ -64,9 +78,9 @@ load("@rules_foreign_cc//foreign_cc:repositories.bzl", "rules_foreign_cc_depende
 rules_foreign_cc_dependencies()
 
 http_archive(
-   name = "fftw",
-   build_file_content = all_content,
-   sha256 = "56c932549852cddcfafdab3820b0200c7742675be92179e59e6215b340e26467",
-   strip_prefix = "fftw-3.3.10",
-   url = "https://www.fftw.org/fftw-3.3.10.tar.gz",
+    name = "fftw3_archive",
+    build_file_content = ALL_CONTENT,
+    sha256 = "56c932549852cddcfafdab3820b0200c7742675be92179e59e6215b340e26467",
+    strip_prefix = "fftw-3.3.10",
+    url = "https://www.fftw.org/fftw-3.3.10.tar.gz",
 )
diff --git a/build_deps/fftw3/BUILD b/build_deps/fftw3/BUILD
new file mode 100644
index 0000000..80ea685
--- /dev/null
+++ b/build_deps/fftw3/BUILD
@@ -0,0 +1,10 @@
+load("@rules_foreign_cc//foreign_cc:configure.bzl", "configure_make")
+
+configure_make(
+    name = "fftw3",
+    configure_in_place = True,
+    configure_options = ["CFLAGS=\"-fPIC\""],
+    lib_source = "@fftw3_archive//:all",
+    out_static_libs = ["libfftw3.a"],
+    visibility = ["//visibility:public"],
+)
diff --git a/build_deps/toolchains/gpu/cuda/build_defs.bzl.tpl b/build_deps/toolchains/gpu/cuda/build_defs.bzl.tpl
index a4f484f..c04bbb9 100644
--- a/build_deps/toolchains/gpu/cuda/build_defs.bzl.tpl
+++ b/build_deps/toolchains/gpu/cuda/build_defs.bzl.tpl
@@ -22,6 +22,7 @@ def cuda_is_configured():
     """Returns true if CUDA was enabled during the configure process."""
     return %{cuda_is_configured}
 
+
 def if_cuda_is_configured(x):
     """Tests if the CUDA was enabled during the configure process.
 
@@ -32,6 +33,7 @@ def if_cuda_is_configured(x):
       return x
     return []
 
+
 def cuda_header_library(
         name,
         hdrs,
@@ -60,3 +62,8 @@ def cuda_header_library(
         deps = deps + [":%s_virtual" % name],
         **kwargs
     )
+
+
+def cuda_library(copts = [], **kwargs):
+    """Wrapper over cc_library which adds default CUDA options."""
+    native.cc_library(copts = cuda_default_copts() + copts, **kwargs)
diff --git a/tensorflow_nufft/BUILD b/tensorflow_nufft/BUILD
index 373306a..d6b83fd 100644
--- a/tensorflow_nufft/BUILD
+++ b/tensorflow_nufft/BUILD
@@ -2,8 +2,7 @@ licenses(["notice"])  # Apache 2.0
 
 package(default_visibility = ["//visibility:public"])
 
-load("@rules_foreign_cc//foreign_cc:configure.bzl", "configure_make")
-
+load("@rules_proto//proto:defs.bzl", "proto_library")
 load("//tensorflow_nufft:tensorflow_nufft.bzl", "custom_op_library")
 
 config_setting(
@@ -22,6 +21,10 @@ py_library(
 custom_op_library(
     name = "python/ops/_nufft_ops.so",
     srcs = [
+        "cc/kernels/fftw_api.h",
+        "cc/kernels/omp_api.h",
+        "cc/kernels/kernel_horner_sigma2.inc",
+        "cc/kernels/kernel_horner_sigma125.inc",
         "cc/kernels/legendre_rule_fast.cc",
         "cc/kernels/legendre_rule_fast.h",
         "cc/kernels/nufft_kernels.cc",
@@ -45,15 +48,17 @@ custom_op_library(
         "cc/kernels/transpose_functor_gpu.cu.cc",
     ],
     deps = [
-        "fftw"
+        "@//build_deps/fftw3",
+        ":nufft_options_cc_proto",
     ],
 )
 
-configure_make(
-    name = "fftw",
-    configure_in_place = True,
-    lib_source = "@fftw//:all",
-    out_include_dir = "api",
-    out_lib_dir = ".libs",
-    out_static_libs = ["libfftw3.a"],
+cc_proto_library(
+    name = "nufft_options_cc_proto",
+    deps = [":nufft_options_proto"],
+)
+
+proto_library(
+    name = "nufft_options_proto",
+    srcs = ["proto/nufft_options.proto"],
 )

From 1c60808f3f0623120400acb1dcb195d5258a5024 Mon Sep 17 00:00:00 2001
From: Javier Montalt Tordera <javier.montalt@outlook.com>
Date: Sun, 12 Mar 2023 16:46:51 +0000
Subject: [PATCH 04/23] Add Python proto to Bazel build

---
 tensorflow_nufft/BUILD    | 7 +++++++
 tools/install_so_files.sh | 1 +
 2 files changed, 8 insertions(+)
 mode change 100644 => 100755 tools/install_so_files.sh

diff --git a/tensorflow_nufft/BUILD b/tensorflow_nufft/BUILD
index d6b83fd..48e34cf 100644
--- a/tensorflow_nufft/BUILD
+++ b/tensorflow_nufft/BUILD
@@ -2,6 +2,7 @@ licenses(["notice"])  # Apache 2.0
 
 package(default_visibility = ["//visibility:public"])
 
+load("@com_google_protobuf//:protobuf.bzl", "py_proto_library")
 load("@rules_proto//proto:defs.bzl", "proto_library")
 load("//tensorflow_nufft:tensorflow_nufft.bzl", "custom_op_library")
 
@@ -15,6 +16,7 @@ py_library(
     srcs = glob(["*.py"]),
     data = [
         ":python/ops/_nufft_ops.so",
+        ":nufft_options_py_proto",
     ],
 )
 
@@ -53,6 +55,11 @@ custom_op_library(
     ],
 )
 
+py_proto_library(
+    name = "nufft_options_py_proto",
+    srcs = ["proto/nufft_options.proto"],
+)
+
 cc_proto_library(
     name = "nufft_options_cc_proto",
     deps = [":nufft_options_proto"],
diff --git a/tools/install_so_files.sh b/tools/install_so_files.sh
old mode 100644
new mode 100755
index 6b6168f..c436b3d
--- a/tools/install_so_files.sh
+++ b/tools/install_so_files.sh
@@ -6,3 +6,4 @@ fi
 
 bazel build $CUDA_FLAG //tensorflow_nufft/...
 cp ./bazel-bin/tensorflow_nufft/python/ops/_nufft_ops.so ./tensorflow_nufft/python/ops/
+cp ./bazel-bin/tensorflow_nufft/proto/nufft_options_pb2.py ./tensorflow_nufft/proto/

From de940e952fe329f7246d927c9f346bb5a6f55bb0 Mon Sep 17 00:00:00 2001
From: Javier Montalt Tordera <javier.montalt@outlook.com>
Date: Sun, 12 Mar 2023 17:50:53 +0000
Subject: [PATCH 05/23] Add single-precision FFTW library

---
 build_deps/fftw3/BUILD | 26 ++++++++++++++++++++++++--
 tensorflow_nufft/BUILD |  3 ++-
 2 files changed, 26 insertions(+), 3 deletions(-)

diff --git a/build_deps/fftw3/BUILD b/build_deps/fftw3/BUILD
index 80ea685..16919a3 100644
--- a/build_deps/fftw3/BUILD
+++ b/build_deps/fftw3/BUILD
@@ -3,8 +3,30 @@ load("@rules_foreign_cc//foreign_cc:configure.bzl", "configure_make")
 configure_make(
     name = "fftw3",
     configure_in_place = True,
-    configure_options = ["CFLAGS=\"-fPIC\""],
+    configure_options = [
+        "CFLAGS=\"-fPIC\"",
+        "--enable-openmp",
+    ],
     lib_source = "@fftw3_archive//:all",
-    out_static_libs = ["libfftw3.a"],
+    out_static_libs = [
+        "libfftw3.a",
+        "libfftw3_omp.a",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+configure_make(
+    name = "fftw3f",
+    configure_in_place = True,
+    configure_options = [
+        "CFLAGS=\"-fPIC\"",
+        "--enable-openmp",
+        "--enable-float",
+    ],
+    lib_source = "@fftw3_archive//:all",
+    out_static_libs = [
+        "libfftw3f.a",
+        "libfftw3f_omp.a",
+    ],
     visibility = ["//visibility:public"],
 )
diff --git a/tensorflow_nufft/BUILD b/tensorflow_nufft/BUILD
index 48e34cf..81ebdaf 100644
--- a/tensorflow_nufft/BUILD
+++ b/tensorflow_nufft/BUILD
@@ -50,7 +50,8 @@ custom_op_library(
         "cc/kernels/transpose_functor_gpu.cu.cc",
     ],
     deps = [
-        "@//build_deps/fftw3",
+        "@//build_deps/fftw3:fftw3",
+        "@//build_deps/fftw3:fftw3f",
         ":nufft_options_cc_proto",
     ],
 )

From 508985ba7a4ebca8c101747976247ac0d6794128 Mon Sep 17 00:00:00 2001
From: Javier Montalt Tordera <javier.montalt@outlook.com>
Date: Sat, 18 Mar 2023 08:25:52 +0000
Subject: [PATCH 06/23] Update dev container

---
 .devcontainer/Dockerfile        | 28 ----------------------------
 .devcontainer/devcontainer.json | 30 ++++++++++++++++++------------
 Dockerfile                      | 32 ++++++++++++++++++++++++++++++++
 WORKSPACE                       |  5 ++---
 4 files changed, 52 insertions(+), 43 deletions(-)
 delete mode 100644 .devcontainer/Dockerfile
 create mode 100644 Dockerfile

diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile
deleted file mode 100644
index 286dd5a..0000000
--- a/.devcontainer/Dockerfile
+++ /dev/null
@@ -1,28 +0,0 @@
-FROM ghcr.io/mrphys/tensorflow-manylinux:1.15.0
-
-# To enable plotting.
-RUN apt-get update && \
-    apt-get install -y libcairo2-dev libgirepository1.0-dev libgtk-3-dev
-
-ARG PYBIN=/usr/local/bin/python
-ARG PYVERSIONS="3.7 3.8 3.9 3.10"
-RUN for PYVER in ${PYVERSIONS}; do ${PYBIN}${PYVER} -m pip install pycairo PyGObject; done
-
-# Install TFMRI dependencies.
-COPY requirements.txt /tmp/requirements.txt
-RUN for PYVER in ${PYVERSIONS}; do ${PYBIN}${PYVER} -m pip install -r /tmp/requirements.txt; done
-
-# Create non-root user.
-ARG USERNAME=vscode
-ARG USER_UID=1000
-ARG USER_GID=$USER_UID
-
-RUN groupadd --gid $USER_GID $USERNAME && \
-    useradd --uid $USER_UID --gid $USER_GID -m $USERNAME && \
-    # Add user to sudoers.
-    apt-get update && \
-    apt-get install -y sudo && \
-    echo $USERNAME ALL=\(root\) NOPASSWD:ALL > /etc/sudoers.d/$USERNAME && \
-    chmod 0440 /etc/sudoers.d/$USERNAME && \
-    # Change default shell to bash.
-    usermod --shell /bin/bash $USERNAME
diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
index 9c275e9..10869c0 100644
--- a/.devcontainer/devcontainer.json
+++ b/.devcontainer/devcontainer.json
@@ -1,17 +1,23 @@
 {
-	"dockerFile": "./Dockerfile",
-	"context": "..",
-	"settings": {},
+	"build": {
+		"dockerfile": "../Dockerfile",
+		"context": "..",
+		"target": "dev"
+	},
 	// Useful extensions.
-	"extensions": [
-		"ms-python.python",
-		"ms-vscode.cpptools",
-		"ms-vscode.makefile-tools",
-		"github.vscode-pull-request-github",
-		"github.copilot",
-		"zxh404.vscode-proto3",
-		"BazelBuild.vscode-bazel"
-	],
+	"customizations": {
+		"vscode": {
+			"extensions": [
+				"ms-python.python",
+				"ms-vscode.cpptools",
+				"ms-vscode.makefile-tools",
+				"github.vscode-pull-request-github",
+				"github.copilot",
+				"zxh404.vscode-proto3",
+				"BazelBuild.vscode-bazel"
+			]
+		}
+	},
 	// Enable GPUs.
 	"runArgs": [
 		"--gpus=all"
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..c15498f
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,32 @@
+#syntax=docker/dockerfile:1.1.5-experimental
+ARG PY_VERSION=3.8
+ARG TF_VERSION=2.11
+FROM tensorflow/build:$TF_VERSION-python$PY_VERSION as base
+
+ENV TF_NEED_CUDA="1"
+ARG PY_VERSION
+ARG TF_VERSION
+
+# TODO: Remove this if tensorflow/build container removes their keras-nightly install
+# https://github.com/tensorflow/build/issues/78
+RUN python -m pip uninstall -y keras-nightly
+
+RUN python -m pip install --default-timeout=1000 tensorflow==$TF_VERSION
+
+# Dev container.
+FROM base as dev
+
+# Create non-root user.
+ARG USERNAME=vscode
+ARG USER_UID=1000
+ARG USER_GID=$USER_UID
+
+RUN groupadd --gid $USER_GID $USERNAME && \
+    useradd --uid $USER_UID --gid $USER_GID -m $USERNAME && \
+    # Add user to sudoers.
+    apt-get update && \
+    apt-get install -y sudo && \
+    echo $USERNAME ALL=\(root\) NOPASSWD:ALL > /etc/sudoers.d/$USERNAME && \
+    chmod 0440 /etc/sudoers.d/$USERNAME && \
+    # Change default shell to bash.
+    usermod --shell /bin/bash $USERNAME
diff --git a/WORKSPACE b/WORKSPACE
index 407db23..0ff6696 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -80,7 +80,6 @@ rules_foreign_cc_dependencies()
 http_archive(
     name = "fftw3_archive",
     build_file_content = ALL_CONTENT,
-    sha256 = "56c932549852cddcfafdab3820b0200c7742675be92179e59e6215b340e26467",
-    strip_prefix = "fftw-3.3.10",
-    url = "https://www.fftw.org/fftw-3.3.10.tar.gz",
+    strip_prefix = "fftw-3.3.9",
+    url = "https://www.fftw.org/fftw-3.3.9.tar.gz",
 )

From 44a6c5d6b805138271f2feb14f5a430095f93862 Mon Sep 17 00:00:00 2001
From: Javier Montalt Tordera <javier.montalt@outlook.com>
Date: Sat, 18 Mar 2023 08:29:38 +0000
Subject: [PATCH 07/23] Update gitignore

---
 .gitignore | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.gitignore b/.gitignore
index 3fbf52a..635aa53 100644
--- a/.gitignore
+++ b/.gitignore
@@ -15,3 +15,5 @@ docs/api_docs/tfft/
 *_pb2.py
 *.pb.h
 *.pb.cc
+
+.bazelrc

From 1dd6e42a75cff6608325167d2081a441cb45ec01 Mon Sep 17 00:00:00 2001
From: Javier Montalt Tordera <javier.montalttordera@bayer.com>
Date: Sat, 18 Mar 2023 13:24:26 +0000
Subject: [PATCH 08/23] Working on compilation

---
 .devcontainer/devcontainer.json |  8 ++++----
 WORKSPACE                       |  1 +
 tools/build_and_install.sh      | 11 +++++++++++
 3 files changed, 16 insertions(+), 4 deletions(-)
 create mode 100755 tools/build_and_install.sh

diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
index 10869c0..1d9c035 100644
--- a/.devcontainer/devcontainer.json
+++ b/.devcontainer/devcontainer.json
@@ -18,10 +18,10 @@
 			]
 		}
 	},
-	// Enable GPUs.
-	"runArgs": [
-		"--gpus=all"
-	],
+	// Uncomment below to enable GPU support within dev container.
+	// "runArgs": [
+	// 	"--gpus=all"
+	// ],
 	// Enable plotting.
 	"mounts": [
 		"type=bind,source=/tmp/.X11-unix,target=/tmp/.X11-unix"
diff --git a/WORKSPACE b/WORKSPACE
index 0ff6696..78d6c89 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -80,6 +80,7 @@ rules_foreign_cc_dependencies()
 http_archive(
     name = "fftw3_archive",
     build_file_content = ALL_CONTENT,
+    sha256 = "bf2c7ce40b04ae811af714deb512510cc2c17b9ab9d6ddcf49fe4487eea7af3d",
     strip_prefix = "fftw-3.3.9",
     url = "https://www.fftw.org/fftw-3.3.9.tar.gz",
 )
diff --git a/tools/build_and_install.sh b/tools/build_and_install.sh
new file mode 100755
index 0000000..916d4f8
--- /dev/null
+++ b/tools/build_and_install.sh
@@ -0,0 +1,11 @@
+set -e -x
+
+python ./configure.py
+
+if [ "$TF_NEED_CUDA" == "1" ]; then
+  CUDA_FLAG="--crosstool_top=@ubuntu20.04-gcc9_manylinux2014-cuda11.2-cudnn8.1-tensorrt7.2_config_cuda//crosstool:toolchain"
+fi
+
+bazel build $CUDA_FLAG //tensorflow_nufft/...
+cp ./bazel-bin/tensorflow_nufft/python/ops/_nufft_ops.so ./tensorflow_nufft/python/ops/
+cp ./bazel-bin/tensorflow_nufft/proto/nufft_options_pb2.py ./tensorflow_nufft/proto/

From ad340dd99d99bac0ee123bae980644354fc955fe Mon Sep 17 00:00:00 2001
From: Javier Montalt Tordera <javier.montalttordera@bayer.com>
Date: Sat, 18 Mar 2023 13:26:19 +0000
Subject: [PATCH 09/23] Add bazel rules to gitignore

---
 .gitignore | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index 635aa53..ac9d367 100644
--- a/.gitignore
+++ b/.gitignore
@@ -16,4 +16,6 @@ docs/api_docs/tfft/
 *.pb.h
 *.pb.cc
 
-.bazelrc
+# Bazel
+/.bazelrc
+/bazel-*

From b270188cb0295bcad52eb62cc71d4d52610651a6 Mon Sep 17 00:00:00 2001
From: Javier Montalt Tordera <javier.montalt@outlook.com>
Date: Sat, 18 Mar 2023 16:21:34 +0000
Subject: [PATCH 10/23] Link against GOMP

---
 tensorflow_nufft/BUILD | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tensorflow_nufft/BUILD b/tensorflow_nufft/BUILD
index 81ebdaf..400f45e 100644
--- a/tensorflow_nufft/BUILD
+++ b/tensorflow_nufft/BUILD
@@ -54,6 +54,8 @@ custom_op_library(
         "@//build_deps/fftw3:fftw3f",
         ":nufft_options_cc_proto",
     ],
+    copts = ["-fopenmp"],
+    linkopts = ["-lgomp"],
 )
 
 py_proto_library(

From 6394976dd4a4f28d5dec00d6aabd9c9b37ff884d Mon Sep 17 00:00:00 2001
From: Javier Montalt Tordera <javier.montalt@outlook.com>
Date: Sat, 18 Mar 2023 16:59:29 +0000
Subject: [PATCH 11/23] GHA workflow

---
 .devcontainer/devcontainer.json               |   2 +-
 .github/workflows/build-and-release.yml       | 157 ++++++++++++++++++
 .../{build-package.yml => build-package.txt}  |   0
 Dockerfile                                    |  60 ++++++-
 tools/build/make_wheel_Linux_x86.sh           |   5 +-
 tools/docker/build_wheel.Dockerfile           |  38 +----
 tools/testing/build_and_run_tests.sh          |   5 +-
 7 files changed, 222 insertions(+), 45 deletions(-)
 create mode 100644 .github/workflows/build-and-release.yml
 rename .github/workflows/{build-package.yml => build-package.txt} (100%)

diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
index 1d9c035..6fc1ef2 100644
--- a/.devcontainer/devcontainer.json
+++ b/.devcontainer/devcontainer.json
@@ -2,7 +2,7 @@
 	"build": {
 		"dockerfile": "../Dockerfile",
 		"context": "..",
-		"target": "dev"
+		"target": "dev_container"
 	},
 	// Useful extensions.
 	"customizations": {
diff --git a/.github/workflows/build-and-release.yml b/.github/workflows/build-and-release.yml
new file mode 100644
index 0000000..2d8f568
--- /dev/null
+++ b/.github/workflows/build-and-release.yml
@@ -0,0 +1,157 @@
+name: build-and-release
+
+on:
+  push:
+    branches:
+      - master
+    tags:
+      - v[0-9]+.[0-9]+.[0-9]+*
+  pull_request:
+    branches:
+      - master
+
+env:
+  MIN_PY_VERSION: '3.7'
+  MAX_PY_VERSION: '3.10'
+
+jobs:
+  # test-with-bazel:
+  #   name: Test with bazel
+  #   runs-on: ubuntu-18.04
+  #   steps:
+  #     - uses: actions/checkout@v2
+  #     - uses: actions/setup-python@v2
+  #       with:
+  #         python-version: ${{ env.MIN_PY_VERSION }}
+  #     - name: Build wheels
+  #       run: |
+  #         pip install --default-timeout=1000 -r tools/install_deps/pytest.txt -r tools/install_deps/tensorflow-cpu.txt -r requirements.txt
+  #         bash tools/install_deps/install_bazelisk.sh ./
+  #         python configure.py
+  #         bazel test -k --test_timeout 300,450,1200,3600 --test_output=errors //tensorflow_addons/...
+  build:
+    name: Test and build release wheels
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        # To switch on windows-2022/latest, please verify the bazel version:
+        # https://github.com/bazelbuild/bazel/issues/14232#issuecomment-1011247429
+        os: ['ubuntu-18.04'] # ['macos-12', 'windows-2019', 'ubuntu-18.04']
+        py-version: ['3.8'] # ['3.7', '3.8', '3.9', '3.10']
+        tf-version: ['2.11.0']
+        cpu: ['x86']
+      #   include:
+      #     - os: 'macos-12'
+      #       cpu: 'arm64'
+      #       tf-version: '2.11.0'
+      #       py-version: '3.8'
+      #     - os: 'macos-12'
+      #       cpu: 'arm64'
+      #       tf-version: '2.11.0'
+      #       py-version: '3.9'
+      #     - os: 'macos-12'
+      #       cpu: 'arm64'
+      #       tf-version: '2.11.0'
+      #       py-version: '3.10'
+      fail-fast: false
+    steps:
+      # - uses: actions/github-script@0.3.0
+      #   id: author-date
+      #   with:
+      #     github-token: ${{secrets.GITHUB_TOKEN}}
+      #     script: |
+      #       const commit_details = await github.git.getCommit({owner: context.repo.owner, repo: context.repo.repo, commit_sha: context.sha});
+      #       return commit_details.data.author.date
+      # - if: matrix.tf-version != '2.10.0'
+      #   shell: bash
+      #   run: echo "SKIP_CUSTOM_OP_TESTS=--skip-custom-ops" >> $GITHUB_ENV
+      # - if: github.event_name == 'push'
+      #   shell: bash
+      #   run: echo "NIGHTLY_FLAG=--nightly" >> $GITHUB_ENV
+      - name: Checkout code
+        uses: actions/checkout@v2
+
+      - name: Setup Python
+        uses: actions/setup-python@v2
+        with:
+          python-version: ${{ matrix.py-version }}
+
+      - if: matrix.os != 'ubuntu-18.04'
+        name: Setup Bazel
+        # Ubuntu bazel is run inside of the docker image
+        run: bash tools/install_deps/install_bazelisk.sh ./
+
+      - name: Lint code
+        run: |
+          make lint PY_VERSION=${{ matrix.py-version }}
+
+      - name: Build wheels
+        env:
+          OS: ${{ runner.os }}
+          PY_VERSION: ${{ matrix.py-version }}
+          TF_VERSION: ${{ matrix.tf-version }}
+          CPU: ${{ matrix.cpu }}
+        shell: bash
+        run: bash ./tools/build/make_wheel_${OS}_${CPU}.sh
+
+      - name: Build docs
+        run: |
+          make docs PY_VERSION=${{ matrix.py_version }}
+
+      - uses: actions/upload-artifact@v1
+        with:
+          name: ${{ runner.os }}-${{ matrix.py-version }}-tf${{ matrix.tf-version }}-${{ matrix.cpu }}-wheel
+          path: wheelhouse
+
+  # upload-wheels:
+  #   name: Publish wheels to PyPi
+  #   needs: [release-wheel, test-with-bazel]
+  #   runs-on: ubuntu-18.04
+  #   strategy:
+  #     matrix:
+  #       os: ['macOS', 'Windows', 'Linux']
+  #       py-version: ['3.7', '3.8', '3.9', '3.10']
+  #       tf-version: ['2.11.0']
+  #       cpu: ['x86']
+  #       include:
+  #         - os: 'macOS'
+  #           cpu: 'arm64'
+  #           tf-version: '2.11.0'
+  #           py-version: '3.8'
+  #         - os: 'macOS'
+  #           cpu: 'arm64'
+  #           tf-version: '2.11.0'
+  #           py-version: '3.9'
+  #         - os: 'macOS'
+  #           cpu: 'arm64'
+  #           tf-version: '2.11.0'
+  #           py-version: '3.10'
+  #     fail-fast: false
+  #   if: (github.event_name == 'push' && github.ref == 'refs/heads/master') || github.event_name == 'release'
+  #   steps:
+  #     - uses: actions/download-artifact@v1
+  #       with:
+  #         name: ${{ matrix.os }}-${{ matrix.py-version }}-tf${{ matrix.tf-version }}-${{ matrix.cpu }}-wheel
+  #         path: ./dist
+  #     - run: |
+  #         set -e -x
+  #         ls -la dist/
+  #         sha256sum dist/*.whl
+  #     - uses: pypa/gh-action-pypi-publish@v1.1.0
+  #       with:
+  #         user: __token__
+  #         password: ${{ secrets.pypi_token }}
+  # upload-dev-container:
+  #   name: Upload dev container to DockerHub
+  #   needs: [release-wheel, test-with-bazel]
+  #   runs-on: ubuntu-18.04
+  #   env:
+  #     PY_VERSION: '3.9'
+  #   if: (github.event_name == 'push' && github.ref == 'refs/heads/master')
+  #   steps:
+  #     - uses: actions/checkout@v2
+  #     - run: |
+  #         set -e -x
+  #         echo ${{ secrets.DOCKER_PW }} | docker login --username ${{ secrets.DOCKER_USER }} --password-stdin
+  #         bash .github/workflows/github_build_dev_container.sh
+  #         docker push tfaddons/dev_container:latest-gpu
diff --git a/.github/workflows/build-package.yml b/.github/workflows/build-package.txt
similarity index 100%
rename from .github/workflows/build-package.yml
rename to .github/workflows/build-package.txt
diff --git a/Dockerfile b/Dockerfile
index c15498f..4f6eef6 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,7 +1,7 @@
 #syntax=docker/dockerfile:1.1.5-experimental
 ARG PY_VERSION=3.8
 ARG TF_VERSION=2.11
-FROM tensorflow/build:$TF_VERSION-python$PY_VERSION as base
+FROM tensorflow/build:$TF_VERSION-python$PY_VERSION as base_install
 
 ENV TF_NEED_CUDA="1"
 ARG PY_VERSION
@@ -13,8 +13,41 @@ RUN python -m pip uninstall -y keras-nightly
 
 RUN python -m pip install --default-timeout=1000 tensorflow==$TF_VERSION
 
+COPY requirements.txt .
+RUN python -m pip install -r requirements.txt
+
+COPY ./ /tensorflow-nufft
+WORKDIR /tensorflow-nufft
+
+
+# ------------------------------------------------------------------------------
+# Make wheel.
+# ------------------------------------------------------------------------------
+FROM base_install as make_wheel
+
+RUN python configure.py
+
+RUN bash tools/testing/build_and_run_tests.sh
+
+RUN bazel build \
+        --noshow_progress \
+        --noshow_loading_progress \
+        --verbose_failures \
+        --test_output=errors \
+        --crosstool_top=@ubuntu20.04-gcc9_manylinux2014-cuda11.2-cudnn8.1-tensorrt7.2_config_cuda//crosstool:toolchain \
+        build_pip_pkg && \
+    # Package Whl
+    bazel-bin/build_pip_pkg artifacts
+
+RUN bash tools/releases/tf_auditwheel_patch.sh
+RUN python -m auditwheel repair --plat manylinux2014_x86_64 artifacts/*.whl
+RUN ls -al wheelhouse/
+
+
+# ------------------------------------------------------------------------------
 # Dev container.
-FROM base as dev
+# ------------------------------------------------------------------------------
+FROM base_install as dev_container
 
 # Create non-root user.
 ARG USERNAME=vscode
@@ -30,3 +63,26 @@ RUN groupadd --gid $USER_GID $USERNAME && \
     chmod 0440 /etc/sudoers.d/$USERNAME && \
     # Change default shell to bash.
     usermod --shell /bin/bash $USERNAME
+
+
+# ------------------------------------------------------------------------------
+# Test wheel in fresh environment.
+# ------------------------------------------------------------------------------
+FROM python:$PY_VERSION as test_wheel_in_fresh_environment
+
+ARG TF_VERSION
+
+RUN python -m pip install --default-timeout=1000 tensorflow==$TF_VERSION
+
+COPY --from=make_wheel /tensorflow-nufft/wheelhouse/ /tensorflow-nufft/wheelhouse/
+RUN pip install /tensorflow-nufft/wheelhouse/*.whl
+
+RUN python -c "import tensorflow_nufft as tfft"
+
+
+# ------------------------------------------------------------------------------
+# Build output.
+# ------------------------------------------------------------------------------
+FROM scratch as output
+
+COPY --from=test_wheel_in_fresh_environment /tensorflow-nufft/wheelhouse/ .
diff --git a/tools/build/make_wheel_Linux_x86.sh b/tools/build/make_wheel_Linux_x86.sh
index c76289c..d9a3c84 100755
--- a/tools/build/make_wheel_Linux_x86.sh
+++ b/tools/build/make_wheel_Linux_x86.sh
@@ -7,11 +7,8 @@ rm -rf /usr/share/dotnet &
 
 # Tests are ran as part of make_wheel target
 DOCKER_BUILDKIT=1 docker build \
-    -f tools/docker/build_wheel.Dockerfile \
+    -f Dockerfile \
     --output type=local,dest=wheelhouse \
     --build-arg PY_VERSION \
     --build-arg TF_VERSION \
-    --build-arg NIGHTLY_FLAG \
-    --build-arg NIGHTLY_TIME \
-    --build-arg SKIP_CUSTOM_OP_TESTS \
     ./
diff --git a/tools/docker/build_wheel.Dockerfile b/tools/docker/build_wheel.Dockerfile
index 5f1a100..9c0b227 100644
--- a/tools/docker/build_wheel.Dockerfile
+++ b/tools/docker/build_wheel.Dockerfile
@@ -13,53 +13,23 @@ RUN python -m pip uninstall -y keras-nightly
 
 RUN python -m pip install --default-timeout=1000 tensorflow==$TF_VERSION
 
-# # Install FFTW library.
-# ARG FFTW_VERSION=3.3.10
-# ARG PREFIX=/dt9/usr
-# ARG CC="${PREFIX}/bin/gcc"
-# ARG CXX="${PREFIX}/bin/g++"
-# ARG LIBDIR="${PREFIX}/lib"
-# ARG INCLUDEDIR="${PREFIX}/include"
-# ARG CFLAGS="-O3 -march=x86-64 -mtune=generic -fPIC"
-# RUN cd /opt && \
-#     curl -sL http://www.fftw.org/fftw-${FFTW_VERSION}.tar.gz | tar xz && \
-#     cd fftw-${FFTW_VERSION} && \
-#     ./configure CC="${CC}" CFLAGS="${CFLAGS}" --prefix ${PREFIX} --enable-openmp --enable-float && \
-#     make && \
-#     make install && \
-#     ./configure CC="${CC}" CFLAGS="${CFLAGS}" --prefix ${PREFIX} --enable-openmp && \
-#     make && \
-#     make install
-
-COPY tools/install_deps/ /install_deps
-RUN python -m pip install -r /install_deps/pytest.txt
-
 COPY requirements.txt .
 RUN python -m pip install -r requirements.txt
 
 COPY ./ /tensorflow-nufft
 WORKDIR /tensorflow-nufft
 
-# -------------------------------------------------------------------
+# ------------------------------------------------------------------------------
 FROM base_install as tfa_gpu_tests
 CMD ["bash", "tools/testing/build_and_run_tests.sh"]
 
-# -------------------------------------------------------------------
+# ------------------------------------------------------------------------------
 FROM base_install as make_wheel
-ARG NIGHTLY_FLAG=
-ARG NIGHTLY_TIME=
-ARG SKIP_CUSTOM_OP_TESTS=
-
-# NIGHTLY_FLAG, NIGHTLY_TIME and SKIP_CUSTOM_OP_TESTS are not currently
-# supported for TensorFlow NUFFT.
-RUN if [[ -n "$NIGHTLY_FLAG" ]] ; then exit 1 ; fi
-RUN if [[ -n "$NIGHTLY_TIME" ]] ; then exit 1 ; fi
-RUN if [[ -n "$SKIP_CUSTOM_OP_TESTS" ]] ; then exit 1 ; fi
 
 RUN python configure.py
 
-# Test Before Building
-RUN bash tools/testing/build_and_run_tests.sh $SKIP_CUSTOM_OP_TESTS
+# Test
+RUN bash tools/testing/build_and_run_tests.sh
 
 # Build
 RUN bazel build \
diff --git a/tools/testing/build_and_run_tests.sh b/tools/testing/build_and_run_tests.sh
index a17ee31..5e3fb23 100644
--- a/tools/testing/build_and_run_tests.sh
+++ b/tools/testing/build_and_run_tests.sh
@@ -18,9 +18,6 @@
 
 set -x -e
 
-SKIP_CUSTOM_OP_TESTS_FLAG=${1}
-
-python -m pip install -r tools/install_deps/pytest.txt -e ./
 python ./configure.py
 bash tools/install_so_files.sh
 python -c "import tensorflow as tf; print(tf.config.list_physical_devices())"
@@ -35,4 +32,4 @@ if ! [ -x "$(command -v nvidia-smi)" ]; then
 fi
 
 bazel clean
-python -m pytest -v --functions-durations=20 --modules-durations=5 $SKIP_CUSTOM_OP_TESTS_FLAG $EXTRA_ARGS ./tensorflow_nufft
+make test

From ad92a03eb4bcb123adac980999f0783d31a1cef9 Mon Sep 17 00:00:00 2001
From: Javier Montalt Tordera <javier.montalt@outlook.com>
Date: Sat, 18 Mar 2023 17:29:58 +0000
Subject: [PATCH 12/23] Simplify Makefile

---
 Makefile     | 156 +--------------------------------------------------
 Makefile.bak | 156 +++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 157 insertions(+), 155 deletions(-)
 create mode 100644 Makefile.bak

diff --git a/Makefile b/Makefile
index f6cbc71..dd59f31 100644
--- a/Makefile
+++ b/Makefile
@@ -1,156 +1,2 @@
-CXX := /dt9/usr/bin/g++
-NVCC := nvcc
-PY_VERSION ?= 3.8
-PYTHON = python$(PY_VERSION)
-
-ROOT_DIR := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST))))
-
-KERNELS_DIR = tensorflow_nufft/cc/kernels
-OPS_DIR = tensorflow_nufft/cc/ops
-PROTO_DIR = tensorflow_nufft/proto
-PYOPS_DIR = tensorflow_nufft/python/ops
-
-# Protocol buffer source files (*.proto files).
-PROTO_SOURCES = $(wildcard $(PROTO_DIR)/*.proto)
-# protoc generated files (*.pb.h and *.pb.cc files).
-PROTO_OBJECTS = $(patsubst $(PROTO_DIR)/%.proto, $(PROTO_DIR)/%.pb.cc, $(PROTO_SOURCES))
-PROTO_HEADERS = $(patsubst $(PROTO_DIR)/%.proto, $(PROTO_DIR)/%.pb.h, $(PROTO_SOURCES))
-# protoc generated files (*_pb2.py files).
-PROTO_MODULES = $(patsubst $(PROTO_DIR)/%.proto, $(PROTO_DIR)/%_pb2.py, $(PROTO_SOURCES))
-
-CUSOURCES = $(wildcard $(KERNELS_DIR)/*.cu.cc)
-CUOBJECTS = $(patsubst %.cu.cc, %.cu.o, $(CUSOURCES))
-CXXSOURCES = $(filter-out $(CUSOURCES), $(wildcard $(KERNELS_DIR)/*.cc) $(wildcard $(OPS_DIR)/*.cc))
-CXXHEADERS = $(wildcard $(KERNELS_DIR)/*.h) $(wildcard $(OPS_DIR)/*.h)
-
-TARGET_LIB = tensorflow_nufft/python/ops/_nufft_ops.so
-TARGET_DLINK = tensorflow_nufft/cc/kernels/nufft_kernels.dlink.o
-
-TF_CFLAGS := $(shell $(PYTHON) -c 'import tensorflow as tf; print(" ".join(tf.sysconfig.get_compile_flags()))')
-TF_LDFLAGS := $(shell $(PYTHON) -c 'import tensorflow as tf; print(" ".join(tf.sysconfig.get_link_flags()))')
-
-CUDA_INCLUDE = /usr/local/cuda/targets/x86_64-linux/include
-CUDA_LIBDIR = /usr/local/cuda/targets/x86_64-linux/lib
-
-CUDA ?= 1
-OMP ?= 1
-CFLAGS = -O3 -march=x86-64 -mtune=generic -funroll-loops -fcx-limited-range
-
--include make.inc
-
-CFLAGS += -fPIC
-
-ifeq ($(CUDA), 1)
-TF_CFLAGS += -DGOOGLE_CUDA=1
-endif
-
-ifeq ($(OMP), 1)
-CFLAGS += -fopenmp
-endif
-
-CXXFLAGS = -std=c++17 $(CFLAGS) $(TF_CFLAGS)
-CXXFLAGS += -I$(ROOT_DIR)
-ifeq ($(CUDA), 1)
-CXXFLAGS += -I$(CUDA_INCLUDE)
-endif
-
-# As of TensorFlow 2.7, a deprecated-declarations is triggered by TensorFlow's
-# header files, which we can't do anything about. Therefore, disable these
-# warnings.
-CXXFLAGS += -Wno-deprecated-declarations
-
-# ==============================================================================
-# NVCC options
-# ==============================================================================
-
-NVARCH_FLAGS ?= \
-	-gencode=arch=compute_35,code=sm_35 \
-	-gencode=arch=compute_50,code=sm_50 \
-	-gencode=arch=compute_52,code=sm_52 \
-	-gencode=arch=compute_60,code=sm_60 \
-	-gencode=arch=compute_61,code=sm_61 \
-	-gencode=arch=compute_70,code=sm_70 \
-	-gencode=arch=compute_75,code=sm_75 \
-	-gencode=arch=compute_80,code=sm_80 \
-	-gencode=arch=compute_86,code=sm_86 \
-	-gencode=arch=compute_86,code=compute_86
-
-CUDAFE = --diag_suppress=174 --diag_suppress=177 --diag_suppress=611 --diag_suppress=20012 --diag_suppress=1886 --display_error_number
-
-CUFLAGS = $(NVARCH_FLAGS) -Xcompiler "$(CFLAGS)" $(TF_CFLAGS) -DNDEBUG --expt-relaxed-constexpr
-CUFLAGS += -I$(ROOT_DIR)
-CUFLAGS += -Xcudafe "$(CUDAFE)" -Wno-deprecated-gpu-targets
-CUFLAGS += --std=c++17
-
-
-# ==============================================================================
-# Linker options
-# ==============================================================================
-
-LDFLAGS = -lfftw3 -lfftw3f
-
-ifeq ($(OMP), 1)
-LDFLAGS += -lgomp
-LDFLAGS += -lfftw3_omp -lfftw3f_omp
-endif
-
-ifeq ($(CUDA), 1)
-LDFLAGS += -L$(CUDA_LIBDIR)
-LDFLAGS += -lcudart_static
-endif
-
-LDFLAGS += $(TF_LDFLAGS)
-
-# ==============================================================================
-# TensorFlow NUFFT
-# ==============================================================================
-
-all: lib wheel
-
-lib: proto $(TARGET_LIB)
-
-%.cu.o: %.cu.cc
-	$(NVCC) --compiler-bindir $(CXX) --device-c -x cu $(CUFLAGS) --threads 0 --output-file $@ --compile $<
-
-$(TARGET_DLINK): $(CUOBJECTS)
-	$(NVCC) --compiler-bindir $(CXX) --device-link $(CUFLAGS) --threads 0 --output-file $@ $^
-
-$(TARGET_LIB): $(CXXSOURCES) $(PROTO_OBJECTS) $(CUOBJECTS) $(TARGET_DLINK)
-	$(CXX) -shared $(CXXFLAGS) -o $@ $^ $(LDFLAGS)
-
-
-# ==============================================================================
-# Miscellaneous
-# ==============================================================================
-
-proto:
-	protoc -I$(PROTO_DIR) --python_out=$(PROTO_DIR) --cpp_out=$(PROTO_DIR) $(PROTO_SOURCES)
-
-wheel:
-	./tools/build/build_pip_pkg.sh make --python $(PYTHON) artifacts
-
-test:
-	$(PYTHON) -m unittest discover -v -p *_test.py
-
-benchmark: $(wildcard tensorflow_nufft/python/ops/*.py) $(TARGET_LIB)
-	$(PYTHON) tensorflow_nufft/python/ops/nufft_ops_test.py --benchmarks=.*
-
-lint: $(wildcard tensorflow_nufft/python/ops/*.py)
+lint:
 	pylint --rcfile=pylintrc tensorflow_nufft/python
-
-cpplint:
-	python2.7 tools/lint/cpplint.py $(CXXSOURCES) $(CUSOURCES) $(CXXHEADERS)
-
-docs: $(TARGET)
-	rm -rf docs/_* docs/api_docs/tfft/
-	$(MAKE) -C docs dirhtml PY_VERSION=$(PY_VERSION)
-
-# Cleans compiled objects.
-clean:
-	rm -f $(TARGET_LIB)
-	rm -f $(TARGET_DLINK)
-	rm -f $(CUOBJECTS)
-	rm -f $(PROTO_OBJECTS) $(PROTO_HEADERS) $(PROTO_MODULES)
-	rm -rf artifacts/
-
-.PHONY: all lib proto wheel test benchmark lint docs clean allclean
diff --git a/Makefile.bak b/Makefile.bak
new file mode 100644
index 0000000..f6cbc71
--- /dev/null
+++ b/Makefile.bak
@@ -0,0 +1,156 @@
+CXX := /dt9/usr/bin/g++
+NVCC := nvcc
+PY_VERSION ?= 3.8
+PYTHON = python$(PY_VERSION)
+
+ROOT_DIR := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST))))
+
+KERNELS_DIR = tensorflow_nufft/cc/kernels
+OPS_DIR = tensorflow_nufft/cc/ops
+PROTO_DIR = tensorflow_nufft/proto
+PYOPS_DIR = tensorflow_nufft/python/ops
+
+# Protocol buffer source files (*.proto files).
+PROTO_SOURCES = $(wildcard $(PROTO_DIR)/*.proto)
+# protoc generated files (*.pb.h and *.pb.cc files).
+PROTO_OBJECTS = $(patsubst $(PROTO_DIR)/%.proto, $(PROTO_DIR)/%.pb.cc, $(PROTO_SOURCES))
+PROTO_HEADERS = $(patsubst $(PROTO_DIR)/%.proto, $(PROTO_DIR)/%.pb.h, $(PROTO_SOURCES))
+# protoc generated files (*_pb2.py files).
+PROTO_MODULES = $(patsubst $(PROTO_DIR)/%.proto, $(PROTO_DIR)/%_pb2.py, $(PROTO_SOURCES))
+
+CUSOURCES = $(wildcard $(KERNELS_DIR)/*.cu.cc)
+CUOBJECTS = $(patsubst %.cu.cc, %.cu.o, $(CUSOURCES))
+CXXSOURCES = $(filter-out $(CUSOURCES), $(wildcard $(KERNELS_DIR)/*.cc) $(wildcard $(OPS_DIR)/*.cc))
+CXXHEADERS = $(wildcard $(KERNELS_DIR)/*.h) $(wildcard $(OPS_DIR)/*.h)
+
+TARGET_LIB = tensorflow_nufft/python/ops/_nufft_ops.so
+TARGET_DLINK = tensorflow_nufft/cc/kernels/nufft_kernels.dlink.o
+
+TF_CFLAGS := $(shell $(PYTHON) -c 'import tensorflow as tf; print(" ".join(tf.sysconfig.get_compile_flags()))')
+TF_LDFLAGS := $(shell $(PYTHON) -c 'import tensorflow as tf; print(" ".join(tf.sysconfig.get_link_flags()))')
+
+CUDA_INCLUDE = /usr/local/cuda/targets/x86_64-linux/include
+CUDA_LIBDIR = /usr/local/cuda/targets/x86_64-linux/lib
+
+CUDA ?= 1
+OMP ?= 1
+CFLAGS = -O3 -march=x86-64 -mtune=generic -funroll-loops -fcx-limited-range
+
+-include make.inc
+
+CFLAGS += -fPIC
+
+ifeq ($(CUDA), 1)
+TF_CFLAGS += -DGOOGLE_CUDA=1
+endif
+
+ifeq ($(OMP), 1)
+CFLAGS += -fopenmp
+endif
+
+CXXFLAGS = -std=c++17 $(CFLAGS) $(TF_CFLAGS)
+CXXFLAGS += -I$(ROOT_DIR)
+ifeq ($(CUDA), 1)
+CXXFLAGS += -I$(CUDA_INCLUDE)
+endif
+
+# As of TensorFlow 2.7, a deprecated-declarations is triggered by TensorFlow's
+# header files, which we can't do anything about. Therefore, disable these
+# warnings.
+CXXFLAGS += -Wno-deprecated-declarations
+
+# ==============================================================================
+# NVCC options
+# ==============================================================================
+
+NVARCH_FLAGS ?= \
+	-gencode=arch=compute_35,code=sm_35 \
+	-gencode=arch=compute_50,code=sm_50 \
+	-gencode=arch=compute_52,code=sm_52 \
+	-gencode=arch=compute_60,code=sm_60 \
+	-gencode=arch=compute_61,code=sm_61 \
+	-gencode=arch=compute_70,code=sm_70 \
+	-gencode=arch=compute_75,code=sm_75 \
+	-gencode=arch=compute_80,code=sm_80 \
+	-gencode=arch=compute_86,code=sm_86 \
+	-gencode=arch=compute_86,code=compute_86
+
+CUDAFE = --diag_suppress=174 --diag_suppress=177 --diag_suppress=611 --diag_suppress=20012 --diag_suppress=1886 --display_error_number
+
+CUFLAGS = $(NVARCH_FLAGS) -Xcompiler "$(CFLAGS)" $(TF_CFLAGS) -DNDEBUG --expt-relaxed-constexpr
+CUFLAGS += -I$(ROOT_DIR)
+CUFLAGS += -Xcudafe "$(CUDAFE)" -Wno-deprecated-gpu-targets
+CUFLAGS += --std=c++17
+
+
+# ==============================================================================
+# Linker options
+# ==============================================================================
+
+LDFLAGS = -lfftw3 -lfftw3f
+
+ifeq ($(OMP), 1)
+LDFLAGS += -lgomp
+LDFLAGS += -lfftw3_omp -lfftw3f_omp
+endif
+
+ifeq ($(CUDA), 1)
+LDFLAGS += -L$(CUDA_LIBDIR)
+LDFLAGS += -lcudart_static
+endif
+
+LDFLAGS += $(TF_LDFLAGS)
+
+# ==============================================================================
+# TensorFlow NUFFT
+# ==============================================================================
+
+all: lib wheel
+
+lib: proto $(TARGET_LIB)
+
+%.cu.o: %.cu.cc
+	$(NVCC) --compiler-bindir $(CXX) --device-c -x cu $(CUFLAGS) --threads 0 --output-file $@ --compile $<
+
+$(TARGET_DLINK): $(CUOBJECTS)
+	$(NVCC) --compiler-bindir $(CXX) --device-link $(CUFLAGS) --threads 0 --output-file $@ $^
+
+$(TARGET_LIB): $(CXXSOURCES) $(PROTO_OBJECTS) $(CUOBJECTS) $(TARGET_DLINK)
+	$(CXX) -shared $(CXXFLAGS) -o $@ $^ $(LDFLAGS)
+
+
+# ==============================================================================
+# Miscellaneous
+# ==============================================================================
+
+proto:
+	protoc -I$(PROTO_DIR) --python_out=$(PROTO_DIR) --cpp_out=$(PROTO_DIR) $(PROTO_SOURCES)
+
+wheel:
+	./tools/build/build_pip_pkg.sh make --python $(PYTHON) artifacts
+
+test:
+	$(PYTHON) -m unittest discover -v -p *_test.py
+
+benchmark: $(wildcard tensorflow_nufft/python/ops/*.py) $(TARGET_LIB)
+	$(PYTHON) tensorflow_nufft/python/ops/nufft_ops_test.py --benchmarks=.*
+
+lint: $(wildcard tensorflow_nufft/python/ops/*.py)
+	pylint --rcfile=pylintrc tensorflow_nufft/python
+
+cpplint:
+	python2.7 tools/lint/cpplint.py $(CXXSOURCES) $(CUSOURCES) $(CXXHEADERS)
+
+docs: $(TARGET)
+	rm -rf docs/_* docs/api_docs/tfft/
+	$(MAKE) -C docs dirhtml PY_VERSION=$(PY_VERSION)
+
+# Cleans compiled objects.
+clean:
+	rm -f $(TARGET_LIB)
+	rm -f $(TARGET_DLINK)
+	rm -f $(CUOBJECTS)
+	rm -f $(PROTO_OBJECTS) $(PROTO_HEADERS) $(PROTO_MODULES)
+	rm -rf artifacts/
+
+.PHONY: all lib proto wheel test benchmark lint docs clean allclean

From ad696401b4b844a9afa96f4fabb802b1a563ce81 Mon Sep 17 00:00:00 2001
From: Javier Montalt Tordera <javier.montalt@outlook.com>
Date: Sun, 19 Mar 2023 08:39:43 +0000
Subject: [PATCH 13/23] Add installation requirements

---
 .github/workflows/build-and-release.yml | 12 ++++++++----
 tools/install_deps/requirements.txt     |  1 +
 2 files changed, 9 insertions(+), 4 deletions(-)
 create mode 100644 tools/install_deps/requirements.txt

diff --git a/.github/workflows/build-and-release.yml b/.github/workflows/build-and-release.yml
index 2d8f568..52525f9 100644
--- a/.github/workflows/build-and-release.yml
+++ b/.github/workflows/build-and-release.yml
@@ -1,4 +1,4 @@
-name: build-and-release
+name: Build and release
 
 on:
   push:
@@ -30,7 +30,7 @@ jobs:
   #         python configure.py
   #         bazel test -k --test_timeout 300,450,1200,3600 --test_output=errors //tensorflow_addons/...
   build:
-    name: Test and build release wheels
+    name: Test and build wheels
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
@@ -81,9 +81,13 @@ jobs:
         # Ubuntu bazel is run inside of the docker image
         run: bash tools/install_deps/install_bazelisk.sh ./
 
-      - name: Lint code
+      - name: Install requirements
         run: |
-          make lint PY_VERSION=${{ matrix.py-version }}
+          pip install requirements.txt
+          pip install -r tools/install_deps/requirements.txt
+
+      - name: Lint code
+        run: make lint
 
       - name: Build wheels
         env:
diff --git a/tools/install_deps/requirements.txt b/tools/install_deps/requirements.txt
new file mode 100644
index 0000000..f8b2483
--- /dev/null
+++ b/tools/install_deps/requirements.txt
@@ -0,0 +1 @@
+pylint==2.17.0

From d6dca3a7de07af1f88700a98f856f18e9a0d9793 Mon Sep 17 00:00:00 2001
From: Javier Montalt Tordera <javier.montalt@outlook.com>
Date: Sun, 19 Mar 2023 08:41:07 +0000
Subject: [PATCH 14/23] Upgrade Ubuntu

---
 .github/workflows/build-and-release.yml | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/build-and-release.yml b/.github/workflows/build-and-release.yml
index 52525f9..576651d 100644
--- a/.github/workflows/build-and-release.yml
+++ b/.github/workflows/build-and-release.yml
@@ -17,7 +17,7 @@ env:
 jobs:
   # test-with-bazel:
   #   name: Test with bazel
-  #   runs-on: ubuntu-18.04
+  #   runs-on: ubuntu-20.04
   #   steps:
   #     - uses: actions/checkout@v2
   #     - uses: actions/setup-python@v2
@@ -36,7 +36,7 @@ jobs:
       matrix:
         # To switch on windows-2022/latest, please verify the bazel version:
         # https://github.com/bazelbuild/bazel/issues/14232#issuecomment-1011247429
-        os: ['ubuntu-18.04'] # ['macos-12', 'windows-2019', 'ubuntu-18.04']
+        os: ['ubuntu-20.04'] # ['macos-12', 'windows-2019', 'ubuntu-20.04']
         py-version: ['3.8'] # ['3.7', '3.8', '3.9', '3.10']
         tf-version: ['2.11.0']
         cpu: ['x86']
@@ -76,7 +76,7 @@ jobs:
         with:
           python-version: ${{ matrix.py-version }}
 
-      - if: matrix.os != 'ubuntu-18.04'
+      - if: matrix.os != 'ubuntu-20.04'
         name: Setup Bazel
         # Ubuntu bazel is run inside of the docker image
         run: bash tools/install_deps/install_bazelisk.sh ./
@@ -110,7 +110,7 @@ jobs:
   # upload-wheels:
   #   name: Publish wheels to PyPi
   #   needs: [release-wheel, test-with-bazel]
-  #   runs-on: ubuntu-18.04
+  #   runs-on: ubuntu-20.04
   #   strategy:
   #     matrix:
   #       os: ['macOS', 'Windows', 'Linux']
@@ -148,7 +148,7 @@ jobs:
   # upload-dev-container:
   #   name: Upload dev container to DockerHub
   #   needs: [release-wheel, test-with-bazel]
-  #   runs-on: ubuntu-18.04
+  #   runs-on: ubuntu-20.04
   #   env:
   #     PY_VERSION: '3.9'
   #   if: (github.event_name == 'push' && github.ref == 'refs/heads/master')

From 410c5f6350287119aab4c9a9acd82334d8b8ac95 Mon Sep 17 00:00:00 2001
From: Javier Montalt Tordera <javier.montalt@outlook.com>
Date: Sun, 19 Mar 2023 08:41:47 +0000
Subject: [PATCH 15/23] Fix requirements installation

---
 .github/workflows/build-and-release.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build-and-release.yml b/.github/workflows/build-and-release.yml
index 576651d..4df59ee 100644
--- a/.github/workflows/build-and-release.yml
+++ b/.github/workflows/build-and-release.yml
@@ -83,7 +83,7 @@ jobs:
 
       - name: Install requirements
         run: |
-          pip install requirements.txt
+          pip install -r requirements.txt
           pip install -r tools/install_deps/requirements.txt
 
       - name: Lint code

From 661888a05653ac1b66ae1c5e37e1ada55188b52a Mon Sep 17 00:00:00 2001
From: Javier Montalt Tordera <javier.montalt@outlook.com>
Date: Sun, 19 Mar 2023 08:57:12 +0000
Subject: [PATCH 16/23] Fix pylint version

---
 tensorflow_nufft/python/ops/nufft_ops.py | 62 ++++++++++++------------
 tools/install_deps/requirements.txt      |  2 +-
 2 files changed, 31 insertions(+), 33 deletions(-)

diff --git a/tensorflow_nufft/python/ops/nufft_ops.py b/tensorflow_nufft/python/ops/nufft_ops.py
index cc41a28..5be55b9 100755
--- a/tensorflow_nufft/python/ops/nufft_ops.py
+++ b/tensorflow_nufft/python/ops/nufft_ops.py
@@ -369,35 +369,35 @@ def _validate_nudft_inputs(source,
   if transform_type == 'type_1':
 
     if not len(grid_shape) == rank:
-      raise ValueError((
-          "Invalid `grid_shape` argument: must represent a rank-{} "
-          "shape. Received: {}").format(rank, grid_shape))
+      raise ValueError(
+          f"Invalid `grid_shape` argument: must represent a rank-{rank} "
+          f"shape. Received: {grid_shape}")
 
   if expected_rank:
 
     if not rank == expected_rank:
-      raise ValueError((
-          "Invalid shape for `points` argument: "
-          "last dimension must be equal to expected rank, which is {}. "
-          "Received: {}").format(expected_rank, rank))
+      raise ValueError(
+          f"Invalid shape for `points` argument: "
+          f"last dimension mst be equal to expected rank, which is "
+          f"{expected_rank}. Received: {rank}")
 
   # Check that dtype for `source` matches the expected dtype.
   if expected_dtype:
 
     if not source.dtype == expected_dtype:
-      raise TypeError((
-          "Invalid dtype for `source` argument: "
-          "must match the expected dtype, which is {}. "
-          "Received: {}").format(expected_dtype, source.dtype))
+      raise TypeError(
+          f"Invalid dtype for `source` argument: "
+          f"must match the expected dtype, which is {expected_dtype}. "
+          f"Received: {source.dtype}")
 
   expected_dtype = source.dtype
 
   # Check that dtype for `points` matches the expected dtype.
   if not points.dtype == expected_dtype.real_dtype:
-    raise TypeError((
-        "Invalid dtype for `points` argument: "
-        "must match the real part of the expected dtype, which is {}. "
-        "Received: {}").format(expected_dtype.real_dtype, points.dtype))
+    raise TypeError(
+        f"Invalid dtype for `points` argument: "
+        f"must match the real part of the expected dtype, which is "
+        f"{expected_dtype.real_dtype}. Received: {points.dtype}")
 
   # Check that spatial dimensions of input `source` match the expected modes
   # shape.
@@ -405,18 +405,17 @@ def _validate_nudft_inputs(source,
 
     if transform_type == 'type_1':
       if not grid_shape == expected_grid_shape:
-        raise ValueError((
-            "Invalid `grid_shape` argument: "
-            "expected {}. Received: {}").format(
-                expected_grid_shape, grid_shape))
+        raise ValueError(
+            f"Invalid `grid_shape` argument: "
+            f"expected {expected_grid_shape}. Received: {grid_shape}")
 
     if transform_type == 'type_2':
       if not source.shape[-rank:] == expected_grid_shape:
-        raise ValueError((
-            "Invalid shape for `source` argument: "
-            "the modes shape (i.e., dimensions {}) must match "
-            "the expected modes shape, which is {}. Received: {}").format(
-                tuple(range(-rank, 0)), expected_grid_shape, source.shape))
+        raise ValueError(
+            f"Invalid shape for `source` argument: "
+            f"the modes shape (i.e., dimensions {tuple(range(-rank, 0))}) "
+            f"must match the expected modes shape, which is "
+            f"{expected_grid_shape}. Received: {source.shape}")
 
   # Check that batch shapes for `source` and `points` are broadcastable, and
   # broadcast them to a common shape.
@@ -435,11 +434,10 @@ def _validate_nudft_inputs(source,
       batch_shape_source, batch_shape_points)
 
   except ValueError as err:
-    raise ValueError((
-        "Incompatible batch shapes for `source` and `points`."
-        "The batch dimensions for `source` and `points` must be "
-        "broadcastable. Received: {}, {}").format(
-            source.shape, points.shape)) from err
+    raise ValueError(
+        f"Incompatible batch shapes for `source` and `points`. "
+        f"The batch dimensions for `source` and `points` must be "
+        f"broadcastable. Received: {source.shape}, {points.shape}")
 
   source = tf.broadcast_to(source, batch_shape + source_shape)
   points = tf.broadcast_to(points, batch_shape + points_shape)
@@ -463,9 +461,9 @@ def _validate_enum(value, valid_values, name):
     ValueError: If a value not in the list of valid values was passed.
   """
   if value not in valid_values:
-    raise ValueError((
-        "The `{}` argument must be one of {}. "
-        "Received: {}").format(name, valid_values, value))
+    raise ValueError(
+        f"Argument `{name}` must be one of {valid_values}. "
+        f"Received: {value}")
   return value
 
 
diff --git a/tools/install_deps/requirements.txt b/tools/install_deps/requirements.txt
index f8b2483..c02e57a 100644
--- a/tools/install_deps/requirements.txt
+++ b/tools/install_deps/requirements.txt
@@ -1 +1 @@
-pylint==2.17.0
+pylint==2.7.4

From b5a8a69b5ccd3308a63e48ddfe685ba4705c21cc Mon Sep 17 00:00:00 2001
From: Javier Montalt Tordera <javier.montalt@outlook.com>
Date: Sun, 19 Mar 2023 08:59:35 +0000
Subject: [PATCH 17/23] Linting

---
 tensorflow_nufft/python/ops/nufft_ops.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow_nufft/python/ops/nufft_ops.py b/tensorflow_nufft/python/ops/nufft_ops.py
index 5be55b9..aa2edcf 100755
--- a/tensorflow_nufft/python/ops/nufft_ops.py
+++ b/tensorflow_nufft/python/ops/nufft_ops.py
@@ -437,7 +437,7 @@ def _validate_nudft_inputs(source,
     raise ValueError(
         f"Incompatible batch shapes for `source` and `points`. "
         f"The batch dimensions for `source` and `points` must be "
-        f"broadcastable. Received: {source.shape}, {points.shape}")
+        f"broadcastable. Received: {source.shape}, {points.shape}") from err
 
   source = tf.broadcast_to(source, batch_shape + source_shape)
   points = tf.broadcast_to(points, batch_shape + points_shape)

From 24039de38b3ecd84bff5c2acdd56f0d64ff8361e Mon Sep 17 00:00:00 2001
From: Javier Montalt Tordera <javier.montalt@outlook.com>
Date: Sun, 19 Mar 2023 09:05:45 +0000
Subject: [PATCH 18/23] Remove patch number from TF version

---
 .github/workflows/build-and-release.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build-and-release.yml b/.github/workflows/build-and-release.yml
index 4df59ee..2ffa5e7 100644
--- a/.github/workflows/build-and-release.yml
+++ b/.github/workflows/build-and-release.yml
@@ -38,7 +38,7 @@ jobs:
         # https://github.com/bazelbuild/bazel/issues/14232#issuecomment-1011247429
         os: ['ubuntu-20.04'] # ['macos-12', 'windows-2019', 'ubuntu-20.04']
         py-version: ['3.8'] # ['3.7', '3.8', '3.9', '3.10']
-        tf-version: ['2.11.0']
+        tf-version: ['2.11']
         cpu: ['x86']
       #   include:
       #     - os: 'macos-12'

From 9c759ea6fb15970fe78b60c632eddba60e1c9093 Mon Sep 17 00:00:00 2001
From: Javier Montalt Tordera <javier.montalt@outlook.com>
Date: Sun, 19 Mar 2023 09:17:08 +0000
Subject: [PATCH 19/23] Enable Windows in CI

---
 .github/workflows/build-and-release.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build-and-release.yml b/.github/workflows/build-and-release.yml
index 2ffa5e7..cb16b77 100644
--- a/.github/workflows/build-and-release.yml
+++ b/.github/workflows/build-and-release.yml
@@ -36,7 +36,7 @@ jobs:
       matrix:
         # To switch on windows-2022/latest, please verify the bazel version:
         # https://github.com/bazelbuild/bazel/issues/14232#issuecomment-1011247429
-        os: ['ubuntu-20.04'] # ['macos-12', 'windows-2019', 'ubuntu-20.04']
+        os: ['ubuntu-20.04', 'windows-2019'] # ['macos-12', 'windows-2019', 'ubuntu-20.04']
         py-version: ['3.8'] # ['3.7', '3.8', '3.9', '3.10']
         tf-version: ['2.11']
         cpu: ['x86']

From 9279084dc3e9e6e757e48712787108ce01ed69e5 Mon Sep 17 00:00:00 2001
From: Javier Montalt Tordera <javier.montalt@outlook.com>
Date: Sun, 19 Mar 2023 09:56:37 +0000
Subject: [PATCH 20/23] Parallelize CI workflows

---
 .github/workflows/build-and-release.yml       | 126 ++++++++++++++++--
 Makefile                                      |   6 +
 tools/build/make_wheel_macOS_x86.sh           |   1 -
 tools/install_deps/docs.txt                   |   7 +
 .../{requirements.txt => lint.txt}            |   0
 tools/install_deps/pytest.txt                 |   7 -
 6 files changed, 126 insertions(+), 21 deletions(-)
 create mode 100644 tools/install_deps/docs.txt
 rename tools/install_deps/{requirements.txt => lint.txt} (100%)
 delete mode 100644 tools/install_deps/pytest.txt

diff --git a/.github/workflows/build-and-release.yml b/.github/workflows/build-and-release.yml
index cb16b77..532e52b 100644
--- a/.github/workflows/build-and-release.yml
+++ b/.github/workflows/build-and-release.yml
@@ -30,7 +30,7 @@ jobs:
   #         python configure.py
   #         bazel test -k --test_timeout 300,450,1200,3600 --test_output=errors //tensorflow_addons/...
   build:
-    name: Test and build wheels
+    name: Build
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
@@ -81,14 +81,6 @@ jobs:
         # Ubuntu bazel is run inside of the docker image
         run: bash tools/install_deps/install_bazelisk.sh ./
 
-      - name: Install requirements
-        run: |
-          pip install -r requirements.txt
-          pip install -r tools/install_deps/requirements.txt
-
-      - name: Lint code
-        run: make lint
-
       - name: Build wheels
         env:
           OS: ${{ runner.os }}
@@ -98,14 +90,122 @@ jobs:
         shell: bash
         run: bash ./tools/build/make_wheel_${OS}_${CPU}.sh
 
+      - name: Upload wheels
+        if: startsWith(github.ref, 'refs/tags')
+        uses: actions/upload-artifact@v2
+        with:
+          name: ${{ runner.os }}-${{ matrix.py-version }}-tf${{ matrix.tf-version }}-${{ matrix.cpu }}-wheel
+          path: ./wheelhouse
+
+      - name: Publish to Test PyPI
+        uses: pypa/gh-action-pypi-publish@release/v1
+        with:
+          password: ${{ secrets.TEST_PYPI_API_TOKEN }}
+          repository_url: https://test.pypi.org/legacy/
+          packages_dir: wheelhouse/
+          skip_existing: true
+
+  lint:
+    name: Lint
+    runs-on: ubuntu-20.04
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v2
+
+      - name: Setup Python
+        uses: actions/setup-python@v2
+        with:
+          python-version: '3.8'
+
+      - name: Install requirements
+        run: |
+          pip install -r tools/install_deps/lint.txt
+
+      - name: Lint code
+        if: matrix.os == ubuntu-20.04 && matrix.py-version == 3.8
+        run: make lint
+
+  docs:
+    name: Documentation
+    runs-on: ubuntu-20.04
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v2
+
+      - name: Setup Python
+        uses: actions/setup-python@v2
+        with:
+          python-version: '3.8'
+
+      - name: Install requirements
+        run: |
+          pip install -r tools/install_deps/docs.txt
+
       - name: Build docs
+        run: make docs
+
+      - name: Upload docs
+        if: startsWith(github.ref, 'refs/tags')
+        uses: actions/upload-artifact@v2
+        with:
+          name: docs
+          path: ./docs/_build/dirhtml
+
+  release:
+    name: Release
+    needs: [build, lint, docs]
+    runs-on: ubuntu-latest
+    if: startsWith(github.ref, 'refs/tags')
+
+    steps:
+      - name: Checkout docs branch
+        uses: actions/checkout@v2
+        with:
+          ref: docs
+
+      - name: Clean docs branch
         run: |
-          make docs PY_VERSION=${{ matrix.py_version }}
+          rm -rf ./*
 
-      - uses: actions/upload-artifact@v1
+      - name: Download docs
+        uses: actions/download-artifact@v2
         with:
-          name: ${{ runner.os }}-${{ matrix.py-version }}-tf${{ matrix.tf-version }}-${{ matrix.cpu }}-wheel
-          path: wheelhouse
+          name: docs
+
+      - name: Publish docs to GitHub Pages
+        run: |
+          git config --global user.name mrphys
+          git config --global user.email mrphys@users.noreply.github.com
+          git add -A
+          git commit -m "Update docs [bot]" || echo "No changes to commit."
+          git push --force
+
+      - name: Checkout master
+        uses: actions/checkout@v2
+
+      - name: Get release tag
+        run: |
+          echo "release=${GITHUB_REF/refs\/tags\/v/}" >> $GITHUB_ENV
+
+      - name: Create a release
+        uses: softprops/action-gh-release@v1
+        with:
+          name: TensorFlow NUFFT ${{ env.release }}
+          body_path: RELEASE.md
+          prerelease: ${{ contains(env.release, 'a') || contains(env.release, 'b') || contains(env.release, 'rc') }}
+          fail_on_unmatched_files: true
+
+      - name: Download wheels
+        uses: actions/download-artifact@v2
+        with:
+          name: wheels
+          path: artifacts/
+
+      - name: Publish to PyPI
+        uses: pypa/gh-action-pypi-publish@release/v1
+        with:
+          password: ${{ secrets.PYPI_API_TOKEN }}
+          packages_dir: artifacts/
 
   # upload-wheels:
   #   name: Publish wheels to PyPi
diff --git a/Makefile b/Makefile
index dd59f31..c4ce55e 100644
--- a/Makefile
+++ b/Makefile
@@ -1,2 +1,8 @@
+.PHONY: lint
 lint:
 	pylint --rcfile=pylintrc tensorflow_nufft/python
+
+.PHONY: docs
+docs:
+	rm -rf docs/_* docs/api_docs/tfft/
+	$(MAKE) -C docs dirhtml
diff --git a/tools/build/make_wheel_macOS_x86.sh b/tools/build/make_wheel_macOS_x86.sh
index 908a671..447007a 100644
--- a/tools/build/make_wheel_macOS_x86.sh
+++ b/tools/build/make_wheel_macOS_x86.sh
@@ -29,4 +29,3 @@ bazel-bin/build_pip_pkg artifacts $NIGHTLY_FLAG
 # Setting DYLD_LIBRARY_PATH to help delocate finding tensorflow after the rpath invalidation
 export DYLD_LIBRARY_PATH=$DYLD_LIBRARY_PATH:$(python -c 'import configure; print(configure.get_tf_shared_lib_dir())')
 delocate-wheel -w wheelhouse -v artifacts/*.whl
-
diff --git a/tools/install_deps/docs.txt b/tools/install_deps/docs.txt
new file mode 100644
index 0000000..4cebcb1
--- /dev/null
+++ b/tools/install_deps/docs.txt
@@ -0,0 +1,7 @@
+sphinx==4.5.0
+pydata-sphinx-theme==0.8.0
+ipython
+sphinx-sitemap==2.5.0
+myst-nb==0.17.1
+sphinx-book-theme==0.3.3
+pydot==1.4.2
diff --git a/tools/install_deps/requirements.txt b/tools/install_deps/lint.txt
similarity index 100%
rename from tools/install_deps/requirements.txt
rename to tools/install_deps/lint.txt
diff --git a/tools/install_deps/pytest.txt b/tools/install_deps/pytest.txt
deleted file mode 100644
index d49227a..0000000
--- a/tools/install_deps/pytest.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-pytest~=6.2.5
-pytest-xdist~=1.31
-pytest-extra-durations~=0.1.3
-scikit-learn~=1.0.2
-scikit-image~=0.19.2
-Pillow~=9.0.1
-tqdm>=4.36.1

From 99e2600654393368e475311a8f3121e906f38e01 Mon Sep 17 00:00:00 2001
From: Javier Montalt Tordera <javier.montalt@outlook.com>
Date: Sun, 19 Mar 2023 10:00:13 +0000
Subject: [PATCH 21/23] Fix error in CI workflow

---
 .github/workflows/build-and-release.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/build-and-release.yml b/.github/workflows/build-and-release.yml
index 532e52b..9a9267a 100644
--- a/.github/workflows/build-and-release.yml
+++ b/.github/workflows/build-and-release.yml
@@ -122,7 +122,6 @@ jobs:
           pip install -r tools/install_deps/lint.txt
 
       - name: Lint code
-        if: matrix.os == ubuntu-20.04 && matrix.py-version == 3.8
         run: make lint
 
   docs:

From 2576bb0c222768153c274d7e7261157946c4c457 Mon Sep 17 00:00:00 2001
From: Javier Montalt Tordera <javier.montalt@outlook.com>
Date: Sun, 19 Mar 2023 10:00:44 +0000
Subject: [PATCH 22/23] Rename docs step in workflow

---
 .github/workflows/build-and-release.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build-and-release.yml b/.github/workflows/build-and-release.yml
index 9a9267a..d166a43 100644
--- a/.github/workflows/build-and-release.yml
+++ b/.github/workflows/build-and-release.yml
@@ -125,7 +125,7 @@ jobs:
         run: make lint
 
   docs:
-    name: Documentation
+    name: Docs
     runs-on: ubuntu-20.04
     steps:
       - name: Checkout code

From 6ea609d4dbee3109a41b97713770cc367bbf80e6 Mon Sep 17 00:00:00 2001
From: Javier Montalt Tordera <javier.montalt@outlook.com>
Date: Sun, 19 Mar 2023 10:07:32 +0000
Subject: [PATCH 23/23] Add test target to new Makefile

---
 Makefile                              | 3 +++
 docs/requirements.txt                 | 0
 tools/install_deps/tensorflow-cpu.txt | 1 -
 tools/install_deps/tensorflow.txt     | 1 -
 4 files changed, 3 insertions(+), 2 deletions(-)
 delete mode 100644 docs/requirements.txt
 delete mode 100644 tools/install_deps/tensorflow-cpu.txt
 delete mode 100644 tools/install_deps/tensorflow.txt

diff --git a/Makefile b/Makefile
index c4ce55e..bf76c99 100644
--- a/Makefile
+++ b/Makefile
@@ -1,3 +1,6 @@
+.PHONY: test
+	python -m unittest discover -v -p *_test.py
+
 .PHONY: lint
 lint:
 	pylint --rcfile=pylintrc tensorflow_nufft/python
diff --git a/docs/requirements.txt b/docs/requirements.txt
deleted file mode 100644
index e69de29..0000000
diff --git a/tools/install_deps/tensorflow-cpu.txt b/tools/install_deps/tensorflow-cpu.txt
deleted file mode 100644
index a56d24a..0000000
--- a/tools/install_deps/tensorflow-cpu.txt
+++ /dev/null
@@ -1 +0,0 @@
-tensorflow-cpu~=2.11.0
diff --git a/tools/install_deps/tensorflow.txt b/tools/install_deps/tensorflow.txt
deleted file mode 100644
index 1f4b31a..0000000
--- a/tools/install_deps/tensorflow.txt
+++ /dev/null
@@ -1 +0,0 @@
-tensorflow~=2.11.0
\ No newline at end of file