From 3c10e5b2f6ad03655feae3514870cfc6b2ed2665 Mon Sep 17 00:00:00 2001
From: Ethan Luis McDonough <ethanluismcdonough@gmail.com>
Date: Wed, 20 Dec 2023 16:58:56 -0600
Subject: [PATCH 001/342] [OpenMP] Add unit tests for nextgen plugins (#74398)

This patch add three GTest unit tests that test plugin read and write
operations. Tests can be compiled with `ninja -C runtimes/runtimes-bins
LibomptUnitTests`.
---
 openmp/libomptarget/CMakeLists.txt            |   6 +
 openmp/libomptarget/unittests/CMakeLists.txt  |   8 +
 .../unittests/Plugins/CMakeLists.txt          |  11 ++
 .../unittests/Plugins/NextgenPluginsTest.cpp  | 168 ++++++++++++++++++
 4 files changed, 193 insertions(+)
 create mode 100644 openmp/libomptarget/unittests/CMakeLists.txt
 create mode 100644 openmp/libomptarget/unittests/Plugins/CMakeLists.txt
 create mode 100644 openmp/libomptarget/unittests/Plugins/NextgenPluginsTest.cpp

diff --git a/openmp/libomptarget/CMakeLists.txt b/openmp/libomptarget/CMakeLists.txt
index 66925ccbe0305..7060e20af0932 100644
--- a/openmp/libomptarget/CMakeLists.txt
+++ b/openmp/libomptarget/CMakeLists.txt
@@ -150,3 +150,9 @@ add_subdirectory(${LIBOMPTARGET_SRC_DIR})
 
 # Add tests.
 add_subdirectory(test)
+
+# Add unit tests if GMock/GTest is present
+if (EXISTS ${LLVM_THIRD_PARTY_DIR}/unittest)
+  add_subdirectory(${LLVM_THIRD_PARTY_DIR}/unittest ${CMAKE_CURRENT_BINARY_DIR}/third-party/unittest)
+  add_subdirectory(unittests)
+endif()
diff --git a/openmp/libomptarget/unittests/CMakeLists.txt b/openmp/libomptarget/unittests/CMakeLists.txt
new file mode 100644
index 0000000000000..73c87b708d25f
--- /dev/null
+++ b/openmp/libomptarget/unittests/CMakeLists.txt
@@ -0,0 +1,8 @@
+add_custom_target(LibomptUnitTests)
+set_target_properties(LibomptUnitTests PROPERTIES FOLDER "Tests/UnitTests")
+
+function(add_libompt_unittest test_dirname)
+  add_unittest(LibomptUnitTests ${test_dirname} ${ARGN})
+endfunction()
+
+add_subdirectory(Plugins)
diff --git a/openmp/libomptarget/unittests/Plugins/CMakeLists.txt b/openmp/libomptarget/unittests/Plugins/CMakeLists.txt
new file mode 100644
index 0000000000000..e137d2a9d1774
--- /dev/null
+++ b/openmp/libomptarget/unittests/Plugins/CMakeLists.txt
@@ -0,0 +1,11 @@
+set(PLUGINS_TEST_COMMON omptarget OMPT omptarget.devicertl)
+set(PLUGINS_TEST_SOURCES NextgenPluginsTest.cpp)
+set(PLUGINS_TEST_INCLUDE ${LIBOMPTARGET_INCLUDE_DIR})
+
+foreach(PLUGIN IN LISTS LIBOMPTARGET_TESTED_PLUGINS)
+  libomptarget_say("Building plugin unit tests for ${PLUGIN}")
+  add_libompt_unittest("${PLUGIN}.unittests" ${PLUGINS_TEST_SOURCES})
+  add_dependencies("${PLUGIN}.unittests" ${PLUGINS_TEST_COMMON} ${PLUGIN})
+  target_link_libraries("${PLUGIN}.unittests" PRIVATE ${PLUGINS_TEST_COMMON} ${PLUGIN})
+  target_include_directories("${PLUGIN}.unittests" PRIVATE ${PLUGINS_TEST_INCLUDE})
+endforeach()
diff --git a/openmp/libomptarget/unittests/Plugins/NextgenPluginsTest.cpp b/openmp/libomptarget/unittests/Plugins/NextgenPluginsTest.cpp
new file mode 100644
index 0000000000000..635bd1637c903
--- /dev/null
+++ b/openmp/libomptarget/unittests/Plugins/NextgenPluginsTest.cpp
@@ -0,0 +1,168 @@
+//===------- unittests/Plugins/NextgenPluginsTest.cpp - Plugin tests ------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "Shared/PluginAPI.h"
+#include "omptarget.h"
+#include "gtest/gtest.h"
+
+#include <unordered_set>
+
+const int DEVICE_ID = 0;
+std::unordered_set<int> setup_map;
+
+int init_test_device(int ID) {
+  if (setup_map.find(ID) != setup_map.end()) {
+    return OFFLOAD_SUCCESS;
+  }
+  if (__tgt_rtl_init_plugin() == OFFLOAD_FAIL ||
+      __tgt_rtl_init_device(ID) == OFFLOAD_FAIL) {
+    return OFFLOAD_FAIL;
+  }
+  setup_map.insert(ID);
+  return OFFLOAD_SUCCESS;
+}
+
+// Test plugin initialization
+TEST(NextgenPluginsTest, PluginInit) {
+  EXPECT_EQ(OFFLOAD_SUCCESS, init_test_device(DEVICE_ID));
+}
+
+// Test GPU allocation and R/W
+TEST(NextgenPluginsTest, PluginAlloc) {
+  int32_t test_value = 23;
+  int32_t host_value = -1;
+  int64_t var_size = sizeof(int32_t);
+
+  // Init plugin and device
+  EXPECT_EQ(OFFLOAD_SUCCESS, init_test_device(DEVICE_ID));
+
+  // Allocate memory
+  void *device_ptr =
+      __tgt_rtl_data_alloc(DEVICE_ID, var_size, nullptr, TARGET_ALLOC_DEFAULT);
+
+  // Check that the result is not null
+  EXPECT_NE(device_ptr, nullptr);
+
+  // Submit data to device
+  EXPECT_EQ(OFFLOAD_SUCCESS, __tgt_rtl_data_submit(DEVICE_ID, device_ptr,
+                                                   &test_value, var_size));
+
+  // Read data from device
+  EXPECT_EQ(OFFLOAD_SUCCESS, __tgt_rtl_data_retrieve(DEVICE_ID, &host_value,
+                                                     device_ptr, var_size));
+
+  // Compare values
+  EXPECT_EQ(host_value, test_value);
+
+  // Cleanup data
+  EXPECT_EQ(OFFLOAD_SUCCESS,
+            __tgt_rtl_data_delete(DEVICE_ID, device_ptr, TARGET_ALLOC_DEFAULT));
+}
+
+// Test async GPU allocation and R/W
+TEST(NextgenPluginsTest, PluginAsyncAlloc) {
+  int32_t test_value = 47;
+  int32_t host_value = -1;
+  int64_t var_size = sizeof(int32_t);
+  __tgt_async_info *info;
+
+  // Init plugin and device
+  EXPECT_EQ(OFFLOAD_SUCCESS, init_test_device(DEVICE_ID));
+
+  // Check if device supports async
+  // Platforms like x86_64 don't support it
+  if (__tgt_rtl_init_async_info(DEVICE_ID, &info) == OFFLOAD_SUCCESS) {
+    // Allocate memory
+    void *device_ptr = __tgt_rtl_data_alloc(DEVICE_ID, var_size, nullptr,
+                                            TARGET_ALLOC_DEFAULT);
+
+    // Check that the result is not null
+    EXPECT_NE(device_ptr, nullptr);
+
+    // Submit data to device asynchronously
+    EXPECT_EQ(OFFLOAD_SUCCESS,
+              __tgt_rtl_data_submit_async(DEVICE_ID, device_ptr, &test_value,
+                                          var_size, info));
+
+    // Wait for async request to process
+    EXPECT_EQ(OFFLOAD_SUCCESS, __tgt_rtl_synchronize(DEVICE_ID, info));
+
+    // Read data from device
+    EXPECT_EQ(OFFLOAD_SUCCESS,
+              __tgt_rtl_data_retrieve_async(DEVICE_ID, &host_value, device_ptr,
+                                            var_size, info));
+
+    // Wait for async request to process
+    EXPECT_EQ(OFFLOAD_SUCCESS, __tgt_rtl_synchronize(DEVICE_ID, info));
+
+    // Compare values
+    EXPECT_EQ(host_value, test_value);
+
+    // Cleanup data
+    EXPECT_EQ(OFFLOAD_SUCCESS, __tgt_rtl_data_delete(DEVICE_ID, device_ptr,
+                                                     TARGET_ALLOC_DEFAULT));
+  }
+}
+
+// Test GPU data exchange
+TEST(NextgenPluginsTest, PluginDataSwap) {
+  int32_t test_value = 23;
+  int32_t host_value = -1;
+  int64_t var_size = sizeof(int32_t);
+
+  // Look for compatible device
+  int DEVICE_TWO = -1;
+  for (int i = 1; i < __tgt_rtl_number_of_devices(); i++) {
+    if (__tgt_rtl_is_data_exchangable(DEVICE_ID, i)) {
+      DEVICE_TWO = i;
+      break;
+    }
+  }
+
+  // Only run test if we have multiple GPUs to test
+  // GPUs must be compatible for test to work
+  if (DEVICE_TWO >= 1) {
+    // Init both GPUs
+    EXPECT_EQ(OFFLOAD_SUCCESS, init_test_device(DEVICE_ID));
+    EXPECT_EQ(OFFLOAD_SUCCESS, init_test_device(DEVICE_TWO));
+
+    // Allocate memory on both GPUs
+    // DEVICE_ID will be the source
+    // DEVICE_TWO will be the destination
+    void *source_ptr = __tgt_rtl_data_alloc(DEVICE_ID, var_size, nullptr,
+                                            TARGET_ALLOC_DEFAULT);
+    void *dest_ptr = __tgt_rtl_data_alloc(DEVICE_TWO, var_size, nullptr,
+                                          TARGET_ALLOC_DEFAULT);
+
+    // Check for success in allocation
+    EXPECT_NE(source_ptr, nullptr);
+    EXPECT_NE(dest_ptr, nullptr);
+
+    // Write data to source
+    EXPECT_EQ(OFFLOAD_SUCCESS, __tgt_rtl_data_submit(DEVICE_ID, source_ptr,
+                                                     &test_value, var_size));
+
+    // Transfer data between devices
+    EXPECT_EQ(OFFLOAD_SUCCESS,
+              __tgt_rtl_data_exchange(DEVICE_ID, source_ptr, DEVICE_TWO,
+                                      dest_ptr, var_size));
+
+    // Read from destination device (DEVICE_TWO) memory
+    EXPECT_EQ(OFFLOAD_SUCCESS, __tgt_rtl_data_retrieve(DEVICE_TWO, &host_value,
+                                                       dest_ptr, var_size));
+
+    // Ensure match
+    EXPECT_EQ(host_value, test_value);
+
+    // Cleanup
+    EXPECT_EQ(OFFLOAD_SUCCESS, __tgt_rtl_data_delete(DEVICE_ID, source_ptr,
+                                                     TARGET_ALLOC_DEFAULT));
+    EXPECT_EQ(OFFLOAD_SUCCESS, __tgt_rtl_data_delete(DEVICE_TWO, dest_ptr,
+                                                     TARGET_ALLOC_DEFAULT));
+  }
+}

From 39f09ec245f3906bbc6b06500f7177a8001e7eed Mon Sep 17 00:00:00 2001
From: "Ivan R. Ivanov" <ivanov.i.aa@m.titech.ac.jp>
Date: Thu, 21 Dec 2023 08:01:21 +0900
Subject: [PATCH 002/342] Invalidate analyses after running Attributor in
 OpenMPOpt (#74908)

Using the LoopInfo from OMPInfoCache after the Attributor ran resulted
in a crash due to it being in an invalid state.

---------

Co-authored-by: Ivan Radanov Ivanov <ivanov2@llnl.gov>
---
 llvm/include/llvm/Transforms/IPO/Attributor.h | 10 ++++++++++
 llvm/lib/Transforms/IPO/OpenMPOpt.cpp         |  3 +++
 2 files changed, 13 insertions(+)

diff --git a/llvm/include/llvm/Transforms/IPO/Attributor.h b/llvm/include/llvm/Transforms/IPO/Attributor.h
index 50167708163ef..30c51250af61c 100644
--- a/llvm/include/llvm/Transforms/IPO/Attributor.h
+++ b/llvm/include/llvm/Transforms/IPO/Attributor.h
@@ -1157,6 +1157,12 @@ struct AnalysisGetter {
     return nullptr;
   }
 
+  /// Invalidates the analyses. Valid only when using the new pass manager.
+  void invalidateAnalyses() {
+    assert(FAM && "Can only be used from the new PM!");
+    FAM->clear();
+  }
+
   AnalysisGetter(FunctionAnalysisManager &FAM, bool CachedOnly = false)
       : FAM(&FAM), CachedOnly(CachedOnly) {}
   AnalysisGetter(Pass *P, bool CachedOnly = false)
@@ -1286,6 +1292,10 @@ struct InformationCache {
     return AssumeOnlyValues.contains(&I);
   }
 
+  /// Invalidates the cached analyses. Valid only when using the new pass
+  /// manager.
+  void invalidateAnalyses() { AG.invalidateAnalyses(); }
+
   /// Return the analysis result from a pass \p AP for function \p F.
   template <typename AP>
   typename AP::Result *getAnalysisResultForFunction(const Function &F,
diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
index 2c880316e0a1c..4176d561363fb 100644
--- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
+++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -2053,6 +2053,9 @@ struct OpenMPOpt {
     LLVM_DEBUG(dbgs() << "[Attributor] Done with " << SCC.size()
                       << " functions, result: " << Changed << ".\n");
 
+    if (Changed == ChangeStatus::CHANGED)
+      OMPInfoCache.invalidateAnalyses();
+
     return Changed == ChangeStatus::CHANGED;
   }
 

From 7e4c6f6cb2e17ee186c9525e59218de0b2277799 Mon Sep 17 00:00:00 2001
From: Shilei Tian <i@tianshilei.me>
Date: Wed, 20 Dec 2023 18:03:01 -0500
Subject: [PATCH 003/342] [OpenMP] Reduce the size of heap memory required by
 the test `malloc_parallel.c` (#75885)

This patch reduces the size of heap memory required by the test
`malloc_parallel.c` and `malloc.c`. The original size is too large such
that `malloc` returns `nullptr` on many threads, causing illegal
memory access.
---
 openmp/libomptarget/test/offloading/malloc.c          | 8 ++++----
 openmp/libomptarget/test/offloading/malloc_parallel.c | 8 ++++----
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/openmp/libomptarget/test/offloading/malloc.c b/openmp/libomptarget/test/offloading/malloc.c
index ad49ace200625..7b98e1f1110e5 100644
--- a/openmp/libomptarget/test/offloading/malloc.c
+++ b/openmp/libomptarget/test/offloading/malloc.c
@@ -6,9 +6,9 @@
 
 int main() {
   long unsigned *DP = 0;
-  int N = 128;
-  int Threads = 128;
-  int Teams = 440;
+  int N = 32;
+  int Threads = 64;
+  int Teams = 10;
 
   // Allocate ~55MB on the device.
 #pragma omp target map(from : DP)
@@ -31,7 +31,7 @@ int main() {
     }
   }
 
-  // CHECK: Sum: 203458478080
+  // CHECK: Sum: 6860800
   printf("Sum: %li\n", s);
   return 0;
 }
diff --git a/openmp/libomptarget/test/offloading/malloc_parallel.c b/openmp/libomptarget/test/offloading/malloc_parallel.c
index 4908e00694d99..076a7ba397a3c 100644
--- a/openmp/libomptarget/test/offloading/malloc_parallel.c
+++ b/openmp/libomptarget/test/offloading/malloc_parallel.c
@@ -7,9 +7,9 @@
 
 int main() {
   long unsigned **DP = 0;
-  int N = 128;
-  int Threads = 128;
-  int Teams = 440;
+  int N = 32;
+  int Threads = 64;
+  int Teams = 10;
 
 #pragma omp target map(from : DP)
   DP = (long unsigned **)malloc(sizeof(long unsigned *) * Threads * Teams);
@@ -36,7 +36,7 @@ int main() {
     }
   }
 
-  // CHECK: Sum: 203458478080
+  // CHECK: Sum: 6860800
   printf("Sum: %li\n", s);
   return 0;
 }

From f94adfd50cc28d5108624e401b263382b259cf84 Mon Sep 17 00:00:00 2001
From: Jonas Paulsson <paulson1@linux.ibm.com>
Date: Thu, 21 Dec 2023 00:08:41 +0100
Subject: [PATCH 004/342] [docs] Reword the alignment implications for atomic
 instructions. (#75871)

Atomic instructions (load / store/ atomicrwm / cmpxchg) are not
really undefined behavior if they lack natural alignment. They will
(with AtomicExpand pass enabled) be converted into libcalls.

Update the language reference to reflect this.
---
 llvm/docs/LangRef.rst | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 7f4a316a21ace..b5918e3063d86 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -10515,9 +10515,10 @@ Atomic loads produce :ref:`defined <memmodel>` results when they may see
 multiple atomic stores. The type of the pointee must be an integer, pointer, or
 floating-point type whose bit width is a power of two greater than or equal to
 eight and less than or equal to a target-specific size limit.  ``align`` must be
-explicitly specified on atomic loads, and the load has undefined behavior if the
-alignment is not set to a value which is at least the size in bytes of the
-pointee. ``!nontemporal`` does not have any defined semantics for atomic loads.
+explicitly specified on atomic loads. Note: if the alignment is not greater or
+equal to the size of the `<value>` type, the atomic operation is likely to
+require a lock and have poor performance. ``!nontemporal`` does not have any
+defined semantics for atomic loads.
 
 The optional constant ``align`` argument specifies the alignment of the
 operation (that is, the alignment of the memory address). It is the
@@ -10655,9 +10656,10 @@ Atomic loads produce :ref:`defined <memmodel>` results when they may see
 multiple atomic stores. The type of the pointee must be an integer, pointer, or
 floating-point type whose bit width is a power of two greater than or equal to
 eight and less than or equal to a target-specific size limit.  ``align`` must be
-explicitly specified on atomic stores, and the store has undefined behavior if
-the alignment is not set to a value which is at least the size in bytes of the
-pointee. ``!nontemporal`` does not have any defined semantics for atomic stores.
+explicitly specified on atomic stores. Note: if the alignment is not greater or
+equal to the size of the `<value>` type, the atomic operation is likely to
+require a lock and have poor performance. ``!nontemporal`` does not have any
+defined semantics for atomic stores.
 
 The optional constant ``align`` argument specifies the alignment of the
 operation (that is, the alignment of the memory address). It is the
@@ -10807,8 +10809,9 @@ must be at least ``monotonic``, the failure ordering cannot be either
 A ``cmpxchg`` instruction can also take an optional
 ":ref:`syncscope <syncscope>`" argument.
 
-The alignment must be a power of two greater or equal to the size of the
-`<value>` type.
+Note: if the alignment is not greater or equal to the size of the `<value>`
+type, the atomic operation is likely to require a lock and have poor
+performance.
 
 The alignment is only optional when parsing textual IR; for in-memory IR, it is
 always present. If unspecified, the alignment is assumed to be equal to the
@@ -10910,8 +10913,9 @@ the ``atomicrmw`` is marked as ``volatile``, then the optimizer is not
 allowed to modify the number or order of execution of this
 ``atomicrmw`` with other :ref:`volatile operations <volatile>`.
 
-The alignment must be a power of two greater or equal to the size of the
-`<value>` type.
+Note: if the alignment is not greater or equal to the size of the `<value>`
+type, the atomic operation is likely to require a lock and have poor
+performance.
 
 The alignment is only optional when parsing textual IR; for in-memory IR, it is
 always present. If unspecified, the alignment is assumed to be equal to the

From b37c0486b2e82832ba353214b58f26924d8ca68e Mon Sep 17 00:00:00 2001
From: michaelrj-google <71531609+michaelrj-google@users.noreply.github.com>
Date: Wed, 20 Dec 2023 15:12:54 -0800
Subject: [PATCH 005/342] [libc][NFC] clean up printf_core and scanf_core
 (#74535)

Add LIBC_INLINE annotations to functions and fix variable cases within
printf_core and scanf_core.
---
 libc/src/stdio/printf_core/char_converter.h   |  6 ++--
 libc/src/stdio/printf_core/converter_utils.h  |  1 -
 libc/src/stdio/printf_core/core_structs.h     | 12 +++----
 .../stdio/printf_core/float_dec_converter.h   | 35 +++++++++----------
 .../stdio/printf_core/float_hex_converter.h   |  2 --
 .../printf_core/float_inf_nan_converter.h     |  1 -
 libc/src/stdio/printf_core/int_converter.h    |  1 -
 libc/src/stdio/printf_core/parser.h           |  3 --
 libc/src/stdio/printf_core/ptr_converter.h    |  3 --
 libc/src/stdio/printf_core/string_converter.h |  1 -
 .../stdio/printf_core/write_int_converter.h   |  1 -
 libc/src/stdio/printf_core/writer.cpp         |  2 --
 libc/src/stdio/printf_core/writer.h           |  2 +-
 libc/src/stdio/scanf_core/converter_utils.h   |  2 --
 libc/src/stdio/scanf_core/core_structs.h      |  3 +-
 .../stdio/scanf_core/current_pos_converter.h  |  1 -
 libc/src/stdio/scanf_core/parser.h            |  1 -
 libc/src/stdio/scanf_core/reader.h            |  9 ++---
 18 files changed, 31 insertions(+), 55 deletions(-)

diff --git a/libc/src/stdio/printf_core/char_converter.h b/libc/src/stdio/printf_core/char_converter.h
index 9b1501ff24b0d..13596b8ed4f23 100644
--- a/libc/src/stdio/printf_core/char_converter.h
+++ b/libc/src/stdio/printf_core/char_converter.h
@@ -9,8 +9,6 @@
 #ifndef LLVM_LIBC_SRC_STDIO_PRINTF_CORE_CHAR_CONVERTER_H
 #define LLVM_LIBC_SRC_STDIO_PRINTF_CORE_CHAR_CONVERTER_H
 
-#include "src/__support/CPP/string_view.h"
-#include "src/__support/common.h"
 #include "src/stdio/printf_core/converter_utils.h"
 #include "src/stdio/printf_core/core_structs.h"
 #include "src/stdio/printf_core/writer.h"
@@ -21,10 +19,10 @@ namespace printf_core {
 LIBC_INLINE int convert_char(Writer *writer, const FormatSection &to_conv) {
   char c = static_cast<char>(to_conv.conv_val_raw);
 
-  constexpr int string_len = 1;
+  constexpr int STRING_LEN = 1;
 
   size_t padding_spaces =
-      to_conv.min_width > string_len ? to_conv.min_width - string_len : 0;
+      to_conv.min_width > STRING_LEN ? to_conv.min_width - STRING_LEN : 0;
 
   // If the padding is on the left side, write the spaces first.
   if (padding_spaces > 0 &&
diff --git a/libc/src/stdio/printf_core/converter_utils.h b/libc/src/stdio/printf_core/converter_utils.h
index 4540bba6346e2..54f0a870d0ac4 100644
--- a/libc/src/stdio/printf_core/converter_utils.h
+++ b/libc/src/stdio/printf_core/converter_utils.h
@@ -10,7 +10,6 @@
 #define LLVM_LIBC_SRC_STDIO_PRINTF_CORE_CONVERTER_UTILS_H
 
 #include "src/__support/CPP/limits.h"
-#include "src/__support/common.h"
 #include "src/stdio/printf_core/core_structs.h"
 
 #include <inttypes.h>
diff --git a/libc/src/stdio/printf_core/core_structs.h b/libc/src/stdio/printf_core/core_structs.h
index 37538362fa3e7..7634d45568ab8 100644
--- a/libc/src/stdio/printf_core/core_structs.h
+++ b/libc/src/stdio/printf_core/core_structs.h
@@ -53,7 +53,7 @@ struct FormatSection {
 
   // This operator is only used for testing and should be automatically
   // optimized out for release builds.
-  bool operator==(const FormatSection &other) const {
+  LIBC_INLINE bool operator==(const FormatSection &other) const {
     if (has_conv != other.has_conv)
       return false;
 
@@ -93,11 +93,11 @@ template <typename T> LIBC_INLINE constexpr TypeDesc type_desc_from_type() {
   if constexpr (cpp::is_same_v<T, void>) {
     return TypeDesc{0, PrimaryType::Unknown};
   } else {
-    constexpr bool isPointer = cpp::is_pointer_v<T>;
-    constexpr bool isFloat = cpp::is_floating_point_v<T>;
-    return TypeDesc{sizeof(T), isPointer ? PrimaryType::Pointer
-                               : isFloat ? PrimaryType::Float
-                                         : PrimaryType::Integer};
+    constexpr bool IS_POINTER = cpp::is_pointer_v<T>;
+    constexpr bool IS_FLOAT = cpp::is_floating_point_v<T>;
+    return TypeDesc{sizeof(T), IS_POINTER ? PrimaryType::Pointer
+                               : IS_FLOAT ? PrimaryType::Float
+                                          : PrimaryType::Integer};
   }
 }
 
diff --git a/libc/src/stdio/printf_core/float_dec_converter.h b/libc/src/stdio/printf_core/float_dec_converter.h
index 458f494d5edfd..798bb955cca14 100644
--- a/libc/src/stdio/printf_core/float_dec_converter.h
+++ b/libc/src/stdio/printf_core/float_dec_converter.h
@@ -10,13 +10,9 @@
 #define LLVM_LIBC_SRC_STDIO_PRINTF_CORE_FLOAT_DEC_CONVERTER_H
 
 #include "src/__support/CPP/string_view.h"
-#include "src/__support/FPUtil/FEnvImpl.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/__support/FPUtil/FloatProperties.h"
 #include "src/__support/FPUtil/rounding_mode.h"
-#include "src/__support/UInt.h"
-#include "src/__support/UInt128.h"
-#include "src/__support/common.h"
 #include "src/__support/float_to_string.h"
 #include "src/__support/integer_to_string.h"
 #include "src/__support/libc_assert.h"
@@ -103,14 +99,14 @@ class PaddingWriter {
   size_t min_width = 0;
 
 public:
-  PaddingWriter() {}
-  PaddingWriter(const FormatSection &to_conv, char init_sign_char)
+  LIBC_INLINE PaddingWriter() {}
+  LIBC_INLINE PaddingWriter(const FormatSection &to_conv, char init_sign_char)
       : left_justified((to_conv.flags & FormatFlags::LEFT_JUSTIFIED) > 0),
         leading_zeroes((to_conv.flags & FormatFlags::LEADING_ZEROES) > 0),
         sign_char(init_sign_char),
         min_width(to_conv.min_width > 0 ? to_conv.min_width : 0) {}
 
-  int write_left_padding(Writer *writer, size_t total_digits) {
+  LIBC_INLINE int write_left_padding(Writer *writer, size_t total_digits) {
     // The pattern is (spaces) (sign) (zeroes), but only one of spaces and
     // zeroes can be written, and only if the padding amount is positive.
     int padding_amount =
@@ -133,7 +129,7 @@ class PaddingWriter {
     return 0;
   }
 
-  int write_right_padding(Writer *writer, size_t total_digits) {
+  LIBC_INLINE int write_right_padding(Writer *writer, size_t total_digits) {
     // If and only if the conversion is left justified, there may be trailing
     // spaces.
     int padding_amount =
@@ -170,7 +166,7 @@ class FloatWriter {
   Writer *writer;                   // Writes to the final output.
   PaddingWriter padding_writer; // Handles prefixes/padding, uses total_digits.
 
-  int flush_buffer(bool round_up_max_blocks = false) {
+  LIBC_INLINE int flush_buffer(bool round_up_max_blocks = false) {
     const char MAX_BLOCK_DIGIT = (round_up_max_blocks ? '0' : '9');
 
     // Write the most recent buffered block, and mark has_written
@@ -249,17 +245,18 @@ class FloatWriter {
                 (sizeof(int) * 8));
 
 public:
-  FloatWriter(Writer *init_writer, bool init_has_decimal_point,
-              const PaddingWriter &init_padding_writer)
+  LIBC_INLINE FloatWriter(Writer *init_writer, bool init_has_decimal_point,
+                          const PaddingWriter &init_padding_writer)
       : has_decimal_point(init_has_decimal_point), writer(init_writer),
         padding_writer(init_padding_writer) {}
 
-  void init(size_t init_total_digits, size_t init_digits_before_decimal) {
+  LIBC_INLINE void init(size_t init_total_digits,
+                        size_t init_digits_before_decimal) {
     total_digits = init_total_digits;
     digits_before_decimal = init_digits_before_decimal;
   }
 
-  void write_first_block(BlockInt block, bool exp_format = false) {
+  LIBC_INLINE void write_first_block(BlockInt block, bool exp_format = false) {
     const DecimalString buf(block);
     const cpp::string_view int_to_str = buf.view();
     size_t digits_buffered = int_to_str.size();
@@ -280,7 +277,7 @@ class FloatWriter {
     }
   }
 
-  int write_middle_block(BlockInt block) {
+  LIBC_INLINE int write_middle_block(BlockInt block) {
     if (block == MAX_BLOCK) { // Buffer max blocks in case of rounding
       ++max_block_count;
     } else { // If a non-max block has been found
@@ -301,9 +298,9 @@ class FloatWriter {
     return 0;
   }
 
-  int write_last_block(BlockInt block, size_t block_digits,
-                       RoundDirection round, int exponent = 0,
-                       char exp_char = '\0') {
+  LIBC_INLINE int write_last_block(BlockInt block, size_t block_digits,
+                                   RoundDirection round, int exponent = 0,
+                                   char exp_char = '\0') {
     bool has_exp = (exp_char != '\0');
 
     char end_buff[BLOCK_SIZE];
@@ -458,13 +455,13 @@ class FloatWriter {
     return WRITE_OK;
   }
 
-  int write_zeroes(uint32_t num_zeroes) {
+  LIBC_INLINE int write_zeroes(uint32_t num_zeroes) {
     RET_IF_RESULT_NEGATIVE(flush_buffer());
     RET_IF_RESULT_NEGATIVE(writer->write('0', num_zeroes));
     return 0;
   }
 
-  int right_pad() {
+  LIBC_INLINE int right_pad() {
     return padding_writer.write_right_padding(writer, total_digits);
   }
 };
diff --git a/libc/src/stdio/printf_core/float_hex_converter.h b/libc/src/stdio/printf_core/float_hex_converter.h
index a3a8c0420beff..5ccae81b430c5 100644
--- a/libc/src/stdio/printf_core/float_hex_converter.h
+++ b/libc/src/stdio/printf_core/float_hex_converter.h
@@ -10,10 +10,8 @@
 #define LLVM_LIBC_SRC_STDIO_PRINTF_CORE_FLOAT_HEX_CONVERTER_H
 
 #include "src/__support/CPP/string_view.h"
-#include "src/__support/FPUtil/FEnvImpl.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/__support/FPUtil/rounding_mode.h"
-#include "src/__support/common.h"
 #include "src/stdio/printf_core/converter_utils.h"
 #include "src/stdio/printf_core/core_structs.h"
 #include "src/stdio/printf_core/float_inf_nan_converter.h"
diff --git a/libc/src/stdio/printf_core/float_inf_nan_converter.h b/libc/src/stdio/printf_core/float_inf_nan_converter.h
index a0310dc88b560..8669dc374cb29 100644
--- a/libc/src/stdio/printf_core/float_inf_nan_converter.h
+++ b/libc/src/stdio/printf_core/float_inf_nan_converter.h
@@ -10,7 +10,6 @@
 #define LLVM_LIBC_SRC_STDIO_PRINTF_CORE_FLOAT_INF_NAN_CONVERTER_H
 
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/__support/common.h"
 #include "src/stdio/printf_core/converter_utils.h"
 #include "src/stdio/printf_core/core_structs.h"
 #include "src/stdio/printf_core/writer.h"
diff --git a/libc/src/stdio/printf_core/int_converter.h b/libc/src/stdio/printf_core/int_converter.h
index 13fcf3f1aa2ed..7744d801cbc18 100644
--- a/libc/src/stdio/printf_core/int_converter.h
+++ b/libc/src/stdio/printf_core/int_converter.h
@@ -11,7 +11,6 @@
 
 #include "src/__support/CPP/span.h"
 #include "src/__support/CPP/string_view.h"
-#include "src/__support/common.h"
 #include "src/__support/integer_to_string.h"
 #include "src/stdio/printf_core/converter_utils.h"
 #include "src/stdio/printf_core/core_structs.h"
diff --git a/libc/src/stdio/printf_core/parser.h b/libc/src/stdio/printf_core/parser.h
index f1994517e1ab1..ab491655275fb 100644
--- a/libc/src/stdio/printf_core/parser.h
+++ b/libc/src/stdio/printf_core/parser.h
@@ -10,9 +10,6 @@
 #define LLVM_LIBC_SRC_STDIO_PRINTF_CORE_PARSER_H
 
 #include "src/__support/CPP/optional.h"
-#include "src/__support/CPP/type_traits.h"
-#include "src/__support/arg_list.h"
-#include "src/__support/common.h"
 #include "src/__support/str_to_integer.h"
 #include "src/stdio/printf_core/core_structs.h"
 #include "src/stdio/printf_core/printf_config.h"
diff --git a/libc/src/stdio/printf_core/ptr_converter.h b/libc/src/stdio/printf_core/ptr_converter.h
index 73c6e608a59a7..c5d4086647ec3 100644
--- a/libc/src/stdio/printf_core/ptr_converter.h
+++ b/libc/src/stdio/printf_core/ptr_converter.h
@@ -9,9 +9,6 @@
 #ifndef LLVM_LIBC_SRC_STDIO_PRINTF_CORE_PTR_CONVERTER_H
 #define LLVM_LIBC_SRC_STDIO_PRINTF_CORE_PTR_CONVERTER_H
 
-#include "src/__support/CPP/string_view.h"
-#include "src/__support/common.h"
-#include "src/stdio/printf_core/converter_utils.h"
 #include "src/stdio/printf_core/core_structs.h"
 #include "src/stdio/printf_core/int_converter.h"
 #include "src/stdio/printf_core/string_converter.h"
diff --git a/libc/src/stdio/printf_core/string_converter.h b/libc/src/stdio/printf_core/string_converter.h
index 158315311e9ea..04dc5a06da222 100644
--- a/libc/src/stdio/printf_core/string_converter.h
+++ b/libc/src/stdio/printf_core/string_converter.h
@@ -10,7 +10,6 @@
 #define LLVM_LIBC_SRC_STDIO_PRINTF_CORE_STRING_CONVERTER_H
 
 #include "src/__support/CPP/string_view.h"
-#include "src/__support/common.h"
 #include "src/stdio/printf_core/converter_utils.h"
 #include "src/stdio/printf_core/core_structs.h"
 #include "src/stdio/printf_core/writer.h"
diff --git a/libc/src/stdio/printf_core/write_int_converter.h b/libc/src/stdio/printf_core/write_int_converter.h
index 35cafacd5a8c1..0310905f36f14 100644
--- a/libc/src/stdio/printf_core/write_int_converter.h
+++ b/libc/src/stdio/printf_core/write_int_converter.h
@@ -9,7 +9,6 @@
 #ifndef LLVM_LIBC_SRC_STDIO_PRINTF_CORE_WRITE_INT_CONVERTER_H
 #define LLVM_LIBC_SRC_STDIO_PRINTF_CORE_WRITE_INT_CONVERTER_H
 
-#include "src/__support/CPP/limits.h"
 #include "src/stdio/printf_core/core_structs.h"
 #include "src/stdio/printf_core/writer.h"
 
diff --git a/libc/src/stdio/printf_core/writer.cpp b/libc/src/stdio/printf_core/writer.cpp
index c831ca14c9d91..f8ecd829af3a6 100644
--- a/libc/src/stdio/printf_core/writer.cpp
+++ b/libc/src/stdio/printf_core/writer.cpp
@@ -8,9 +8,7 @@
 
 #include "writer.h"
 #include "src/__support/CPP/string_view.h"
-#include "src/__support/macros/optimization.h"
 #include "src/stdio/printf_core/core_structs.h"
-#include "src/string/memory_utils/inline_memcpy.h"
 #include "src/string/memory_utils/inline_memset.h"
 #include <stddef.h>
 
diff --git a/libc/src/stdio/printf_core/writer.h b/libc/src/stdio/printf_core/writer.h
index e4f503abc34c5..67513eca97288 100644
--- a/libc/src/stdio/printf_core/writer.h
+++ b/libc/src/stdio/printf_core/writer.h
@@ -45,7 +45,7 @@ struct WriteBuffer {
   // write as much of new_str to the buffer as it can. The current position in
   // the buffer will be reset iff stream_writer is called. Calling this with an
   // empty string will flush the buffer if relevant.
-  int overflow_write(cpp::string_view new_str) {
+  LIBC_INLINE int overflow_write(cpp::string_view new_str) {
     // If there is a stream_writer, write the contents of the buffer, then
     // new_str, then clear the buffer.
     if (stream_writer != nullptr) {
diff --git a/libc/src/stdio/scanf_core/converter_utils.h b/libc/src/stdio/scanf_core/converter_utils.h
index a14f35796d27f..a25e8a73e99a4 100644
--- a/libc/src/stdio/scanf_core/converter_utils.h
+++ b/libc/src/stdio/scanf_core/converter_utils.h
@@ -9,11 +9,9 @@
 #ifndef LLVM_LIBC_SRC_STDIO_SCANF_CORE_CONVERTER_UTILS_H
 #define LLVM_LIBC_SRC_STDIO_SCANF_CORE_CONVERTER_UTILS_H
 
-#include "src/__support/common.h"
 #include "src/__support/ctype_utils.h"
 #include "src/__support/str_to_float.h"
 #include "src/stdio/scanf_core/core_structs.h"
-#include "src/stdio/scanf_core/reader.h"
 
 #include <stddef.h>
 
diff --git a/libc/src/stdio/scanf_core/core_structs.h b/libc/src/stdio/scanf_core/core_structs.h
index 246f770e0cabe..29e1bf2e47f39 100644
--- a/libc/src/stdio/scanf_core/core_structs.h
+++ b/libc/src/stdio/scanf_core/core_structs.h
@@ -11,7 +11,6 @@
 
 #include "src/__support/CPP/bitset.h"
 #include "src/__support/CPP/string_view.h"
-#include "src/__support/FPUtil/FPBits.h"
 
 #include <inttypes.h>
 #include <stddef.h>
@@ -46,7 +45,7 @@ struct FormatSection {
 
   cpp::bitset<256> scan_set;
 
-  bool operator==(const FormatSection &other) {
+  LIBC_INLINE bool operator==(const FormatSection &other) {
     if (has_conv != other.has_conv)
       return false;
 
diff --git a/libc/src/stdio/scanf_core/current_pos_converter.h b/libc/src/stdio/scanf_core/current_pos_converter.h
index fd62383e39409..be25cefed151a 100644
--- a/libc/src/stdio/scanf_core/current_pos_converter.h
+++ b/libc/src/stdio/scanf_core/current_pos_converter.h
@@ -9,7 +9,6 @@
 #ifndef LLVM_LIBC_SRC_STDIO_SCANF_CORE_CURRENT_POS_CONVERTER_H
 #define LLVM_LIBC_SRC_STDIO_SCANF_CORE_CURRENT_POS_CONVERTER_H
 
-#include "src/__support/common.h"
 #include "src/stdio/scanf_core/converter_utils.h"
 #include "src/stdio/scanf_core/core_structs.h"
 #include "src/stdio/scanf_core/reader.h"
diff --git a/libc/src/stdio/scanf_core/parser.h b/libc/src/stdio/scanf_core/parser.h
index 7f3a53be35700..5ae9009bc4a23 100644
--- a/libc/src/stdio/scanf_core/parser.h
+++ b/libc/src/stdio/scanf_core/parser.h
@@ -10,7 +10,6 @@
 #define LLVM_LIBC_SRC_STDIO_SCANF_CORE_PARSER_H
 
 #include "src/__support/arg_list.h"
-#include "src/__support/common.h"
 #include "src/__support/ctype_utils.h"
 #include "src/__support/str_to_integer.h"
 #include "src/stdio/scanf_core/core_structs.h"
diff --git a/libc/src/stdio/scanf_core/reader.h b/libc/src/stdio/scanf_core/reader.h
index d8647fe2c4ec7..f750c4341a8d7 100644
--- a/libc/src/stdio/scanf_core/reader.h
+++ b/libc/src/stdio/scanf_core/reader.h
@@ -38,10 +38,11 @@ class Reader {
 
 public:
   // TODO: Set buff_len with a proper constant
-  Reader(ReadBuffer *string_buffer) : rb(string_buffer) {}
+  LIBC_INLINE Reader(ReadBuffer *string_buffer) : rb(string_buffer) {}
 
-  Reader(void *stream, StreamGetc stream_getc_in, StreamUngetc stream_ungetc_in,
-         ReadBuffer *stream_buffer = nullptr)
+  LIBC_INLINE Reader(void *stream, StreamGetc stream_getc_in,
+                     StreamUngetc stream_ungetc_in,
+                     ReadBuffer *stream_buffer = nullptr)
       : rb(stream_buffer), input_stream(stream), stream_getc(stream_getc_in),
         stream_ungetc(stream_ungetc_in) {}
 
@@ -63,7 +64,7 @@ class Reader {
   // this is a file reader, else c is ignored.
   void ungetc(char c);
 
-  size_t chars_read() { return cur_chars_read; }
+  LIBC_INLINE size_t chars_read() { return cur_chars_read; }
 };
 
 } // namespace scanf_core

From f324584ae3b6330be3d79e89cdacd3969760574e Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn@outlook.com>
Date: Wed, 20 Dec 2023 17:13:41 -0600
Subject: [PATCH 006/342] [Libomptarget][NFCI] Remove caching of created ELF
 files (#76080)

Summary:
We currently keep a cache of created ELF files from the relevant images.
This shouldn't be necessary as the entire ELF interface is generally
trivially constructable and extremely cheap. The cost of constructing
one of these objects is simply a size check and writing a pointer to the
underlying data. Given that, keeping a cache of these images should not
be necessary overall.
---
 .../common/include/GlobalHandler.h            |  9 +---
 .../common/src/GlobalHandler.cpp              | 41 +++++++------------
 .../plugins-nextgen/cuda/src/rtl.cpp          | 10 ++---
 3 files changed, 20 insertions(+), 40 deletions(-)

diff --git a/openmp/libomptarget/plugins-nextgen/common/include/GlobalHandler.h b/openmp/libomptarget/plugins-nextgen/common/include/GlobalHandler.h
index fa079ac9660ee..d9fe938790ca7 100644
--- a/openmp/libomptarget/plugins-nextgen/common/include/GlobalHandler.h
+++ b/openmp/libomptarget/plugins-nextgen/common/include/GlobalHandler.h
@@ -89,9 +89,6 @@ template <typename Ty> class StaticGlobalTy : public GlobalTy {
 /// global metadata (size, addr) from the device.
 /// \see getGlobalMetadataFromDevice
 class GenericGlobalHandlerTy {
-  /// Map to store the ELF object files that have been loaded.
-  llvm::DenseMap<int32_t, ELF64LEObjectFile> ELFObjectFiles;
-
   /// Actually move memory between host and device. See readGlobalFromDevice and
   /// writeGlobalToDevice for the interface description.
   Error moveGlobalBetweenDeviceAndHost(GenericDeviceTy &Device,
@@ -109,10 +106,8 @@ class GenericGlobalHandlerTy {
 public:
   virtual ~GenericGlobalHandlerTy() {}
 
-  /// Get the cached ELF64LEObjectFile previosuly created for a specific
-  /// device image or create it if did not exist.
-  const ELF64LEObjectFile *
-  getOrCreateELFObjectFile(const GenericDeviceTy &Device, DeviceImageTy &Image);
+  /// Helper function for getting an ELF from a device image.
+  Expected<ELF64LEObjectFile> getELFObjectFile(DeviceImageTy &Image);
 
   /// Returns whether the symbol named \p SymName is present in the given \p
   /// Image.
diff --git a/openmp/libomptarget/plugins-nextgen/common/src/GlobalHandler.cpp b/openmp/libomptarget/plugins-nextgen/common/src/GlobalHandler.cpp
index 3a272e228c7df..d398f60c55bd1 100644
--- a/openmp/libomptarget/plugins-nextgen/common/src/GlobalHandler.cpp
+++ b/openmp/libomptarget/plugins-nextgen/common/src/GlobalHandler.cpp
@@ -25,29 +25,14 @@ using namespace omp;
 using namespace target;
 using namespace plugin;
 
-const ELF64LEObjectFile *
-GenericGlobalHandlerTy::getOrCreateELFObjectFile(const GenericDeviceTy &Device,
-                                                 DeviceImageTy &Image) {
+Expected<ELF64LEObjectFile>
+GenericGlobalHandlerTy::getELFObjectFile(DeviceImageTy &Image) {
+  assert(utils::elf::isELF(Image.getMemoryBuffer().getBuffer()) &&
+         "Input is not an ELF file");
 
-  auto Search = ELFObjectFiles.find(Image.getId());
-  if (Search != ELFObjectFiles.end())
-    // The ELF object file was already there.
-    return &Search->second;
-
-  // The ELF object file we are checking is not created yet.
   Expected<ELF64LEObjectFile> ElfOrErr =
       ELF64LEObjectFile::create(Image.getMemoryBuffer());
-  if (!ElfOrErr) {
-    consumeError(ElfOrErr.takeError());
-    return nullptr;
-  }
-
-  auto Result =
-      ELFObjectFiles.try_emplace(Image.getId(), std::move(ElfOrErr.get()));
-  assert(Result.second && "Map insertion failed");
-  assert(Result.first != ELFObjectFiles.end() && "Map insertion failed");
-
-  return &Result.first->second;
+  return ElfOrErr;
 }
 
 Error GenericGlobalHandlerTy::moveGlobalBetweenDeviceAndHost(
@@ -83,7 +68,8 @@ Error GenericGlobalHandlerTy::moveGlobalBetweenDeviceAndHost(
       return Err;
   }
 
-  DP("Succesfully %s %u bytes associated with global symbol '%s' %s the device "
+  DP("Succesfully %s %u bytes associated with global symbol '%s' %s the "
+     "device "
      "(%p -> %p).\n",
      Device2Host ? "read" : "write", HostGlobal.getSize(),
      HostGlobal.getName().data(), Device2Host ? "from" : "to",
@@ -98,12 +84,14 @@ bool GenericGlobalHandlerTy::isSymbolInImage(GenericDeviceTy &Device,
   // Get the ELF object file for the image. Notice the ELF object may already
   // be created in previous calls, so we can reuse it. If this is unsuccessful
   // just return false as we couldn't find it.
-  const ELF64LEObjectFile *ELFObj = getOrCreateELFObjectFile(Device, Image);
-  if (!ELFObj)
+  auto ELFObjOrErr = getELFObjectFile(Image);
+  if (!ELFObjOrErr) {
+    consumeError(ELFObjOrErr.takeError());
     return false;
+  }
 
   // Search the ELF symbol using the symbol name.
-  auto SymOrErr = utils::elf::getSymbol(*ELFObj, SymName);
+  auto SymOrErr = utils::elf::getSymbol(*ELFObjOrErr, SymName);
   if (!SymOrErr) {
     consumeError(SymOrErr.takeError());
     return false;
@@ -117,10 +105,9 @@ Error GenericGlobalHandlerTy::getGlobalMetadataFromImage(
 
   // Get the ELF object file for the image. Notice the ELF object may already
   // be created in previous calls, so we can reuse it.
-  const ELF64LEObjectFile *ELFObj = getOrCreateELFObjectFile(Device, Image);
+  auto ELFObj = getELFObjectFile(Image);
   if (!ELFObj)
-    return Plugin::error("Unable to create ELF object for image %p",
-                         Image.getStart());
+    return ELFObj.takeError();
 
   // Search the ELF symbol using the symbol name.
   auto SymOrErr = utils::elf::getSymbol(*ELFObj, ImageGlobal.getName());
diff --git a/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
index 0c7535a0da8b9..b0dff917dd0be 100644
--- a/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
+++ b/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
@@ -1063,15 +1063,13 @@ struct CUDADeviceTy : public GenericDeviceTy {
     // automatically so we must create it ourselves. The backend will emit
     // several globals that contain function pointers we can call. These are
     // prefixed with a known name due to Nvidia's lack of section support.
-    const ELF64LEObjectFile *ELFObj =
-        Handler.getOrCreateELFObjectFile(*this, Image);
-    if (!ELFObj)
-      return Plugin::error("Unable to create ELF object for image %p",
-                           Image.getStart());
+    auto ELFObjOrErr = Handler.getELFObjectFile(Image);
+    if (!ELFObjOrErr)
+      return ELFObjOrErr.takeError();
 
     // Search for all symbols that contain a constructor or destructor.
     SmallVector<std::pair<StringRef, uint16_t>> Funcs;
-    for (ELFSymbolRef Sym : ELFObj->symbols()) {
+    for (ELFSymbolRef Sym : ELFObjOrErr->symbols()) {
       auto NameOrErr = Sym.getName();
       if (!NameOrErr)
         return NameOrErr.takeError();

From acaff70841f59a1aec2a3c417e9f3a0f14eb47ad Mon Sep 17 00:00:00 2001
From: Maksim Levental <maksim.levental@gmail.com>
Date: Wed, 20 Dec 2023 17:29:11 -0600
Subject: [PATCH 007/342] [mlir][python] move transform extras (#76102)

---
 mlir/python/CMakeLists.txt                    |  2 +-
 .../mlir/dialects/transform/__init__.py       |  1 +
 .../transform/extras}/__init__.py             | 43 ++++++++++---------
 mlir/test/python/dialects/transform_extras.py |  2 +-
 4 files changed, 25 insertions(+), 23 deletions(-)
 rename mlir/python/mlir/{extras/dialects/transform => dialects/transform/extras}/__init__.py (80%)

diff --git a/mlir/python/CMakeLists.txt b/mlir/python/CMakeLists.txt
index 41d91cf677833..55c5973e40e52 100644
--- a/mlir/python/CMakeLists.txt
+++ b/mlir/python/CMakeLists.txt
@@ -172,7 +172,7 @@ declare_mlir_python_sources(
   ROOT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/mlir"
   GEN_ENUM_BINDINGS
   SOURCES
-    extras/dialects/transform/__init__.py)
+    dialects/transform/extras/__init__.py)
 
 declare_mlir_dialect_extension_python_bindings(
   ADD_TO_PARENT MLIRPythonSources.Dialects
diff --git a/mlir/python/mlir/dialects/transform/__init__.py b/mlir/python/mlir/dialects/transform/__init__.py
index 7ae4fefbac412..175634c7d458f 100644
--- a/mlir/python/mlir/dialects/transform/__init__.py
+++ b/mlir/python/mlir/dialects/transform/__init__.py
@@ -6,6 +6,7 @@
 from .._transform_ops_gen import *
 from .._transform_ops_gen import _Dialect
 from ..._mlir_libs._mlirDialectsTransform import *
+from ..._mlir_libs._mlirDialectsTransform import AnyOpType, OperationType
 
 try:
     from ...ir import *
diff --git a/mlir/python/mlir/extras/dialects/transform/__init__.py b/mlir/python/mlir/dialects/transform/extras/__init__.py
similarity index 80%
rename from mlir/python/mlir/extras/dialects/transform/__init__.py
rename to mlir/python/mlir/dialects/transform/extras/__init__.py
index 9e313324318aa..c715dac1ef7eb 100644
--- a/mlir/python/mlir/extras/dialects/transform/__init__.py
+++ b/mlir/python/mlir/dialects/transform/extras/__init__.py
@@ -2,12 +2,11 @@
 #  See https://llvm.org/LICENSE.txt for license information.
 #  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-from __future__ import annotations
-from typing import Callable, Optional, Sequence
+from typing import Callable, Optional, Sequence, Union
 
 from .... import ir
-from ....dialects import transform
-from ....dialects.transform import structured
+from .. import AnyOpType, OperationType, NamedSequenceOp, YieldOp
+from .. import structured
 
 
 class Handle(ir.Value):
@@ -25,16 +24,16 @@ def __init__(
         self,
         v: ir.Value,
         *,
-        parent: Optional[Handle] = None,
-        children: Optional[Sequence[Handle]] = None,
+        parent: Optional["Handle"] = None,
+        children: Optional[Sequence["Handle"]] = None,
     ):
         super().__init__(v)
         self.parent = parent
         self.children = children if children is not None else []
 
 
-@ir.register_value_caster(transform.AnyOpType.get_static_typeid())
-@ir.register_value_caster(transform.OperationType.get_static_typeid())
+@ir.register_value_caster(AnyOpType.get_static_typeid())
+@ir.register_value_caster(OperationType.get_static_typeid())
 class OpHandle(Handle):
     """
     Wrapper around a transform operation handle with methods to chain further
@@ -52,11 +51,13 @@ def __init__(
 
     def match_ops(
         self,
-        ops: str
-        | ir.OpView
-        | structured.MatchInterfaceEnum
-        | Sequence[str | ir.OpView],
-    ) -> OpHandle:
+        ops: Union[
+            str,
+            ir.OpView,
+            structured.MatchInterfaceEnum,
+            Sequence[Union[str, ir.OpView]],
+        ],
+    ) -> "OpHandle":
         """
         Emits a `transform.structured.MatchOp`.
         Returns a handle to payload ops that match the given names, types, or
@@ -70,7 +71,7 @@ def match_ops(
             if isinstance(ops, str):
                 ops = structured.MatchInterfaceEnum[ops]
             match_op = structured.MatchOp(
-                transform.AnyOpType.get(),
+                AnyOpType.get(),
                 self,
                 interface=ops,
             )
@@ -78,15 +79,15 @@ def match_ops(
         # Handle op name(s), either given directly as string or given as op.
         else:
             if isinstance(ops, str):
-                op_type = transform.OperationType.get(ops)
+                op_type = OperationType.get(ops)
                 op_names = [ops]
             elif isinstance(ops, Sequence):
-                op_type = transform.AnyOpType.get()
+                op_type = AnyOpType.get()
                 op_names = [
                     op if isinstance(op, str) else op.OPERATION_NAME for op in ops
                 ]
             else:
-                op_type = transform.OperationType.get(ops.OPERATION_NAME)
+                op_type = OperationType.get(ops.OPERATION_NAME)
                 op_names = [ops.OPERATION_NAME]
             match_op = structured.MatchOp.match_op_names(
                 op_type,
@@ -100,7 +101,7 @@ def match_ops(
 
 
 def insert_transform_script(
-    block_or_insertion_point: ir.Block | ir.InsertionPoint,
+    block_or_insertion_point: Union[ir.Block, ir.InsertionPoint],
     script: Callable[[OpHandle], None],
     dump_script: bool = False,
 ) -> None:
@@ -137,12 +138,12 @@ def test_match_ops_single(module: OpHandle):
 
     with context, ir.Location.unknown(context):
         with insertion_point:
-            named_sequence_op = transform.NamedSequenceOp(
-                "__transform_main", [transform.AnyOpType.get()], []
+            named_sequence_op = NamedSequenceOp(
+                "__transform_main", [AnyOpType.get()], []
             )
         with ir.InsertionPoint(named_sequence_op.body):
             script(named_sequence_op.bodyTarget)
-            transform.YieldOp([])
+            YieldOp([])
 
     if dump_script:
         print(named_sequence_op)
diff --git a/mlir/test/python/dialects/transform_extras.py b/mlir/test/python/dialects/transform_extras.py
index dbfa8a2dc73c4..e7b43ea63c31c 100644
--- a/mlir/test/python/dialects/transform_extras.py
+++ b/mlir/test/python/dialects/transform_extras.py
@@ -4,7 +4,7 @@
 from mlir import ir
 from mlir.dialects import scf
 from mlir.dialects.transform import structured
-from mlir.extras.dialects.transform import OpHandle, insert_transform_script
+from mlir.dialects.transform.extras import OpHandle, insert_transform_script
 
 
 def build_transform_script(script: Callable[[OpHandle], None]):

From 3dca63a32f9834d0e5586fc90797b7c94d93e0bf Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Wed, 20 Dec 2023 15:38:43 -0800
Subject: [PATCH 008/342] [symbolizer] Don't threat symbolizer API as optional
 (#76103)

There is an assumption that we dont need to to mix sanitizer with
symbolizer from different LLVM revison. If so we can detect it by
`__sanitizer_symbolize_code` and assume that the rest is present.
---
 .../sanitizer_symbolizer_posix_libcdep.cpp    | 27 +++++++------------
 1 file changed, 10 insertions(+), 17 deletions(-)

diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_posix_libcdep.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_posix_libcdep.cpp
index 28f11352a6b5b..0ddc24802d216 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_posix_libcdep.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_posix_libcdep.cpp
@@ -341,15 +341,14 @@ __sanitizer_symbolize_set_inline_frames(bool InlineFrames);
 class InternalSymbolizer final : public SymbolizerTool {
  public:
   static InternalSymbolizer *get(LowLevelAllocator *alloc) {
-    if (&__sanitizer_symbolize_set_demangle)
-      CHECK(__sanitizer_symbolize_set_demangle(common_flags()->demangle));
-    if (&__sanitizer_symbolize_set_inline_frames)
-      CHECK(__sanitizer_symbolize_set_inline_frames(
-          common_flags()->symbolize_inline_frames));
-    // These are essential, we don't have InternalSymbolizer without them.
-    if (&__sanitizer_symbolize_code && &__sanitizer_symbolize_data)
-      return new (*alloc) InternalSymbolizer();
-    return 0;
+    // These one is the most used one, so we will use it to detect a presence of
+    // internal symbolizer.
+    if (&__sanitizer_symbolize_code == nullptr)
+      return nullptr;
+    CHECK(__sanitizer_symbolize_set_demangle(common_flags()->demangle));
+    CHECK(__sanitizer_symbolize_set_inline_frames(
+        common_flags()->symbolize_inline_frames));
+    return new (*alloc) InternalSymbolizer();
   }
 
   bool SymbolizePC(uptr addr, SymbolizedStack *stack) override {
@@ -371,8 +370,6 @@ class InternalSymbolizer final : public SymbolizerTool {
   }
 
   bool SymbolizeFrame(uptr addr, FrameInfo *info) override {
-    if (&__sanitizer_symbolize_frame == nullptr)
-      return false;
     bool result = __sanitizer_symbolize_frame(info->module, info->module_offset,
                                               buffer_, sizeof(buffer_));
     if (result)
@@ -380,14 +377,10 @@ class InternalSymbolizer final : public SymbolizerTool {
     return result;
   }
 
-  void Flush() override {
-    if (&__sanitizer_symbolize_flush)
-      __sanitizer_symbolize_flush();
-  }
+  void Flush() override { __sanitizer_symbolize_flush(); }
 
   const char *Demangle(const char *name) override {
-    if (&__sanitizer_symbolize_demangle &&
-        __sanitizer_symbolize_demangle(name, buffer_, sizeof(buffer_))) {
+    if (__sanitizer_symbolize_demangle(name, buffer_, sizeof(buffer_))) {
       char *res_buff = nullptr;
       ExtractToken(buffer_, "", &res_buff);
       return res_buff;

From 7c9c807fa4337c8ae36296e99df529938aba4335 Mon Sep 17 00:00:00 2001
From: NAKAMURA Takumi <geek4civic@gmail.com>
Date: Thu, 21 Dec 2023 08:25:21 +0900
Subject: [PATCH 009/342] [Bazel] Update llvm/Config, fixup for 476812a74260

---
 .../llvm-project-overlay/llvm/include/llvm/Config/config.h  | 6 ------
 .../llvm/include/llvm/Config/llvm-config.h                  | 6 ++++++
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/utils/bazel/llvm-project-overlay/llvm/include/llvm/Config/config.h b/utils/bazel/llvm-project-overlay/llvm/include/llvm/Config/config.h
index b4fb2373d571f..da18916b14a76 100644
--- a/utils/bazel/llvm-project-overlay/llvm/include/llvm/Config/config.h
+++ b/utils/bazel/llvm-project-overlay/llvm/include/llvm/Config/config.h
@@ -66,15 +66,9 @@
    don't. */
 #define HAVE_DECL_STRERROR_S 0
 
-/* Define to 1 if you have the <dlfcn.h> header file. */
-#define HAVE_DLFCN_H 1
-
 /* Define if dlopen() is available on this platform. */
 #define HAVE_DLOPEN 1
 
-/* Define if dladdr() is available on this platform. */
-#define HAVE_DLADDR 1
-
 /* Define to 1 if we can register EH frames on this platform. */
 /* HAVE_REGISTER_FRAME defined in Bazel*/
 
diff --git a/utils/bazel/llvm-project-overlay/llvm/include/llvm/Config/llvm-config.h b/utils/bazel/llvm-project-overlay/llvm/include/llvm/Config/llvm-config.h
index 5240b8299c109..5235d8303f568 100644
--- a/utils/bazel/llvm-project-overlay/llvm/include/llvm/Config/llvm-config.h
+++ b/utils/bazel/llvm-project-overlay/llvm/include/llvm/Config/llvm-config.h
@@ -124,4 +124,10 @@
 /* Define to 1 if you have the DIA SDK installed, and to 0 if you don't. */
 #define LLVM_ENABLE_DIA_SDK 0
 
+/* Define to 1 if you have the <dlfcn.h> header file. */
+#define HAVE_DLFCN_H 1
+
+/* Define if dladdr() is available on this platform. */
+#define HAVE_DLADDR 1
+
 #endif

From e98082d90ab0c0d01d01f73132f70eacf865e8da Mon Sep 17 00:00:00 2001
From: Valentin Clement <clementval@gmail.com>
Date: Wed, 20 Dec 2023 16:07:57 -0800
Subject: [PATCH 010/342] Revert "[flang][openacc] Remove unused waitdevnum"

This reverts commit 8fdc3b98b894bbbe301b13cf8fc89663e1cbac1a.
---
 flang/lib/Lower/OpenACC.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/flang/lib/Lower/OpenACC.cpp b/flang/lib/Lower/OpenACC.cpp
index ecf70818c4ac0..59db5ab71b702 100644
--- a/flang/lib/Lower/OpenACC.cpp
+++ b/flang/lib/Lower/OpenACC.cpp
@@ -1783,6 +1783,7 @@ createComputeOp(Fortran::lower::AbstractConverter &converter,
   // Parallel operation operands
   mlir::Value ifCond;
   mlir::Value selfCond;
+  mlir::Value waitDevnum;
   llvm::SmallVector<mlir::Value> waitOperands, attachEntryOperands,
       copyEntryOperands, copyoutEntryOperands, createEntryOperands,
       dataClauseOperands, numGangs, numWorkers, vectorLength, async;

From 553748356c1c59ae1dd2ecdf0f09e9d49f880090 Mon Sep 17 00:00:00 2001
From: Valentin Clement <clementval@gmail.com>
Date: Wed, 20 Dec 2023 16:08:10 -0800
Subject: [PATCH 011/342] Revert "[mlir][openacc] Add device_type support for
 compute operations (#75864)"

This reverts commit 8b885eb90ff14862b579b191c3f469a5a4fed1bc.
---
 flang/lib/Lower/OpenACC.cpp                   | 106 +---
 flang/test/Lower/OpenACC/acc-device-type.f90  |  44 --
 flang/test/Lower/OpenACC/acc-kernels-loop.f90 |  14 +-
 flang/test/Lower/OpenACC/acc-kernels.f90      |  14 +-
 .../test/Lower/OpenACC/acc-parallel-loop.f90  |  14 +-
 flang/test/Lower/OpenACC/acc-parallel.f90     |  16 +-
 flang/test/Lower/OpenACC/acc-serial-loop.f90  |  10 +-
 flang/test/Lower/OpenACC/acc-serial.f90       |  10 +-
 .../mlir/Dialect/OpenACC/OpenACCOps.td        | 286 +++-------
 mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp       | 515 +-----------------
 mlir/test/Dialect/OpenACC/invalid.mlir        |   4 +-
 mlir/test/Dialect/OpenACC/ops.mlir            |  76 +--
 mlir/unittests/Dialect/CMakeLists.txt         |   1 -
 mlir/unittests/Dialect/OpenACC/CMakeLists.txt |   8 -
 .../Dialect/OpenACC/OpenACCOpsTest.cpp        | 275 ----------
 15 files changed, 177 insertions(+), 1216 deletions(-)
 delete mode 100644 flang/test/Lower/OpenACC/acc-device-type.f90
 delete mode 100644 mlir/unittests/Dialect/OpenACC/CMakeLists.txt
 delete mode 100644 mlir/unittests/Dialect/OpenACC/OpenACCOpsTest.cpp

diff --git a/flang/lib/Lower/OpenACC.cpp b/flang/lib/Lower/OpenACC.cpp
index 59db5ab71b702..fae54eefb02f7 100644
--- a/flang/lib/Lower/OpenACC.cpp
+++ b/flang/lib/Lower/OpenACC.cpp
@@ -1480,7 +1480,7 @@ getDeviceType(Fortran::parser::AccDeviceTypeExpr::Device device) {
   case Fortran::parser::AccDeviceTypeExpr::Device::Multicore:
     return mlir::acc::DeviceType::Multicore;
   }
-  return mlir::acc::DeviceType::None;
+  return mlir::acc::DeviceType::Default;
 }
 
 static void gatherDeviceTypeAttrs(
@@ -1781,25 +1781,26 @@ createComputeOp(Fortran::lower::AbstractConverter &converter,
                 bool outerCombined = false) {
 
   // Parallel operation operands
+  mlir::Value async;
+  mlir::Value numWorkers;
+  mlir::Value vectorLength;
   mlir::Value ifCond;
   mlir::Value selfCond;
   mlir::Value waitDevnum;
   llvm::SmallVector<mlir::Value> waitOperands, attachEntryOperands,
       copyEntryOperands, copyoutEntryOperands, createEntryOperands,
-      dataClauseOperands, numGangs, numWorkers, vectorLength, async;
-  llvm::SmallVector<mlir::Attribute> numGangsDeviceTypes, numWorkersDeviceTypes,
-      vectorLengthDeviceTypes, asyncDeviceTypes, asyncOnlyDeviceTypes,
-      waitOperandsDeviceTypes, waitOnlyDeviceTypes;
-  llvm::SmallVector<int32_t> numGangsSegments, waitOperandsSegments;
+      dataClauseOperands, numGangs;
 
   llvm::SmallVector<mlir::Value> reductionOperands, privateOperands,
       firstprivateOperands;
   llvm::SmallVector<mlir::Attribute> privatizations, firstPrivatizations,
       reductionRecipes;
 
-  // Self clause has optional values but can be present with
+  // Async, wait and self clause have optional values but can be present with
   // no value as well. When there is no value, the op has an attribute to
   // represent the clause.
+  bool addAsyncAttr = false;
+  bool addWaitAttr = false;
   bool addSelfAttr = false;
 
   bool hasDefaultNone = false;
@@ -1807,11 +1808,6 @@ createComputeOp(Fortran::lower::AbstractConverter &converter,
 
   fir::FirOpBuilder &builder = converter.getFirOpBuilder();
 
-  // device_type attribute is set to `none` until a device_type clause is
-  // encountered.
-  auto crtDeviceTypeAttr = mlir::acc::DeviceTypeAttr::get(
-      builder.getContext(), mlir::acc::DeviceType::None);
-
   // Lower clauses values mapped to operands.
   // Keep track of each group of operands separatly as clauses can appear
   // more than once.
@@ -1819,52 +1815,27 @@ createComputeOp(Fortran::lower::AbstractConverter &converter,
     mlir::Location clauseLocation = converter.genLocation(clause.source);
     if (const auto *asyncClause =
             std::get_if<Fortran::parser::AccClause::Async>(&clause.u)) {
-      const auto &asyncClauseValue = asyncClause->v;
-      if (asyncClauseValue) { // async has a value.
-        async.push_back(fir::getBase(converter.genExprValue(
-            *Fortran::semantics::GetExpr(*asyncClauseValue), stmtCtx)));
-        asyncDeviceTypes.push_back(crtDeviceTypeAttr);
-      } else {
-        asyncOnlyDeviceTypes.push_back(crtDeviceTypeAttr);
-      }
+      genAsyncClause(converter, asyncClause, async, addAsyncAttr, stmtCtx);
     } else if (const auto *waitClause =
                    std::get_if<Fortran::parser::AccClause::Wait>(&clause.u)) {
-      const auto &waitClauseValue = waitClause->v;
-      if (waitClauseValue) { // wait has a value.
-        const Fortran::parser::AccWaitArgument &waitArg = *waitClauseValue;
-        const auto &waitList =
-            std::get<std::list<Fortran::parser::ScalarIntExpr>>(waitArg.t);
-        auto crtWaitOperands = waitOperands.size();
-        for (const Fortran::parser::ScalarIntExpr &value : waitList) {
-          waitOperands.push_back(fir::getBase(converter.genExprValue(
-              *Fortran::semantics::GetExpr(value), stmtCtx)));
-        }
-        waitOperandsDeviceTypes.push_back(crtDeviceTypeAttr);
-        waitOperandsSegments.push_back(waitOperands.size() - crtWaitOperands);
-      } else {
-        waitOnlyDeviceTypes.push_back(crtDeviceTypeAttr);
-      }
+      genWaitClause(converter, waitClause, waitOperands, waitDevnum,
+                    addWaitAttr, stmtCtx);
     } else if (const auto *numGangsClause =
                    std::get_if<Fortran::parser::AccClause::NumGangs>(
                        &clause.u)) {
-      auto crtNumGangs = numGangs.size();
       for (const Fortran::parser::ScalarIntExpr &expr : numGangsClause->v)
         numGangs.push_back(fir::getBase(converter.genExprValue(
             *Fortran::semantics::GetExpr(expr), stmtCtx)));
-      numGangsDeviceTypes.push_back(crtDeviceTypeAttr);
-      numGangsSegments.push_back(numGangs.size() - crtNumGangs);
     } else if (const auto *numWorkersClause =
                    std::get_if<Fortran::parser::AccClause::NumWorkers>(
                        &clause.u)) {
-      numWorkers.push_back(fir::getBase(converter.genExprValue(
-          *Fortran::semantics::GetExpr(numWorkersClause->v), stmtCtx)));
-      numWorkersDeviceTypes.push_back(crtDeviceTypeAttr);
+      numWorkers = fir::getBase(converter.genExprValue(
+          *Fortran::semantics::GetExpr(numWorkersClause->v), stmtCtx));
     } else if (const auto *vectorLengthClause =
                    std::get_if<Fortran::parser::AccClause::VectorLength>(
                        &clause.u)) {
-      vectorLength.push_back(fir::getBase(converter.genExprValue(
-          *Fortran::semantics::GetExpr(vectorLengthClause->v), stmtCtx)));
-      vectorLengthDeviceTypes.push_back(crtDeviceTypeAttr);
+      vectorLength = fir::getBase(converter.genExprValue(
+          *Fortran::semantics::GetExpr(vectorLengthClause->v), stmtCtx));
     } else if (const auto *ifClause =
                    std::get_if<Fortran::parser::AccClause::If>(&clause.u)) {
       genIfClause(converter, clauseLocation, ifClause, ifCond, stmtCtx);
@@ -2015,27 +1986,18 @@ createComputeOp(Fortran::lower::AbstractConverter &converter,
       else if ((defaultClause->v).v ==
                llvm::acc::DefaultValue::ACC_Default_present)
         hasDefaultPresent = true;
-    } else if (const auto *deviceTypeClause =
-                   std::get_if<Fortran::parser::AccClause::DeviceType>(
-                       &clause.u)) {
-      const Fortran::parser::AccDeviceTypeExprList &deviceTypeExprList =
-          deviceTypeClause->v;
-      assert(deviceTypeExprList.v.size() == 1 &&
-             "expect only one device_type expr");
-      crtDeviceTypeAttr = mlir::acc::DeviceTypeAttr::get(
-          builder.getContext(), getDeviceType(deviceTypeExprList.v.front().v));
     }
   }
 
   // Prepare the operand segment size attribute and the operands value range.
   llvm::SmallVector<mlir::Value, 8> operands;
   llvm::SmallVector<int32_t, 8> operandSegments;
-  addOperands(operands, operandSegments, async);
+  addOperand(operands, operandSegments, async);
   addOperands(operands, operandSegments, waitOperands);
   if constexpr (!std::is_same_v<Op, mlir::acc::SerialOp>) {
     addOperands(operands, operandSegments, numGangs);
-    addOperands(operands, operandSegments, numWorkers);
-    addOperands(operands, operandSegments, vectorLength);
+    addOperand(operands, operandSegments, numWorkers);
+    addOperand(operands, operandSegments, vectorLength);
   }
   addOperand(operands, operandSegments, ifCond);
   addOperand(operands, operandSegments, selfCond);
@@ -2056,6 +2018,10 @@ createComputeOp(Fortran::lower::AbstractConverter &converter,
         builder, currentLocation, eval, operands, operandSegments,
         outerCombined);
 
+  if (addAsyncAttr)
+    computeOp.setAsyncAttrAttr(builder.getUnitAttr());
+  if (addWaitAttr)
+    computeOp.setWaitAttrAttr(builder.getUnitAttr());
   if (addSelfAttr)
     computeOp.setSelfAttrAttr(builder.getUnitAttr());
 
@@ -2064,34 +2030,6 @@ createComputeOp(Fortran::lower::AbstractConverter &converter,
   if (hasDefaultPresent)
     computeOp.setDefaultAttr(mlir::acc::ClauseDefaultValue::Present);
 
-  if constexpr (!std::is_same_v<Op, mlir::acc::SerialOp>) {
-    if (!numWorkersDeviceTypes.empty())
-      computeOp.setNumWorkersDeviceTypeAttr(
-          mlir::ArrayAttr::get(builder.getContext(), numWorkersDeviceTypes));
-    if (!vectorLengthDeviceTypes.empty())
-      computeOp.setVectorLengthDeviceTypeAttr(
-          mlir::ArrayAttr::get(builder.getContext(), vectorLengthDeviceTypes));
-    if (!numGangsDeviceTypes.empty())
-      computeOp.setNumGangsDeviceTypeAttr(
-          mlir::ArrayAttr::get(builder.getContext(), numGangsDeviceTypes));
-    if (!numGangsSegments.empty())
-      computeOp.setNumGangsSegmentsAttr(
-          builder.getDenseI32ArrayAttr(numGangsSegments));
-  }
-  if (!asyncDeviceTypes.empty())
-    computeOp.setAsyncDeviceTypeAttr(builder.getArrayAttr(asyncDeviceTypes));
-  if (!asyncOnlyDeviceTypes.empty())
-    computeOp.setAsyncOnlyAttr(builder.getArrayAttr(asyncOnlyDeviceTypes));
-
-  if (!waitOperandsDeviceTypes.empty())
-    computeOp.setWaitOperandsDeviceTypeAttr(
-        builder.getArrayAttr(waitOperandsDeviceTypes));
-  if (!waitOperandsSegments.empty())
-    computeOp.setWaitOperandsSegmentsAttr(
-        builder.getDenseI32ArrayAttr(waitOperandsSegments));
-  if (!waitOnlyDeviceTypes.empty())
-    computeOp.setWaitOnlyAttr(builder.getArrayAttr(waitOnlyDeviceTypes));
-
   if constexpr (!std::is_same_v<Op, mlir::acc::KernelsOp>) {
     if (!privatizations.empty())
       computeOp.setPrivatizationsAttr(
diff --git a/flang/test/Lower/OpenACC/acc-device-type.f90 b/flang/test/Lower/OpenACC/acc-device-type.f90
deleted file mode 100644
index 871dbc95f60fc..0000000000000
--- a/flang/test/Lower/OpenACC/acc-device-type.f90
+++ /dev/null
@@ -1,44 +0,0 @@
-! This test checks lowering of OpenACC device_type clause on directive where its
-! position and the clauses that follow have special semantic
-
-! RUN: bbc -fopenacc -emit-hlfir %s -o - | FileCheck %s
-
-subroutine sub1()
-
-  !$acc parallel num_workers(16)
-  !$acc end parallel
-
-! CHECK: acc.parallel num_workers(%c16{{.*}} : i32) {
-
-  !$acc parallel num_workers(1) device_type(nvidia) num_workers(16)
-  !$acc end parallel
-
-! CHECK: acc.parallel num_workers(%c1{{.*}} : i32, %c16{{.*}} : i32 [#acc.device_type<nvidia>])
-
-  !$acc parallel device_type(*) num_workers(1) device_type(nvidia) num_workers(16)
-  !$acc end parallel
-
-! CHECK: acc.parallel num_workers(%c1{{.*}} : i32 [#acc.device_type<star>], %c16{{.*}} : i32 [#acc.device_type<nvidia>])
-
-  !$acc parallel vector_length(1)
-  !$acc end parallel
-
-! CHECK: acc.parallel vector_length(%c1{{.*}} : i32)
-
-  !$acc parallel device_type(multicore) vector_length(1)
-  !$acc end parallel
-
-! CHECK: acc.parallel vector_length(%c1{{.*}} : i32 [#acc.device_type<multicore>])
-
-  !$acc parallel num_gangs(2) device_type(nvidia) num_gangs(4)
-  !$acc end parallel
-
-! CHECK: acc.parallel num_gangs({%c2{{.*}} : i32}, {%c4{{.*}} : i32} [#acc.device_type<nvidia>])
-
-  !$acc parallel num_gangs(2) device_type(nvidia) num_gangs(1, 1, 1)
-  !$acc end parallel
-
-! CHECK: acc.parallel num_gangs({%c2{{.*}} : i32}, {%c1{{.*}} : i32, %c1{{.*}} : i32, %c1{{.*}} : i32} [#acc.device_type<nvidia>])
-
-
-end subroutine
diff --git a/flang/test/Lower/OpenACC/acc-kernels-loop.f90 b/flang/test/Lower/OpenACC/acc-kernels-loop.f90
index 93bc699031d55..34e7232697241 100644
--- a/flang/test/Lower/OpenACC/acc-kernels-loop.f90
+++ b/flang/test/Lower/OpenACC/acc-kernels-loop.f90
@@ -62,7 +62,7 @@ subroutine acc_kernels_loop
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.terminator
-! CHECK-NEXT: } attributes {asyncOnly = [#acc.device_type<none>]} 
+! CHECK-NEXT: } attributes {asyncAttr}
 
   !$acc kernels loop async(1)
   DO i = 1, n
@@ -103,7 +103,7 @@ subroutine acc_kernels_loop
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.terminator
-! CHECK-NEXT: } attributes {waitOnly = [#acc.device_type<none>]}
+! CHECK-NEXT: } attributes {waitAttr}
 
   !$acc kernels loop wait(1)
   DO i = 1, n
@@ -111,7 +111,7 @@ subroutine acc_kernels_loop
   END DO
 
 ! CHECK:      [[WAIT1:%.*]] = arith.constant 1 : i32
-! CHECK:      acc.kernels wait({[[WAIT1]] : i32}) {
+! CHECK:      acc.kernels wait([[WAIT1]] : i32) {
 ! CHECK:        acc.loop {
 ! CHECK:          fir.do_loop
 ! CHECK:          acc.yield
@@ -126,7 +126,7 @@ subroutine acc_kernels_loop
 
 ! CHECK:      [[WAIT2:%.*]] = arith.constant 1 : i32
 ! CHECK:      [[WAIT3:%.*]] = arith.constant 2 : i32
-! CHECK:      acc.kernels wait({[[WAIT2]] : i32, [[WAIT3]] : i32}) {
+! CHECK:      acc.kernels wait([[WAIT2]], [[WAIT3]] : i32, i32) {
 ! CHECK:        acc.loop {
 ! CHECK:          fir.do_loop
 ! CHECK:          acc.yield
@@ -141,7 +141,7 @@ subroutine acc_kernels_loop
 
 ! CHECK:      [[WAIT4:%.*]] = fir.load %{{.*}} : !fir.ref<i32>
 ! CHECK:      [[WAIT5:%.*]] = fir.load %{{.*}} : !fir.ref<i32>
-! CHECK:      acc.kernels wait({[[WAIT4]] : i32, [[WAIT5]] : i32}) {
+! CHECK:      acc.kernels wait([[WAIT4]], [[WAIT5]] : i32, i32) {
 ! CHECK:        acc.loop {
 ! CHECK:          fir.do_loop
 ! CHECK:          acc.yield
@@ -155,7 +155,7 @@ subroutine acc_kernels_loop
   END DO
 
 ! CHECK:      [[NUMGANGS1:%.*]] = arith.constant 1 : i32
-! CHECK:      acc.kernels num_gangs({[[NUMGANGS1]] : i32}) {
+! CHECK:      acc.kernels num_gangs([[NUMGANGS1]] : i32) {
 ! CHECK:        acc.loop {
 ! CHECK:          fir.do_loop
 ! CHECK:          acc.yield
@@ -169,7 +169,7 @@ subroutine acc_kernels_loop
   END DO
 
 ! CHECK:      [[NUMGANGS2:%.*]] = fir.load %{{.*}} : !fir.ref<i32>
-! CHECK:      acc.kernels num_gangs({[[NUMGANGS2]] : i32}) {
+! CHECK:      acc.kernels num_gangs([[NUMGANGS2]] : i32) {
 ! CHECK:        acc.loop {
 ! CHECK:          fir.do_loop
 ! CHECK:          acc.yield
diff --git a/flang/test/Lower/OpenACC/acc-kernels.f90 b/flang/test/Lower/OpenACC/acc-kernels.f90
index 99629bb835172..1f882c6df5106 100644
--- a/flang/test/Lower/OpenACC/acc-kernels.f90
+++ b/flang/test/Lower/OpenACC/acc-kernels.f90
@@ -40,7 +40,7 @@ subroutine acc_kernels
 
 ! CHECK:      acc.kernels  {
 ! CHECK:        acc.terminator
-! CHECK-NEXT: } attributes {asyncOnly = [#acc.device_type<none>]} 
+! CHECK-NEXT: } attributes {asyncAttr}
 
   !$acc kernels async(1)
   !$acc end kernels
@@ -63,13 +63,13 @@ subroutine acc_kernels
 
 ! CHECK:      acc.kernels  {
 ! CHECK:        acc.terminator
-! CHECK-NEXT: } attributes {waitOnly = [#acc.device_type<none>]}
+! CHECK-NEXT: } attributes {waitAttr}
 
   !$acc kernels wait(1)
   !$acc end kernels
 
 ! CHECK:      [[WAIT1:%.*]] = arith.constant 1 : i32
-! CHECK:      acc.kernels  wait({[[WAIT1]] : i32}) {
+! CHECK:      acc.kernels  wait([[WAIT1]] : i32) {
 ! CHECK:        acc.terminator
 ! CHECK-NEXT: }{{$}}
 
@@ -78,7 +78,7 @@ subroutine acc_kernels
 
 ! CHECK:      [[WAIT2:%.*]] = arith.constant 1 : i32
 ! CHECK:      [[WAIT3:%.*]] = arith.constant 2 : i32
-! CHECK:      acc.kernels  wait({[[WAIT2]] : i32, [[WAIT3]] : i32}) {
+! CHECK:      acc.kernels  wait([[WAIT2]], [[WAIT3]] : i32, i32) {
 ! CHECK:        acc.terminator
 ! CHECK-NEXT: }{{$}}
 
@@ -87,7 +87,7 @@ subroutine acc_kernels
 
 ! CHECK:      [[WAIT4:%.*]] = fir.load %{{.*}} : !fir.ref<i32>
 ! CHECK:      [[WAIT5:%.*]] = fir.load %{{.*}} : !fir.ref<i32>
-! CHECK:      acc.kernels  wait({[[WAIT4]] : i32, [[WAIT5]] : i32}) {
+! CHECK:      acc.kernels  wait([[WAIT4]], [[WAIT5]] : i32, i32) {
 ! CHECK:        acc.terminator
 ! CHECK-NEXT: }{{$}}
 
@@ -95,7 +95,7 @@ subroutine acc_kernels
   !$acc end kernels
 
 ! CHECK:      [[NUMGANGS1:%.*]] = arith.constant 1 : i32
-! CHECK:      acc.kernels  num_gangs({[[NUMGANGS1]] : i32}) {
+! CHECK:      acc.kernels  num_gangs([[NUMGANGS1]] : i32) {
 ! CHECK:        acc.terminator
 ! CHECK-NEXT: }{{$}}
 
@@ -103,7 +103,7 @@ subroutine acc_kernels
   !$acc end kernels
 
 ! CHECK:      [[NUMGANGS2:%.*]] = fir.load %{{.*}} : !fir.ref<i32>
-! CHECK:      acc.kernels  num_gangs({[[NUMGANGS2]] : i32}) {
+! CHECK:      acc.kernels  num_gangs([[NUMGANGS2]] : i32) {
 ! CHECK:        acc.terminator
 ! CHECK-NEXT: }{{$}}
 
diff --git a/flang/test/Lower/OpenACC/acc-parallel-loop.f90 b/flang/test/Lower/OpenACC/acc-parallel-loop.f90
index deee7089033ea..1856215ce59d1 100644
--- a/flang/test/Lower/OpenACC/acc-parallel-loop.f90
+++ b/flang/test/Lower/OpenACC/acc-parallel-loop.f90
@@ -64,7 +64,7 @@ subroutine acc_parallel_loop
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.yield
-! CHECK-NEXT: } attributes {asyncOnly = [#acc.device_type<none>]}
+! CHECK-NEXT: } attributes {asyncAttr}
 
   !$acc parallel loop async(1)
   DO i = 1, n
@@ -105,7 +105,7 @@ subroutine acc_parallel_loop
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.yield
-! CHECK-NEXT: } attributes {waitOnly = [#acc.device_type<none>]}
+! CHECK-NEXT: } attributes {waitAttr}
 
   !$acc parallel loop wait(1)
   DO i = 1, n
@@ -113,7 +113,7 @@ subroutine acc_parallel_loop
   END DO
 
 ! CHECK:      [[WAIT1:%.*]] = arith.constant 1 : i32
-! CHECK:      acc.parallel wait({[[WAIT1]] : i32}) {
+! CHECK:      acc.parallel wait([[WAIT1]] : i32) {
 ! CHECK:        acc.loop {
 ! CHECK:          fir.do_loop
 ! CHECK:          acc.yield
@@ -128,7 +128,7 @@ subroutine acc_parallel_loop
 
 ! CHECK:      [[WAIT2:%.*]] = arith.constant 1 : i32
 ! CHECK:      [[WAIT3:%.*]] = arith.constant 2 : i32
-! CHECK:      acc.parallel wait({[[WAIT2]] : i32, [[WAIT3]] : i32}) {
+! CHECK:      acc.parallel wait([[WAIT2]], [[WAIT3]] : i32, i32) {
 ! CHECK:        acc.loop {
 ! CHECK:          fir.do_loop
 ! CHECK:          acc.yield
@@ -143,7 +143,7 @@ subroutine acc_parallel_loop
 
 ! CHECK:      [[WAIT4:%.*]] = fir.load %{{.*}} : !fir.ref<i32>
 ! CHECK:      [[WAIT5:%.*]] = fir.load %{{.*}} : !fir.ref<i32>
-! CHECK:      acc.parallel wait({[[WAIT4]] : i32, [[WAIT5]] : i32}) {
+! CHECK:      acc.parallel wait([[WAIT4]], [[WAIT5]] : i32, i32) {
 ! CHECK:        acc.loop {
 ! CHECK:          fir.do_loop
 ! CHECK:          acc.yield
@@ -157,7 +157,7 @@ subroutine acc_parallel_loop
   END DO
 
 ! CHECK:      [[NUMGANGS1:%.*]] = arith.constant 1 : i32
-! CHECK:      acc.parallel num_gangs({[[NUMGANGS1]] : i32}) {
+! CHECK:      acc.parallel num_gangs([[NUMGANGS1]] : i32) {
 ! CHECK:        acc.loop {
 ! CHECK:          fir.do_loop
 ! CHECK:          acc.yield
@@ -171,7 +171,7 @@ subroutine acc_parallel_loop
   END DO
 
 ! CHECK:      [[NUMGANGS2:%.*]] = fir.load %{{.*}} : !fir.ref<i32>
-! CHECK:      acc.parallel num_gangs({[[NUMGANGS2]] : i32}) {
+! CHECK:      acc.parallel num_gangs([[NUMGANGS2]] : i32) {
 ! CHECK:        acc.loop {
 ! CHECK:          fir.do_loop
 ! CHECK:          acc.yield
diff --git a/flang/test/Lower/OpenACC/acc-parallel.f90 b/flang/test/Lower/OpenACC/acc-parallel.f90
index a369bf01f2599..bbf51ba36a7de 100644
--- a/flang/test/Lower/OpenACC/acc-parallel.f90
+++ b/flang/test/Lower/OpenACC/acc-parallel.f90
@@ -62,7 +62,7 @@ subroutine acc_parallel
 
 ! CHECK:      acc.parallel {
 ! CHECK:        acc.yield
-! CHECK-NEXT: } attributes {asyncOnly = [#acc.device_type<none>]}
+! CHECK-NEXT: } attributes {asyncAttr}
 
   !$acc parallel async(1)
   !$acc end parallel
@@ -85,13 +85,13 @@ subroutine acc_parallel
 
 ! CHECK:      acc.parallel {
 ! CHECK:        acc.yield
-! CHECK-NEXT: } attributes {waitOnly = [#acc.device_type<none>]}
+! CHECK-NEXT: } attributes {waitAttr}
 
   !$acc parallel wait(1)
   !$acc end parallel
 
 ! CHECK:      [[WAIT1:%.*]] = arith.constant 1 : i32
-! CHECK:      acc.parallel wait({[[WAIT1]] : i32}) {
+! CHECK:      acc.parallel wait([[WAIT1]] : i32) {
 ! CHECK:        acc.yield
 ! CHECK-NEXT: }{{$}}
 
@@ -100,7 +100,7 @@ subroutine acc_parallel
 
 ! CHECK:      [[WAIT2:%.*]] = arith.constant 1 : i32
 ! CHECK:      [[WAIT3:%.*]] = arith.constant 2 : i32
-! CHECK:      acc.parallel wait({[[WAIT2]] : i32, [[WAIT3]] : i32}) {
+! CHECK:      acc.parallel wait([[WAIT2]], [[WAIT3]] : i32, i32) {
 ! CHECK:        acc.yield
 ! CHECK-NEXT: }{{$}}
 
@@ -109,7 +109,7 @@ subroutine acc_parallel
 
 ! CHECK:      [[WAIT4:%.*]] = fir.load %{{.*}} : !fir.ref<i32>
 ! CHECK:      [[WAIT5:%.*]] = fir.load %{{.*}} : !fir.ref<i32>
-! CHECK:      acc.parallel wait({[[WAIT4]] : i32, [[WAIT5]] : i32}) {
+! CHECK:      acc.parallel wait([[WAIT4]], [[WAIT5]] : i32, i32) {
 ! CHECK:        acc.yield
 ! CHECK-NEXT: }{{$}}
 
@@ -117,7 +117,7 @@ subroutine acc_parallel
   !$acc end parallel
 
 ! CHECK:      [[NUMGANGS1:%.*]] = arith.constant 1 : i32
-! CHECK:      acc.parallel num_gangs({[[NUMGANGS1]] : i32}) {
+! CHECK:      acc.parallel num_gangs([[NUMGANGS1]] : i32) {
 ! CHECK:        acc.yield
 ! CHECK-NEXT: }{{$}}
 
@@ -125,14 +125,14 @@ subroutine acc_parallel
   !$acc end parallel
 
 ! CHECK:      [[NUMGANGS2:%.*]] = fir.load %{{.*}} : !fir.ref<i32>
-! CHECK:      acc.parallel num_gangs({[[NUMGANGS2]] : i32}) {
+! CHECK:      acc.parallel num_gangs([[NUMGANGS2]] : i32) {
 ! CHECK:        acc.yield
 ! CHECK-NEXT: }{{$}}
 
   !$acc parallel num_gangs(1, 1, 1)
   !$acc end parallel
 
-! CHECK:      acc.parallel num_gangs({%{{.*}} : i32, %{{.*}} : i32, %{{.*}} : i32}) {
+! CHECK:      acc.parallel num_gangs(%{{.*}}, %{{.*}}, %{{.*}} : i32, i32, i32) {
 ! CHECK:        acc.yield
 ! CHECK-NEXT: }{{$}}
 
diff --git a/flang/test/Lower/OpenACC/acc-serial-loop.f90 b/flang/test/Lower/OpenACC/acc-serial-loop.f90
index 712bfc80ce387..4ed7bb8da29a1 100644
--- a/flang/test/Lower/OpenACC/acc-serial-loop.f90
+++ b/flang/test/Lower/OpenACC/acc-serial-loop.f90
@@ -83,7 +83,7 @@ subroutine acc_serial_loop
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.yield
-! CHECK-NEXT: } attributes {asyncOnly = [#acc.device_type<none>]}
+! CHECK-NEXT: } attributes {asyncAttr}
 
   !$acc serial loop async(1)
   DO i = 1, n
@@ -124,7 +124,7 @@ subroutine acc_serial_loop
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.yield
-! CHECK-NEXT: } attributes {waitOnly = [#acc.device_type<none>]}
+! CHECK-NEXT: } attributes {waitAttr}
 
   !$acc serial loop wait(1)
   DO i = 1, n
@@ -132,7 +132,7 @@ subroutine acc_serial_loop
   END DO
 
 ! CHECK:      [[WAIT1:%.*]] = arith.constant 1 : i32
-! CHECK:      acc.serial wait({[[WAIT1]] : i32}) {
+! CHECK:      acc.serial wait([[WAIT1]] : i32) {
 ! CHECK:        acc.loop {
 ! CHECK:          fir.do_loop
 ! CHECK:          acc.yield
@@ -147,7 +147,7 @@ subroutine acc_serial_loop
 
 ! CHECK:      [[WAIT2:%.*]] = arith.constant 1 : i32
 ! CHECK:      [[WAIT3:%.*]] = arith.constant 2 : i32
-! CHECK:      acc.serial wait({[[WAIT2]] : i32, [[WAIT3]] : i32}) {
+! CHECK:      acc.serial wait([[WAIT2]], [[WAIT3]] : i32, i32) {
 ! CHECK:        acc.loop {
 ! CHECK:          fir.do_loop
 ! CHECK:          acc.yield
@@ -162,7 +162,7 @@ subroutine acc_serial_loop
 
 ! CHECK:      [[WAIT4:%.*]] = fir.load %{{.*}} : !fir.ref<i32>
 ! CHECK:      [[WAIT5:%.*]] = fir.load %{{.*}} : !fir.ref<i32>
-! CHECK:      acc.serial wait({[[WAIT4]] : i32, [[WAIT5]] : i32}) {
+! CHECK:      acc.serial wait([[WAIT4]], [[WAIT5]] : i32, i32) {
 ! CHECK:        acc.loop {
 ! CHECK:          fir.do_loop
 ! CHECK:          acc.yield
diff --git a/flang/test/Lower/OpenACC/acc-serial.f90 b/flang/test/Lower/OpenACC/acc-serial.f90
index d05e51d3d274f..ab3b0ccd54595 100644
--- a/flang/test/Lower/OpenACC/acc-serial.f90
+++ b/flang/test/Lower/OpenACC/acc-serial.f90
@@ -62,7 +62,7 @@ subroutine acc_serial
 
 ! CHECK:      acc.serial {
 ! CHECK:        acc.yield
-! CHECK-NEXT: } attributes {asyncOnly = [#acc.device_type<none>]} 
+! CHECK-NEXT: } attributes {asyncAttr}
 
   !$acc serial async(1)
   !$acc end serial
@@ -85,13 +85,13 @@ subroutine acc_serial
 
 ! CHECK:      acc.serial {
 ! CHECK:        acc.yield
-! CHECK-NEXT: } attributes {waitOnly = [#acc.device_type<none>]}
+! CHECK-NEXT: } attributes {waitAttr}
 
   !$acc serial wait(1)
   !$acc end serial
 
 ! CHECK:      [[WAIT1:%.*]] = arith.constant 1 : i32
-! CHECK:      acc.serial wait({[[WAIT1]] : i32}) {
+! CHECK:      acc.serial wait([[WAIT1]] : i32) {
 ! CHECK:        acc.yield
 ! CHECK-NEXT: }{{$}}
 
@@ -100,7 +100,7 @@ subroutine acc_serial
 
 ! CHECK:      [[WAIT2:%.*]] = arith.constant 1 : i32
 ! CHECK:      [[WAIT3:%.*]] = arith.constant 2 : i32
-! CHECK:      acc.serial wait({[[WAIT2]] : i32, [[WAIT3]] : i32}) {
+! CHECK:      acc.serial wait([[WAIT2]], [[WAIT3]] : i32, i32) {
 ! CHECK:        acc.yield
 ! CHECK-NEXT: }{{$}}
 
@@ -109,7 +109,7 @@ subroutine acc_serial
 
 ! CHECK:      [[WAIT4:%.*]] = fir.load %{{.*}} : !fir.ref<i32>
 ! CHECK:      [[WAIT5:%.*]] = fir.load %{{.*}} : !fir.ref<i32>
-! CHECK:      acc.serial wait({[[WAIT4]] : i32, [[WAIT5]] : i32}) {
+! CHECK:      acc.serial wait([[WAIT4]], [[WAIT5]] : i32, i32) {
 ! CHECK:        acc.yield
 ! CHECK-NEXT: }{{$}}
 
diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td b/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
index 234c1076e14e3..a78c3e98c9551 100644
--- a/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
+++ b/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
@@ -156,46 +156,29 @@ def DeclareActionAttr : OpenACC_Attr<"DeclareAction", "declare_action"> {
 }
 
 // Device type enumeration.
-def OpenACC_DeviceTypeNone      : I32EnumAttrCase<"None", 0, "none">;
-def OpenACC_DeviceTypeStar      : I32EnumAttrCase<"Star", 1, "star">;
-def OpenACC_DeviceTypeDefault   : I32EnumAttrCase<"Default", 2, "default">;
-def OpenACC_DeviceTypeHost      : I32EnumAttrCase<"Host", 3, "host">;
-def OpenACC_DeviceTypeMulticore : I32EnumAttrCase<"Multicore", 4, "multicore">;
-def OpenACC_DeviceTypeNvidia    : I32EnumAttrCase<"Nvidia", 5, "nvidia">;
-def OpenACC_DeviceTypeRadeon    : I32EnumAttrCase<"Radeon", 6, "radeon">;
+def OpenACC_DeviceTypeStar      : I32EnumAttrCase<"Star", 0, "star">;
+def OpenACC_DeviceTypeDefault   : I32EnumAttrCase<"Default", 1, "default">;
+def OpenACC_DeviceTypeHost      : I32EnumAttrCase<"Host", 2, "host">;
+def OpenACC_DeviceTypeMulticore : I32EnumAttrCase<"Multicore", 3, "multicore">;
+def OpenACC_DeviceTypeNvidia    : I32EnumAttrCase<"Nvidia", 4, "nvidia">;
+def OpenACC_DeviceTypeRadeon    : I32EnumAttrCase<"Radeon", 5, "radeon">;
+
 
 def OpenACC_DeviceType : I32EnumAttr<"DeviceType",
     "built-in device type supported by OpenACC",
-    [OpenACC_DeviceTypeNone, OpenACC_DeviceTypeStar, OpenACC_DeviceTypeDefault,
+    [OpenACC_DeviceTypeStar, OpenACC_DeviceTypeDefault,
      OpenACC_DeviceTypeHost, OpenACC_DeviceTypeMulticore,
      OpenACC_DeviceTypeNvidia, OpenACC_DeviceTypeRadeon
     ]> {
   let genSpecializedAttr = 0;
   let cppNamespace = "::mlir::acc";
 }
-
-// Device type attribute is used to associate a value for for clauses that
-// appear after a device_type clause. The list of clauses allowed after the
-// device_type clause is defined per construct as follows:
-// Loop construct: collapse, gang, worker, vector, seq, independent, auto,
-//                 and tile
-// Compute construct: async, wait, num_gangs, num_workers, and vector_length
-// Data construct: async and wait 
-// Routine: gang, worker, vector, seq and bind
-//
-// The `none` means that the value appears before any device_type clause.
-//
 def OpenACC_DeviceTypeAttr : EnumAttr<OpenACC_Dialect,
                                       OpenACC_DeviceType,
                                       "device_type"> {
   let assemblyFormat = [{ ```<` $value `>` }];
 }
 
-def DeviceTypeArrayAttr :
-  TypedArrayAttrBase<OpenACC_DeviceTypeAttr, "device type array attribute"> {
-  let constBuilderCall = ?;
-}
-
 // Define a resource for the OpenACC runtime counters.
 def OpenACC_RuntimeCounters : Resource<"::mlir::acc::RuntimeCounters">;
 
@@ -880,32 +863,24 @@ def OpenACC_ParallelOp : OpenACC_Op<"parallel",
     ```
   }];
 
-  let arguments = (ins
-      Variadic<IntOrIndex>:$async,
-      OptionalAttr<DeviceTypeArrayAttr>:$asyncDeviceType,
-      OptionalAttr<DeviceTypeArrayAttr>:$asyncOnly,
-      Variadic<IntOrIndex>:$waitOperands,
-      OptionalAttr<DenseI32ArrayAttr>:$waitOperandsSegments,
-      OptionalAttr<DeviceTypeArrayAttr>:$waitOperandsDeviceType,
-      OptionalAttr<DeviceTypeArrayAttr>:$waitOnly,
-      Variadic<IntOrIndex>:$numGangs,
-      OptionalAttr<DenseI32ArrayAttr>:$numGangsSegments,
-      OptionalAttr<DeviceTypeArrayAttr>:$numGangsDeviceType,
-      Variadic<IntOrIndex>:$numWorkers,
-      OptionalAttr<DeviceTypeArrayAttr>:$numWorkersDeviceType,
-      Variadic<IntOrIndex>:$vectorLength,
-      OptionalAttr<DeviceTypeArrayAttr>:$vectorLengthDeviceType,
-      Optional<I1>:$ifCond,
-      Optional<I1>:$selfCond,
-      UnitAttr:$selfAttr,
-      Variadic<AnyType>:$reductionOperands,
-      OptionalAttr<SymbolRefArrayAttr>:$reductionRecipes,
-      Variadic<OpenACC_PointerLikeTypeInterface>:$gangPrivateOperands,
-      OptionalAttr<SymbolRefArrayAttr>:$privatizations,
-      Variadic<OpenACC_PointerLikeTypeInterface>:$gangFirstPrivateOperands,
-      OptionalAttr<SymbolRefArrayAttr>:$firstprivatizations,
-      Variadic<OpenACC_PointerLikeTypeInterface>:$dataClauseOperands,
-      OptionalAttr<DefaultValueAttr>:$defaultAttr);
+  let arguments = (ins Optional<IntOrIndex>:$async,
+                       UnitAttr:$asyncAttr,
+                       Variadic<IntOrIndex>:$waitOperands,
+                       UnitAttr:$waitAttr,
+                       Variadic<IntOrIndex>:$numGangs,
+                       Optional<IntOrIndex>:$numWorkers,
+                       Optional<IntOrIndex>:$vectorLength,
+                       Optional<I1>:$ifCond,
+                       Optional<I1>:$selfCond,
+                       UnitAttr:$selfAttr,
+                       Variadic<AnyType>:$reductionOperands,
+                       OptionalAttr<SymbolRefArrayAttr>:$reductionRecipes,
+                       Variadic<OpenACC_PointerLikeTypeInterface>:$gangPrivateOperands,
+                       OptionalAttr<SymbolRefArrayAttr>:$privatizations,
+                       Variadic<OpenACC_PointerLikeTypeInterface>:$gangFirstPrivateOperands,
+                       OptionalAttr<SymbolRefArrayAttr>:$firstprivatizations,
+                       Variadic<OpenACC_PointerLikeTypeInterface>:$dataClauseOperands,
+                       OptionalAttr<DefaultValueAttr>:$defaultAttr);
 
   let regions = (region AnyRegion:$region);
 
@@ -915,69 +890,22 @@ def OpenACC_ParallelOp : OpenACC_Op<"parallel",
 
     /// The i-th data operand passed.
     Value getDataOperand(unsigned i);
-
-    /// Return true if the op has the async attribute for the
-    /// mlir::acc::DeviceType::None device_type.
-    bool hasAsyncOnly();
-    /// Return true if the op has the async attribute for the given device_type.
-    bool hasAsyncOnly(mlir::acc::DeviceType deviceType);
-    /// Return the value of the async clause if present.
-    mlir::Value getAsyncValue();
-    /// Return the value of the async clause for the given device_type if
-    /// present.
-    mlir::Value getAsyncValue(mlir::acc::DeviceType deviceType);
-
-    /// Return the value of the num_workers clause if present.
-    mlir::Value getNumWorkersValue();
-    /// Return the value of the num_workers clause for the given device_type if
-    /// present.
-    mlir::Value getNumWorkersValue(mlir::acc::DeviceType deviceType);
-
-    /// Return the value of the vector_length clause if present.
-    mlir::Value getVectorLengthValue();
-    /// Return the value of the vector_length clause for the given device_type 
-    /// if present.
-    mlir::Value getVectorLengthValue(mlir::acc::DeviceType deviceType);
-
-    /// Return the values of the num_gangs clause if present.
-    mlir::Operation::operand_range getNumGangsValues();
-    /// Return the values of the num_gangs clause for the given device_type if
-    /// present.
-    mlir::Operation::operand_range
-    getNumGangsValues(mlir::acc::DeviceType deviceType);
-
-    /// Return true if the op has the wait attribute for the
-    /// mlir::acc::DeviceType::None device_type.
-    bool hasWaitOnly();
-    /// Return true if the op has the wait attribute for the given device_type.
-    bool hasWaitOnly(mlir::acc::DeviceType deviceType);
-    /// Return the values of the wait clause if present.
-    mlir::Operation::operand_range getWaitValues();
-    /// Return the values of the wait clause for the given device_type if
-    /// present.
-    mlir::Operation::operand_range
-    getWaitValues(mlir::acc::DeviceType deviceType);
   }];
 
   let assemblyFormat = [{
     oilist(
         `dataOperands` `(` $dataClauseOperands `:` type($dataClauseOperands) `)`
-      | `async` `(` custom<DeviceTypeOperands>($async,
-            type($async), $asyncDeviceType) `)`
+      | `async` `(` $async `:` type($async) `)`
       | `firstprivate` `(` custom<SymOperandList>($gangFirstPrivateOperands,
             type($gangFirstPrivateOperands), $firstprivatizations)
         `)`
-      | `num_gangs` `(` custom<NumGangs>($numGangs,
-            type($numGangs), $numGangsDeviceType, $numGangsSegments) `)`
-      | `num_workers` `(` custom<DeviceTypeOperands>($numWorkers,
-            type($numWorkers), $numWorkersDeviceType) `)`
+      | `num_gangs` `(` $numGangs `:` type($numGangs) `)`
+      | `num_workers` `(` $numWorkers `:` type($numWorkers) `)`
       | `private` `(` custom<SymOperandList>(
             $gangPrivateOperands, type($gangPrivateOperands), $privatizations)
         `)`
-      | `vector_length` `(` custom<DeviceTypeOperands>($vectorLength,
-            type($vectorLength), $vectorLengthDeviceType) `)`
-      | `wait` `(` custom<WaitOperands>($waitOperands,
-            type($waitOperands), $waitOperandsDeviceType, $waitOperandsSegments) `)`
+      | `vector_length` `(` $vectorLength `:` type($vectorLength) `)`
+      | `wait` `(` $waitOperands `:` type($waitOperands) `)`
       | `self` `(` $selfCond `)`
       | `if` `(` $ifCond `)`
       | `reduction` `(` custom<SymOperandList>(
@@ -1011,25 +939,21 @@ def OpenACC_SerialOp : OpenACC_Op<"serial",
     ```
   }];
 
-  let arguments = (ins
-      Variadic<IntOrIndex>:$async,
-      OptionalAttr<DeviceTypeArrayAttr>:$asyncDeviceType,
-      OptionalAttr<DeviceTypeArrayAttr>:$asyncOnly,
-      Variadic<IntOrIndex>:$waitOperands,
-      OptionalAttr<DenseI32ArrayAttr>:$waitOperandsSegments,
-      OptionalAttr<DeviceTypeArrayAttr>:$waitOperandsDeviceType,
-      OptionalAttr<DeviceTypeArrayAttr>:$waitOnly,
-      Optional<I1>:$ifCond,
-      Optional<I1>:$selfCond,
-      UnitAttr:$selfAttr,
-      Variadic<AnyType>:$reductionOperands,
-      OptionalAttr<SymbolRefArrayAttr>:$reductionRecipes,
-      Variadic<OpenACC_PointerLikeTypeInterface>:$gangPrivateOperands,
-      OptionalAttr<SymbolRefArrayAttr>:$privatizations,
-      Variadic<OpenACC_PointerLikeTypeInterface>:$gangFirstPrivateOperands,
-      OptionalAttr<SymbolRefArrayAttr>:$firstprivatizations,
-      Variadic<OpenACC_PointerLikeTypeInterface>:$dataClauseOperands,
-      OptionalAttr<DefaultValueAttr>:$defaultAttr);
+  let arguments = (ins Optional<IntOrIndex>:$async,
+                       UnitAttr:$asyncAttr,
+                       Variadic<IntOrIndex>:$waitOperands,
+                       UnitAttr:$waitAttr,
+                       Optional<I1>:$ifCond,
+                       Optional<I1>:$selfCond,
+                       UnitAttr:$selfAttr,
+                       Variadic<AnyType>:$reductionOperands,
+                       OptionalAttr<SymbolRefArrayAttr>:$reductionRecipes,
+                       Variadic<OpenACC_PointerLikeTypeInterface>:$gangPrivateOperands,
+                       OptionalAttr<SymbolRefArrayAttr>:$privatizations,
+                       Variadic<OpenACC_PointerLikeTypeInterface>:$gangFirstPrivateOperands,
+                       OptionalAttr<SymbolRefArrayAttr>:$firstprivatizations,
+                       Variadic<OpenACC_PointerLikeTypeInterface>:$dataClauseOperands,
+                       OptionalAttr<DefaultValueAttr>:$defaultAttr);
 
   let regions = (region AnyRegion:$region);
 
@@ -1039,44 +963,19 @@ def OpenACC_SerialOp : OpenACC_Op<"serial",
 
     /// The i-th data operand passed.
     Value getDataOperand(unsigned i);
-
-    /// Return true if the op has the async attribute for the
-    /// mlir::acc::DeviceType::None device_type.
-    bool hasAsyncOnly();
-    /// Return true if the op has the async attribute for the given device_type.
-    bool hasAsyncOnly(mlir::acc::DeviceType deviceType);
-    /// Return the value of the async clause if present.
-    mlir::Value getAsyncValue();
-    /// Return the value of the async clause for the given device_type if
-    /// present.
-    mlir::Value getAsyncValue(mlir::acc::DeviceType deviceType);
-
-    /// Return true if the op has the wait attribute for the
-    /// mlir::acc::DeviceType::None device_type.
-    bool hasWaitOnly();
-    /// Return true if the op has the wait attribute for the given device_type.
-    bool hasWaitOnly(mlir::acc::DeviceType deviceType);
-    /// Return the values of the wait clause if present.
-    mlir::Operation::operand_range getWaitValues();
-    /// Return the values of the wait clause for the given device_type if
-    /// present.
-    mlir::Operation::operand_range
-    getWaitValues(mlir::acc::DeviceType deviceType);
   }];
 
   let assemblyFormat = [{
     oilist(
         `dataOperands` `(` $dataClauseOperands `:` type($dataClauseOperands) `)`
-      | `async` `(` custom<DeviceTypeOperands>($async,
-            type($async), $asyncDeviceType) `)`
+      | `async` `(` $async `:` type($async) `)`
       | `firstprivate` `(` custom<SymOperandList>($gangFirstPrivateOperands,
             type($gangFirstPrivateOperands), $firstprivatizations)
         `)`
       | `private` `(` custom<SymOperandList>(
             $gangPrivateOperands, type($gangPrivateOperands), $privatizations)
         `)`
-      | `wait` `(` custom<WaitOperands>($waitOperands,
-            type($waitOperands), $waitOperandsDeviceType, $waitOperandsSegments) `)`
+      | `wait` `(` $waitOperands `:` type($waitOperands) `)`
       | `self` `(` $selfCond `)`
       | `if` `(` $ifCond `)`
       | `reduction` `(` custom<SymOperandList>(
@@ -1112,26 +1011,18 @@ def OpenACC_KernelsOp : OpenACC_Op<"kernels",
     ```
   }];
 
-  let arguments = (ins
-      Variadic<IntOrIndex>:$async,
-      OptionalAttr<DeviceTypeArrayAttr>:$asyncDeviceType,
-      OptionalAttr<DeviceTypeArrayAttr>:$asyncOnly,
-      Variadic<IntOrIndex>:$waitOperands,
-      OptionalAttr<DenseI32ArrayAttr>:$waitOperandsSegments,
-      OptionalAttr<DeviceTypeArrayAttr>:$waitOperandsDeviceType,
-      OptionalAttr<DeviceTypeArrayAttr>:$waitOnly,
-      Variadic<IntOrIndex>:$numGangs,
-      OptionalAttr<DenseI32ArrayAttr>:$numGangsSegments,
-      OptionalAttr<DeviceTypeArrayAttr>:$numGangsDeviceType,
-      Variadic<IntOrIndex>:$numWorkers,
-      OptionalAttr<DeviceTypeArrayAttr>:$numWorkersDeviceType,
-      Variadic<IntOrIndex>:$vectorLength,
-      OptionalAttr<DeviceTypeArrayAttr>:$vectorLengthDeviceType,
-      Optional<I1>:$ifCond,
-      Optional<I1>:$selfCond,
-      UnitAttr:$selfAttr,
-      Variadic<OpenACC_PointerLikeTypeInterface>:$dataClauseOperands,
-      OptionalAttr<DefaultValueAttr>:$defaultAttr);
+  let arguments = (ins Optional<IntOrIndex>:$async,
+                       UnitAttr:$asyncAttr,
+                       Variadic<IntOrIndex>:$waitOperands,
+                       UnitAttr:$waitAttr,
+                       Variadic<IntOrIndex>:$numGangs,
+                       Optional<IntOrIndex>:$numWorkers,
+                       Optional<IntOrIndex>:$vectorLength,
+                       Optional<I1>:$ifCond,
+                       Optional<I1>:$selfCond,
+                       UnitAttr:$selfAttr,
+                       Variadic<OpenACC_PointerLikeTypeInterface>:$dataClauseOperands,
+                       OptionalAttr<DefaultValueAttr>:$defaultAttr);
 
   let regions = (region AnyRegion:$region);
 
@@ -1141,63 +1032,16 @@ def OpenACC_KernelsOp : OpenACC_Op<"kernels",
 
     /// The i-th data operand passed.
     Value getDataOperand(unsigned i);
-
-    /// Return true if the op has the async attribute for the
-    /// mlir::acc::DeviceType::None device_type.
-    bool hasAsyncOnly();
-    /// Return true if the op has the async attribute for the given device_type.
-    bool hasAsyncOnly(mlir::acc::DeviceType deviceType);
-    /// Return the value of the async clause if present.
-    mlir::Value getAsyncValue();
-    /// Return the value of the async clause for the given device_type if
-    /// present.
-    mlir::Value getAsyncValue(mlir::acc::DeviceType deviceType);
-
-    /// Return the value of the num_workers clause if present.
-    mlir::Value getNumWorkersValue();
-    /// Return the value of the num_workers clause for the given device_type if
-    /// present.
-    mlir::Value getNumWorkersValue(mlir::acc::DeviceType deviceType);
-
-    /// Return the value of the vector_length clause if present.
-    mlir::Value getVectorLengthValue();
-    /// Return the value of the vector_length clause for the given device_type 
-    /// if present.
-    mlir::Value getVectorLengthValue(mlir::acc::DeviceType deviceType);
-
-    /// Return the values of the num_gangs clause if present.
-    mlir::Operation::operand_range getNumGangsValues();
-    /// Return the values of the num_gangs clause for the given device_type if
-    /// present.
-    mlir::Operation::operand_range
-    getNumGangsValues(mlir::acc::DeviceType deviceType);
-
-    /// Return true if the op has the wait attribute for the
-    /// mlir::acc::DeviceType::None device_type.
-    bool hasWaitOnly();
-    /// Return true if the op has the wait attribute for the given device_type.
-    bool hasWaitOnly(mlir::acc::DeviceType deviceType);
-    /// Return the values of the wait clause if present.
-    mlir::Operation::operand_range getWaitValues();
-    /// Return the values of the wait clause for the given device_type if
-    /// present.
-    mlir::Operation::operand_range
-    getWaitValues(mlir::acc::DeviceType deviceType);
   }];
 
   let assemblyFormat = [{
     oilist(
         `dataOperands` `(` $dataClauseOperands `:` type($dataClauseOperands) `)`
-      | `async` `(` custom<DeviceTypeOperands>($async,
-            type($async), $asyncDeviceType) `)`
-      | `num_gangs` `(` custom<NumGangs>($numGangs,
-            type($numGangs), $numGangsDeviceType, $numGangsSegments) `)`
-      | `num_workers` `(` custom<DeviceTypeOperands>($numWorkers,
-            type($numWorkers), $numWorkersDeviceType) `)`
-      | `vector_length` `(` custom<DeviceTypeOperands>($vectorLength,
-            type($vectorLength), $vectorLengthDeviceType) `)`
-      | `wait` `(` custom<WaitOperands>($waitOperands,
-            type($waitOperands), $waitOperandsDeviceType, $waitOperandsSegments) `)`
+      | `async` `(` $async `:` type($async) `)`
+      | `num_gangs` `(` $numGangs `:` type($numGangs) `)`
+      | `num_workers` `(` $numWorkers `:` type($numWorkers) `)`
+      | `vector_length` `(` $vectorLength `:` type($vectorLength) `)`
+      | `wait` `(` $waitOperands `:` type($waitOperands) `)`
       | `self` `(` $selfCond `)`
       | `if` `(` $ifCond `)`
     )
diff --git a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
index 45e0632db5ef2..df4f7825545c2 100644
--- a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
+++ b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
@@ -615,49 +615,15 @@ unsigned ParallelOp::getNumDataOperands() {
 }
 
 Value ParallelOp::getDataOperand(unsigned i) {
-  unsigned numOptional = getAsync().size();
+  unsigned numOptional = getAsync() ? 1 : 0;
   numOptional += getNumGangs().size();
-  numOptional += getNumWorkers().size();
-  numOptional += getVectorLength().size();
+  numOptional += getNumWorkers() ? 1 : 0;
+  numOptional += getVectorLength() ? 1 : 0;
   numOptional += getIfCond() ? 1 : 0;
   numOptional += getSelfCond() ? 1 : 0;
   return getOperand(getWaitOperands().size() + numOptional + i);
 }
 
-template <typename Op>
-static LogicalResult verifyDeviceTypeCountMatch(Op op, OperandRange operands,
-                                                ArrayAttr deviceTypes,
-                                                llvm::StringRef keyword) {
-  if (operands.size() > 0 && deviceTypes.getValue().size() != operands.size())
-    return op.emitOpError() << keyword << " operands count must match "
-                            << keyword << " device_type count";
-  return success();
-}
-
-template <typename Op>
-static LogicalResult verifyDeviceTypeAndSegmentCountMatch(
-    Op op, OperandRange operands, DenseI32ArrayAttr segments,
-    ArrayAttr deviceTypes, llvm::StringRef keyword, int32_t maxInSegment = 0) {
-  std::size_t numOperandsInSegments = 0;
-
-  if (!segments)
-    return success();
-
-  for (auto segCount : segments.asArrayRef()) {
-    if (maxInSegment != 0 && segCount > maxInSegment)
-      return op.emitOpError() << keyword << " expects a maximum of "
-                              << maxInSegment << " values per segment";
-    numOperandsInSegments += segCount;
-  }
-  if (numOperandsInSegments != operands.size())
-    return op.emitOpError()
-           << keyword << " operand count does not match count in segments";
-  if (deviceTypes.getValue().size() != (size_t)segments.size())
-    return op.emitOpError()
-           << keyword << " segment count does not match device_type count";
-  return success();
-}
-
 LogicalResult acc::ParallelOp::verify() {
   if (failed(checkSymOperandList<mlir::acc::PrivateRecipeOp>(
           *this, getPrivatizations(), getGangPrivateOperands(), "private",
@@ -667,322 +633,11 @@ LogicalResult acc::ParallelOp::verify() {
           *this, getReductionRecipes(), getReductionOperands(), "reduction",
           "reductions", false)))
     return failure();
-
-  if (failed(verifyDeviceTypeAndSegmentCountMatch(
-          *this, getNumGangs(), getNumGangsSegmentsAttr(),
-          getNumGangsDeviceTypeAttr(), "num_gangs", 3)))
-    return failure();
-
-  if (failed(verifyDeviceTypeAndSegmentCountMatch(
-          *this, getWaitOperands(), getWaitOperandsSegmentsAttr(),
-          getWaitOperandsDeviceTypeAttr(), "wait")))
-    return failure();
-
-  if (failed(verifyDeviceTypeCountMatch(*this, getNumWorkers(),
-                                        getNumWorkersDeviceTypeAttr(),
-                                        "num_workers")))
-    return failure();
-
-  if (failed(verifyDeviceTypeCountMatch(*this, getVectorLength(),
-                                        getVectorLengthDeviceTypeAttr(),
-                                        "vector_length")))
-    return failure();
-
-  if (failed(verifyDeviceTypeCountMatch(*this, getAsync(),
-                                        getAsyncDeviceTypeAttr(), "async")))
-    return failure();
-
+  if (getNumGangs().size() > 3)
+    return emitOpError() << "num_gangs expects a maximum of 3 values";
   return checkDataOperands<acc::ParallelOp>(*this, getDataClauseOperands());
 }
 
-static std::optional<unsigned> findSegment(ArrayAttr segments,
-                                           mlir::acc::DeviceType deviceType) {
-  unsigned segmentIdx = 0;
-  for (auto attr : segments) {
-    auto deviceTypeAttr = mlir::dyn_cast<mlir::acc::DeviceTypeAttr>(attr);
-    if (deviceTypeAttr.getValue() == deviceType)
-      return std::make_optional(segmentIdx);
-    ++segmentIdx;
-  }
-  return std::nullopt;
-}
-
-static mlir::Value
-getValueInDeviceTypeSegment(std::optional<mlir::ArrayAttr> arrayAttr,
-                            mlir::Operation::operand_range range,
-                            mlir::acc::DeviceType deviceType) {
-  if (!arrayAttr)
-    return {};
-  if (auto pos = findSegment(*arrayAttr, deviceType))
-    return range[*pos];
-  return {};
-}
-
-bool acc::ParallelOp::hasAsyncOnly() {
-  return hasAsyncOnly(mlir::acc::DeviceType::None);
-}
-
-bool acc::ParallelOp::hasAsyncOnly(mlir::acc::DeviceType deviceType) {
-  if (auto arrayAttr = getAsyncOnly()) {
-    if (findSegment(*arrayAttr, deviceType))
-      return true;
-  }
-  return false;
-}
-
-mlir::Value acc::ParallelOp::getAsyncValue() {
-  return getAsyncValue(mlir::acc::DeviceType::None);
-}
-
-mlir::Value acc::ParallelOp::getAsyncValue(mlir::acc::DeviceType deviceType) {
-  return getValueInDeviceTypeSegment(getAsyncDeviceType(), getAsync(),
-                                     deviceType);
-}
-
-mlir::Value acc::ParallelOp::getNumWorkersValue() {
-  return getNumWorkersValue(mlir::acc::DeviceType::None);
-}
-
-mlir::Value
-acc::ParallelOp::getNumWorkersValue(mlir::acc::DeviceType deviceType) {
-  return getValueInDeviceTypeSegment(getNumWorkersDeviceType(), getNumWorkers(),
-                                     deviceType);
-}
-
-mlir::Value acc::ParallelOp::getVectorLengthValue() {
-  return getVectorLengthValue(mlir::acc::DeviceType::None);
-}
-
-mlir::Value
-acc::ParallelOp::getVectorLengthValue(mlir::acc::DeviceType deviceType) {
-  return getValueInDeviceTypeSegment(getVectorLengthDeviceType(),
-                                     getVectorLength(), deviceType);
-}
-
-mlir::Operation::operand_range ParallelOp::getNumGangsValues() {
-  return getNumGangsValues(mlir::acc::DeviceType::None);
-}
-
-static mlir::Operation::operand_range
-getValuesFromSegments(std::optional<mlir::ArrayAttr> arrayAttr,
-                      mlir::Operation::operand_range range,
-                      std::optional<llvm::ArrayRef<int32_t>> segments,
-                      mlir::acc::DeviceType deviceType) {
-  if (!arrayAttr)
-    return range.take_front(0);
-  if (auto pos = findSegment(*arrayAttr, deviceType)) {
-    int32_t nbOperandsBefore = 0;
-    for (unsigned i = 0; i < *pos; ++i)
-      nbOperandsBefore += (*segments)[i];
-    return range.drop_front(nbOperandsBefore).take_front((*segments)[*pos]);
-  }
-  return range.take_front(0);
-}
-
-mlir::Operation::operand_range
-ParallelOp::getNumGangsValues(mlir::acc::DeviceType deviceType) {
-  return getValuesFromSegments(getNumGangsDeviceType(), getNumGangs(),
-                               getNumGangsSegments(), deviceType);
-}
-
-bool acc::ParallelOp::hasWaitOnly() {
-  return hasWaitOnly(mlir::acc::DeviceType::None);
-}
-
-bool acc::ParallelOp::hasWaitOnly(mlir::acc::DeviceType deviceType) {
-  if (auto arrayAttr = getWaitOnly()) {
-    if (findSegment(*arrayAttr, deviceType))
-      return true;
-  }
-  return false;
-}
-
-mlir::Operation::operand_range ParallelOp::getWaitValues() {
-  return getWaitValues(mlir::acc::DeviceType::None);
-}
-
-mlir::Operation::operand_range
-ParallelOp::getWaitValues(mlir::acc::DeviceType deviceType) {
-  return getValuesFromSegments(getWaitOperandsDeviceType(), getWaitOperands(),
-                               getWaitOperandsSegments(), deviceType);
-}
-
-static ParseResult parseNumGangs(
-    mlir::OpAsmParser &parser,
-    llvm::SmallVectorImpl<mlir::OpAsmParser::UnresolvedOperand> &operands,
-    llvm::SmallVectorImpl<Type> &types, mlir::ArrayAttr &deviceTypes,
-    mlir::DenseI32ArrayAttr &segments) {
-  llvm::SmallVector<DeviceTypeAttr> attributes;
-  llvm::SmallVector<int32_t> seg;
-
-  do {
-    if (failed(parser.parseLBrace()))
-      return failure();
-
-    if (failed(parser.parseCommaSeparatedList(
-            mlir::AsmParser::Delimiter::None, [&]() {
-              if (parser.parseOperand(operands.emplace_back()) ||
-                  parser.parseColonType(types.emplace_back()))
-                return failure();
-              return success();
-            })))
-      return failure();
-
-    seg.push_back(operands.size());
-
-    if (failed(parser.parseRBrace()))
-      return failure();
-
-    if (succeeded(parser.parseOptionalLSquare())) {
-      if (parser.parseAttribute(attributes.emplace_back()) ||
-          parser.parseRSquare())
-        return failure();
-    } else {
-      attributes.push_back(mlir::acc::DeviceTypeAttr::get(
-          parser.getContext(), mlir::acc::DeviceType::None));
-    }
-  } while (succeeded(parser.parseOptionalComma()));
-
-  llvm::SmallVector<mlir::Attribute> arrayAttr(attributes.begin(),
-                                               attributes.end());
-  deviceTypes = ArrayAttr::get(parser.getContext(), arrayAttr);
-  segments = DenseI32ArrayAttr::get(parser.getContext(), seg);
-
-  return success();
-}
-
-static void printNumGangs(mlir::OpAsmPrinter &p, mlir::Operation *op,
-                          mlir::OperandRange operands, mlir::TypeRange types,
-                          std::optional<mlir::ArrayAttr> deviceTypes,
-                          std::optional<mlir::DenseI32ArrayAttr> segments) {
-  unsigned opIdx = 0;
-  for (unsigned i = 0; i < deviceTypes->size(); ++i) {
-    if (i != 0)
-      p << ", ";
-    p << "{";
-    for (int32_t j = 0; j < (*segments)[i]; ++j) {
-      if (j != 0)
-        p << ", ";
-      p << operands[opIdx] << " : " << operands[opIdx].getType();
-      ++opIdx;
-    }
-    p << "}";
-    auto deviceTypeAttr =
-        mlir::dyn_cast<mlir::acc::DeviceTypeAttr>((*deviceTypes)[i]);
-    if (deviceTypeAttr.getValue() != mlir::acc::DeviceType::None)
-      p << " [" << (*deviceTypes)[i] << "]";
-  }
-}
-
-static ParseResult parseWaitOperands(
-    mlir::OpAsmParser &parser,
-    llvm::SmallVectorImpl<mlir::OpAsmParser::UnresolvedOperand> &operands,
-    llvm::SmallVectorImpl<Type> &types, mlir::ArrayAttr &deviceTypes,
-    mlir::DenseI32ArrayAttr &segments) {
-  llvm::SmallVector<DeviceTypeAttr> attributes;
-  llvm::SmallVector<int32_t> seg;
-
-  do {
-    if (failed(parser.parseLBrace()))
-      return failure();
-
-    if (failed(parser.parseCommaSeparatedList(
-            mlir::AsmParser::Delimiter::None, [&]() {
-              if (parser.parseOperand(operands.emplace_back()) ||
-                  parser.parseColonType(types.emplace_back()))
-                return failure();
-              return success();
-            })))
-      return failure();
-
-    seg.push_back(operands.size());
-
-    if (failed(parser.parseRBrace()))
-      return failure();
-
-    if (succeeded(parser.parseOptionalLSquare())) {
-      if (parser.parseAttribute(attributes.emplace_back()) ||
-          parser.parseRSquare())
-        return failure();
-    } else {
-      attributes.push_back(mlir::acc::DeviceTypeAttr::get(
-          parser.getContext(), mlir::acc::DeviceType::None));
-    }
-  } while (succeeded(parser.parseOptionalComma()));
-
-  llvm::SmallVector<mlir::Attribute> arrayAttr(attributes.begin(),
-                                               attributes.end());
-  deviceTypes = ArrayAttr::get(parser.getContext(), arrayAttr);
-  segments = DenseI32ArrayAttr::get(parser.getContext(), seg);
-
-  return success();
-}
-
-static void printWaitOperands(mlir::OpAsmPrinter &p, mlir::Operation *op,
-                              mlir::OperandRange operands,
-                              mlir::TypeRange types,
-                              std::optional<mlir::ArrayAttr> deviceTypes,
-                              std::optional<mlir::DenseI32ArrayAttr> segments) {
-  unsigned opIdx = 0;
-  for (unsigned i = 0; i < deviceTypes->size(); ++i) {
-    if (i != 0)
-      p << ", ";
-    p << "{";
-    for (int32_t j = 0; j < (*segments)[i]; ++j) {
-      if (j != 0)
-        p << ", ";
-      p << operands[opIdx] << " : " << operands[opIdx].getType();
-      ++opIdx;
-    }
-    p << "}";
-    auto deviceTypeAttr =
-        mlir::dyn_cast<mlir::acc::DeviceTypeAttr>((*deviceTypes)[i]);
-    if (deviceTypeAttr.getValue() != mlir::acc::DeviceType::None)
-      p << " [" << (*deviceTypes)[i] << "]";
-  }
-}
-
-static ParseResult parseDeviceTypeOperands(
-    mlir::OpAsmParser &parser,
-    llvm::SmallVectorImpl<mlir::OpAsmParser::UnresolvedOperand> &operands,
-    llvm::SmallVectorImpl<Type> &types, mlir::ArrayAttr &deviceTypes) {
-  llvm::SmallVector<DeviceTypeAttr> attributes;
-  if (failed(parser.parseCommaSeparatedList([&]() {
-        if (parser.parseOperand(operands.emplace_back()) ||
-            parser.parseColonType(types.emplace_back()))
-          return failure();
-        if (succeeded(parser.parseOptionalLSquare())) {
-          if (parser.parseAttribute(attributes.emplace_back()) ||
-              parser.parseRSquare())
-            return failure();
-        } else {
-          attributes.push_back(mlir::acc::DeviceTypeAttr::get(
-              parser.getContext(), mlir::acc::DeviceType::None));
-        }
-        return success();
-      })))
-    return failure();
-  llvm::SmallVector<mlir::Attribute> arrayAttr(attributes.begin(),
-                                               attributes.end());
-  deviceTypes = ArrayAttr::get(parser.getContext(), arrayAttr);
-  return success();
-}
-
-static void
-printDeviceTypeOperands(mlir::OpAsmPrinter &p, mlir::Operation *op,
-                        mlir::OperandRange operands, mlir::TypeRange types,
-                        std::optional<mlir::ArrayAttr> deviceTypes) {
-  for (unsigned i = 0, e = deviceTypes->size(); i < e; ++i) {
-    if (i != 0)
-      p << ", ";
-    p << operands[i] << " : " << operands[i].getType();
-    auto deviceTypeAttr =
-        mlir::dyn_cast<mlir::acc::DeviceTypeAttr>((*deviceTypes)[i]);
-    if (deviceTypeAttr.getValue() != mlir::acc::DeviceType::None)
-      p << " [" << (*deviceTypes)[i] << "]";
-  }
-}
-
 //===----------------------------------------------------------------------===//
 // SerialOp
 //===----------------------------------------------------------------------===//
@@ -993,55 +648,12 @@ unsigned SerialOp::getNumDataOperands() {
 }
 
 Value SerialOp::getDataOperand(unsigned i) {
-  unsigned numOptional = getAsync().size();
+  unsigned numOptional = getAsync() ? 1 : 0;
   numOptional += getIfCond() ? 1 : 0;
   numOptional += getSelfCond() ? 1 : 0;
   return getOperand(getWaitOperands().size() + numOptional + i);
 }
 
-bool acc::SerialOp::hasAsyncOnly() {
-  return hasAsyncOnly(mlir::acc::DeviceType::None);
-}
-
-bool acc::SerialOp::hasAsyncOnly(mlir::acc::DeviceType deviceType) {
-  if (auto arrayAttr = getAsyncOnly()) {
-    if (findSegment(*arrayAttr, deviceType))
-      return true;
-  }
-  return false;
-}
-
-mlir::Value acc::SerialOp::getAsyncValue() {
-  return getAsyncValue(mlir::acc::DeviceType::None);
-}
-
-mlir::Value acc::SerialOp::getAsyncValue(mlir::acc::DeviceType deviceType) {
-  return getValueInDeviceTypeSegment(getAsyncDeviceType(), getAsync(),
-                                     deviceType);
-}
-
-bool acc::SerialOp::hasWaitOnly() {
-  return hasWaitOnly(mlir::acc::DeviceType::None);
-}
-
-bool acc::SerialOp::hasWaitOnly(mlir::acc::DeviceType deviceType) {
-  if (auto arrayAttr = getWaitOnly()) {
-    if (findSegment(*arrayAttr, deviceType))
-      return true;
-  }
-  return false;
-}
-
-mlir::Operation::operand_range SerialOp::getWaitValues() {
-  return getWaitValues(mlir::acc::DeviceType::None);
-}
-
-mlir::Operation::operand_range
-SerialOp::getWaitValues(mlir::acc::DeviceType deviceType) {
-  return getValuesFromSegments(getWaitOperandsDeviceType(), getWaitOperands(),
-                               getWaitOperandsSegments(), deviceType);
-}
-
 LogicalResult acc::SerialOp::verify() {
   if (failed(checkSymOperandList<mlir::acc::PrivateRecipeOp>(
           *this, getPrivatizations(), getGangPrivateOperands(), "private",
@@ -1051,16 +663,6 @@ LogicalResult acc::SerialOp::verify() {
           *this, getReductionRecipes(), getReductionOperands(), "reduction",
           "reductions", false)))
     return failure();
-
-  if (failed(verifyDeviceTypeAndSegmentCountMatch(
-          *this, getWaitOperands(), getWaitOperandsSegmentsAttr(),
-          getWaitOperandsDeviceTypeAttr(), "wait")))
-    return failure();
-
-  if (failed(verifyDeviceTypeCountMatch(*this, getAsync(),
-                                        getAsyncDeviceTypeAttr(), "async")))
-    return failure();
-
   return checkDataOperands<acc::SerialOp>(*this, getDataClauseOperands());
 }
 
@@ -1073,114 +675,19 @@ unsigned KernelsOp::getNumDataOperands() {
 }
 
 Value KernelsOp::getDataOperand(unsigned i) {
-  unsigned numOptional = getAsync().size();
+  unsigned numOptional = getAsync() ? 1 : 0;
   numOptional += getWaitOperands().size();
   numOptional += getNumGangs().size();
-  numOptional += getNumWorkers().size();
-  numOptional += getVectorLength().size();
+  numOptional += getNumWorkers() ? 1 : 0;
+  numOptional += getVectorLength() ? 1 : 0;
   numOptional += getIfCond() ? 1 : 0;
   numOptional += getSelfCond() ? 1 : 0;
   return getOperand(numOptional + i);
 }
 
-bool acc::KernelsOp::hasAsyncOnly() {
-  return hasAsyncOnly(mlir::acc::DeviceType::None);
-}
-
-bool acc::KernelsOp::hasAsyncOnly(mlir::acc::DeviceType deviceType) {
-  if (auto arrayAttr = getAsyncOnly()) {
-    if (findSegment(*arrayAttr, deviceType))
-      return true;
-  }
-  return false;
-}
-
-mlir::Value acc::KernelsOp::getAsyncValue() {
-  return getAsyncValue(mlir::acc::DeviceType::None);
-}
-
-mlir::Value acc::KernelsOp::getAsyncValue(mlir::acc::DeviceType deviceType) {
-  return getValueInDeviceTypeSegment(getAsyncDeviceType(), getAsync(),
-                                     deviceType);
-}
-
-mlir::Value acc::KernelsOp::getNumWorkersValue() {
-  return getNumWorkersValue(mlir::acc::DeviceType::None);
-}
-
-mlir::Value
-acc::KernelsOp::getNumWorkersValue(mlir::acc::DeviceType deviceType) {
-  return getValueInDeviceTypeSegment(getNumWorkersDeviceType(), getNumWorkers(),
-                                     deviceType);
-}
-
-mlir::Value acc::KernelsOp::getVectorLengthValue() {
-  return getVectorLengthValue(mlir::acc::DeviceType::None);
-}
-
-mlir::Value
-acc::KernelsOp::getVectorLengthValue(mlir::acc::DeviceType deviceType) {
-  return getValueInDeviceTypeSegment(getVectorLengthDeviceType(),
-                                     getVectorLength(), deviceType);
-}
-
-mlir::Operation::operand_range KernelsOp::getNumGangsValues() {
-  return getNumGangsValues(mlir::acc::DeviceType::None);
-}
-
-mlir::Operation::operand_range
-KernelsOp::getNumGangsValues(mlir::acc::DeviceType deviceType) {
-  return getValuesFromSegments(getNumGangsDeviceType(), getNumGangs(),
-                               getNumGangsSegments(), deviceType);
-}
-
-bool acc::KernelsOp::hasWaitOnly() {
-  return hasWaitOnly(mlir::acc::DeviceType::None);
-}
-
-bool acc::KernelsOp::hasWaitOnly(mlir::acc::DeviceType deviceType) {
-  if (auto arrayAttr = getWaitOnly()) {
-    if (findSegment(*arrayAttr, deviceType))
-      return true;
-  }
-  return false;
-}
-
-mlir::Operation::operand_range KernelsOp::getWaitValues() {
-  return getWaitValues(mlir::acc::DeviceType::None);
-}
-
-mlir::Operation::operand_range
-KernelsOp::getWaitValues(mlir::acc::DeviceType deviceType) {
-  return getValuesFromSegments(getWaitOperandsDeviceType(), getWaitOperands(),
-                               getWaitOperandsSegments(), deviceType);
-}
-
 LogicalResult acc::KernelsOp::verify() {
-  if (failed(verifyDeviceTypeAndSegmentCountMatch(
-          *this, getNumGangs(), getNumGangsSegmentsAttr(),
-          getNumGangsDeviceTypeAttr(), "num_gangs", 3)))
-    return failure();
-
-  if (failed(verifyDeviceTypeAndSegmentCountMatch(
-          *this, getWaitOperands(), getWaitOperandsSegmentsAttr(),
-          getWaitOperandsDeviceTypeAttr(), "wait")))
-    return failure();
-
-  if (failed(verifyDeviceTypeCountMatch(*this, getNumWorkers(),
-                                        getNumWorkersDeviceTypeAttr(),
-                                        "num_workers")))
-    return failure();
-
-  if (failed(verifyDeviceTypeCountMatch(*this, getVectorLength(),
-                                        getVectorLengthDeviceTypeAttr(),
-                                        "vector_length")))
-    return failure();
-
-  if (failed(verifyDeviceTypeCountMatch(*this, getAsync(),
-                                        getAsyncDeviceTypeAttr(), "async")))
-    return failure();
-
+  if (getNumGangs().size() > 3)
+    return emitOpError() << "num_gangs expects a maximum of 3 values";
   return checkDataOperands<acc::KernelsOp>(*this, getDataClauseOperands());
 }
 
diff --git a/mlir/test/Dialect/OpenACC/invalid.mlir b/mlir/test/Dialect/OpenACC/invalid.mlir
index c18d964b370f2..b9ac68d0592c8 100644
--- a/mlir/test/Dialect/OpenACC/invalid.mlir
+++ b/mlir/test/Dialect/OpenACC/invalid.mlir
@@ -462,8 +462,8 @@ acc.loop gang() {
 // -----
 
 %i64value = arith.constant 1 : i64
-// expected-error@+1 {{num_gangs expects a maximum of 3 values per segment}}
-acc.parallel num_gangs({%i64value: i64, %i64value : i64, %i64value : i64, %i64value : i64}) {
+// expected-error@+1 {{num_gangs expects a maximum of 3 values}}
+acc.parallel num_gangs(%i64value, %i64value, %i64value, %i64value : i64, i64, i64, i64) {
 }
 
 // -----
diff --git a/mlir/test/Dialect/OpenACC/ops.mlir b/mlir/test/Dialect/OpenACC/ops.mlir
index 5a95811685f84..05b0450c7fb91 100644
--- a/mlir/test/Dialect/OpenACC/ops.mlir
+++ b/mlir/test/Dialect/OpenACC/ops.mlir
@@ -137,7 +137,7 @@ func.func @compute3(%a: memref<10x10xf32>, %b: memref<10x10xf32>, %c: memref<10x
   %pd = acc.present varPtr(%d : memref<10xf32>) -> memref<10xf32>
   acc.data dataOperands(%pa, %pb, %pc, %pd: memref<10x10xf32>, memref<10x10xf32>, memref<10xf32>, memref<10xf32>) {
     %private = acc.private varPtr(%c : memref<10xf32>) -> memref<10xf32>
-    acc.parallel num_gangs({%numGangs: i64}) num_workers(%numWorkers: i64 [#acc.device_type<nvidia>]) private(@privatization_memref_10_f32 -> %private : memref<10xf32>) {
+    acc.parallel num_gangs(%numGangs: i64) num_workers(%numWorkers: i64) private(@privatization_memref_10_f32 -> %private : memref<10xf32>) {
       acc.loop gang {
         scf.for %x = %lb to %c10 step %st {
           acc.loop worker {
@@ -180,7 +180,7 @@ func.func @compute3(%a: memref<10x10xf32>, %b: memref<10x10xf32>, %c: memref<10x
 // CHECK-NEXT:   [[NUMWORKERS:%.*]] = arith.constant 10 : i64
 // CHECK:        acc.data dataOperands(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : memref<10x10xf32>, memref<10x10xf32>, memref<10xf32>, memref<10xf32>) {
 // CHECK-NEXT:     %[[P_ARG2:.*]] = acc.private varPtr([[ARG2]] : memref<10xf32>) -> memref<10xf32> 
-// CHECK-NEXT:     acc.parallel num_gangs({[[NUMGANG]] : i64}) num_workers([[NUMWORKERS]] : i64 [#acc.device_type<nvidia>]) private(@privatization_memref_10_f32 -> %[[P_ARG2]] : memref<10xf32>) {
+// CHECK-NEXT:     acc.parallel num_gangs([[NUMGANG]] : i64) num_workers([[NUMWORKERS]] : i64) private(@privatization_memref_10_f32 -> %[[P_ARG2]] : memref<10xf32>) {
 // CHECK-NEXT:       acc.loop gang {
 // CHECK-NEXT:         scf.for %{{.*}} = [[C0]] to [[C10]] step [[C1]] {
 // CHECK-NEXT:           acc.loop worker {
@@ -439,25 +439,25 @@ func.func @testparallelop(%a: memref<10xf32>, %b: memref<10xf32>, %c: memref<10x
   }
   acc.parallel async(%idxValue: index) {
   }
-  acc.parallel wait({%i64value: i64}) {
+  acc.parallel wait(%i64value: i64) {
   }
-  acc.parallel wait({%i32value: i32}) {
+  acc.parallel wait(%i32value: i32) {
   }
-  acc.parallel wait({%idxValue: index}) {
+  acc.parallel wait(%idxValue: index) {
   }
-  acc.parallel wait({%i64value : i64, %i32value : i32, %idxValue : index}) {
+  acc.parallel wait(%i64value, %i32value, %idxValue : i64, i32, index) {
   }
-  acc.parallel num_gangs({%i64value: i64}) {
+  acc.parallel num_gangs(%i64value: i64) {
   }
-  acc.parallel num_gangs({%i32value: i32}) {
+  acc.parallel num_gangs(%i32value: i32) {
   }
-  acc.parallel num_gangs({%idxValue: index}) {
+  acc.parallel num_gangs(%idxValue: index) {
   }
-  acc.parallel num_gangs({%i64value: i64, %i64value: i64, %idxValue: index}) {
+  acc.parallel num_gangs(%i64value, %i64value, %idxValue : i64, i64, index) {
   }
-  acc.parallel num_workers(%i64value: i64 [#acc.device_type<nvidia>]) {
+  acc.parallel num_workers(%i64value: i64) {
   }
-  acc.parallel num_workers(%i32value: i32 [#acc.device_type<default>]) {
+  acc.parallel num_workers(%i32value: i32) {
   }
   acc.parallel num_workers(%idxValue: index) {
   }
@@ -492,25 +492,25 @@ func.func @testparallelop(%a: memref<10xf32>, %b: memref<10xf32>, %c: memref<10x
 // CHECK-NEXT: }
 // CHECK:      acc.parallel async([[IDXVALUE]] : index) {
 // CHECK-NEXT: }
-// CHECK:      acc.parallel wait({[[I64VALUE]] : i64}) {
+// CHECK:      acc.parallel wait([[I64VALUE]] : i64) {
 // CHECK-NEXT: }
-// CHECK:      acc.parallel wait({[[I32VALUE]] : i32}) {
+// CHECK:      acc.parallel wait([[I32VALUE]] : i32) {
 // CHECK-NEXT: }
-// CHECK:      acc.parallel wait({[[IDXVALUE]] : index}) {
+// CHECK:      acc.parallel wait([[IDXVALUE]] : index) {
 // CHECK-NEXT: }
-// CHECK:      acc.parallel wait({[[I64VALUE]] : i64, [[I32VALUE]] : i32, [[IDXVALUE]] : index}) {
+// CHECK:      acc.parallel wait([[I64VALUE]], [[I32VALUE]], [[IDXVALUE]] : i64, i32, index) {
 // CHECK-NEXT: }
-// CHECK:      acc.parallel num_gangs({[[I64VALUE]] : i64}) {
+// CHECK:      acc.parallel num_gangs([[I64VALUE]] : i64) {
 // CHECK-NEXT: }
-// CHECK:      acc.parallel num_gangs({[[I32VALUE]] : i32}) {
+// CHECK:      acc.parallel num_gangs([[I32VALUE]] : i32) {
 // CHECK-NEXT: }
-// CHECK:      acc.parallel num_gangs({[[IDXVALUE]] : index}) {
+// CHECK:      acc.parallel num_gangs([[IDXVALUE]] : index) {
 // CHECK-NEXT: }
-// CHECK:      acc.parallel num_gangs({[[I64VALUE]] : i64, [[I64VALUE]] : i64, [[IDXVALUE]] : index}) {
+// CHECK:      acc.parallel num_gangs([[I64VALUE]], [[I64VALUE]], [[IDXVALUE]] : i64, i64, index) {
 // CHECK-NEXT: }
-// CHECK:      acc.parallel num_workers([[I64VALUE]] : i64 [#acc.device_type<nvidia>]) {
+// CHECK:      acc.parallel num_workers([[I64VALUE]] : i64) {
 // CHECK-NEXT: }
-// CHECK:      acc.parallel num_workers([[I32VALUE]] : i32 [#acc.device_type<default>]) {
+// CHECK:      acc.parallel num_workers([[I32VALUE]] : i32) {
 // CHECK-NEXT: }
 // CHECK:      acc.parallel num_workers([[IDXVALUE]] : index) {
 // CHECK-NEXT: }
@@ -590,13 +590,13 @@ func.func @testserialop(%a: memref<10xf32>, %b: memref<10xf32>, %c: memref<10x10
   }
   acc.serial async(%idxValue: index) {
   }
-  acc.serial wait({%i64value: i64}) {
+  acc.serial wait(%i64value: i64) {
   }
-  acc.serial wait({%i32value: i32}) {
+  acc.serial wait(%i32value: i32) {
   }
-  acc.serial wait({%idxValue: index}) {
+  acc.serial wait(%idxValue: index) {
   }
-  acc.serial wait({%i64value : i64, %i32value : i32, %idxValue : index}) {
+  acc.serial wait(%i64value, %i32value, %idxValue : i64, i32, index) {
   }
   %firstprivate = acc.firstprivate varPtr(%b : memref<10xf32>) -> memref<10xf32>
   acc.serial private(@privatization_memref_10_f32 -> %a : memref<10xf32>, @privatization_memref_10_10_f32 -> %c : memref<10x10xf32>) firstprivate(@firstprivatization_memref_10xf32 -> %firstprivate : memref<10xf32>) {
@@ -627,13 +627,13 @@ func.func @testserialop(%a: memref<10xf32>, %b: memref<10xf32>, %c: memref<10x10
 // CHECK-NEXT: }
 // CHECK:      acc.serial async([[IDXVALUE]] : index) {
 // CHECK-NEXT: }
-// CHECK:      acc.serial wait({[[I64VALUE]] : i64}) {
+// CHECK:      acc.serial wait([[I64VALUE]] : i64) {
 // CHECK-NEXT: }
-// CHECK:      acc.serial wait({[[I32VALUE]] : i32}) {
+// CHECK:      acc.serial wait([[I32VALUE]] : i32) {
 // CHECK-NEXT: }
-// CHECK:      acc.serial wait({[[IDXVALUE]] : index}) {
+// CHECK:      acc.serial wait([[IDXVALUE]] : index) {
 // CHECK-NEXT: }
-// CHECK:      acc.serial wait({[[I64VALUE]] : i64, [[I32VALUE]] : i32, [[IDXVALUE]] : index}) {
+// CHECK:      acc.serial wait([[I64VALUE]], [[I32VALUE]], [[IDXVALUE]] : i64, i32, index) {
 // CHECK-NEXT: }
 // CHECK:      %[[FIRSTP:.*]] = acc.firstprivate varPtr([[ARGB]] : memref<10xf32>) -> memref<10xf32>
 // CHECK:      acc.serial firstprivate(@firstprivatization_memref_10xf32 -> %[[FIRSTP]] : memref<10xf32>) private(@privatization_memref_10_f32 -> [[ARGA]] : memref<10xf32>, @privatization_memref_10_10_f32 -> [[ARGC]] : memref<10x10xf32>) {
@@ -665,13 +665,13 @@ func.func @testserialop(%a: memref<10xf32>, %b: memref<10xf32>, %c: memref<10x10
   }
   acc.kernels async(%idxValue: index) {
   }
-  acc.kernels wait({%i64value: i64}) {
+  acc.kernels wait(%i64value: i64) {
   }
-  acc.kernels wait({%i32value: i32}) {
+  acc.kernels wait(%i32value: i32) {
   }
-  acc.kernels wait({%idxValue: index}) {
+  acc.kernels wait(%idxValue: index) {
   }
-  acc.kernels wait({%i64value : i64, %i32value : i32, %idxValue : index}) {
+  acc.kernels wait(%i64value, %i32value, %idxValue : i64, i32, index) {
   }
   acc.kernels {
   } attributes {defaultAttr = #acc<defaultvalue none>}
@@ -699,13 +699,13 @@ func.func @testserialop(%a: memref<10xf32>, %b: memref<10xf32>, %c: memref<10x10
 // CHECK-NEXT: }
 // CHECK:      acc.kernels async([[IDXVALUE]] : index) {
 // CHECK-NEXT: }
-// CHECK:      acc.kernels wait({[[I64VALUE]] : i64}) {
+// CHECK:      acc.kernels wait([[I64VALUE]] : i64) {
 // CHECK-NEXT: }
-// CHECK:      acc.kernels wait({[[I32VALUE]] : i32}) {
+// CHECK:      acc.kernels wait([[I32VALUE]] : i32) {
 // CHECK-NEXT: }
-// CHECK:      acc.kernels wait({[[IDXVALUE]] : index}) {
+// CHECK:      acc.kernels wait([[IDXVALUE]] : index) {
 // CHECK-NEXT: }
-// CHECK:      acc.kernels wait({[[I64VALUE]] : i64, [[I32VALUE]] : i32, [[IDXVALUE]] : index}) {
+// CHECK:      acc.kernels wait([[I64VALUE]], [[I32VALUE]], [[IDXVALUE]] : i64, i32, index) {
 // CHECK-NEXT: }
 // CHECK:      acc.kernels {
 // CHECK-NEXT: } attributes {defaultAttr = #acc<defaultvalue none>}
diff --git a/mlir/unittests/Dialect/CMakeLists.txt b/mlir/unittests/Dialect/CMakeLists.txt
index 13393569f36fe..2dec4ba3c001e 100644
--- a/mlir/unittests/Dialect/CMakeLists.txt
+++ b/mlir/unittests/Dialect/CMakeLists.txt
@@ -10,7 +10,6 @@ add_subdirectory(ArmSME)
 add_subdirectory(Index)
 add_subdirectory(LLVMIR)
 add_subdirectory(MemRef)
-add_subdirectory(OpenACC)
 add_subdirectory(SCF)
 add_subdirectory(SparseTensor)
 add_subdirectory(SPIRV)
diff --git a/mlir/unittests/Dialect/OpenACC/CMakeLists.txt b/mlir/unittests/Dialect/OpenACC/CMakeLists.txt
deleted file mode 100644
index 5133d7fc38296..0000000000000
--- a/mlir/unittests/Dialect/OpenACC/CMakeLists.txt
+++ /dev/null
@@ -1,8 +0,0 @@
-add_mlir_unittest(MLIROpenACCTests
-  OpenACCOpsTest.cpp
-)
-target_link_libraries(MLIROpenACCTests
-  PRIVATE
-  MLIRIR
-  MLIROpenACCDialect
-)
diff --git a/mlir/unittests/Dialect/OpenACC/OpenACCOpsTest.cpp b/mlir/unittests/Dialect/OpenACC/OpenACCOpsTest.cpp
deleted file mode 100644
index dcf6c1240c55d..0000000000000
--- a/mlir/unittests/Dialect/OpenACC/OpenACCOpsTest.cpp
+++ /dev/null
@@ -1,275 +0,0 @@
-//===- OpenACCOpsTest.cpp - OpenACC ops extra functiosn Tests -------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "mlir/Dialect/Arith/IR/Arith.h"
-#include "mlir/Dialect/OpenACC/OpenACC.h"
-#include "mlir/IR/Diagnostics.h"
-#include "mlir/IR/MLIRContext.h"
-#include "mlir/IR/OwningOpRef.h"
-#include "gtest/gtest.h"
-
-using namespace mlir;
-using namespace mlir::acc;
-
-//===----------------------------------------------------------------------===//
-// Test Fixture
-//===----------------------------------------------------------------------===//
-
-class OpenACCOpsTest : public ::testing::Test {
-protected:
-  OpenACCOpsTest() : b(&context), loc(UnknownLoc::get(&context)) {
-    context.loadDialect<acc::OpenACCDialect, arith::ArithDialect>();
-  }
-
-  MLIRContext context;
-  OpBuilder b;
-  Location loc;
-  llvm::SmallVector<DeviceType> dtypes = {
-      DeviceType::None,    DeviceType::Star, DeviceType::Multicore,
-      DeviceType::Default, DeviceType::Host, DeviceType::Nvidia,
-      DeviceType::Radeon};
-  llvm::SmallVector<DeviceType> dtypesWithoutNone = {
-      DeviceType::Star, DeviceType::Multicore, DeviceType::Default,
-      DeviceType::Host, DeviceType::Nvidia,    DeviceType::Radeon};
-};
-
-template <typename Op>
-void testAsyncOnly(OpBuilder &b, MLIRContext &context, Location loc,
-                   llvm::SmallVector<DeviceType> &dtypes) {
-  Op op = b.create<Op>(loc, TypeRange{}, ValueRange{});
-  EXPECT_FALSE(op.hasAsyncOnly());
-  for (auto d : dtypes)
-    EXPECT_FALSE(op.hasAsyncOnly(d));
-
-  auto dtypeNone = DeviceTypeAttr::get(&context, DeviceType::None);
-  op.setAsyncOnlyAttr(b.getArrayAttr({dtypeNone}));
-  EXPECT_TRUE(op.hasAsyncOnly());
-  EXPECT_TRUE(op.hasAsyncOnly(DeviceType::None));
-  op.removeAsyncOnlyAttr();
-
-  auto dtypeHost = DeviceTypeAttr::get(&context, DeviceType::Host);
-  op.setAsyncOnlyAttr(b.getArrayAttr({dtypeHost}));
-  EXPECT_TRUE(op.hasAsyncOnly(DeviceType::Host));
-  EXPECT_FALSE(op.hasAsyncOnly());
-  op.removeAsyncOnlyAttr();
-
-  auto dtypeStar = DeviceTypeAttr::get(&context, DeviceType::Star);
-  op.setAsyncOnlyAttr(b.getArrayAttr({dtypeHost, dtypeStar}));
-  EXPECT_TRUE(op.hasAsyncOnly(DeviceType::Star));
-  EXPECT_TRUE(op.hasAsyncOnly(DeviceType::Host));
-  EXPECT_FALSE(op.hasAsyncOnly());
-}
-
-TEST_F(OpenACCOpsTest, asyncOnlyTest) {
-  testAsyncOnly<ParallelOp>(b, context, loc, dtypes);
-  testAsyncOnly<KernelsOp>(b, context, loc, dtypes);
-  testAsyncOnly<SerialOp>(b, context, loc, dtypes);
-}
-
-template <typename Op>
-void testAsyncValue(OpBuilder &b, MLIRContext &context, Location loc,
-                    llvm::SmallVector<DeviceType> &dtypes) {
-  Op op = b.create<Op>(loc, TypeRange{}, ValueRange{});
-
-  mlir::Value empty;
-  EXPECT_EQ(op.getAsyncValue(), empty);
-  for (auto d : dtypes)
-    EXPECT_EQ(op.getAsyncValue(d), empty);
-
-  mlir::Value val = b.create<arith::ConstantOp>(loc, b.getI32IntegerAttr(1));
-  auto dtypeNvidia = DeviceTypeAttr::get(&context, DeviceType::Nvidia);
-  op.setAsyncDeviceTypeAttr(b.getArrayAttr({dtypeNvidia}));
-  op.getAsyncMutable().assign(val);
-  EXPECT_EQ(op.getAsyncValue(), empty);
-  EXPECT_EQ(op.getAsyncValue(DeviceType::Nvidia), val);
-}
-
-TEST_F(OpenACCOpsTest, asyncValueTest) {
-  testAsyncValue<ParallelOp>(b, context, loc, dtypes);
-  testAsyncValue<KernelsOp>(b, context, loc, dtypes);
-  testAsyncValue<SerialOp>(b, context, loc, dtypes);
-}
-
-template <typename Op>
-void testNumGangsValues(OpBuilder &b, MLIRContext &context, Location loc,
-                        llvm::SmallVector<DeviceType> &dtypes,
-                        llvm::SmallVector<DeviceType> &dtypesWithoutNone) {
-  Op op = b.create<Op>(loc, TypeRange{}, ValueRange{});
-  EXPECT_EQ(op.getNumGangsValues().begin(), op.getNumGangsValues().end());
-
-  mlir::Value val1 = b.create<arith::ConstantOp>(loc, b.getI32IntegerAttr(1));
-  mlir::Value val2 = b.create<arith::ConstantOp>(loc, b.getI32IntegerAttr(4));
-  auto dtypeNone = DeviceTypeAttr::get(&context, DeviceType::None);
-  op.getNumGangsMutable().assign(val1);
-  op.setNumGangsDeviceTypeAttr(b.getArrayAttr({dtypeNone}));
-  op.setNumGangsSegments(b.getDenseI32ArrayAttr({1}));
-  EXPECT_EQ(op.getNumGangsValues().front(), val1);
-  for (auto d : dtypesWithoutNone)
-    EXPECT_EQ(op.getNumGangsValues(d).begin(), op.getNumGangsValues(d).end());
-
-  op.getNumGangsMutable().clear();
-  op.removeNumGangsDeviceTypeAttr();
-  op.removeNumGangsSegmentsAttr();
-  for (auto d : dtypes)
-    EXPECT_EQ(op.getNumGangsValues(d).begin(), op.getNumGangsValues(d).end());
-
-  op.getNumGangsMutable().append(val1);
-  op.getNumGangsMutable().append(val2);
-  op.setNumGangsDeviceTypeAttr(
-      b.getArrayAttr({DeviceTypeAttr::get(&context, DeviceType::Host),
-                      DeviceTypeAttr::get(&context, DeviceType::Star)}));
-  op.setNumGangsSegments(b.getDenseI32ArrayAttr({1, 1}));
-  EXPECT_EQ(op.getNumGangsValues(DeviceType::None).begin(),
-            op.getNumGangsValues(DeviceType::None).end());
-  EXPECT_EQ(op.getNumGangsValues(DeviceType::Host).front(), val1);
-  EXPECT_EQ(op.getNumGangsValues(DeviceType::Star).front(), val2);
-
-  op.getNumGangsMutable().clear();
-  op.removeNumGangsDeviceTypeAttr();
-  op.removeNumGangsSegmentsAttr();
-  for (auto d : dtypes)
-    EXPECT_EQ(op.getNumGangsValues(d).begin(), op.getNumGangsValues(d).end());
-
-  op.getNumGangsMutable().append(val1);
-  op.getNumGangsMutable().append(val2);
-  op.getNumGangsMutable().append(val1);
-  op.setNumGangsDeviceTypeAttr(
-      b.getArrayAttr({DeviceTypeAttr::get(&context, DeviceType::Default),
-                      DeviceTypeAttr::get(&context, DeviceType::Multicore)}));
-  op.setNumGangsSegments(b.getDenseI32ArrayAttr({2, 1}));
-  EXPECT_EQ(op.getNumGangsValues(DeviceType::None).begin(),
-            op.getNumGangsValues(DeviceType::None).end());
-  EXPECT_EQ(op.getNumGangsValues(DeviceType::Default).front(), val1);
-  EXPECT_EQ(op.getNumGangsValues(DeviceType::Default).drop_front().front(),
-            val2);
-  EXPECT_EQ(op.getNumGangsValues(DeviceType::Multicore).front(), val1);
-}
-
-TEST_F(OpenACCOpsTest, numGangsValuesTest) {
-  testNumGangsValues<ParallelOp>(b, context, loc, dtypes, dtypesWithoutNone);
-  testNumGangsValues<KernelsOp>(b, context, loc, dtypes, dtypesWithoutNone);
-}
-
-template <typename Op>
-void testVectorLength(OpBuilder &b, MLIRContext &context, Location loc,
-                      llvm::SmallVector<DeviceType> &dtypes) {
-  auto op = b.create<Op>(loc, TypeRange{}, ValueRange{});
-
-  mlir::Value empty;
-  EXPECT_EQ(op.getVectorLengthValue(), empty);
-  for (auto d : dtypes)
-    EXPECT_EQ(op.getVectorLengthValue(d), empty);
-
-  mlir::Value val = b.create<arith::ConstantOp>(loc, b.getI32IntegerAttr(1));
-  auto dtypeNvidia = DeviceTypeAttr::get(&context, DeviceType::Nvidia);
-  op.setVectorLengthDeviceTypeAttr(b.getArrayAttr({dtypeNvidia}));
-  op.getVectorLengthMutable().assign(val);
-  EXPECT_EQ(op.getVectorLengthValue(), empty);
-  EXPECT_EQ(op.getVectorLengthValue(DeviceType::Nvidia), val);
-}
-
-TEST_F(OpenACCOpsTest, vectorLengthTest) {
-  testVectorLength<ParallelOp>(b, context, loc, dtypes);
-  testVectorLength<KernelsOp>(b, context, loc, dtypes);
-}
-
-template <typename Op>
-void testWaitOnly(OpBuilder &b, MLIRContext &context, Location loc,
-                  llvm::SmallVector<DeviceType> &dtypes,
-                  llvm::SmallVector<DeviceType> &dtypesWithoutNone) {
-  Op op = b.create<Op>(loc, TypeRange{}, ValueRange{});
-  EXPECT_FALSE(op.hasWaitOnly());
-  for (auto d : dtypes)
-    EXPECT_FALSE(op.hasWaitOnly(d));
-
-  auto dtypeNone = DeviceTypeAttr::get(&context, DeviceType::None);
-  op.setWaitOnlyAttr(b.getArrayAttr({dtypeNone}));
-  EXPECT_TRUE(op.hasWaitOnly());
-  EXPECT_TRUE(op.hasWaitOnly(DeviceType::None));
-  for (auto d : dtypesWithoutNone)
-    EXPECT_FALSE(op.hasWaitOnly(d));
-  op.removeWaitOnlyAttr();
-
-  auto dtypeHost = DeviceTypeAttr::get(&context, DeviceType::Host);
-  op.setWaitOnlyAttr(b.getArrayAttr({dtypeHost}));
-  EXPECT_TRUE(op.hasWaitOnly(DeviceType::Host));
-  EXPECT_FALSE(op.hasWaitOnly());
-  op.removeWaitOnlyAttr();
-
-  auto dtypeStar = DeviceTypeAttr::get(&context, DeviceType::Star);
-  op.setWaitOnlyAttr(b.getArrayAttr({dtypeHost, dtypeStar}));
-  EXPECT_TRUE(op.hasWaitOnly(DeviceType::Star));
-  EXPECT_TRUE(op.hasWaitOnly(DeviceType::Host));
-  EXPECT_FALSE(op.hasWaitOnly());
-}
-
-TEST_F(OpenACCOpsTest, waitOnlyTest) {
-  testWaitOnly<ParallelOp>(b, context, loc, dtypes, dtypesWithoutNone);
-  testWaitOnly<KernelsOp>(b, context, loc, dtypes, dtypesWithoutNone);
-  testWaitOnly<SerialOp>(b, context, loc, dtypes, dtypesWithoutNone);
-}
-
-template <typename Op>
-void testWaitValues(OpBuilder &b, MLIRContext &context, Location loc,
-                    llvm::SmallVector<DeviceType> &dtypes,
-                    llvm::SmallVector<DeviceType> &dtypesWithoutNone) {
-  Op op = b.create<Op>(loc, TypeRange{}, ValueRange{});
-  EXPECT_EQ(op.getWaitValues().begin(), op.getWaitValues().end());
-
-  mlir::Value val1 = b.create<arith::ConstantOp>(loc, b.getI32IntegerAttr(1));
-  mlir::Value val2 = b.create<arith::ConstantOp>(loc, b.getI32IntegerAttr(4));
-  auto dtypeNone = DeviceTypeAttr::get(&context, DeviceType::None);
-  op.getWaitOperandsMutable().assign(val1);
-  op.setWaitOperandsDeviceTypeAttr(b.getArrayAttr({dtypeNone}));
-  op.setWaitOperandsSegments(b.getDenseI32ArrayAttr({1}));
-  EXPECT_EQ(op.getWaitValues().front(), val1);
-  for (auto d : dtypesWithoutNone)
-    EXPECT_EQ(op.getWaitValues(d).begin(), op.getWaitValues(d).end());
-
-  op.getWaitOperandsMutable().clear();
-  op.removeWaitOperandsDeviceTypeAttr();
-  op.removeWaitOperandsSegmentsAttr();
-  for (auto d : dtypes)
-    EXPECT_EQ(op.getWaitValues(d).begin(), op.getWaitValues(d).end());
-
-  op.getWaitOperandsMutable().append(val1);
-  op.getWaitOperandsMutable().append(val2);
-  op.setWaitOperandsDeviceTypeAttr(
-      b.getArrayAttr({DeviceTypeAttr::get(&context, DeviceType::Host),
-                      DeviceTypeAttr::get(&context, DeviceType::Star)}));
-  op.setWaitOperandsSegments(b.getDenseI32ArrayAttr({1, 1}));
-  EXPECT_EQ(op.getWaitValues(DeviceType::None).begin(),
-            op.getWaitValues(DeviceType::None).end());
-  EXPECT_EQ(op.getWaitValues(DeviceType::Host).front(), val1);
-  EXPECT_EQ(op.getWaitValues(DeviceType::Star).front(), val2);
-
-  op.getWaitOperandsMutable().clear();
-  op.removeWaitOperandsDeviceTypeAttr();
-  op.removeWaitOperandsSegmentsAttr();
-  for (auto d : dtypes)
-    EXPECT_EQ(op.getWaitValues(d).begin(), op.getWaitValues(d).end());
-
-  op.getWaitOperandsMutable().append(val1);
-  op.getWaitOperandsMutable().append(val2);
-  op.getWaitOperandsMutable().append(val1);
-  op.setWaitOperandsDeviceTypeAttr(
-      b.getArrayAttr({DeviceTypeAttr::get(&context, DeviceType::Default),
-                      DeviceTypeAttr::get(&context, DeviceType::Multicore)}));
-  op.setWaitOperandsSegments(b.getDenseI32ArrayAttr({2, 1}));
-  EXPECT_EQ(op.getWaitValues(DeviceType::None).begin(),
-            op.getWaitValues(DeviceType::None).end());
-  EXPECT_EQ(op.getWaitValues(DeviceType::Default).front(), val1);
-  EXPECT_EQ(op.getWaitValues(DeviceType::Default).drop_front().front(), val2);
-  EXPECT_EQ(op.getWaitValues(DeviceType::Multicore).front(), val1);
-}
-
-TEST_F(OpenACCOpsTest, waitValuesTest) {
-  testWaitValues<KernelsOp>(b, context, loc, dtypes, dtypesWithoutNone);
-  testWaitValues<ParallelOp>(b, context, loc, dtypes, dtypesWithoutNone);
-  testWaitValues<SerialOp>(b, context, loc, dtypes, dtypesWithoutNone);
-}

From ad4cead67cff7cedacd32249799d43f1a59db706 Mon Sep 17 00:00:00 2001
From: Alexander Yermolovich <43973793+ayermolo@users.noreply.github.com>
Date: Wed, 20 Dec 2023 16:12:52 -0800
Subject: [PATCH 012/342] [BOLT][DWARF][NFC] Initialize CloneUnitCtxMap with
 current partition size (#75876)

We would always allocate maximum amount for vector containing
DWARFUnitInfo. In real usecases what ends up hapenning is we allocate a
giant vector when processing one CU, or for thin-lto case multiple CUs.
This lead to a lot of memory overhead, and 2x BOLT processing slowdown
for at least one service built with monolithic DWARF.

For binaries built with LTO with clang all of CUs that have cross
references will share an abbrev table and will be processed in one
batch. Rest of CUs are processesd in --cu-processing-batch-size size.
Which defaults to 1.

For theoretical cases where cross-cu references are present, but they do
not share abbrev will increase the size of CloneUnitCtxMap as each CU is
being processsed.
---
 bolt/lib/Core/DIEBuilder.cpp | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/bolt/lib/Core/DIEBuilder.cpp b/bolt/lib/Core/DIEBuilder.cpp
index caa5ecbea521d..762d3419edd34 100644
--- a/bolt/lib/Core/DIEBuilder.cpp
+++ b/bolt/lib/Core/DIEBuilder.cpp
@@ -266,13 +266,11 @@ void DIEBuilder::buildCompileUnits(const bool Init) {
 }
 void DIEBuilder::buildCompileUnits(const std::vector<DWARFUnit *> &CUs) {
   BuilderState.reset(new State());
-  // Initializing to full size because there could be cross CU references with
-  // different abbrev offsets. LLVM happens to output CUs that have cross CU
-  // references with the same abbrev table. So destinations end up in the first
-  // set, even if they themselves don't have src cross cu ref. We could have
-  // cases where this is not the case. In which case this container needs to be
-  // big enough for all.
-  getState().CloneUnitCtxMap.resize(DwarfContext->getNumCompileUnits());
+  // Allocating enough for current batch being processed.
+  // In real use cases we either processing a batch of CUs with no cross
+  // references, or if they do have them it is due to LTO. With clang they will
+  // share the same abbrev table. In either case this vector will not grow.
+  getState().CloneUnitCtxMap.resize(CUs.size());
   getState().Type = ProcessingType::CUs;
   for (DWARFUnit *CU : CUs)
     registerUnit(*CU, false);
@@ -897,6 +895,10 @@ void DIEBuilder::registerUnit(DWARFUnit &DU, bool NeedSort) {
                 });
   }
   getState().UnitIDMap[getHash(DU)] = getState().DUList.size();
+  // This handles the case where we do have cross cu references, but CUs do not
+  // share the same abbrev table.
+  if (getState().DUList.size() == getState().CloneUnitCtxMap.size())
+    getState().CloneUnitCtxMap.emplace_back();
   getState().DUList.push_back(&DU);
 }
 

From 0ea87560cca4993b89aecf6cd3d93d6d97f2b3fb Mon Sep 17 00:00:00 2001
From: Max Winkler <82551778+MaxEW707@users.noreply.github.com>
Date: Wed, 20 Dec 2023 19:43:46 -0500
Subject: [PATCH 013/342] [clang] Separate Intel ADC instrinsics from ADX
 intrinsics (#75992)

See https://github.com/llvm/llvm-project/pull/75711 for discussion.

As a summary from the PR above, `<adxintrin.h>` includes adc intrinsics
and adx intrinsics.
To support MSVC STL we need to expose the adc intrinsics inside the
currently proposed `<intrin0.h>` header.
Move the processor agnostic adc intrinsics into a separate file that can
be included from `<immintrin.h>` and the currently proposed
`<intrin0.h>`.
---
 clang/lib/Headers/CMakeLists.txt |   1 +
 clang/lib/Headers/adcintrin.h    | 160 +++++++++++++++++++++++++++++++
 clang/lib/Headers/adxintrin.h    | 143 ++-------------------------
 clang/lib/Headers/immintrin.h    |   8 +-
 4 files changed, 176 insertions(+), 136 deletions(-)
 create mode 100644 clang/lib/Headers/adcintrin.h

diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt
index f8fdd402777e4..735e4e4e3be89 100644
--- a/clang/lib/Headers/CMakeLists.txt
+++ b/clang/lib/Headers/CMakeLists.txt
@@ -139,6 +139,7 @@ set(webassembly_files
 
 set(x86_files
 # Intrinsics
+  adcintrin.h
   adxintrin.h
   ammintrin.h
   amxcomplexintrin.h
diff --git a/clang/lib/Headers/adcintrin.h b/clang/lib/Headers/adcintrin.h
new file mode 100644
index 0000000000000..0065a1b543f81
--- /dev/null
+++ b/clang/lib/Headers/adcintrin.h
@@ -0,0 +1,160 @@
+/*===---- adcintrin.h - ADC intrinsics -------------------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __ADCINTRIN_H
+#define __ADCINTRIN_H
+
+#if !defined(__i386__) && !defined(__x86_64__)
+#error "This header is only meant to be used on x86 and x64 architecture"
+#endif
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
+
+/* Use C++ inline semantics in C++, GNU inline for C mode. */
+#if defined(__cplusplus)
+#define __INLINE __inline
+#else
+#define __INLINE static __inline
+#endif
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/// Adds unsigned 32-bit integers \a __x and \a __y, plus 0 or 1 as indicated
+///    by the carry flag \a __cf. Stores the unsigned 32-bit sum in the memory
+///    at \a __p, and returns the 8-bit carry-out (carry flag).
+///
+/// \code{.operation}
+/// temp := (__cf == 0) ? 0 : 1
+/// Store32(__p, __x + __y + temp)
+/// result := CF
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c ADC instruction.
+///
+/// \param __cf
+///    The 8-bit unsigned carry flag; any non-zero value indicates carry.
+/// \param __x
+///    A 32-bit unsigned addend.
+/// \param __y
+///    A 32-bit unsigned addend.
+/// \param __p
+///    Pointer to memory for storing the sum.
+/// \returns The 8-bit unsigned carry-out value.
+__INLINE unsigned char __DEFAULT_FN_ATTRS _addcarry_u32(unsigned char __cf,
+                                                        unsigned int __x,
+                                                        unsigned int __y,
+                                                        unsigned int *__p) {
+  return __builtin_ia32_addcarryx_u32(__cf, __x, __y, __p);
+}
+
+/// Adds unsigned 32-bit integer \a __y to 0 or 1 as indicated by the carry
+///    flag \a __cf, and subtracts the result from unsigned 32-bit integer
+///    \a __x. Stores the unsigned 32-bit difference in the memory at \a __p,
+///    and returns the 8-bit carry-out (carry or overflow flag).
+///
+/// \code{.operation}
+/// temp := (__cf == 0) ? 0 : 1
+/// Store32(__p, __x - (__y + temp))
+/// result := CF
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c SBB instruction.
+///
+/// \param __cf
+///    The 8-bit unsigned carry flag; any non-zero value indicates carry.
+/// \param __x
+///    The 32-bit unsigned minuend.
+/// \param __y
+///    The 32-bit unsigned subtrahend.
+/// \param __p
+///    Pointer to memory for storing the difference.
+/// \returns The 8-bit unsigned carry-out value.
+__INLINE unsigned char __DEFAULT_FN_ATTRS _subborrow_u32(unsigned char __cf,
+                                                         unsigned int __x,
+                                                         unsigned int __y,
+                                                         unsigned int *__p) {
+  return __builtin_ia32_subborrow_u32(__cf, __x, __y, __p);
+}
+
+#ifdef __x86_64__
+/// Adds unsigned 64-bit integers \a __x and \a __y, plus 0 or 1 as indicated
+///    by the carry flag \a __cf. Stores the unsigned 64-bit sum in the memory
+///    at \a __p, and returns the 8-bit carry-out (carry flag).
+///
+/// \code{.operation}
+/// temp := (__cf == 0) ? 0 : 1
+/// Store64(__p, __x + __y + temp)
+/// result := CF
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c ADC instruction.
+///
+/// \param __cf
+///    The 8-bit unsigned carry flag; any non-zero value indicates carry.
+/// \param __x
+///    A 64-bit unsigned addend.
+/// \param __y
+///    A 64-bit unsigned addend.
+/// \param __p
+///    Pointer to memory for storing the sum.
+/// \returns The 8-bit unsigned carry-out value.
+__INLINE unsigned char __DEFAULT_FN_ATTRS
+_addcarry_u64(unsigned char __cf, unsigned long long __x,
+              unsigned long long __y, unsigned long long *__p) {
+  return __builtin_ia32_addcarryx_u64(__cf, __x, __y, __p);
+}
+
+/// Adds unsigned 64-bit integer \a __y to 0 or 1 as indicated by the carry
+///    flag \a __cf, and subtracts the result from unsigned 64-bit integer
+///    \a __x. Stores the unsigned 64-bit difference in the memory at \a __p,
+///    and returns the 8-bit carry-out (carry or overflow flag).
+///
+/// \code{.operation}
+/// temp := (__cf == 0) ? 0 : 1
+/// Store64(__p, __x - (__y + temp))
+/// result := CF
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c ADC instruction.
+///
+/// \param __cf
+///    The 8-bit unsigned carry flag; any non-zero value indicates carry.
+/// \param __x
+///    The 64-bit unsigned minuend.
+/// \param __y
+///    The 64-bit unsigned subtrahend.
+/// \param __p
+///    Pointer to memory for storing the difference.
+/// \returns The 8-bit unsigned carry-out value.
+__INLINE unsigned char __DEFAULT_FN_ATTRS
+_subborrow_u64(unsigned char __cf, unsigned long long __x,
+               unsigned long long __y, unsigned long long *__p) {
+  return __builtin_ia32_subborrow_u64(__cf, __x, __y, __p);
+}
+#endif
+
+#if defined(__cplusplus)
+}
+#endif
+
+#undef __INLINE
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __ADCINTRIN_H */
diff --git a/clang/lib/Headers/adxintrin.h b/clang/lib/Headers/adxintrin.h
index 20f6211e567b3..bc6a4caf35337 100644
--- a/clang/lib/Headers/adxintrin.h
+++ b/clang/lib/Headers/adxintrin.h
@@ -15,7 +15,8 @@
 #define __ADXINTRIN_H
 
 /* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
+#define __DEFAULT_FN_ATTRS                                                     \
+  __attribute__((__always_inline__, __nodebug__, __target__("adx")))
 
 /* Use C++ inline semantics in C++, GNU inline for C mode. */
 #if defined(__cplusplus)
@@ -53,10 +54,10 @@ extern "C" {
 /// \param __p
 ///    Pointer to memory for storing the sum.
 /// \returns The 8-bit unsigned carry-out value.
-__INLINE unsigned char
-    __attribute__((__always_inline__, __nodebug__, __target__("adx")))
-    _addcarryx_u32(unsigned char __cf, unsigned int __x, unsigned int __y,
-                   unsigned int *__p) {
+__INLINE unsigned char __DEFAULT_FN_ATTRS _addcarryx_u32(unsigned char __cf,
+                                                         unsigned int __x,
+                                                         unsigned int __y,
+                                                         unsigned int *__p) {
   return __builtin_ia32_addcarryx_u32(__cf, __x, __y, __p);
 }
 
@@ -84,137 +85,10 @@ __INLINE unsigned char
 /// \param __p
 ///    Pointer to memory for storing the sum.
 /// \returns The 8-bit unsigned carry-out value.
-__INLINE unsigned char
-    __attribute__((__always_inline__, __nodebug__, __target__("adx")))
-    _addcarryx_u64(unsigned char __cf, unsigned long long __x,
-                   unsigned long long __y, unsigned long long *__p) {
-  return __builtin_ia32_addcarryx_u64(__cf, __x, __y, __p);
-}
-#endif
-
-/* Intrinsics that are also available if __ADX__ is undefined. */
-
-/// Adds unsigned 32-bit integers \a __x and \a __y, plus 0 or 1 as indicated
-///    by the carry flag \a __cf. Stores the unsigned 32-bit sum in the memory
-///    at \a __p, and returns the 8-bit carry-out (carry flag).
-///
-/// \code{.operation}
-/// temp := (__cf == 0) ? 0 : 1
-/// Store32(__p, __x + __y + temp)
-/// result := CF
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c ADC instruction.
-///
-/// \param __cf
-///    The 8-bit unsigned carry flag; any non-zero value indicates carry.
-/// \param __x
-///    A 32-bit unsigned addend.
-/// \param __y
-///    A 32-bit unsigned addend.
-/// \param __p
-///    Pointer to memory for storing the sum.
-/// \returns The 8-bit unsigned carry-out value.
-__INLINE unsigned char __DEFAULT_FN_ATTRS _addcarry_u32(unsigned char __cf,
-                                                        unsigned int __x,
-                                                        unsigned int __y,
-                                                        unsigned int *__p) {
-  return __builtin_ia32_addcarryx_u32(__cf, __x, __y, __p);
-}
-
-#ifdef __x86_64__
-/// Adds unsigned 64-bit integers \a __x and \a __y, plus 0 or 1 as indicated
-///    by the carry flag \a __cf. Stores the unsigned 64-bit sum in the memory
-///    at \a __p, and returns the 8-bit carry-out (carry flag).
-///
-/// \code{.operation}
-/// temp := (__cf == 0) ? 0 : 1
-/// Store64(__p, __x + __y + temp)
-/// result := CF
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c ADC instruction.
-///
-/// \param __cf
-///    The 8-bit unsigned carry flag; any non-zero value indicates carry.
-/// \param __x
-///    A 64-bit unsigned addend.
-/// \param __y
-///    A 64-bit unsigned addend.
-/// \param __p
-///    Pointer to memory for storing the sum.
-/// \returns The 8-bit unsigned carry-out value.
 __INLINE unsigned char __DEFAULT_FN_ATTRS
-_addcarry_u64(unsigned char __cf, unsigned long long __x,
-              unsigned long long __y, unsigned long long *__p) {
-  return __builtin_ia32_addcarryx_u64(__cf, __x, __y, __p);
-}
-#endif
-
-/// Adds unsigned 32-bit integer \a __y to 0 or 1 as indicated by the carry
-///    flag \a __cf, and subtracts the result from unsigned 32-bit integer
-///    \a __x. Stores the unsigned 32-bit difference in the memory at \a __p,
-///    and returns the 8-bit carry-out (carry or overflow flag).
-///
-/// \code{.operation}
-/// temp := (__cf == 0) ? 0 : 1
-/// Store32(__p, __x - (__y + temp))
-/// result := CF
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c SBB instruction.
-///
-/// \param __cf
-///    The 8-bit unsigned carry flag; any non-zero value indicates carry.
-/// \param __x
-///    The 32-bit unsigned minuend.
-/// \param __y
-///    The 32-bit unsigned subtrahend.
-/// \param __p
-///    Pointer to memory for storing the difference.
-/// \returns The 8-bit unsigned carry-out value.
-__INLINE unsigned char __DEFAULT_FN_ATTRS _subborrow_u32(unsigned char __cf,
-                                                         unsigned int __x,
-                                                         unsigned int __y,
-                                                         unsigned int *__p) {
-  return __builtin_ia32_subborrow_u32(__cf, __x, __y, __p);
-}
-
-#ifdef __x86_64__
-/// Adds unsigned 64-bit integer \a __y to 0 or 1 as indicated by the carry
-///    flag \a __cf, and subtracts the result from unsigned 64-bit integer
-///    \a __x. Stores the unsigned 64-bit difference in the memory at \a __p,
-///    and returns the 8-bit carry-out (carry or overflow flag).
-///
-/// \code{.operation}
-/// temp := (__cf == 0) ? 0 : 1
-/// Store64(__p, __x - (__y + temp))
-/// result := CF
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c ADC instruction.
-///
-/// \param __cf
-///    The 8-bit unsigned carry flag; any non-zero value indicates carry.
-/// \param __x
-///    The 64-bit unsigned minuend.
-/// \param __y
-///    The 64-bit unsigned subtrahend.
-/// \param __p
-///    Pointer to memory for storing the difference.
-/// \returns The 8-bit unsigned carry-out value.
-__INLINE unsigned char __DEFAULT_FN_ATTRS
-_subborrow_u64(unsigned char __cf, unsigned long long __x,
+_addcarryx_u64(unsigned char __cf, unsigned long long __x,
                unsigned long long __y, unsigned long long *__p) {
-  return __builtin_ia32_subborrow_u64(__cf, __x, __y, __p);
+  return __builtin_ia32_addcarryx_u64(__cf, __x, __y, __p);
 }
 #endif
 
@@ -222,6 +96,7 @@ _subborrow_u64(unsigned char __cf, unsigned long long __x,
 }
 #endif
 
+#undef __INLINE
 #undef __DEFAULT_FN_ATTRS
 
 #endif /* __ADXINTRIN_H */
diff --git a/clang/lib/Headers/immintrin.h b/clang/lib/Headers/immintrin.h
index 9bfe2fcdabdb3..0149a1cdea633 100644
--- a/clang/lib/Headers/immintrin.h
+++ b/clang/lib/Headers/immintrin.h
@@ -580,9 +580,13 @@ _storebe_i64(void * __P, long long __D) {
 #include <cetintrin.h>
 #endif
 
-/* Some intrinsics inside adxintrin.h are available only on processors with ADX,
- * whereas others are also available at all times. */
+/* Intrinsics inside adcintrin.h are available at all times. */
+#include <adcintrin.h>
+
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+    defined(__ADX__)
 #include <adxintrin.h>
+#endif
 
 #if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
     defined(__RDSEED__)

From ba192debb409a0ad513772eed289099321e531df Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn@outlook.com>
Date: Wed, 20 Dec 2023 19:02:50 -0600
Subject: [PATCH 014/342] [Libomptarget][Obvious] Fix typo in attribute lookup

Summary:
These are keys into the AMDGPU target metadata. One of them had a typo
which prevented it from being extracted.
---
 openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h b/openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h
index 1cb99c0a5dca3..2471590c27b37 100644
--- a/openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h
+++ b/openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h
@@ -202,7 +202,7 @@ class KernelInfoReader {
       KernelData.VGPRSpillCount = V.second.getUInt();
     } else if (IsKey(V.first, ".private_segment_fixed_size")) {
       KernelData.PrivateSegmentSize = V.second.getUInt();
-    } else if (IsKey(V.first, ".group_segement_fixed_size")) {
+    } else if (IsKey(V.first, ".group_segment_fixed_size")) {
       KernelData.GroupSegmentList = V.second.getUInt();
     } else if (IsKey(V.first, ".reqd_workgroup_size")) {
       GetSequenceOfThreeInts(V.second, KernelData.RequestedWorkgroupSize);

From bffdde8b8e5d9a76a47949cd0f574f3ce656e181 Mon Sep 17 00:00:00 2001
From: Han-Chung Wang <hanhan0912@gmail.com>
Date: Wed, 20 Dec 2023 17:03:55 -0800
Subject: [PATCH 015/342] [mlir][tensor][NFC] Fix a typo in pack simplification
 pattern. (#76109)

---
 mlir/lib/Dialect/Tensor/IR/TensorOps.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp b/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp
index 1b0cdbd0f4f73..7c35dd4d95361 100644
--- a/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp
+++ b/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp
@@ -3469,7 +3469,7 @@ OpFoldResult SplatOp::fold(FoldAdaptor adaptor) {
 namespace {
 
 /// Packing one-dimensional tensor can be expressed as an expand shape op.
-struct SimplifyPackToExandShape : public OpRewritePattern<PackOp> {
+struct SimplifyPackToExpandShape : public OpRewritePattern<PackOp> {
   using OpRewritePattern<PackOp>::OpRewritePattern;
 
   Value insertExpand(RewriterBase &rewriter, Location loc, Value operand,
@@ -3501,7 +3501,7 @@ struct SimplifyPackToExandShape : public OpRewritePattern<PackOp> {
 } // namespace
 
 void mlir::tensor::populateSimplifyTensorPack(RewritePatternSet &patterns) {
-  patterns.add<SimplifyPackToExandShape>(patterns.getContext());
+  patterns.add<SimplifyPackToExpandShape>(patterns.getContext());
 }
 
 template <typename OpTy>

From 1638657dce0ca03c7d5cd9dfc23bf31485b4ac43 Mon Sep 17 00:00:00 2001
From: Konstantin Varlamov <varconsteq@gmail.com>
Date: Wed, 20 Dec 2023 17:24:48 -0800
Subject: [PATCH 016/342] [libc++][hardening] Categorize more
 'valid-element-access' checks. (#71620)

---
 libcxx/include/__algorithm/ranges_max.h       |  5 +-
 libcxx/include/__algorithm/ranges_min.h       |  5 +-
 libcxx/include/__algorithm/ranges_minmax.h    |  5 +-
 libcxx/include/__algorithm/sample.h           |  2 +-
 libcxx/include/__format/formatter_output.h    |  6 +-
 .../include/__format/parser_std_format_spec.h |  6 +-
 libcxx/include/__iterator/common_iterator.h   | 32 +++++-----
 libcxx/include/__iterator/counted_iterator.h  | 11 ++--
 libcxx/include/__ranges/subrange.h            |  4 +-
 libcxx/include/__ranges/view_interface.h      | 10 ++--
 .../include/__utility/is_pointer_in_range.h   |  2 +-
 libcxx/include/experimental/__simd/vec_ext.h  |  4 +-
 libcxx/src/support/ibm/xlocale_zos.cpp        |  2 +-
 .../alg.sorting/assert.min.max.pass.cpp       |  2 +-
 .../counted.iterator/assert.pass.cpp          | 42 ++++++++++++++
 .../iterators.common/assert.pass.cpp          | 58 +++++++++++++++++++
 16 files changed, 151 insertions(+), 45 deletions(-)
 create mode 100644 libcxx/test/libcxx/iterators/predef.iterators/counted.iterator/assert.pass.cpp
 create mode 100644 libcxx/test/libcxx/iterators/predef.iterators/iterators.common/assert.pass.cpp

diff --git a/libcxx/include/__algorithm/ranges_max.h b/libcxx/include/__algorithm/ranges_max.h
index 782ce2670f055..0f89cb2ff5bf2 100644
--- a/libcxx/include/__algorithm/ranges_max.h
+++ b/libcxx/include/__algorithm/ranges_max.h
@@ -54,7 +54,8 @@ struct __fn {
             indirect_strict_weak_order<projected<const _Tp*, _Proj>> _Comp = ranges::less>
   _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI constexpr _Tp
   operator()(initializer_list<_Tp> __il, _Comp __comp = {}, _Proj __proj = {}) const {
-    _LIBCPP_ASSERT_UNCATEGORIZED(__il.begin() != __il.end(), "initializer_list must contain at least one element");
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(
+        __il.begin() != __il.end(), "initializer_list must contain at least one element");
 
     auto __comp_lhs_rhs_swapped = [&](auto&& __lhs, auto&& __rhs) -> bool { return std::invoke(__comp, __rhs, __lhs); };
     return *ranges::__min_element_impl(__il.begin(), __il.end(), __comp_lhs_rhs_swapped, __proj);
@@ -69,7 +70,7 @@ struct __fn {
     auto __first = ranges::begin(__r);
     auto __last  = ranges::end(__r);
 
-    _LIBCPP_ASSERT_UNCATEGORIZED(__first != __last, "range must contain at least one element");
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(__first != __last, "range must contain at least one element");
 
     if constexpr (forward_range<_Rp> && !__is_cheap_to_copy<range_value_t<_Rp>>) {
       auto __comp_lhs_rhs_swapped = [&](auto&& __lhs, auto&& __rhs) -> bool {
diff --git a/libcxx/include/__algorithm/ranges_min.h b/libcxx/include/__algorithm/ranges_min.h
index be15b4536734d..8757358cdf37d 100644
--- a/libcxx/include/__algorithm/ranges_min.h
+++ b/libcxx/include/__algorithm/ranges_min.h
@@ -53,7 +53,8 @@ struct __fn {
             indirect_strict_weak_order<projected<const _Tp*, _Proj>> _Comp = ranges::less>
   _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI constexpr _Tp
   operator()(initializer_list<_Tp> __il, _Comp __comp = {}, _Proj __proj = {}) const {
-    _LIBCPP_ASSERT_UNCATEGORIZED(__il.begin() != __il.end(), "initializer_list must contain at least one element");
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(
+        __il.begin() != __il.end(), "initializer_list must contain at least one element");
     return *ranges::__min_element_impl(__il.begin(), __il.end(), __comp, __proj);
   }
 
@@ -65,7 +66,7 @@ struct __fn {
   operator()(_Rp&& __r, _Comp __comp = {}, _Proj __proj = {}) const {
     auto __first = ranges::begin(__r);
     auto __last  = ranges::end(__r);
-    _LIBCPP_ASSERT_UNCATEGORIZED(__first != __last, "range must contain at least one element");
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(__first != __last, "range must contain at least one element");
     if constexpr (forward_range<_Rp> && !__is_cheap_to_copy<range_value_t<_Rp>>) {
       return *ranges::__min_element_impl(__first, __last, __comp, __proj);
     } else {
diff --git a/libcxx/include/__algorithm/ranges_minmax.h b/libcxx/include/__algorithm/ranges_minmax.h
index a5b5cf9bd0ab9..22a62b620c936 100644
--- a/libcxx/include/__algorithm/ranges_minmax.h
+++ b/libcxx/include/__algorithm/ranges_minmax.h
@@ -65,7 +65,8 @@ struct __fn {
             indirect_strict_weak_order<projected<const _Type*, _Proj>> _Comp = ranges::less>
   _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI constexpr ranges::minmax_result<_Type>
   operator()(initializer_list<_Type> __il, _Comp __comp = {}, _Proj __proj = {}) const {
-    _LIBCPP_ASSERT_UNCATEGORIZED(__il.begin() != __il.end(), "initializer_list has to contain at least one element");
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(
+        __il.begin() != __il.end(), "initializer_list has to contain at least one element");
     auto __iters = std::__minmax_element_impl(__il.begin(), __il.end(), __comp, __proj);
     return ranges::minmax_result<_Type>{*__iters.first, *__iters.second};
   }
@@ -80,7 +81,7 @@ struct __fn {
     auto __last   = ranges::end(__r);
     using _ValueT = range_value_t<_Range>;
 
-    _LIBCPP_ASSERT_UNCATEGORIZED(__first != __last, "range has to contain at least one element");
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(__first != __last, "range has to contain at least one element");
 
     if constexpr (forward_range<_Range>) {
       // Special-case the one element case. Avoid repeatedly initializing objects from the result of an iterator
diff --git a/libcxx/include/__algorithm/sample.h b/libcxx/include/__algorithm/sample.h
index cc29dd686f6be..ebe5180b7eeca 100644
--- a/libcxx/include/__algorithm/sample.h
+++ b/libcxx/include/__algorithm/sample.h
@@ -89,7 +89,7 @@ _LIBCPP_HIDE_FROM_ABI _SampleIterator __sample(
     _SampleIterator __output_iter,
     _Distance __n,
     _UniformRandomNumberGenerator& __g) {
-  _LIBCPP_ASSERT_UNCATEGORIZED(__n >= 0, "N must be a positive number.");
+  _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(__n >= 0, "N must be a positive number.");
 
   using _PopIterCategory = typename _IterOps<_AlgPolicy>::template __iterator_category<_PopulationIterator>;
   using _Difference      = typename _IterOps<_AlgPolicy>::template __difference_type<_PopulationIterator>;
diff --git a/libcxx/include/__format/formatter_output.h b/libcxx/include/__format/formatter_output.h
index 89854f67f5fc1..31e06425703ae 100644
--- a/libcxx/include/__format/formatter_output.h
+++ b/libcxx/include/__format/formatter_output.h
@@ -246,7 +246,7 @@ __write(_Iterator __first,
         output_iterator<const iter_value_t<_Iterator>&> auto __out_it,
         __format_spec::__parsed_specifications<_ParserCharT> __specs,
         ptrdiff_t __size) -> decltype(__out_it) {
-  _LIBCPP_ASSERT_UNCATEGORIZED(__first <= __last, "Not a valid range");
+  _LIBCPP_ASSERT_VALID_INPUT_RANGE(__first <= __last, "Not a valid range");
   return __formatter::__write(basic_string_view{__first, __last}, std::move(__out_it), __specs, __size);
 }
 
@@ -259,7 +259,7 @@ __write(_Iterator __first,
         _Iterator __last,
         output_iterator<const iter_value_t<_Iterator>&> auto __out_it,
         __format_spec::__parsed_specifications<_ParserCharT> __specs) -> decltype(__out_it) {
-  _LIBCPP_ASSERT_UNCATEGORIZED(__first <= __last, "Not a valid range");
+  _LIBCPP_ASSERT_VALID_INPUT_RANGE(__first <= __last, "Not a valid range");
   return __formatter::__write(__first, __last, std::move(__out_it), __specs, __last - __first);
 }
 
@@ -273,7 +273,7 @@ _LIBCPP_HIDE_FROM_ABI auto __write_transformed(
     output_iterator<const _CharT&> auto __out_it,
     __format_spec::__parsed_specifications<_ParserCharT> __specs,
     _UnaryOperation __op) -> decltype(__out_it) {
-  _LIBCPP_ASSERT_UNCATEGORIZED(__first <= __last, "Not a valid range");
+  _LIBCPP_ASSERT_VALID_INPUT_RANGE(__first <= __last, "Not a valid range");
 
   ptrdiff_t __size = __last - __first;
   if (__size >= __specs.__width_)
diff --git a/libcxx/include/__format/parser_std_format_spec.h b/libcxx/include/__format/parser_std_format_spec.h
index 9a91179fdfb52..e38729db965c3 100644
--- a/libcxx/include/__format/parser_std_format_spec.h
+++ b/libcxx/include/__format/parser_std_format_spec.h
@@ -591,7 +591,7 @@ class _LIBCPP_TEMPLATE_VIS __parser {
           || (same_as<_CharT, wchar_t> && sizeof(wchar_t) == 2)
 #    endif
   _LIBCPP_HIDE_FROM_ABI constexpr bool __parse_fill_align(_Iterator& __begin, _Iterator __end, bool __use_range_fill) {
-    _LIBCPP_ASSERT_UNCATEGORIZED(
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(
         __begin != __end,
         "when called with an empty input the function will cause "
         "undefined behavior by evaluating data not in the input");
@@ -624,7 +624,7 @@ class _LIBCPP_TEMPLATE_VIS __parser {
   template <contiguous_iterator _Iterator>
     requires(same_as<_CharT, wchar_t> && sizeof(wchar_t) == 4)
   _LIBCPP_HIDE_FROM_ABI constexpr bool __parse_fill_align(_Iterator& __begin, _Iterator __end, bool __use_range_fill) {
-    _LIBCPP_ASSERT_UNCATEGORIZED(
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(
         __begin != __end,
         "when called with an empty input the function will cause "
         "undefined behavior by evaluating data not in the input");
@@ -652,7 +652,7 @@ class _LIBCPP_TEMPLATE_VIS __parser {
   // range-fill and tuple-fill are identical
   template <contiguous_iterator _Iterator>
   _LIBCPP_HIDE_FROM_ABI constexpr bool __parse_fill_align(_Iterator& __begin, _Iterator __end, bool __use_range_fill) {
-    _LIBCPP_ASSERT_UNCATEGORIZED(
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(
         __begin != __end,
         "when called with an empty input the function will cause "
         "undefined behavior by evaluating data not in the input");
diff --git a/libcxx/include/__iterator/common_iterator.h b/libcxx/include/__iterator/common_iterator.h
index cc49d62cd04dd..7b3f4610d5319 100644
--- a/libcxx/include/__iterator/common_iterator.h
+++ b/libcxx/include/__iterator/common_iterator.h
@@ -77,7 +77,7 @@ class common_iterator {
     requires convertible_to<const _I2&, _Iter> && convertible_to<const _S2&, _Sent>
   _LIBCPP_HIDE_FROM_ABI constexpr common_iterator(const common_iterator<_I2, _S2>& __other)
       : __hold_([&]() -> variant<_Iter, _Sent> {
-          _LIBCPP_ASSERT_UNCATEGORIZED(
+          _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(
               !__other.__hold_.valueless_by_exception(), "Attempted to construct from a valueless common_iterator");
           if (__other.__hold_.index() == 0)
             return variant<_Iter, _Sent>{in_place_index<0>, std::__unchecked_get<0>(__other.__hold_)};
@@ -88,7 +88,7 @@ class common_iterator {
     requires convertible_to<const _I2&, _Iter> && convertible_to<const _S2&, _Sent> &&
              assignable_from<_Iter&, const _I2&> && assignable_from<_Sent&, const _S2&>
   _LIBCPP_HIDE_FROM_ABI common_iterator& operator=(const common_iterator<_I2, _S2>& __other) {
-    _LIBCPP_ASSERT_UNCATEGORIZED(
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(
         !__other.__hold_.valueless_by_exception(), "Attempted to assign from a valueless common_iterator");
 
     auto __idx       = __hold_.index();
@@ -110,7 +110,7 @@ class common_iterator {
   }
 
   _LIBCPP_HIDE_FROM_ABI constexpr decltype(auto) operator*() {
-    _LIBCPP_ASSERT_UNCATEGORIZED(
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(
         std::holds_alternative<_Iter>(__hold_), "Attempted to dereference a non-dereferenceable common_iterator");
     return *std::__unchecked_get<_Iter>(__hold_);
   }
@@ -118,7 +118,7 @@ class common_iterator {
   _LIBCPP_HIDE_FROM_ABI constexpr decltype(auto) operator*() const
     requires __dereferenceable<const _Iter>
   {
-    _LIBCPP_ASSERT_UNCATEGORIZED(
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(
         std::holds_alternative<_Iter>(__hold_), "Attempted to dereference a non-dereferenceable common_iterator");
     return *std::__unchecked_get<_Iter>(__hold_);
   }
@@ -129,7 +129,7 @@ class common_iterator {
                __i.operator->();
              } || is_reference_v<iter_reference_t<_I2>> || constructible_from<iter_value_t<_I2>, iter_reference_t<_I2>>)
   {
-    _LIBCPP_ASSERT_UNCATEGORIZED(
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(
         std::holds_alternative<_Iter>(__hold_), "Attempted to dereference a non-dereferenceable common_iterator");
     if constexpr (is_pointer_v<_Iter> || requires(const _Iter& __i) { __i.operator->(); }) {
       return std::__unchecked_get<_Iter>(__hold_);
@@ -142,14 +142,14 @@ class common_iterator {
   }
 
   _LIBCPP_HIDE_FROM_ABI common_iterator& operator++() {
-    _LIBCPP_ASSERT_UNCATEGORIZED(
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(
         std::holds_alternative<_Iter>(__hold_), "Attempted to increment a non-dereferenceable common_iterator");
     ++std::__unchecked_get<_Iter>(__hold_);
     return *this;
   }
 
   _LIBCPP_HIDE_FROM_ABI decltype(auto) operator++(int) {
-    _LIBCPP_ASSERT_UNCATEGORIZED(
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(
         std::holds_alternative<_Iter>(__hold_), "Attempted to increment a non-dereferenceable common_iterator");
     if constexpr (forward_iterator<_Iter>) {
       auto __tmp = *this;
@@ -170,9 +170,9 @@ class common_iterator {
     requires sentinel_for<_Sent, _I2>
   _LIBCPP_HIDE_FROM_ABI friend constexpr bool
   operator==(const common_iterator& __x, const common_iterator<_I2, _S2>& __y) {
-    _LIBCPP_ASSERT_UNCATEGORIZED(
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(
         !__x.__hold_.valueless_by_exception(), "Attempted to compare a valueless common_iterator");
-    _LIBCPP_ASSERT_UNCATEGORIZED(
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(
         !__y.__hold_.valueless_by_exception(), "Attempted to compare a valueless common_iterator");
 
     auto __x_index = __x.__hold_.index();
@@ -191,9 +191,9 @@ class common_iterator {
     requires sentinel_for<_Sent, _I2> && equality_comparable_with<_Iter, _I2>
   _LIBCPP_HIDE_FROM_ABI friend constexpr bool
   operator==(const common_iterator& __x, const common_iterator<_I2, _S2>& __y) {
-    _LIBCPP_ASSERT_UNCATEGORIZED(
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(
         !__x.__hold_.valueless_by_exception(), "Attempted to compare a valueless common_iterator");
-    _LIBCPP_ASSERT_UNCATEGORIZED(
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(
         !__y.__hold_.valueless_by_exception(), "Attempted to compare a valueless common_iterator");
 
     auto __x_index = __x.__hold_.index();
@@ -215,9 +215,9 @@ class common_iterator {
     requires sized_sentinel_for<_Sent, _I2>
   _LIBCPP_HIDE_FROM_ABI friend constexpr iter_difference_t<_I2>
   operator-(const common_iterator& __x, const common_iterator<_I2, _S2>& __y) {
-    _LIBCPP_ASSERT_UNCATEGORIZED(
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(
         !__x.__hold_.valueless_by_exception(), "Attempted to subtract from a valueless common_iterator");
-    _LIBCPP_ASSERT_UNCATEGORIZED(
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(
         !__y.__hold_.valueless_by_exception(), "Attempted to subtract a valueless common_iterator");
 
     auto __x_index = __x.__hold_.index();
@@ -239,7 +239,7 @@ class common_iterator {
   iter_move(const common_iterator& __i) noexcept(noexcept(ranges::iter_move(std::declval<const _Iter&>())))
     requires input_iterator<_Iter>
   {
-    _LIBCPP_ASSERT_UNCATEGORIZED(
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(
         std::holds_alternative<_Iter>(__i.__hold_), "Attempted to iter_move a non-dereferenceable common_iterator");
     return ranges::iter_move(std::__unchecked_get<_Iter>(__i.__hold_));
   }
@@ -248,9 +248,9 @@ class common_iterator {
   _LIBCPP_HIDE_FROM_ABI friend constexpr void
   iter_swap(const common_iterator& __x, const common_iterator<_I2, _S2>& __y) noexcept(
       noexcept(ranges::iter_swap(std::declval<const _Iter&>(), std::declval<const _I2&>()))) {
-    _LIBCPP_ASSERT_UNCATEGORIZED(
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(
         std::holds_alternative<_Iter>(__x.__hold_), "Attempted to iter_swap a non-dereferenceable common_iterator");
-    _LIBCPP_ASSERT_UNCATEGORIZED(
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(
         std::holds_alternative<_I2>(__y.__hold_), "Attempted to iter_swap a non-dereferenceable common_iterator");
     return ranges::iter_swap(std::__unchecked_get<_Iter>(__x.__hold_), std::__unchecked_get<_I2>(__y.__hold_));
   }
diff --git a/libcxx/include/__iterator/counted_iterator.h b/libcxx/include/__iterator/counted_iterator.h
index c72ac677ff2f8..008c52fa87ce0 100644
--- a/libcxx/include/__iterator/counted_iterator.h
+++ b/libcxx/include/__iterator/counted_iterator.h
@@ -105,14 +105,14 @@ class counted_iterator
   _LIBCPP_HIDE_FROM_ABI constexpr iter_difference_t<_Iter> count() const noexcept { return __count_; }
 
   _LIBCPP_HIDE_FROM_ABI constexpr decltype(auto) operator*() {
-    _LIBCPP_ASSERT_UNCATEGORIZED(__count_ > 0, "Iterator is equal to or past end.");
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(__count_ > 0, "Iterator is equal to or past end.");
     return *__current_;
   }
 
   _LIBCPP_HIDE_FROM_ABI constexpr decltype(auto) operator*() const
     requires __dereferenceable<const _Iter>
   {
-    _LIBCPP_ASSERT_UNCATEGORIZED(__count_ > 0, "Iterator is equal to or past end.");
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(__count_ > 0, "Iterator is equal to or past end.");
     return *__current_;
   }
 
@@ -229,7 +229,7 @@ class counted_iterator
   _LIBCPP_HIDE_FROM_ABI constexpr decltype(auto) operator[](iter_difference_t<_Iter> __n) const
     requires random_access_iterator<_Iter>
   {
-    _LIBCPP_ASSERT_UNCATEGORIZED(__n < __count_, "Subscript argument must be less than size.");
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(__n < __count_, "Subscript argument must be less than size.");
     return __current_[__n];
   }
 
@@ -253,7 +253,7 @@ class counted_iterator
   iter_move(const counted_iterator& __i) noexcept(noexcept(ranges::iter_move(__i.__current_)))
     requires input_iterator<_Iter>
   {
-    _LIBCPP_ASSERT_UNCATEGORIZED(__i.__count_ > 0, "Iterator must not be past end of range.");
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(__i.__count_ > 0, "Iterator must not be past end of range.");
     return ranges::iter_move(__i.__current_);
   }
 
@@ -261,7 +261,8 @@ class counted_iterator
   _LIBCPP_HIDE_FROM_ABI friend constexpr void
   iter_swap(const counted_iterator& __x,
             const counted_iterator<_I2>& __y) noexcept(noexcept(ranges::iter_swap(__x.__current_, __y.__current_))) {
-    _LIBCPP_ASSERT_UNCATEGORIZED(__x.__count_ > 0 && __y.__count_ > 0, "Iterators must not be past end of range.");
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(
+        __x.__count_ > 0 && __y.__count_ > 0, "Iterators must not be past end of range.");
     return ranges::iter_swap(__x.__current_, __y.__current_);
   }
 
diff --git a/libcxx/include/__ranges/subrange.h b/libcxx/include/__ranges/subrange.h
index a7a3c1efa70e5..a41978275b787 100644
--- a/libcxx/include/__ranges/subrange.h
+++ b/libcxx/include/__ranges/subrange.h
@@ -101,8 +101,8 @@ class _LIBCPP_TEMPLATE_VIS subrange : public view_interface<subrange<_Iter, _Sen
     requires(_Kind == subrange_kind::sized)
       : __begin_(std::move(__iter)), __end_(std::move(__sent)), __size_(__n) {
     if constexpr (sized_sentinel_for<_Sent, _Iter>)
-      _LIBCPP_ASSERT_UNCATEGORIZED((__end_ - __begin_) == static_cast<iter_difference_t<_Iter>>(__n),
-                                   "std::ranges::subrange was passed an invalid size hint");
+      _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS((__end_ - __begin_) == static_cast<iter_difference_t<_Iter>>(__n),
+                                          "std::ranges::subrange was passed an invalid size hint");
   }
 
   template <__different_from<subrange> _Range>
diff --git a/libcxx/include/__ranges/view_interface.h b/libcxx/include/__ranges/view_interface.h
index 3216e0bd6ff20..84dd1c316de37 100644
--- a/libcxx/include/__ranges/view_interface.h
+++ b/libcxx/include/__ranges/view_interface.h
@@ -109,7 +109,7 @@ class view_interface {
   _LIBCPP_HIDE_FROM_ABI constexpr decltype(auto) front()
     requires forward_range<_D2>
   {
-    _LIBCPP_ASSERT_UNCATEGORIZED(
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(
         !empty(), "Precondition `!empty()` not satisfied. `.front()` called on an empty view.");
     return *ranges::begin(__derived());
   }
@@ -118,7 +118,7 @@ class view_interface {
   _LIBCPP_HIDE_FROM_ABI constexpr decltype(auto) front() const
     requires forward_range<const _D2>
   {
-    _LIBCPP_ASSERT_UNCATEGORIZED(
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(
         !empty(), "Precondition `!empty()` not satisfied. `.front()` called on an empty view.");
     return *ranges::begin(__derived());
   }
@@ -127,7 +127,8 @@ class view_interface {
   _LIBCPP_HIDE_FROM_ABI constexpr decltype(auto) back()
     requires bidirectional_range<_D2> && common_range<_D2>
   {
-    _LIBCPP_ASSERT_UNCATEGORIZED(!empty(), "Precondition `!empty()` not satisfied. `.back()` called on an empty view.");
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(
+        !empty(), "Precondition `!empty()` not satisfied. `.back()` called on an empty view.");
     return *ranges::prev(ranges::end(__derived()));
   }
 
@@ -135,7 +136,8 @@ class view_interface {
   _LIBCPP_HIDE_FROM_ABI constexpr decltype(auto) back() const
     requires bidirectional_range<const _D2> && common_range<const _D2>
   {
-    _LIBCPP_ASSERT_UNCATEGORIZED(!empty(), "Precondition `!empty()` not satisfied. `.back()` called on an empty view.");
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(
+        !empty(), "Precondition `!empty()` not satisfied. `.back()` called on an empty view.");
     return *ranges::prev(ranges::end(__derived()));
   }
 
diff --git a/libcxx/include/__utility/is_pointer_in_range.h b/libcxx/include/__utility/is_pointer_in_range.h
index e859562e7457a..68cdfea6f9452 100644
--- a/libcxx/include/__utility/is_pointer_in_range.h
+++ b/libcxx/include/__utility/is_pointer_in_range.h
@@ -35,7 +35,7 @@ template <class _Tp, class _Up, __enable_if_t<__is_less_than_comparable<const _T
 _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI _LIBCPP_NO_SANITIZE("address") bool __is_pointer_in_range(
     const _Tp* __begin, const _Tp* __end, const _Up* __ptr) {
   if (__libcpp_is_constant_evaluated()) {
-    _LIBCPP_ASSERT_UNCATEGORIZED(__builtin_constant_p(__begin <= __end), "__begin and __end do not form a range");
+    _LIBCPP_ASSERT_VALID_INPUT_RANGE(__builtin_constant_p(__begin <= __end), "__begin and __end do not form a range");
 
     // If this is not a constant during constant evaluation we know that __ptr is not part of the allocation where
     // [__begin, __end) is.
diff --git a/libcxx/include/experimental/__simd/vec_ext.h b/libcxx/include/experimental/__simd/vec_ext.h
index baaeda6a7401a..56a0b888104bf 100644
--- a/libcxx/include/experimental/__simd/vec_ext.h
+++ b/libcxx/include/experimental/__simd/vec_ext.h
@@ -38,11 +38,11 @@ struct __simd_storage<_Tp, simd_abi::__vec_ext<_Np>> {
   _Tp __data __attribute__((__vector_size__(std::__bit_ceil((sizeof(_Tp) * _Np)))));
 
   _LIBCPP_HIDE_FROM_ABI _Tp __get(size_t __idx) const noexcept {
-    _LIBCPP_ASSERT_UNCATEGORIZED(__idx >= 0 && __idx < _Np, "Index is out of bounds");
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(__idx >= 0 && __idx < _Np, "Index is out of bounds");
     return __data[__idx];
   }
   _LIBCPP_HIDE_FROM_ABI void __set(size_t __idx, _Tp __v) noexcept {
-    _LIBCPP_ASSERT_UNCATEGORIZED(__idx >= 0 && __idx < _Np, "Index is out of bounds");
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(__idx >= 0 && __idx < _Np, "Index is out of bounds");
     __data[__idx] = __v;
   }
 };
diff --git a/libcxx/src/support/ibm/xlocale_zos.cpp b/libcxx/src/support/ibm/xlocale_zos.cpp
index 4c20997b4eb79..9a90e08e11cf9 100644
--- a/libcxx/src/support/ibm/xlocale_zos.cpp
+++ b/libcxx/src/support/ibm/xlocale_zos.cpp
@@ -103,7 +103,7 @@ locale_t uselocale(locale_t newloc) {
       tokenized.push_back(s);
     }
 
-    _LIBCPP_ASSERT_UNCATEGORIZED(tokenized.size() >= _NCAT, "locale-name list is too short");
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(tokenized.size() >= _NCAT, "locale-name list is too short");
 
     previous_loc->lc_collate  = tokenized[LC_COLLATE];
     previous_loc->lc_ctype    = tokenized[LC_CTYPE];
diff --git a/libcxx/test/libcxx/algorithms/alg.sorting/assert.min.max.pass.cpp b/libcxx/test/libcxx/algorithms/alg.sorting/assert.min.max.pass.cpp
index b23b4d4530eec..bd9dfd4549c4e 100644
--- a/libcxx/test/libcxx/algorithms/alg.sorting/assert.min.max.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.sorting/assert.min.max.pass.cpp
@@ -10,7 +10,7 @@
 
 // REQUIRES: has-unix-headers
 // UNSUPPORTED: c++03, c++11, c++14, c++17
-// REQUIRES: libcpp-hardening-mode={{extensive|debug}}
+// UNSUPPORTED: libcpp-hardening-mode=none
 // XFAIL: availability-verbose_abort-missing
 
 #include <algorithm>
diff --git a/libcxx/test/libcxx/iterators/predef.iterators/counted.iterator/assert.pass.cpp b/libcxx/test/libcxx/iterators/predef.iterators/counted.iterator/assert.pass.cpp
new file mode 100644
index 0000000000000..f803b2cad75be
--- /dev/null
+++ b/libcxx/test/libcxx/iterators/predef.iterators/counted.iterator/assert.pass.cpp
@@ -0,0 +1,42 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// REQUIRES: has-unix-headers
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: libcpp-hardening-mode=none
+// XFAIL: availability-verbose_abort-missing
+
+#include <iterator>
+
+#include "check_assertion.h"
+#include "test_iterators.h"
+
+int main(int, char**) {
+  using Iter = std::counted_iterator<int*>;
+  int a[]    = {1, 2, 3};
+  Iter valid_i(a, 1);
+
+  {
+    Iter i;
+
+    TEST_LIBCPP_ASSERT_FAILURE(*i, "Iterator is equal to or past end.");
+    TEST_LIBCPP_ASSERT_FAILURE(i[999], "Subscript argument must be less than size.");
+    TEST_LIBCPP_ASSERT_FAILURE(std::ranges::iter_move(i), "Iterator must not be past end of range.");
+    TEST_LIBCPP_ASSERT_FAILURE(std::ranges::iter_swap(i, valid_i), "Iterators must not be past end of range.");
+    TEST_LIBCPP_ASSERT_FAILURE(std::ranges::iter_swap(valid_i, i), "Iterators must not be past end of range.");
+    std::ranges::iter_swap(valid_i, valid_i); // Ok
+  }
+
+  { // Check the `const` overload of `operator*`.
+    const Iter i;
+
+    TEST_LIBCPP_ASSERT_FAILURE(*i, "Iterator is equal to or past end.");
+  }
+
+  return 0;
+}
diff --git a/libcxx/test/libcxx/iterators/predef.iterators/iterators.common/assert.pass.cpp b/libcxx/test/libcxx/iterators/predef.iterators/iterators.common/assert.pass.cpp
new file mode 100644
index 0000000000000..ea4574fc1a9cc
--- /dev/null
+++ b/libcxx/test/libcxx/iterators/predef.iterators/iterators.common/assert.pass.cpp
@@ -0,0 +1,58 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// REQUIRES: has-unix-headers
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: libcpp-hardening-mode=none
+// XFAIL: availability-verbose_abort-missing
+
+#include <iterator>
+
+#include "check_assertion.h"
+#include "test_iterators.h"
+
+int main(int, char**) {
+  using Iter = std::common_iterator<int*, sentinel_wrapper<int*>>;
+  int a[]    = {1, 2, 3};
+  sentinel_wrapper<int*> s;
+  Iter valid_i = a;
+
+  {
+    Iter i = s;
+
+    TEST_LIBCPP_ASSERT_FAILURE(*i, "Attempted to dereference a non-dereferenceable common_iterator");
+
+    TEST_LIBCPP_ASSERT_FAILURE(++i, "Attempted to increment a non-dereferenceable common_iterator");
+    TEST_LIBCPP_ASSERT_FAILURE(i++, "Attempted to increment a non-dereferenceable common_iterator");
+
+    TEST_LIBCPP_ASSERT_FAILURE(
+        std::ranges::iter_move(i), "Attempted to iter_move a non-dereferenceable common_iterator");
+
+    TEST_LIBCPP_ASSERT_FAILURE(
+        std::ranges::iter_swap(i, valid_i), "Attempted to iter_swap a non-dereferenceable common_iterator");
+    TEST_LIBCPP_ASSERT_FAILURE(
+        std::ranges::iter_swap(valid_i, i), "Attempted to iter_swap a non-dereferenceable common_iterator");
+    std::ranges::iter_swap(valid_i, valid_i); // Ok
+  }
+
+  { // Check the `const` overload of `operator*`.
+    const Iter i = s;
+    TEST_LIBCPP_ASSERT_FAILURE(*i, "Attempted to dereference a non-dereferenceable common_iterator");
+  }
+
+  { // Check `operator->`.
+    struct Foo {
+      int x = 0;
+    };
+
+    std::common_iterator<Foo*, sentinel_wrapper<Foo*>> i = sentinel_wrapper<Foo*>();
+    TEST_LIBCPP_ASSERT_FAILURE(i->x, "Attempted to dereference a non-dereferenceable common_iterator");
+  }
+
+  return 0;
+}

From 9e012c76fb45defe733ba6027978919e3ae878c0 Mon Sep 17 00:00:00 2001
From: kda <kda@users.noreply.github.com>
Date: Wed, 20 Dec 2023 17:58:03 -0800
Subject: [PATCH 017/342] [sanitizer] Add graceful handling of exceeding
 StackStore limit. (#76115)

---
 .../lib/sanitizer_common/sanitizer_stack_store.cpp       | 9 +++++++--
 .../sanitizer_common/tests/sanitizer_stackdepot_test.cpp | 4 ++++
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_stack_store.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_stack_store.cpp
index 148470943b47b..c11df0ddfde43 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_stack_store.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_stack_store.cpp
@@ -44,6 +44,9 @@ StackStore::Id StackStore::Store(const StackTrace &trace, uptr *pack) {
   uptr idx = 0;
   *pack = 0;
   uptr *stack_trace = Alloc(h.size + 1, &idx, pack);
+  // No more space.
+  if (stack_trace == nullptr)
+    return 0;
   *stack_trace = h.ToUptr();
   internal_memcpy(stack_trace + 1, trace.trace, h.size * sizeof(uptr));
   *pack += blocks_[GetBlockIdx(idx)].Stored(h.size + 1);
@@ -76,8 +79,10 @@ uptr *StackStore::Alloc(uptr count, uptr *idx, uptr *pack) {
     uptr block_idx = GetBlockIdx(start);
     uptr last_idx = GetBlockIdx(start + count - 1);
     if (LIKELY(block_idx == last_idx)) {
-      // Fits into the a single block.
-      CHECK_LT(block_idx, ARRAY_SIZE(blocks_));
+      // Fits into a single block.
+      // No more available blocks.  Indicate inability to allocate more memory.
+      if (block_idx >= ARRAY_SIZE(blocks_))
+        return nullptr;
       *idx = start;
       return blocks_[block_idx].GetOrCreate(this) + GetInBlockIdx(start);
     }
diff --git a/compiler-rt/lib/sanitizer_common/tests/sanitizer_stackdepot_test.cpp b/compiler-rt/lib/sanitizer_common/tests/sanitizer_stackdepot_test.cpp
index 3835ce26c4d54..479e4a0c184f7 100644
--- a/compiler-rt/lib/sanitizer_common/tests/sanitizer_stackdepot_test.cpp
+++ b/compiler-rt/lib/sanitizer_common/tests/sanitizer_stackdepot_test.cpp
@@ -148,6 +148,10 @@ static struct StackDepotBenchmarkParams {
     {500000, 10, 16, true, false},
     {1500000, 10, 4, true, true},
     {800000, 10, 16, true, true},
+    // Go crazy, and create too many unique stacks, such that StackStore runs
+    // out of space.
+    {1000000, 1, 128, true, true},
+    {100000000, 1, 1, true, true},
 };
 
 static std::string PrintStackDepotBenchmarkParams(

From 565e5e861f64f455ab789bc50840e0be2d33c417 Mon Sep 17 00:00:00 2001
From: Chuanqi Xu <yedeng.yd@linux.alibaba.com>
Date: Thu, 21 Dec 2023 10:30:12 +0800
Subject: [PATCH 018/342] Recommit [NFC] [Serialization] Packing more bits and
 refactor AbbrevToUse

This patch tries to pack more bits into a value to reduce the size of
.pcm files. Also, after we introduced BitsPackers, it may slightly
better to adjust the way we use Abbrev.

After this patch, the size of the BMI for std module reduce from 28.94MB
to 28.1 MB.

This was reverted due to it broke the build of lldb. The reason that we
skip the serialization of a source location incorrectly. And this patch
now fixes that.
---
 clang/include/clang/Serialization/ASTReader.h |   2 +
 clang/include/clang/Serialization/ASTWriter.h |  88 ++--
 clang/lib/Serialization/ASTReaderDecl.cpp     |   2 +-
 clang/lib/Serialization/ASTReaderStmt.cpp     | 415 +++++++++++-------
 clang/lib/Serialization/ASTWriter.cpp         |  15 +-
 clang/lib/Serialization/ASTWriterDecl.cpp     | 381 ++++++++++++----
 clang/lib/Serialization/ASTWriterStmt.cpp     | 246 +++++++----
 7 files changed, 786 insertions(+), 363 deletions(-)

diff --git a/clang/include/clang/Serialization/ASTReader.h b/clang/include/clang/Serialization/ASTReader.h
index 59358e77edb07..21d791f5cd89a 100644
--- a/clang/include/clang/Serialization/ASTReader.h
+++ b/clang/include/clang/Serialization/ASTReader.h
@@ -2422,6 +2422,8 @@ class BitsUnpacker {
     CurrentBitsIndex = 0;
   }
 
+  void advance(uint32_t BitsWidth) { CurrentBitsIndex += BitsWidth; }
+
   bool getNextBit() {
     assert(isValid());
     return Value & (1 << CurrentBitsIndex++);
diff --git a/clang/include/clang/Serialization/ASTWriter.h b/clang/include/clang/Serialization/ASTWriter.h
index a56929ef0245e..de69f99003d82 100644
--- a/clang/include/clang/Serialization/ASTWriter.h
+++ b/clang/include/clang/Serialization/ASTWriter.h
@@ -564,11 +564,25 @@ class ASTWriter : public ASTDeserializationListener,
   unsigned DeclEnumAbbrev = 0;
   unsigned DeclObjCIvarAbbrev = 0;
   unsigned DeclCXXMethodAbbrev = 0;
+  unsigned DeclDependentNonTemplateCXXMethodAbbrev = 0;
+  unsigned DeclTemplateCXXMethodAbbrev = 0;
+  unsigned DeclMemberSpecializedCXXMethodAbbrev = 0;
+  unsigned DeclTemplateSpecializedCXXMethodAbbrev = 0;
+  unsigned DeclDependentSpecializationCXXMethodAbbrev = 0;
+  unsigned DeclTemplateTypeParmAbbrev = 0;
+  unsigned DeclUsingShadowAbbrev = 0;
 
   unsigned DeclRefExprAbbrev = 0;
   unsigned CharacterLiteralAbbrev = 0;
   unsigned IntegerLiteralAbbrev = 0;
   unsigned ExprImplicitCastAbbrev = 0;
+  unsigned BinaryOperatorAbbrev = 0;
+  unsigned CompoundAssignOperatorAbbrev = 0;
+  unsigned CallExprAbbrev = 0;
+  unsigned CXXOperatorCallExprAbbrev = 0;
+  unsigned CXXMemberCallExprAbbrev = 0;
+
+  unsigned CompoundStmtAbbrev = 0;
 
   void WriteDeclAbbrevs();
   void WriteDecl(ASTContext &Context, Decl *D);
@@ -735,12 +749,41 @@ class ASTWriter : public ASTDeserializationListener,
   unsigned getDeclFieldAbbrev() const { return DeclFieldAbbrev; }
   unsigned getDeclEnumAbbrev() const { return DeclEnumAbbrev; }
   unsigned getDeclObjCIvarAbbrev() const { return DeclObjCIvarAbbrev; }
-  unsigned getDeclCXXMethodAbbrev() const { return DeclCXXMethodAbbrev; }
+  unsigned getDeclCXXMethodAbbrev(FunctionDecl::TemplatedKind Kind) const {
+    switch (Kind) {
+    case FunctionDecl::TK_NonTemplate:
+      return DeclCXXMethodAbbrev;
+    case FunctionDecl::TK_FunctionTemplate:
+      return DeclTemplateCXXMethodAbbrev;
+    case FunctionDecl::TK_MemberSpecialization:
+      return DeclMemberSpecializedCXXMethodAbbrev;
+    case FunctionDecl::TK_FunctionTemplateSpecialization:
+      return DeclTemplateSpecializedCXXMethodAbbrev;
+    case FunctionDecl::TK_DependentNonTemplate:
+      return DeclDependentNonTemplateCXXMethodAbbrev;
+    case FunctionDecl::TK_DependentFunctionTemplateSpecialization:
+      return DeclDependentSpecializationCXXMethodAbbrev;
+    }
+    llvm_unreachable("Unknwon Template Kind!");
+  }
+  unsigned getDeclTemplateTypeParmAbbrev() const {
+    return DeclTemplateTypeParmAbbrev;
+  }
+  unsigned getDeclUsingShadowAbbrev() const { return DeclUsingShadowAbbrev; }
 
   unsigned getDeclRefExprAbbrev() const { return DeclRefExprAbbrev; }
   unsigned getCharacterLiteralAbbrev() const { return CharacterLiteralAbbrev; }
   unsigned getIntegerLiteralAbbrev() const { return IntegerLiteralAbbrev; }
   unsigned getExprImplicitCastAbbrev() const { return ExprImplicitCastAbbrev; }
+  unsigned getBinaryOperatorAbbrev() const { return BinaryOperatorAbbrev; }
+  unsigned getCompoundAssignOperatorAbbrev() const {
+    return CompoundAssignOperatorAbbrev;
+  }
+  unsigned getCallExprAbbrev() const { return CallExprAbbrev; }
+  unsigned getCXXOperatorCallExprAbbrev() { return CXXOperatorCallExprAbbrev; }
+  unsigned getCXXMemberCallExprAbbrev() { return CXXMemberCallExprAbbrev; }
+
+  unsigned getCompoundStmtAbbrev() const { return CompoundStmtAbbrev; }
 
   bool hasChain() const { return Chain; }
   ASTReader *getChain() const { return Chain; }
@@ -841,46 +884,33 @@ class BitsPacker {
   BitsPacker(BitsPacker &&) = delete;
   BitsPacker operator=(const BitsPacker &) = delete;
   BitsPacker operator=(BitsPacker &&) = delete;
-  ~BitsPacker() {
-    assert(!hasUnconsumedValues() && "There are unprocessed bits!");
+  ~BitsPacker() = default;
+
+  bool canWriteNextNBits(uint32_t BitsWidth) const {
+    return CurrentBitIndex + BitsWidth < BitIndexUpbound;
+  }
+
+  void reset(uint32_t Value) {
+    UnderlyingValue = Value;
+    CurrentBitIndex = 0;
   }
 
   void addBit(bool Value) { addBits(Value, 1); }
   void addBits(uint32_t Value, uint32_t BitsWidth) {
     assert(BitsWidth < BitIndexUpbound);
     assert((Value < (1u << BitsWidth)) && "Passing narrower bit width!");
+    assert(canWriteNextNBits(BitsWidth) &&
+           "Inserting too much bits into a value!");
 
-    if (CurrentBitIndex + BitsWidth >= BitIndexUpbound) {
-      Values.push_back(0);
-      CurrentBitIndex = 0;
-    }
-
-    assert(CurrentBitIndex < BitIndexUpbound);
-    Values.back() |= Value << CurrentBitIndex;
+    UnderlyingValue |= Value << CurrentBitIndex;
     CurrentBitIndex += BitsWidth;
   }
 
-  bool hasUnconsumedValues() const {
-    return ConsumingValueIndex < Values.size();
-  }
-  uint32_t getNextValue() {
-    assert(hasUnconsumedValues());
-    return Values[ConsumingValueIndex++];
-  }
-
-  // We can convert the packer to an uint32_t if there is only one values.
-  operator uint32_t() {
-    assert(Values.size() == 1);
-    return getNextValue();
-  }
+  operator uint32_t() { return UnderlyingValue; }
 
 private:
-  SmallVector<uint64_t, 4> Values;
-  uint16_t ConsumingValueIndex = 0;
-  // Initialize CurrentBitIndex with an invalid value
-  // to make it easier to update Values. See the implementation
-  // of `addBits` to see the details.
-  uint16_t CurrentBitIndex = BitIndexUpbound;
+  uint32_t UnderlyingValue = 0;
+  uint32_t CurrentBitIndex = 0;
 };
 
 } // namespace clang
diff --git a/clang/lib/Serialization/ASTReaderDecl.cpp b/clang/lib/Serialization/ASTReaderDecl.cpp
index 7140a14aefbf9..209fb04342088 100644
--- a/clang/lib/Serialization/ASTReaderDecl.cpp
+++ b/clang/lib/Serialization/ASTReaderDecl.cpp
@@ -2660,7 +2660,7 @@ void ASTDeclReader::VisitTemplateTypeParmDecl(TemplateTypeParmDecl *D) {
 
   D->setDeclaredWithTypename(Record.readInt());
 
-  if (Record.readBool()) {
+  if (D->hasTypeConstraint()) {
     ConceptReference *CR = nullptr;
     if (Record.readBool())
       CR = Record.readConceptReference();
diff --git a/clang/lib/Serialization/ASTReaderStmt.cpp b/clang/lib/Serialization/ASTReaderStmt.cpp
index b3a6f619372b4..d2424bffc2288 100644
--- a/clang/lib/Serialization/ASTReaderStmt.cpp
+++ b/clang/lib/Serialization/ASTReaderStmt.cpp
@@ -73,6 +73,8 @@ namespace clang {
     ASTRecordReader &Record;
     llvm::BitstreamCursor &DeclsCursor;
 
+    std::optional<BitsUnpacker> CurrentUnpackingBits;
+
     SourceLocation readSourceLocation() {
       return Record.readSourceLocation();
     }
@@ -110,6 +112,9 @@ namespace clang {
     /// itself.
     static const unsigned NumExprFields = NumStmtFields + 2;
 
+    /// The number of bits required for the packing bits for the Expr class.
+    static const unsigned NumExprBits = 10;
+
     /// Read and initialize a ExplicitTemplateArgumentList structure.
     void ReadTemplateKWAndArgsInfo(ASTTemplateKWAndArgsInfo &Args,
                                    TemplateArgumentLoc *ArgsLocArray,
@@ -147,9 +152,10 @@ void ASTStmtReader::VisitNullStmt(NullStmt *S) {
 
 void ASTStmtReader::VisitCompoundStmt(CompoundStmt *S) {
   VisitStmt(S);
+  CurrentUnpackingBits.emplace(Record.readInt());
   SmallVector<Stmt *, 16> Stmts;
-  unsigned NumStmts = Record.readInt();
-  unsigned HasFPFeatures = Record.readInt();
+  unsigned NumStmts = CurrentUnpackingBits->getNextBits(/*Width=*/20);
+  unsigned HasFPFeatures = CurrentUnpackingBits->getNextBit();
   assert(S->hasStoredFPFeatures() == HasFPFeatures);
   while (NumStmts--)
     Stmts.push_back(Record.readSubStmt());
@@ -214,9 +220,11 @@ void ASTStmtReader::VisitAttributedStmt(AttributedStmt *S) {
 void ASTStmtReader::VisitIfStmt(IfStmt *S) {
   VisitStmt(S);
 
-  bool HasElse = Record.readInt();
-  bool HasVar = Record.readInt();
-  bool HasInit = Record.readInt();
+  CurrentUnpackingBits.emplace(Record.readInt());
+
+  bool HasElse = CurrentUnpackingBits->getNextBit();
+  bool HasVar = CurrentUnpackingBits->getNextBit();
+  bool HasInit = CurrentUnpackingBits->getNextBit();
 
   S->setStatementKind(static_cast<IfStatementKind>(Record.readInt()));
   S->setCond(Record.readSubExpr());
@@ -523,14 +531,15 @@ void ASTStmtReader::VisitCapturedStmt(CapturedStmt *S) {
 
 void ASTStmtReader::VisitExpr(Expr *E) {
   VisitStmt(E);
+  CurrentUnpackingBits.emplace(Record.readInt());
+  E->setDependence(static_cast<ExprDependence>(
+      CurrentUnpackingBits->getNextBits(/*Width=*/5)));
+  E->setValueKind(static_cast<ExprValueKind>(
+      CurrentUnpackingBits->getNextBits(/*Width=*/2)));
+  E->setObjectKind(static_cast<ExprObjectKind>(
+      CurrentUnpackingBits->getNextBits(/*Width=*/3)));
+
   E->setType(Record.readType());
-  BitsUnpacker ExprBits(Record.readInt());
-  E->setDependence(
-      static_cast<ExprDependence>(ExprBits.getNextBits(/*Width=*/5)));
-  E->setValueKind(
-      static_cast<ExprValueKind>(ExprBits.getNextBits(/*Width=*/2)));
-  E->setObjectKind(
-      static_cast<ExprObjectKind>(ExprBits.getNextBits(/*Width=*/3)));
   assert(Record.getIdx() == NumExprFields &&
          "Incorrect expression field count");
 }
@@ -591,17 +600,20 @@ void ASTStmtReader::VisitPredefinedExpr(PredefinedExpr *E) {
 void ASTStmtReader::VisitDeclRefExpr(DeclRefExpr *E) {
   VisitExpr(E);
 
-  E->DeclRefExprBits.HasQualifier = Record.readInt();
-  E->DeclRefExprBits.HasFoundDecl = Record.readInt();
-  E->DeclRefExprBits.HasTemplateKWAndArgsInfo = Record.readInt();
-  E->DeclRefExprBits.HadMultipleCandidates = Record.readInt();
-  E->DeclRefExprBits.RefersToEnclosingVariableOrCapture = Record.readInt();
-  E->DeclRefExprBits.NonOdrUseReason = Record.readInt();
-  E->DeclRefExprBits.IsImmediateEscalating = Record.readInt();
+  E->DeclRefExprBits.HasQualifier = CurrentUnpackingBits->getNextBit();
+  E->DeclRefExprBits.HasFoundDecl = CurrentUnpackingBits->getNextBit();
+  E->DeclRefExprBits.HasTemplateKWAndArgsInfo =
+      CurrentUnpackingBits->getNextBit();
+  E->DeclRefExprBits.HadMultipleCandidates = CurrentUnpackingBits->getNextBit();
+  E->DeclRefExprBits.RefersToEnclosingVariableOrCapture =
+      CurrentUnpackingBits->getNextBit();
+  E->DeclRefExprBits.NonOdrUseReason =
+      CurrentUnpackingBits->getNextBits(/*Width=*/2);
+  E->DeclRefExprBits.IsImmediateEscalating = CurrentUnpackingBits->getNextBit();
   E->DeclRefExprBits.CapturedByCopyInLambdaWithExplicitObjectParameter = false;
   unsigned NumTemplateArgs = 0;
   if (E->hasTemplateKWAndArgsInfo())
-    NumTemplateArgs = Record.readInt();
+    NumTemplateArgs = CurrentUnpackingBits->getNextBits(/*Width=*/12);
 
   if (E->hasQualifier())
     new (E->getTrailingObjects<NestedNameSpecifierLoc>())
@@ -706,12 +718,13 @@ void ASTStmtReader::VisitParenListExpr(ParenListExpr *E) {
 
 void ASTStmtReader::VisitUnaryOperator(UnaryOperator *E) {
   VisitExpr(E);
-  bool hasFP_Features = Record.readInt();
+  bool hasFP_Features = CurrentUnpackingBits->getNextBit();
   assert(hasFP_Features == E->hasStoredFPFeatures());
   E->setSubExpr(Record.readSubExpr());
-  E->setOpcode((UnaryOperator::Opcode)Record.readInt());
+  E->setOpcode(
+      (UnaryOperator::Opcode)CurrentUnpackingBits->getNextBits(/*Width=*/5));
   E->setOperatorLoc(readSourceLocation());
-  E->setCanOverflow(Record.readInt());
+  E->setCanOverflow(CurrentUnpackingBits->getNextBit());
   if (hasFP_Features)
     E->setStoredFPFeatures(
         FPOptionsOverride::getFromOpaqueInt(Record.readInt()));
@@ -1000,12 +1013,10 @@ void ASTStmtReader::VisitOMPIteratorExpr(OMPIteratorExpr *E) {
 void ASTStmtReader::VisitCallExpr(CallExpr *E) {
   VisitExpr(E);
 
-  BitsUnpacker CallExprBits = Record.readInt();
-
-  unsigned NumArgs = CallExprBits.getNextBits(/*Width=*/16);
-  bool HasFPFeatures = CallExprBits.getNextBit();
+  unsigned NumArgs = CurrentUnpackingBits->getNextBits(/*Width=*/13);
+  bool HasFPFeatures = CurrentUnpackingBits->getNextBit();
   E->setADLCallKind(
-      static_cast<CallExpr::ADLCallKind>(CallExprBits.getNextBit()));
+      static_cast<CallExpr::ADLCallKind>(CurrentUnpackingBits->getNextBit()));
   assert((NumArgs == E->getNumArgs()) && "Wrong NumArgs!");
   E->setRParenLoc(readSourceLocation());
   E->setCallee(Record.readSubExpr());
@@ -1024,27 +1035,28 @@ void ASTStmtReader::VisitCXXMemberCallExpr(CXXMemberCallExpr *E) {
 void ASTStmtReader::VisitMemberExpr(MemberExpr *E) {
   VisitExpr(E);
 
-  bool HasQualifier = Record.readInt();
-  bool HasFoundDecl = Record.readInt();
-  bool HasTemplateInfo = Record.readInt();
-  unsigned NumTemplateArgs = Record.readInt();
+  bool HasQualifier = CurrentUnpackingBits->getNextBit();
+  bool HasFoundDecl = CurrentUnpackingBits->getNextBit();
+  bool HasTemplateInfo = CurrentUnpackingBits->getNextBit();
+  unsigned NumTemplateArgs = CurrentUnpackingBits->getNextBits(/*Width=*/12);
 
   E->Base = Record.readSubExpr();
   E->MemberDecl = Record.readDeclAs<ValueDecl>();
   E->MemberDNLoc = Record.readDeclarationNameLoc(E->MemberDecl->getDeclName());
   E->MemberLoc = Record.readSourceLocation();
-  E->MemberExprBits.IsArrow = Record.readInt();
+  E->MemberExprBits.IsArrow = CurrentUnpackingBits->getNextBit();
   E->MemberExprBits.HasQualifierOrFoundDecl = HasQualifier || HasFoundDecl;
   E->MemberExprBits.HasTemplateKWAndArgsInfo = HasTemplateInfo;
-  E->MemberExprBits.HadMultipleCandidates = Record.readInt();
-  E->MemberExprBits.NonOdrUseReason = Record.readInt();
+  E->MemberExprBits.HadMultipleCandidates = CurrentUnpackingBits->getNextBit();
+  E->MemberExprBits.NonOdrUseReason =
+      CurrentUnpackingBits->getNextBits(/*Width=*/2);
   E->MemberExprBits.OperatorLoc = Record.readSourceLocation();
 
   if (HasQualifier || HasFoundDecl) {
     DeclAccessPair FoundDecl;
     if (HasFoundDecl) {
       auto *FoundD = Record.readDeclAs<NamedDecl>();
-      auto AS = (AccessSpecifier)Record.readInt();
+      auto AS = (AccessSpecifier)CurrentUnpackingBits->getNextBits(/*Width=*/2);
       FoundDecl = DeclAccessPair::make(FoundD, AS);
     } else {
       FoundDecl = DeclAccessPair::make(E->MemberDecl,
@@ -1091,10 +1103,10 @@ void ASTStmtReader::VisitCastExpr(CastExpr *E) {
   VisitExpr(E);
   unsigned NumBaseSpecs = Record.readInt();
   assert(NumBaseSpecs == E->path_size());
-  unsigned HasFPFeatures = Record.readInt();
+  unsigned HasFPFeatures = CurrentUnpackingBits->getNextBit();
   assert(E->hasStoredFPFeatures() == HasFPFeatures);
   E->setSubExpr(Record.readSubExpr());
-  E->setCastKind((CastKind)Record.readInt());
+  E->setCastKind((CastKind)CurrentUnpackingBits->getNextBits(/*Width=*/7));
   CastExpr::path_iterator BaseI = E->path_begin();
   while (NumBaseSpecs--) {
     auto *BaseSpec = new (Record.getContext()) CXXBaseSpecifier;
@@ -1107,10 +1119,12 @@ void ASTStmtReader::VisitCastExpr(CastExpr *E) {
 }
 
 void ASTStmtReader::VisitBinaryOperator(BinaryOperator *E) {
-  bool hasFP_Features;
+
   VisitExpr(E);
-  E->setHasStoredFPFeatures(hasFP_Features = Record.readInt());
-  E->setOpcode((BinaryOperator::Opcode)Record.readInt());
+  bool hasFP_Features = CurrentUnpackingBits->getNextBit();
+  E->setHasStoredFPFeatures(hasFP_Features);
+  E->setOpcode(
+      (BinaryOperator::Opcode)CurrentUnpackingBits->getNextBits(/*Width=*/6));
   E->setLHS(Record.readSubExpr());
   E->setRHS(Record.readSubExpr());
   E->setOperatorLoc(readSourceLocation());
@@ -1148,7 +1162,7 @@ ASTStmtReader::VisitBinaryConditionalOperator(BinaryConditionalOperator *E) {
 
 void ASTStmtReader::VisitImplicitCastExpr(ImplicitCastExpr *E) {
   VisitCastExpr(E);
-  E->setIsPartOfExplicitCast(Record.readInt());
+  E->setIsPartOfExplicitCast(CurrentUnpackingBits->getNextBit());
 }
 
 void ASTStmtReader::VisitExplicitCastExpr(ExplicitCastExpr *E) {
@@ -1686,7 +1700,8 @@ void ASTStmtReader::VisitMSDependentExistsStmt(MSDependentExistsStmt *S) {
 
 void ASTStmtReader::VisitCXXOperatorCallExpr(CXXOperatorCallExpr *E) {
   VisitCallExpr(E);
-  E->CXXOperatorCallExprBits.OperatorKind = Record.readInt();
+  E->CXXOperatorCallExprBits.OperatorKind =
+      CurrentUnpackingBits->getNextBits(/*Width=*/6);
   E->Range = Record.readSourceRange();
 }
 
@@ -1764,8 +1779,8 @@ void ASTStmtReader::VisitCXXNamedCastExpr(CXXNamedCastExpr *E) {
   SourceRange R = readSourceRange();
   E->Loc = R.getBegin();
   E->RParenLoc = R.getEnd();
-  R = readSourceRange();
-  E->AngleBrackets = R;
+  if (CurrentUnpackingBits->getNextBit())
+    E->AngleBrackets = readSourceRange();
 }
 
 void ASTStmtReader::VisitCXXStaticCastExpr(CXXStaticCastExpr *E) {
@@ -1961,9 +1976,9 @@ void ASTStmtReader::VisitCXXDependentScopeMemberExpr(
     CXXDependentScopeMemberExpr *E) {
   VisitExpr(E);
 
-  bool HasTemplateKWAndArgsInfo = Record.readInt();
-  unsigned NumTemplateArgs = Record.readInt();
-  bool HasFirstQualifierFoundInScope = Record.readInt();
+  bool HasTemplateKWAndArgsInfo = CurrentUnpackingBits->getNextBit();
+  unsigned NumTemplateArgs = CurrentUnpackingBits->getNextBits(/*Width=*/16);
+  bool HasFirstQualifierFoundInScope = CurrentUnpackingBits->getNextBit();
 
   assert((HasTemplateKWAndArgsInfo == E->hasTemplateKWAndArgsInfo()) &&
          "Wrong HasTemplateKWAndArgsInfo!");
@@ -1979,11 +1994,18 @@ void ASTStmtReader::VisitCXXDependentScopeMemberExpr(
   assert((NumTemplateArgs == E->getNumTemplateArgs()) &&
          "Wrong NumTemplateArgs!");
 
-  E->CXXDependentScopeMemberExprBits.IsArrow = Record.readInt();
-  E->CXXDependentScopeMemberExprBits.OperatorLoc = readSourceLocation();
+  E->CXXDependentScopeMemberExprBits.IsArrow =
+      CurrentUnpackingBits->getNextBit();
+
   E->BaseType = Record.readType();
   E->QualifierLoc = Record.readNestedNameSpecifierLoc();
-  E->Base = Record.readSubExpr();
+  // not ImplicitAccess
+  if (CurrentUnpackingBits->getNextBit())
+    E->Base = Record.readSubExpr();  
+  else
+    E->Base = nullptr;
+
+  E->CXXDependentScopeMemberExprBits.OperatorLoc = readSourceLocation();
 
   if (HasFirstQualifierFoundInScope)
     *E->getTrailingObjects<NamedDecl *>() = readDeclAs<NamedDecl>();
@@ -1995,11 +2017,11 @@ void
 ASTStmtReader::VisitDependentScopeDeclRefExpr(DependentScopeDeclRefExpr *E) {
   VisitExpr(E);
 
-  if (Record.readInt()) // HasTemplateKWAndArgsInfo
+  if (CurrentUnpackingBits->getNextBit()) // HasTemplateKWAndArgsInfo
     ReadTemplateKWAndArgsInfo(
         *E->getTrailingObjects<ASTTemplateKWAndArgsInfo>(),
         E->getTrailingObjects<TemplateArgumentLoc>(),
-        /*NumTemplateArgs=*/Record.readInt());
+        /*NumTemplateArgs=*/CurrentUnpackingBits->getNextBits(/*Width=*/16));
 
   E->QualifierLoc = Record.readNestedNameSpecifierLoc();
   E->NameInfo = Record.readDeclarationNameInfo();
@@ -2022,15 +2044,15 @@ ASTStmtReader::VisitCXXUnresolvedConstructExpr(CXXUnresolvedConstructExpr *E) {
 void ASTStmtReader::VisitOverloadExpr(OverloadExpr *E) {
   VisitExpr(E);
 
-  BitsUnpacker OverloadExprBits = Record.readInt();
-  unsigned NumResults = OverloadExprBits.getNextBits(/*Width=*/14);
-  bool HasTemplateKWAndArgsInfo = OverloadExprBits.getNextBit();
+  CurrentUnpackingBits.emplace(Record.readInt());
+  unsigned NumResults = CurrentUnpackingBits->getNextBits(/*Width=*/12);
+  bool HasTemplateKWAndArgsInfo = CurrentUnpackingBits->getNextBit();
   assert((E->getNumDecls() == NumResults) && "Wrong NumResults!");
   assert((E->hasTemplateKWAndArgsInfo() == HasTemplateKWAndArgsInfo) &&
          "Wrong HasTemplateKWAndArgsInfo!");
 
   if (HasTemplateKWAndArgsInfo) {
-    unsigned NumTemplateArgs = OverloadExprBits.getNextBits(/*Width=*/14);
+    unsigned NumTemplateArgs = CurrentUnpackingBits->getNextBits(/*Width=*/12);
     ReadTemplateKWAndArgsInfo(*E->getTrailingASTTemplateKWAndArgsInfo(),
                               E->getTrailingTemplateArgumentLoc(),
                               NumTemplateArgs);
@@ -2057,17 +2079,24 @@ void ASTStmtReader::VisitOverloadExpr(OverloadExpr *E) {
 
 void ASTStmtReader::VisitUnresolvedMemberExpr(UnresolvedMemberExpr *E) {
   VisitOverloadExpr(E);
-  E->UnresolvedMemberExprBits.IsArrow = Record.readInt();
-  E->UnresolvedMemberExprBits.HasUnresolvedUsing = Record.readInt();
-  E->Base = Record.readSubExpr();
-  E->BaseType = Record.readType();
+  E->UnresolvedMemberExprBits.IsArrow = CurrentUnpackingBits->getNextBit();
+  E->UnresolvedMemberExprBits.HasUnresolvedUsing =
+      CurrentUnpackingBits->getNextBit();
+
+  if (/*!isImplicitAccess=*/CurrentUnpackingBits->getNextBit())
+    E->Base = Record.readSubExpr();
+  else
+    E->Base = nullptr;
+
   E->OperatorLoc = readSourceLocation();
+
+  E->BaseType = Record.readType();
 }
 
 void ASTStmtReader::VisitUnresolvedLookupExpr(UnresolvedLookupExpr *E) {
   VisitOverloadExpr(E);
-  E->UnresolvedLookupExprBits.RequiresADL = Record.readInt();
-  E->UnresolvedLookupExprBits.Overloaded = Record.readInt();
+  E->UnresolvedLookupExprBits.RequiresADL = CurrentUnpackingBits->getNextBit();
+  E->UnresolvedLookupExprBits.Overloaded = CurrentUnpackingBits->getNextBit();
   E->NamingClass = readDeclAs<CXXRecordDecl>();
 }
 
@@ -2142,9 +2171,12 @@ void ASTStmtReader::VisitSubstNonTypeTemplateParmExpr(
                                               SubstNonTypeTemplateParmExpr *E) {
   VisitExpr(E);
   E->AssociatedDeclAndRef.setPointer(readDeclAs<Decl>());
-  E->AssociatedDeclAndRef.setInt(Record.readInt());
-  E->Index = Record.readInt();
-  E->PackIndex = Record.readInt();
+  E->AssociatedDeclAndRef.setInt(CurrentUnpackingBits->getNextBit());
+  E->Index = CurrentUnpackingBits->getNextBits(/*Width=*/12);
+  if (CurrentUnpackingBits->getNextBit())
+    E->PackIndex = Record.readInt();
+  else
+    E->PackIndex = 0;
   E->SubstNonTypeTemplateParmExprBits.NameLoc = readSourceLocation();
   E->Replacement = Record.readSubExpr();
 }
@@ -2836,11 +2868,13 @@ Stmt *ASTReader::ReadStmtFromStream(ModuleFile &F) {
       S = new (Context) NullStmt(Empty);
       break;
 
-    case STMT_COMPOUND:
-      S = CompoundStmt::CreateEmpty(
-          Context, /*NumStmts=*/Record[ASTStmtReader::NumStmtFields],
-          /*HasFPFeatures=*/Record[ASTStmtReader::NumStmtFields + 1]);
+    case STMT_COMPOUND: {
+      BitsUnpacker StmtCompoundBits(Record[ASTStmtReader::NumStmtFields]);
+      unsigned NumStmts = StmtCompoundBits.getNextBits(/*Width=*/20);
+      bool HasFPFeatures = StmtCompoundBits.getNextBit();
+      S = CompoundStmt::CreateEmpty(Context, NumStmts, HasFPFeatures);
       break;
+    }
 
     case STMT_CASE:
       S = CaseStmt::CreateEmpty(
@@ -2862,13 +2896,14 @@ Stmt *ASTReader::ReadStmtFromStream(ModuleFile &F) {
         /*NumAttrs*/Record[ASTStmtReader::NumStmtFields]);
       break;
 
-    case STMT_IF:
-      S = IfStmt::CreateEmpty(
-          Context,
-          /* HasElse=*/Record[ASTStmtReader::NumStmtFields],
-          /* HasVar=*/Record[ASTStmtReader::NumStmtFields + 1],
-          /* HasInit=*/Record[ASTStmtReader::NumStmtFields + 2]);
+    case STMT_IF: {
+      BitsUnpacker IfStmtBits(Record[ASTStmtReader::NumStmtFields]);
+      bool HasElse = IfStmtBits.getNextBit();
+      bool HasVar = IfStmtBits.getNextBit();
+      bool HasInit = IfStmtBits.getNextBit();
+      S = IfStmt::CreateEmpty(Context, HasElse, HasVar, HasInit);
       break;
+    }
 
     case STMT_SWITCH:
       S = SwitchStmt::CreateEmpty(
@@ -2945,17 +2980,20 @@ Stmt *ASTReader::ReadStmtFromStream(ModuleFile &F) {
           /*HasFunctionName*/ Record[ASTStmtReader::NumExprFields]);
       break;
 
-    case EXPR_DECL_REF:
-      S = DeclRefExpr::CreateEmpty(
-          Context,
-          /*HasQualifier=*/Record[ASTStmtReader::NumExprFields],
-          /*HasFoundDecl=*/Record[ASTStmtReader::NumExprFields + 1],
-          /*HasTemplateKWAndArgsInfo=*/Record[ASTStmtReader::NumExprFields + 2],
-          /*NumTemplateArgs=*/
-          Record[ASTStmtReader::NumExprFields + 2]
-              ? Record[ASTStmtReader::NumExprFields + 7]
-              : 0);
+    case EXPR_DECL_REF: {
+      BitsUnpacker DeclRefExprBits(Record[ASTStmtReader::NumStmtFields]);
+      DeclRefExprBits.advance(ASTStmtReader::NumExprBits);
+      bool HasQualifier = DeclRefExprBits.getNextBit();
+      bool HasFoundDecl = DeclRefExprBits.getNextBit();
+      bool HasTemplateKWAndArgsInfo = DeclRefExprBits.getNextBit();
+      DeclRefExprBits.advance(5);
+      unsigned NumTemplateArgs = HasTemplateKWAndArgsInfo
+                                     ? DeclRefExprBits.getNextBits(/*Width=*/12)
+                                     : 0;
+      S = DeclRefExpr::CreateEmpty(Context, HasQualifier, HasFoundDecl,
+                                   HasTemplateKWAndArgsInfo, NumTemplateArgs);
       break;
+    }
 
     case EXPR_INTEGER_LITERAL:
       S = IntegerLiteral::Create(Context, Empty);
@@ -2995,10 +3033,13 @@ Stmt *ASTReader::ReadStmtFromStream(ModuleFile &F) {
           /* NumExprs=*/Record[ASTStmtReader::NumExprFields]);
       break;
 
-    case EXPR_UNARY_OPERATOR:
-      S = UnaryOperator::CreateEmpty(Context,
-                                     Record[ASTStmtReader::NumExprFields]);
+    case EXPR_UNARY_OPERATOR: {
+      BitsUnpacker UnaryOperatorBits(Record[ASTStmtReader::NumStmtFields]);
+      UnaryOperatorBits.advance(ASTStmtReader::NumExprBits);
+      bool HasFPFeatures = UnaryOperatorBits.getNextBit();
+      S = UnaryOperator::CreateEmpty(Context, HasFPFeatures);
       break;
+    }
 
     case EXPR_OFFSETOF:
       S = OffsetOfExpr::CreateEmpty(Context,
@@ -3033,8 +3074,9 @@ Stmt *ASTReader::ReadStmtFromStream(ModuleFile &F) {
       break;
 
     case EXPR_CALL: {
-      BitsUnpacker CallExprBits(Record[ASTStmtReader::NumExprFields]);
-      auto NumArgs = CallExprBits.getNextBits(/*Width=*/16);
+      BitsUnpacker CallExprBits(Record[ASTStmtReader::NumStmtFields]);
+      CallExprBits.advance(ASTStmtReader::NumExprBits);
+      auto NumArgs = CallExprBits.getNextBits(/*Width=*/13);
       auto HasFPFeatures = CallExprBits.getNextBit();
       S = CallExpr::CreateEmpty(Context, NumArgs, HasFPFeatures, Empty);
       break;
@@ -3045,22 +3087,33 @@ Stmt *ASTReader::ReadStmtFromStream(ModuleFile &F) {
           Context, /*NumArgs=*/Record[ASTStmtReader::NumExprFields]);
       break;
 
-    case EXPR_MEMBER:
-      S = MemberExpr::CreateEmpty(Context, Record[ASTStmtReader::NumExprFields],
-                                  Record[ASTStmtReader::NumExprFields + 1],
-                                  Record[ASTStmtReader::NumExprFields + 2],
-                                  Record[ASTStmtReader::NumExprFields + 3]);
+    case EXPR_MEMBER: {
+      BitsUnpacker ExprMemberBits(Record[ASTStmtReader::NumStmtFields]);
+      ExprMemberBits.advance(ASTStmtReader::NumExprBits);
+      bool HasQualifier = ExprMemberBits.getNextBit();
+      bool HasFoundDecl = ExprMemberBits.getNextBit();
+      bool HasTemplateInfo = ExprMemberBits.getNextBit();
+      unsigned NumTemplateArgs = ExprMemberBits.getNextBits(/*Width=*/12);
+      S = MemberExpr::CreateEmpty(Context, HasQualifier, HasFoundDecl,
+                                  HasTemplateInfo, NumTemplateArgs);
       break;
+    }
 
-    case EXPR_BINARY_OPERATOR:
-      S = BinaryOperator::CreateEmpty(Context,
-                                      Record[ASTStmtReader::NumExprFields]);
+    case EXPR_BINARY_OPERATOR: {
+      BitsUnpacker BinaryOperatorBits(Record[ASTStmtReader::NumStmtFields]);
+      BinaryOperatorBits.advance(ASTStmtReader::NumExprBits);
+      bool HasFPFeatures = BinaryOperatorBits.getNextBit();
+      S = BinaryOperator::CreateEmpty(Context, HasFPFeatures);
       break;
+    }
 
-    case EXPR_COMPOUND_ASSIGN_OPERATOR:
-      S = CompoundAssignOperator::CreateEmpty(
-          Context, Record[ASTStmtReader::NumExprFields]);
+    case EXPR_COMPOUND_ASSIGN_OPERATOR: {
+      BitsUnpacker BinaryOperatorBits(Record[ASTStmtReader::NumStmtFields]);
+      BinaryOperatorBits.advance(ASTStmtReader::NumExprBits);
+      bool HasFPFeatures = BinaryOperatorBits.getNextBit();
+      S = CompoundAssignOperator::CreateEmpty(Context, HasFPFeatures);
       break;
+    }
 
     case EXPR_CONDITIONAL_OPERATOR:
       S = new (Context) ConditionalOperator(Empty);
@@ -3070,19 +3123,23 @@ Stmt *ASTReader::ReadStmtFromStream(ModuleFile &F) {
       S = new (Context) BinaryConditionalOperator(Empty);
       break;
 
-    case EXPR_IMPLICIT_CAST:
-      S = ImplicitCastExpr::CreateEmpty(
-          Context,
-          /*PathSize*/ Record[ASTStmtReader::NumExprFields],
-          /*HasFPFeatures*/ Record[ASTStmtReader::NumExprFields + 1]);
+    case EXPR_IMPLICIT_CAST: {
+      BitsUnpacker CastExprBits(Record[ASTStmtReader::NumStmtFields]);
+      CastExprBits.advance(ASTStmtReader::NumExprBits);
+      unsigned PathSize = Record[ASTStmtReader::NumExprFields];
+      bool HasFPFeatures = CastExprBits.getNextBit();
+      S = ImplicitCastExpr::CreateEmpty(Context, PathSize, HasFPFeatures);
       break;
+    }
 
-    case EXPR_CSTYLE_CAST:
-      S = CStyleCastExpr::CreateEmpty(
-          Context,
-          /*PathSize*/ Record[ASTStmtReader::NumExprFields],
-          /*HasFPFeatures*/ Record[ASTStmtReader::NumExprFields + 1]);
+    case EXPR_CSTYLE_CAST: {
+      BitsUnpacker CastExprBits(Record[ASTStmtReader::NumStmtFields]);
+      CastExprBits.advance(ASTStmtReader::NumExprBits);
+      unsigned PathSize = Record[ASTStmtReader::NumExprFields];
+      bool HasFPFeatures = CastExprBits.getNextBit();
+      S = CStyleCastExpr::CreateEmpty(Context, PathSize, HasFPFeatures);
       break;
+    }
 
     case EXPR_COMPOUND_LITERAL:
       S = new (Context) CompoundLiteralExpr(Empty);
@@ -3777,8 +3834,9 @@ Stmt *ASTReader::ReadStmtFromStream(ModuleFile &F) {
     }
 
     case EXPR_CXX_OPERATOR_CALL: {
-      BitsUnpacker CallExprBits(Record[ASTStmtReader::NumExprFields]);
-      auto NumArgs = CallExprBits.getNextBits(/*Width=*/16);
+      BitsUnpacker CallExprBits(Record[ASTStmtReader::NumStmtFields]);
+      CallExprBits.advance(ASTStmtReader::NumExprBits);
+      auto NumArgs = CallExprBits.getNextBits(/*Width=*/13);
       auto HasFPFeatures = CallExprBits.getNextBit();
       S = CXXOperatorCallExpr::CreateEmpty(Context, NumArgs, HasFPFeatures,
                                            Empty);
@@ -3786,8 +3844,9 @@ Stmt *ASTReader::ReadStmtFromStream(ModuleFile &F) {
     }
 
     case EXPR_CXX_MEMBER_CALL: {
-      BitsUnpacker CallExprBits(Record[ASTStmtReader::NumExprFields]);
-      auto NumArgs = CallExprBits.getNextBits(/*Width=*/16);
+      BitsUnpacker CallExprBits(Record[ASTStmtReader::NumStmtFields]);
+      CallExprBits.advance(ASTStmtReader::NumExprBits);
+      auto NumArgs = CallExprBits.getNextBits(/*Width=*/13);
       auto HasFPFeatures = CallExprBits.getNextBit();
       S = CXXMemberCallExpr::CreateEmpty(Context, NumArgs, HasFPFeatures,
                                          Empty);
@@ -3814,22 +3873,26 @@ Stmt *ASTReader::ReadStmtFromStream(ModuleFile &F) {
           /* NumArgs=*/Record[ASTStmtReader::NumExprFields]);
       break;
 
-    case EXPR_CXX_STATIC_CAST:
-      S = CXXStaticCastExpr::CreateEmpty(
-          Context,
-          /*PathSize*/ Record[ASTStmtReader::NumExprFields],
-          /*HasFPFeatures*/ Record[ASTStmtReader::NumExprFields + 1]);
+    case EXPR_CXX_STATIC_CAST: {
+      BitsUnpacker CastExprBits(Record[ASTStmtReader::NumStmtFields]);
+      CastExprBits.advance(ASTStmtReader::NumExprBits);
+      unsigned PathSize = Record[ASTStmtReader::NumExprFields];
+      bool HasFPFeatures = CastExprBits.getNextBit();
+      S = CXXStaticCastExpr::CreateEmpty(Context, PathSize, HasFPFeatures);
       break;
+    }
 
-    case EXPR_CXX_DYNAMIC_CAST:
-      S = CXXDynamicCastExpr::CreateEmpty(Context,
-                       /*PathSize*/ Record[ASTStmtReader::NumExprFields]);
+    case EXPR_CXX_DYNAMIC_CAST: {
+      unsigned PathSize = Record[ASTStmtReader::NumExprFields];
+      S = CXXDynamicCastExpr::CreateEmpty(Context, PathSize);
       break;
+    }
 
-    case EXPR_CXX_REINTERPRET_CAST:
-      S = CXXReinterpretCastExpr::CreateEmpty(Context,
-                       /*PathSize*/ Record[ASTStmtReader::NumExprFields]);
+    case EXPR_CXX_REINTERPRET_CAST: {
+      unsigned PathSize = Record[ASTStmtReader::NumExprFields];
+      S = CXXReinterpretCastExpr::CreateEmpty(Context, PathSize);
       break;
+    }
 
     case EXPR_CXX_CONST_CAST:
       S = CXXConstCastExpr::CreateEmpty(Context);
@@ -3839,21 +3902,28 @@ Stmt *ASTReader::ReadStmtFromStream(ModuleFile &F) {
       S = CXXAddrspaceCastExpr::CreateEmpty(Context);
       break;
 
-    case EXPR_CXX_FUNCTIONAL_CAST:
-      S = CXXFunctionalCastExpr::CreateEmpty(
-          Context,
-          /*PathSize*/ Record[ASTStmtReader::NumExprFields],
-          /*HasFPFeatures*/ Record[ASTStmtReader::NumExprFields + 1]);
+    case EXPR_CXX_FUNCTIONAL_CAST: {
+      BitsUnpacker CastExprBits(Record[ASTStmtReader::NumStmtFields]);
+      CastExprBits.advance(ASTStmtReader::NumExprBits);
+      unsigned PathSize = Record[ASTStmtReader::NumExprFields];
+      bool HasFPFeatures = CastExprBits.getNextBit();
+      S = CXXFunctionalCastExpr::CreateEmpty(Context, PathSize, HasFPFeatures);
       break;
+    }
 
-    case EXPR_BUILTIN_BIT_CAST:
-      assert(Record[ASTStmtReader::NumExprFields] == 0 && "Wrong PathSize!");
+    case EXPR_BUILTIN_BIT_CAST: {
+#ifndef NDEBUG
+      unsigned PathSize = Record[ASTStmtReader::NumExprFields];
+      assert(PathSize == 0 && "Wrong PathSize!");
+#endif
       S = new (Context) BuiltinBitCastExpr(Empty);
       break;
+    }
 
     case EXPR_USER_DEFINED_LITERAL: {
-      BitsUnpacker CallExprBits(Record[ASTStmtReader::NumExprFields]);
-      auto NumArgs = CallExprBits.getNextBits(/*Width=*/16);
+      BitsUnpacker CallExprBits(Record[ASTStmtReader::NumStmtFields]);
+      CallExprBits.advance(ASTStmtReader::NumExprBits);
+      auto NumArgs = CallExprBits.getNextBits(/*Width=*/13);
       auto HasFPFeatures = CallExprBits.getNextBit();
       S = UserDefinedLiteral::CreateEmpty(Context, NumArgs, HasFPFeatures,
                                           Empty);
@@ -3944,47 +4014,63 @@ Stmt *ASTReader::ReadStmtFromStream(ModuleFile &F) {
                                    Record[ASTStmtReader::NumExprFields]);
       break;
 
-    case EXPR_CXX_DEPENDENT_SCOPE_MEMBER:
+    case EXPR_CXX_DEPENDENT_SCOPE_MEMBER: {
+      BitsUnpacker DependentScopeMemberBits(
+          Record[ASTStmtReader::NumStmtFields]);
+      DependentScopeMemberBits.advance(ASTStmtReader::NumExprBits);
+      bool HasTemplateKWAndArgsInfo = DependentScopeMemberBits.getNextBit();
+      unsigned NumTemplateArgs =
+          DependentScopeMemberBits.getNextBits(/*Width=*/16);
+      bool HasFirstQualifierFoundInScope =
+          DependentScopeMemberBits.getNextBit();
       S = CXXDependentScopeMemberExpr::CreateEmpty(
-          Context,
-          /*HasTemplateKWAndArgsInfo=*/Record[ASTStmtReader::NumExprFields],
-          /*NumTemplateArgs=*/Record[ASTStmtReader::NumExprFields + 1],
-          /*HasFirstQualifierFoundInScope=*/
-          Record[ASTStmtReader::NumExprFields + 2]);
+          Context, HasTemplateKWAndArgsInfo, NumTemplateArgs,
+          HasFirstQualifierFoundInScope);
       break;
+    }
 
-    case EXPR_CXX_DEPENDENT_SCOPE_DECL_REF:
-      S = DependentScopeDeclRefExpr::CreateEmpty(Context,
-         /*HasTemplateKWAndArgsInfo=*/Record[ASTStmtReader::NumExprFields],
-                  /*NumTemplateArgs=*/Record[ASTStmtReader::NumExprFields]
-                                   ? Record[ASTStmtReader::NumExprFields + 1]
-                                   : 0);
+    case EXPR_CXX_DEPENDENT_SCOPE_DECL_REF: {
+      BitsUnpacker DependentScopeDeclRefBits(
+          Record[ASTStmtReader::NumStmtFields]);
+      DependentScopeDeclRefBits.advance(ASTStmtReader::NumExprBits);
+      bool HasTemplateKWAndArgsInfo = DependentScopeDeclRefBits.getNextBit();
+      unsigned NumTemplateArgs =
+          HasTemplateKWAndArgsInfo
+              ? DependentScopeDeclRefBits.getNextBits(/*Width=*/16)
+              : 0;
+      S = DependentScopeDeclRefExpr::CreateEmpty(
+          Context, HasTemplateKWAndArgsInfo, NumTemplateArgs);
       break;
+    }
 
     case EXPR_CXX_UNRESOLVED_CONSTRUCT:
       S = CXXUnresolvedConstructExpr::CreateEmpty(Context,
                               /*NumArgs=*/Record[ASTStmtReader::NumExprFields]);
       break;
 
-    case EXPR_CXX_UNRESOLVED_MEMBER:
+    case EXPR_CXX_UNRESOLVED_MEMBER: {
+      BitsUnpacker OverloadExprBits(Record[ASTStmtReader::NumExprFields]);
+      auto NumResults = OverloadExprBits.getNextBits(/*Width=*/12);
+      auto HasTemplateKWAndArgsInfo = OverloadExprBits.getNextBit();
+      auto NumTemplateArgs = HasTemplateKWAndArgsInfo
+                                 ? OverloadExprBits.getNextBits(/*Width=*/12)
+                                 : 0;
       S = UnresolvedMemberExpr::CreateEmpty(
-          Context,
-          /*NumResults=*/Record[ASTStmtReader::NumExprFields] & ((1 << 14) - 1),
-          /*HasTemplateKWAndArgsInfo=*/
-          (Record[ASTStmtReader::NumExprFields] >> 14) & (0x1),
-          /*NumTemplateArgs=*/Record[ASTStmtReader::NumExprFields] >> 14 &
-              ((1 << 14) - 1));
+          Context, NumResults, HasTemplateKWAndArgsInfo, NumTemplateArgs);
       break;
+    }
 
-    case EXPR_CXX_UNRESOLVED_LOOKUP:
+    case EXPR_CXX_UNRESOLVED_LOOKUP: {
+      BitsUnpacker OverloadExprBits(Record[ASTStmtReader::NumExprFields]);
+      auto NumResults = OverloadExprBits.getNextBits(/*Width=*/12);
+      auto HasTemplateKWAndArgsInfo = OverloadExprBits.getNextBit();
+      auto NumTemplateArgs = HasTemplateKWAndArgsInfo
+                                 ? OverloadExprBits.getNextBits(/*Width=*/12)
+                                 : 0;
       S = UnresolvedLookupExpr::CreateEmpty(
-          Context,
-          /*NumResults=*/Record[ASTStmtReader::NumExprFields] & ((1 << 14) - 1),
-          /*HasTemplateKWAndArgsInfo=*/
-          (Record[ASTStmtReader::NumExprFields] >> 14) & (0x1),
-          /*NumTemplateArgs=*/Record[ASTStmtReader::NumExprFields] >> 14 &
-              ((1 << 14) - 1));
+          Context, NumResults, HasTemplateKWAndArgsInfo, NumTemplateArgs);
       break;
+    }
 
     case EXPR_TYPE_TRAIT:
       S = TypeTraitExpr::CreateDeserialized(Context,
@@ -4044,8 +4130,9 @@ Stmt *ASTReader::ReadStmtFromStream(ModuleFile &F) {
       break;
 
     case EXPR_CUDA_KERNEL_CALL: {
-      BitsUnpacker CallExprBits(Record[ASTStmtReader::NumExprFields]);
-      auto NumArgs = CallExprBits.getNextBits(/*Width=*/16);
+      BitsUnpacker CallExprBits(Record[ASTStmtReader::NumStmtFields]);
+      CallExprBits.advance(ASTStmtReader::NumExprBits);
+      auto NumArgs = CallExprBits.getNextBits(/*Width=*/13);
       auto HasFPFeatures = CallExprBits.getNextBit();
       S = CUDAKernelCallExpr::CreateEmpty(Context, NumArgs, HasFPFeatures,
                                           Empty);
diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp
index 91eb2af8f8ad6..78939bfd533ff 100644
--- a/clang/lib/Serialization/ASTWriter.cpp
+++ b/clang/lib/Serialization/ASTWriter.cpp
@@ -6003,12 +6003,17 @@ void ASTRecordWriter::AddCXXDefinitionData(const CXXRecordDecl *D) {
 
   BitsPacker DefinitionBits;
 
-#define FIELD(Name, Width, Merge) DefinitionBits.addBits(Data.Name, Width);
+#define FIELD(Name, Width, Merge)                                              \
+  if (!DefinitionBits.canWriteNextNBits(Width)) {                              \
+    Record->push_back(DefinitionBits);                                         \
+    DefinitionBits.reset(0);                                                   \
+  }                                                                            \
+  DefinitionBits.addBits(Data.Name, Width);
+
 #include "clang/AST/CXXRecordDeclDefinitionBits.def"
 #undef FIELD
 
-  while (DefinitionBits.hasUnconsumedValues())
-    Record->push_back(DefinitionBits.getNextValue());
+  Record->push_back(DefinitionBits);
 
   // getODRHash will compute the ODRHash if it has not been previously computed.
   Record->push_back(D->getODRHash());
@@ -6047,7 +6052,7 @@ void ASTRecordWriter::AddCXXDefinitionData(const CXXRecordDecl *D) {
     LambdaBits.addBits(Lambda.CaptureDefault, /*Width=*/2);
     LambdaBits.addBits(Lambda.NumCaptures, /*Width=*/15);
     LambdaBits.addBit(Lambda.HasKnownInternalLinkage);
-    Record->push_back(LambdaBits.getNextValue());
+    Record->push_back(LambdaBits);
 
     Record->push_back(Lambda.NumExplicitCaptures);
     Record->push_back(Lambda.ManglingNumber);
@@ -6058,10 +6063,12 @@ void ASTRecordWriter::AddCXXDefinitionData(const CXXRecordDecl *D) {
     for (unsigned I = 0, N = Lambda.NumCaptures; I != N; ++I) {
       const LambdaCapture &Capture = Lambda.Captures.front()[I];
       AddSourceLocation(Capture.getLocation());
+
       BitsPacker CaptureBits;
       CaptureBits.addBit(Capture.isImplicit());
       CaptureBits.addBits(Capture.getCaptureKind(), /*Width=*/3);
       Record->push_back(CaptureBits);
+
       switch (Capture.getCaptureKind()) {
       case LCK_StarThis:
       case LCK_This:
diff --git a/clang/lib/Serialization/ASTWriterDecl.cpp b/clang/lib/Serialization/ASTWriterDecl.cpp
index 43169b2befc68..53128133588fa 100644
--- a/clang/lib/Serialization/ASTWriterDecl.cpp
+++ b/clang/lib/Serialization/ASTWriterDecl.cpp
@@ -496,15 +496,10 @@ void ASTDeclWriter::VisitEnumDecl(EnumDecl *D) {
   if (D->getDeclContext() == D->getLexicalDeclContext() &&
       !D->hasAttrs() &&
       !D->isImplicit() &&
-      !D->isUsed(false) &&
       !D->hasExtInfo() &&
       !D->getTypedefNameForAnonDecl() &&
       D->getFirstDecl() == D->getMostRecentDecl() &&
-      !D->isInvalidDecl() &&
-      !D->isReferenced() &&
       !D->isTopLevelDeclInObjCContainer() &&
-      D->getAccess() == AS_none &&
-      !D->isModulePrivate() &&
       !CXXRecordDecl::classofKind(D->getKind()) &&
       !D->getIntegerTypeSourceInfo() &&
       !D->getMemberSpecializationInfo() &&
@@ -544,16 +539,10 @@ void ASTDeclWriter::VisitRecordDecl(RecordDecl *D) {
 
   if (D->getDeclContext() == D->getLexicalDeclContext() &&
       !D->hasAttrs() &&
-      !D->isImplicit() &&
-      !D->isUsed(false) &&
       !D->hasExtInfo() &&
       !D->getTypedefNameForAnonDecl() &&
       D->getFirstDecl() == D->getMostRecentDecl() &&
-      !D->isInvalidDecl() &&
-      !D->isReferenced() &&
       !D->isTopLevelDeclInObjCContainer() &&
-      D->getAccess() == AS_none &&
-      !D->isModulePrivate() &&
       !CXXRecordDecl::classofKind(D->getKind()) &&
       !needsAnonymousDeclarationNumber(D) &&
       D->getDeclName().getNameKind() == DeclarationName::Identifier)
@@ -1137,13 +1126,7 @@ void ASTDeclWriter::VisitVarDecl(VarDecl *D) {
 
   if (D->getDeclContext() == D->getLexicalDeclContext() &&
       !D->hasAttrs() &&
-      !D->isImplicit() &&
-      !D->isUsed(false) &&
-      !D->isInvalidDecl() &&
-      !D->isReferenced() &&
       !D->isTopLevelDeclInObjCContainer() &&
-      D->getAccess() == AS_none &&
-      !D->isModulePrivate() &&
       !needsAnonymousDeclarationNumber(D) &&
       D->getDeclName().getNameKind() == DeclarationName::Identifier &&
       !D->hasExtInfo() &&
@@ -1193,14 +1176,9 @@ void ASTDeclWriter::VisitParmVarDecl(ParmVarDecl *D) {
   // we dynamically check for the properties that we optimize for, but don't
   // know are true of all PARM_VAR_DECLs.
   if (D->getDeclContext() == D->getLexicalDeclContext() && !D->hasAttrs() &&
-      !D->hasExtInfo() && !D->isImplicit() && !D->isUsed(false) &&
-      !D->isInvalidDecl() && !D->isReferenced() && D->getAccess() == AS_none &&
-      !D->isModulePrivate() && D->getStorageClass() == 0 &&
+      !D->hasExtInfo() && D->getStorageClass() == 0 &&
       D->getInitStyle() == VarDecl::CInit && // Can params have anything else?
-      D->getFunctionScopeDepth() == 0 && D->getObjCDeclQualifier() == 0 &&
-      !D->isKNRPromoted() && !D->isExplicitObjectParameter() &&
-      !D->hasInheritedDefaultArg() && D->getInit() == nullptr &&
-      !D->hasUninstantiatedDefaultArg()) // No default expr.
+      D->getInit() == nullptr)               // No default expr.
     AbbrevToUse = Writer.getDeclParmVarAbbrev();
 
   // Check things we know are true of *every* PARM_VAR_DECL, which is more than
@@ -1403,6 +1381,13 @@ void ASTDeclWriter::VisitUsingShadowDecl(UsingShadowDecl *D) {
   Record.push_back(D->getIdentifierNamespace());
   Record.AddDeclRef(D->UsingOrNextShadow);
   Record.AddDeclRef(Context.getInstantiatedFromUsingShadowDecl(D));
+
+  if (D->getDeclContext() == D->getLexicalDeclContext() &&
+      D->getFirstDecl() == D->getMostRecentDecl() && !D->hasAttrs() &&
+      !needsAnonymousDeclarationNumber(D) &&
+      D->getDeclName().getNameKind() == DeclarationName::Identifier)
+    AbbrevToUse = Writer.getDeclUsingShadowAbbrev();
+
   Code = serialization::DECL_USING_SHADOW;
 }
 
@@ -1507,10 +1492,32 @@ void ASTDeclWriter::VisitCXXMethodDecl(CXXMethodDecl *D) {
       D->getFirstDecl() == D->getMostRecentDecl() && !D->isInvalidDecl() &&
       !D->hasAttrs() && !D->isTopLevelDeclInObjCContainer() &&
       D->getDeclName().getNameKind() == DeclarationName::Identifier &&
-      !D->hasExtInfo() && !D->hasInheritedPrototype() &&
-      D->hasWrittenPrototype() &&
-      D->getTemplatedKind() == FunctionDecl::TK_NonTemplate)
-    AbbrevToUse = Writer.getDeclCXXMethodAbbrev();
+      !D->hasExtInfo() && !D->isExplicitlyDefaulted()) {
+    if (D->getTemplatedKind() == FunctionDecl::TK_NonTemplate ||
+        D->getTemplatedKind() == FunctionDecl::TK_FunctionTemplate ||
+        D->getTemplatedKind() == FunctionDecl::TK_MemberSpecialization ||
+        D->getTemplatedKind() == FunctionDecl::TK_DependentNonTemplate)
+      AbbrevToUse = Writer.getDeclCXXMethodAbbrev(D->getTemplatedKind());
+    else if (D->getTemplatedKind() ==
+             FunctionDecl::TK_FunctionTemplateSpecialization) {
+      FunctionTemplateSpecializationInfo *FTSInfo =
+          D->getTemplateSpecializationInfo();
+
+      if (FTSInfo->TemplateArguments->size() == 1) {
+        const TemplateArgument &TA = FTSInfo->TemplateArguments->get(0);
+        if (TA.getKind() == TemplateArgument::Type &&
+            !FTSInfo->TemplateArgumentsAsWritten &&
+            !FTSInfo->getMemberSpecializationInfo())
+          AbbrevToUse = Writer.getDeclCXXMethodAbbrev(D->getTemplatedKind());
+      }
+    } else if (D->getTemplatedKind() ==
+               FunctionDecl::TK_DependentFunctionTemplateSpecialization) {
+      DependentFunctionTemplateSpecializationInfo *DFTSInfo =
+          D->getDependentSpecializationInfo();
+      if (!DFTSInfo->TemplateArgumentsAsWritten)
+        AbbrevToUse = Writer.getDeclCXXMethodAbbrev(D->getTemplatedKind());
+    }
+  }
 
   Code = serialization::DECL_CXX_METHOD;
 }
@@ -1782,7 +1789,7 @@ void ASTDeclWriter::VisitTemplateTypeParmDecl(TemplateTypeParmDecl *D) {
   Record.push_back(D->wasDeclaredWithTypename());
 
   const TypeConstraint *TC = D->getTypeConstraint();
-  Record.push_back(TC != nullptr);
+  assert((bool)TC == D->hasTypeConstraint());
   if (TC) {
     auto *CR = TC->getConceptReference();
     Record.push_back(CR != nullptr);
@@ -1800,6 +1807,13 @@ void ASTDeclWriter::VisitTemplateTypeParmDecl(TemplateTypeParmDecl *D) {
   if (OwnsDefaultArg)
     Record.AddTypeSourceInfo(D->getDefaultArgumentInfo());
 
+  if (!TC && !OwnsDefaultArg &&
+      D->getDeclContext() == D->getLexicalDeclContext() &&
+      !D->isInvalidDecl() && !D->hasAttrs() &&
+      !D->isTopLevelDeclInObjCContainer() &&
+      D->getDeclName().getNameKind() == DeclarationName::Identifier)
+    AbbrevToUse = Writer.getDeclTemplateTypeParmAbbrev();
+
   Code = serialization::DECL_TEMPLATE_TYPE_PARM;
 }
 
@@ -2031,6 +2045,104 @@ void ASTDeclWriter::VisitOMPCapturedExprDecl(OMPCapturedExprDecl *D) {
 // ASTWriter Implementation
 //===----------------------------------------------------------------------===//
 
+namespace {
+template <FunctionDecl::TemplatedKind Kind>
+std::shared_ptr<llvm::BitCodeAbbrev>
+getFunctionDeclAbbrev(serialization::DeclCode Code) {
+  using namespace llvm;
+
+  auto Abv = std::make_shared<BitCodeAbbrev>();
+  Abv->Add(BitCodeAbbrevOp(Code));
+  // RedeclarableDecl
+  Abv->Add(BitCodeAbbrevOp(0)); // CanonicalDecl
+  Abv->Add(BitCodeAbbrevOp(Kind));
+  if constexpr (Kind == FunctionDecl::TK_NonTemplate) {
+
+  } else if constexpr (Kind == FunctionDecl::TK_FunctionTemplate) {
+    // DescribedFunctionTemplate
+    Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));
+  } else if constexpr (Kind == FunctionDecl::TK_DependentNonTemplate) {
+    // Instantiated From Decl
+    Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));
+  } else if constexpr (Kind == FunctionDecl::TK_MemberSpecialization) {
+    Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // InstantiatedFrom
+    Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed,
+                             3)); // TemplateSpecializationKind
+    Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // Specialized Location
+  } else if constexpr (Kind ==
+                       FunctionDecl::TK_FunctionTemplateSpecialization) {
+    Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // Template
+    Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed,
+                             3)); // TemplateSpecializationKind
+    Abv->Add(BitCodeAbbrevOp(1)); // Template Argument Size
+    Abv->Add(BitCodeAbbrevOp(TemplateArgument::Type)); // Template Argument Kind
+    Abv->Add(
+        BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // Template Argument Type
+    Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); // Is Defaulted
+    Abv->Add(BitCodeAbbrevOp(0)); // TemplateArgumentsAsWritten
+    Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // SourceLocation
+    Abv->Add(BitCodeAbbrevOp(0));
+    Abv->Add(
+        BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // Canonical Decl of template
+  } else if constexpr (Kind == FunctionDecl::
+                                   TK_DependentFunctionTemplateSpecialization) {
+    // Candidates of specialization
+    Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
+    Abv->Add(BitCodeAbbrevOp(0)); // TemplateArgumentsAsWritten
+  } else {
+    llvm_unreachable("Unknown templated kind?");
+  }
+  // Decl
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed,
+                           12)); // Packed DeclBits: HasStandaloneLexicalDC,
+                                 // isInvalidDecl, HasAttrs, isImplicit, isUsed,
+                                 // isReferenced, TopLevelDeclInObjCContainer,
+                                 // AccessSpecifier, ModuleOwnershipKind
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // DeclContext
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // SubmoduleID
+  // NamedDecl
+  Abv->Add(BitCodeAbbrevOp(DeclarationName::Identifier)); // NameKind
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));     // Identifier
+  Abv->Add(BitCodeAbbrevOp(0));                           // AnonDeclNumber
+  // ValueDecl
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // Type
+  // DeclaratorDecl
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // InnerLocStart
+  Abv->Add(BitCodeAbbrevOp(0));                       // HasExtInfo
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // TSIType
+  // FunctionDecl
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 11)); // IDNS
+  Abv->Add(BitCodeAbbrevOp(
+      BitCodeAbbrevOp::Fixed,
+      27)); // Packed Function Bits: StorageClass, Inline, InlineSpecified,
+            // VirtualAsWritten, Pure, HasInheritedProto, HasWrittenProto,
+            // Deleted, Trivial, TrivialForCall, Defaulted, ExplicitlyDefaulted,
+            // IsIneligibleOrNotSelected, ImplicitReturnZero, Constexpr,
+            // UsesSEHTry, SkippedBody, MultiVersion, LateParsed,
+            // FriendConstraintRefersToEnclosingTemplate, Linkage
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));    // LocEnd
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 32)); // ODRHash
+  // This Array slurps the rest of the record. Fortunately we want to encode
+  // (nearly) all the remaining (variable number of) fields in the same way.
+  //
+  // This is:
+  //         NumParams and Params[] from FunctionDecl, and
+  //         NumOverriddenMethods, OverriddenMethods[] from CXXMethodDecl.
+  //
+  //  Add an AbbrevOp for 'size then elements' and use it here.
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
+  return Abv;
+}
+
+template <FunctionDecl::TemplatedKind Kind>
+std::shared_ptr<llvm::BitCodeAbbrev> getCXXMethodAbbrev() {
+  using namespace llvm;
+  auto Abv = getFunctionDeclAbbrev<Kind>(serialization::DECL_CXX_METHOD);
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));
+  return Abv;
+}
+} // namespace
+
 void ASTWriter::WriteDeclAbbrevs() {
   using namespace llvm;
 
@@ -2290,71 +2402,81 @@ void ASTWriter::WriteDeclAbbrevs() {
   DeclVarAbbrev = Stream.EmitAbbrev(std::move(Abv));
 
   // Abbreviation for DECL_CXX_METHOD
+  DeclCXXMethodAbbrev =
+      Stream.EmitAbbrev(getCXXMethodAbbrev<FunctionDecl::TK_NonTemplate>());
+  DeclTemplateCXXMethodAbbrev = Stream.EmitAbbrev(
+      getCXXMethodAbbrev<FunctionDecl::TK_FunctionTemplate>());
+  DeclDependentNonTemplateCXXMethodAbbrev = Stream.EmitAbbrev(
+      getCXXMethodAbbrev<FunctionDecl::TK_DependentNonTemplate>());
+  DeclMemberSpecializedCXXMethodAbbrev = Stream.EmitAbbrev(
+      getCXXMethodAbbrev<FunctionDecl::TK_MemberSpecialization>());
+  DeclTemplateSpecializedCXXMethodAbbrev = Stream.EmitAbbrev(
+      getCXXMethodAbbrev<FunctionDecl::TK_FunctionTemplateSpecialization>());
+  DeclDependentSpecializationCXXMethodAbbrev = Stream.EmitAbbrev(
+      getCXXMethodAbbrev<
+          FunctionDecl::TK_DependentFunctionTemplateSpecialization>());
+
+  // Abbreviation for DECL_TEMPLATE_TYPE_PARM
   Abv = std::make_shared<BitCodeAbbrev>();
-  Abv->Add(BitCodeAbbrevOp(serialization::DECL_CXX_METHOD));
-  // RedeclarableDecl
-  Abv->Add(BitCodeAbbrevOp(0));                         // CanonicalDecl
-  // FIXME: Implement abbreviation for other template kinds.
-  Abv->Add(BitCodeAbbrevOp(FunctionDecl::TK_NonTemplate)); // TemplateKind
+  Abv->Add(BitCodeAbbrevOp(serialization::DECL_TEMPLATE_TYPE_PARM));
+  Abv->Add(BitCodeAbbrevOp(0)); // hasTypeConstraint
   // Decl
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed,
                            12)); // Packed DeclBits: HasStandaloneLexicalDC,
                                  // isInvalidDecl, HasAttrs, isImplicit, isUsed,
                                  // isReferenced, TopLevelDeclInObjCContainer,
                                  // AccessSpecifier, ModuleOwnershipKind
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));   // DeclContext
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));   // SubmoduleID
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // DeclContext
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // SubmoduleID
   // NamedDecl
-  Abv->Add(BitCodeAbbrevOp(DeclarationName::Identifier)); // NameKind
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));   // Identifier
-  Abv->Add(BitCodeAbbrevOp(0));                         // AnonDeclNumber
-  // ValueDecl
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));   // Type
-  // DeclaratorDecl
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));   // InnerLocStart
-  Abv->Add(BitCodeAbbrevOp(0));                         // HasExtInfo
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));   // TSIType
-  // FunctionDecl
+  Abv->Add(BitCodeAbbrevOp(0));                       // NameKind = Identifier
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // Name
+  Abv->Add(BitCodeAbbrevOp(0));
+  // TypeDecl
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // Source Location
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // Type Ref
+  // TemplateTypeParmDecl
+  Abv->Add(
+      BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); // wasDeclaredWithTypename
+  Abv->Add(BitCodeAbbrevOp(0));                    // OwnsDefaultArg
+  DeclTemplateTypeParmAbbrev = Stream.EmitAbbrev(std::move(Abv));
+
+  // Abbreviation for DECL_USING_SHADOW
+  Abv = std::make_shared<BitCodeAbbrev>();
+  Abv->Add(BitCodeAbbrevOp(serialization::DECL_USING_SHADOW));
+  // Redeclarable
+  Abv->Add(BitCodeAbbrevOp(0)); // No redeclaration
+  // Decl
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed,
+                           12)); // Packed DeclBits: HasStandaloneLexicalDC,
+                                 // isInvalidDecl, HasAttrs, isImplicit, isUsed,
+                                 // isReferenced, TopLevelDeclInObjCContainer,
+                                 // AccessSpecifier, ModuleOwnershipKind
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // DeclContext
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // SubmoduleID
+  // NamedDecl
+  Abv->Add(BitCodeAbbrevOp(0));                       // NameKind = Identifier
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // Name
+  Abv->Add(BitCodeAbbrevOp(0));
+  // UsingShadowDecl
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));    // TargetDecl
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 11)); // IDNS
-  Abv->Add(BitCodeAbbrevOp(
-      BitCodeAbbrevOp::Fixed,
-      27)); // Packed Function Bits: StorageClass, Inline, InlineSpecified,
-            // VirtualAsWritten, Pure, HasInheritedProto, HasWrittenProto,
-            // Deleted, Trivial, TrivialForCall, Defaulted, ExplicitlyDefaulted,
-            // IsIneligibleOrNotSelected, ImplicitReturnZero, Constexpr,
-            // UsesSEHTry, SkippedBody, MultiVersion, LateParsed,
-            // FriendConstraintRefersToEnclosingTemplate, Linkage
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));   // LocEnd
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));   // Default
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 32)); // ODRHash
-  // This Array slurps the rest of the record. Fortunately we want to encode
-  // (nearly) all the remaining (variable number of) fields in the same way.
-  //
-  // This is:
-  //         NumParams and Params[] from FunctionDecl, and
-  //         NumOverriddenMethods, OverriddenMethods[] from CXXMethodDecl.
-  //
-  //  Add an AbbrevOp for 'size then elements' and use it here.
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));
-  DeclCXXMethodAbbrev = Stream.EmitAbbrev(std::move(Abv));
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));    // UsingOrNextShadow
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR,
+                           6)); // InstantiatedFromUsingShadowDecl
+  DeclUsingShadowAbbrev = Stream.EmitAbbrev(std::move(Abv));
 
   // Abbreviation for EXPR_DECL_REF
   Abv = std::make_shared<BitCodeAbbrev>();
   Abv->Add(BitCodeAbbrevOp(serialization::EXPR_DECL_REF));
-  //Stmt
-  // Expr
+  // Stmt
+  //  Expr
+  //  PackingBits: DependenceKind, ValueKind, ObjectKind, HasQualifier,
+  //  GetDeclFound, ExplicitTemplateArgs, HadMultipleCandidates,
+  //  NonOdrUseReason, RefersToEnclosingVariableOrCapture, IsImmediateEscalating
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 18));
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // Type
-  // DependenceKind, ValueKind, ObjectKind
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 10));
-  //DeclRefExpr
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); //HasQualifier
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); //GetDeclFound
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); //ExplicitTemplateArgs
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); //HadMultipleCandidates
-  Abv->Add(BitCodeAbbrevOp(0)); // RefersToEnclosingVariableOrCapture
-  Abv->Add(BitCodeAbbrevOp(0)); // NonOdrUseReason
-  Abv->Add(BitCodeAbbrevOp(0)); // IsImmediateEscalating
+  // DeclRefExpr
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // DeclRef
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // Location
   DeclRefExprAbbrev = Stream.EmitAbbrev(std::move(Abv));
@@ -2364,10 +2486,10 @@ void ASTWriter::WriteDeclAbbrevs() {
   Abv->Add(BitCodeAbbrevOp(serialization::EXPR_INTEGER_LITERAL));
   //Stmt
   // Expr
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // Type
   // DependenceKind, ValueKind, ObjectKind
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 10));
-  //Integer Literal
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // Type
+  // Integer Literal
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // Location
   Abv->Add(BitCodeAbbrevOp(32));                      // Bit Width
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // Value
@@ -2378,10 +2500,10 @@ void ASTWriter::WriteDeclAbbrevs() {
   Abv->Add(BitCodeAbbrevOp(serialization::EXPR_CHARACTER_LITERAL));
   //Stmt
   // Expr
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // Type
   // DependenceKind, ValueKind, ObjectKind
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 10));
-  //Character Literal
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // Type
+  // Character Literal
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // getValue
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // Location
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 3)); // getKind
@@ -2392,17 +2514,98 @@ void ASTWriter::WriteDeclAbbrevs() {
   Abv->Add(BitCodeAbbrevOp(serialization::EXPR_IMPLICIT_CAST));
   // Stmt
   // Expr
+  // Packing Bits: DependenceKind, ValueKind, ObjectKind,
+  // HasFPFeatures, CastKind, PartOfExplicitCast
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 19));
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // Type
-  // DependenceKind, ValueKind, ObjectKind
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 10));
   // CastExpr
   Abv->Add(BitCodeAbbrevOp(0)); // PathSize
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); // HasFPFeatures
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 6)); // CastKind
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); // PartOfExplicitCast
   // ImplicitCastExpr
   ExprImplicitCastAbbrev = Stream.EmitAbbrev(std::move(Abv));
 
+  // Abbreviation for EXPR_BINARY_OPERATOR
+  Abv = std::make_shared<BitCodeAbbrev>();
+  Abv->Add(BitCodeAbbrevOp(serialization::EXPR_BINARY_OPERATOR));
+  // Stmt
+  // Expr
+  // Packing Bits: DependenceKind, ValueKind, ObjectKind,
+  // HasFPFeatures, OpKind
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 17));
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // Type
+  // BinaryOperator
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // Source Location
+  BinaryOperatorAbbrev = Stream.EmitAbbrev(std::move(Abv));
+
+  // Abbreviation for EXPR_COMPOUND_ASSIGN_OPERATOR
+  Abv = std::make_shared<BitCodeAbbrev>();
+  Abv->Add(BitCodeAbbrevOp(serialization::EXPR_COMPOUND_ASSIGN_OPERATOR));
+  // Stmt
+  // Expr
+  // Packing Bits: DependenceKind, ValueKind, ObjectKind,
+  // HasFPFeatures, OpKind
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 17));
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // Type
+  // BinaryOperator
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // Source Location
+  // CompoundAssignOperator
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // LHSType
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // Result Type
+  CompoundAssignOperatorAbbrev = Stream.EmitAbbrev(std::move(Abv));
+
+  // Abbreviation for EXPR_CALL
+  Abv = std::make_shared<BitCodeAbbrev>();
+  Abv->Add(BitCodeAbbrevOp(serialization::EXPR_CALL));
+  // Stmt
+  // Expr
+  // Packing Bits: DependenceKind, ValueKind, ObjectKind,
+  // NumArgs, hasStoredFPFeatures, ADLCallKind
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 25));
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // Type
+  // CallExpr
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // Source Location
+  CallExprAbbrev = Stream.EmitAbbrev(std::move(Abv));
+
+  // Abbreviation for EXPR_CXX_OPERATOR_CALL
+  Abv = std::make_shared<BitCodeAbbrev>();
+  Abv->Add(BitCodeAbbrevOp(serialization::EXPR_CXX_OPERATOR_CALL));
+  // Stmt
+  // Expr
+  // Packing Bits: DependenceKind, ValueKind, ObjectKind,
+  // NumArgs, hasStoredFPFeatures, ADLCallKind, OperatorKind
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 31));
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // Type
+  // CallExpr
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // Source Location
+  // CXXOperatorCallExpr
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // Source Location
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // Source Location
+  CXXOperatorCallExprAbbrev = Stream.EmitAbbrev(std::move(Abv));
+
+  // Abbreviation for EXPR_CXX_MEMBER_CALL
+  Abv = std::make_shared<BitCodeAbbrev>();
+  Abv->Add(BitCodeAbbrevOp(serialization::EXPR_CXX_MEMBER_CALL));
+  // Stmt
+  // Expr
+  // Packing Bits: DependenceKind, ValueKind, ObjectKind,
+  // NumArgs, hasStoredFPFeatures, ADLCallKind
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 25));
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // Type
+  // CallExpr
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // Source Location
+  // CXXMemberCallExpr
+  CXXMemberCallExprAbbrev = Stream.EmitAbbrev(std::move(Abv));
+
+  // Abbreviation for STMT_COMPOUND
+  Abv = std::make_shared<BitCodeAbbrev>();
+  Abv->Add(BitCodeAbbrevOp(serialization::STMT_COMPOUND));
+  // Stmt
+  // CompoundStmt
+  // Packing Bits: Num Stmts, hasStoredFPFeatures
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 21));
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // Source Location
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // Source Location
+  CompoundStmtAbbrev = Stream.EmitAbbrev(std::move(Abv));
+
   Abv = std::make_shared<BitCodeAbbrev>();
   Abv->Add(BitCodeAbbrevOp(serialization::DECL_CONTEXT_LEXICAL));
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Blob));
diff --git a/clang/lib/Serialization/ASTWriterStmt.cpp b/clang/lib/Serialization/ASTWriterStmt.cpp
index 8524484ea8a0b..02cc7798abdb2 100644
--- a/clang/lib/Serialization/ASTWriterStmt.cpp
+++ b/clang/lib/Serialization/ASTWriterStmt.cpp
@@ -37,15 +37,70 @@ namespace clang {
     serialization::StmtCode Code;
     unsigned AbbrevToUse;
 
+    /// A helper that can help us to write a packed bit across function
+    /// calls. For example, we may write seperate bits in seperate functions:
+    ///
+    ///  void VisitA(A* a) {
+    ///     Record.push_back(a->isSomething());
+    ///  }
+    ///
+    ///  void Visitb(B *b) {
+    ///     VisitA(b);
+    ///     Record.push_back(b->isAnother());
+    ///  }
+    ///
+    /// In such cases, it'll be better if we can pack these 2 bits. We achieve
+    /// this by writing a zero value in `VisitA` and recorded that first and add
+    /// the new bit to the recorded value.
+    class PakedBitsWriter {
+    public:
+      PakedBitsWriter(ASTRecordWriter &Record) : RecordRef(Record) {}
+      ~PakedBitsWriter() { assert(!CurrentIndex); }
+
+      void addBit(bool Value) {
+        assert(CurrentIndex && "Writing Bits without recording first!");
+        PackingBits.addBit(Value);
+      }
+      void addBits(uint32_t Value, uint32_t BitsWidth) {
+        assert(CurrentIndex && "Writing Bits without recording first!");
+        PackingBits.addBits(Value, BitsWidth);
+      }
+
+      void writeBits() {
+        if (!CurrentIndex)
+          return;
+
+        RecordRef[*CurrentIndex] = (uint32_t)PackingBits;
+        CurrentIndex = std::nullopt;
+        PackingBits.reset(0);
+      }
+
+      void updateBits() {
+        writeBits();
+
+        CurrentIndex = RecordRef.size();
+        RecordRef.push_back(0);
+      }
+
+    private:
+      BitsPacker PackingBits;
+      ASTRecordWriter &RecordRef;
+      std::optional<unsigned> CurrentIndex;
+    };
+
+    PakedBitsWriter CurrentPackingBits;
+
   public:
     ASTStmtWriter(ASTWriter &Writer, ASTWriter::RecordData &Record)
         : Writer(Writer), Record(Writer, Record),
-          Code(serialization::STMT_NULL_PTR), AbbrevToUse(0) {}
+          Code(serialization::STMT_NULL_PTR), AbbrevToUse(0),
+          CurrentPackingBits(this->Record) {}
 
     ASTStmtWriter(const ASTStmtWriter&) = delete;
     ASTStmtWriter &operator=(const ASTStmtWriter &) = delete;
 
     uint64_t Emit() {
+      CurrentPackingBits.writeBits();
       assert(Code != serialization::STMT_NULL_PTR &&
              "unhandled sub-statement writing AST file");
       return Record.EmitStmt(Code, AbbrevToUse);
@@ -82,14 +137,22 @@ void ASTStmtWriter::VisitNullStmt(NullStmt *S) {
 
 void ASTStmtWriter::VisitCompoundStmt(CompoundStmt *S) {
   VisitStmt(S);
-  Record.push_back(S->size());
-  Record.push_back(S->hasStoredFPFeatures());
+
+  CurrentPackingBits.updateBits();
+  // 20 bits should be enough to store the size of stmts.
+  CurrentPackingBits.addBits(S->size(), /*Width=*/20);
+  CurrentPackingBits.addBit(S->hasStoredFPFeatures());
+
   for (auto *CS : S->body())
     Record.AddStmt(CS);
   if (S->hasStoredFPFeatures())
     Record.push_back(S->getStoredFPFeatures().getAsOpaqueInt());
   Record.AddSourceLocation(S->getLBracLoc());
   Record.AddSourceLocation(S->getRBracLoc());
+
+  if (!S->hasStoredFPFeatures())
+    AbbrevToUse = Writer.getCompoundStmtAbbrev();
+
   Code = serialization::STMT_COMPOUND;
 }
 
@@ -143,9 +206,11 @@ void ASTStmtWriter::VisitIfStmt(IfStmt *S) {
   bool HasVar = S->getConditionVariableDeclStmt() != nullptr;
   bool HasInit = S->getInit() != nullptr;
 
-  Record.push_back(HasElse);
-  Record.push_back(HasVar);
-  Record.push_back(HasInit);
+  CurrentPackingBits.updateBits();
+
+  CurrentPackingBits.addBit(HasElse);
+  CurrentPackingBits.addBit(HasVar);
+  CurrentPackingBits.addBit(HasInit);
   Record.push_back(static_cast<uint64_t>(S->getStatementKind()));
   Record.AddStmt(S->getCond());
   Record.AddStmt(S->getThen());
@@ -548,15 +613,13 @@ void ASTStmtWriter::VisitCapturedStmt(CapturedStmt *S) {
 
 void ASTStmtWriter::VisitExpr(Expr *E) {
   VisitStmt(E);
-  Record.AddTypeRef(E->getType());
-
-  BitsPacker ExprBits;
 
-  ExprBits.addBits(E->getDependence(), /*BitsWidth=*/5);
-  ExprBits.addBits(E->getValueKind(), /*BitsWidth=*/2);
-  ExprBits.addBits(E->getObjectKind(), /*BitsWidth=*/3);
+  CurrentPackingBits.updateBits();
+  CurrentPackingBits.addBits(E->getDependence(), /*BitsWidth=*/5);
+  CurrentPackingBits.addBits(E->getValueKind(), /*BitsWidth=*/2);
+  CurrentPackingBits.addBits(E->getObjectKind(), /*BitsWidth=*/3);
 
-  Record.push_back(ExprBits);
+  Record.AddTypeRef(E->getType());
 }
 
 void ASTStmtWriter::VisitConstantExpr(ConstantExpr *E) {
@@ -612,26 +675,25 @@ void ASTStmtWriter::VisitPredefinedExpr(PredefinedExpr *E) {
 void ASTStmtWriter::VisitDeclRefExpr(DeclRefExpr *E) {
   VisitExpr(E);
 
-  Record.push_back(E->hasQualifier());
-  Record.push_back(E->getDecl() != E->getFoundDecl());
-  Record.push_back(E->hasTemplateKWAndArgsInfo());
-  Record.push_back(E->hadMultipleCandidates());
-  Record.push_back(E->refersToEnclosingVariableOrCapture());
-  Record.push_back(E->isNonOdrUse());
-  Record.push_back(E->isImmediateEscalating());
+  CurrentPackingBits.addBit(E->hasQualifier());
+  CurrentPackingBits.addBit(E->getDecl() != E->getFoundDecl());
+  CurrentPackingBits.addBit(E->hasTemplateKWAndArgsInfo());
+  CurrentPackingBits.addBit(E->hadMultipleCandidates());
+  CurrentPackingBits.addBit(E->refersToEnclosingVariableOrCapture());
+  CurrentPackingBits.addBits(E->isNonOdrUse(), /*Width=*/2);
+  CurrentPackingBits.addBit(E->isImmediateEscalating());
 
   if (E->hasTemplateKWAndArgsInfo()) {
     unsigned NumTemplateArgs = E->getNumTemplateArgs();
-    Record.push_back(NumTemplateArgs);
+    // 12 bits should be sufficient to store the number of template args.
+    CurrentPackingBits.addBits(NumTemplateArgs, /*Width=*/12);
   }
 
   DeclarationName::NameKind nk = (E->getDecl()->getDeclName().getNameKind());
 
   if ((!E->hasTemplateKWAndArgsInfo()) && (!E->hasQualifier()) &&
       (E->getDecl() == E->getFoundDecl()) &&
-      nk == DeclarationName::Identifier &&
-      !E->refersToEnclosingVariableOrCapture() && !E->isNonOdrUse() &&
-      !E->isImmediateEscalating()) {
+      nk == DeclarationName::Identifier) {
     AbbrevToUse = Writer.getDeclRefExprAbbrev();
   }
 
@@ -742,11 +804,13 @@ void ASTStmtWriter::VisitUnaryOperator(UnaryOperator *E) {
   bool HasFPFeatures = E->hasStoredFPFeatures();
   // Write this first for easy access when deserializing, as they affect the
   // size of the UnaryOperator.
-  Record.push_back(HasFPFeatures);
+  CurrentPackingBits.addBit(HasFPFeatures);
   Record.AddStmt(E->getSubExpr());
-  Record.push_back(E->getOpcode()); // FIXME: stable encoding
+  CurrentPackingBits.addBits(E->getOpcode(),
+                             /*Width=*/5); // FIXME: stable encoding
   Record.AddSourceLocation(E->getOperatorLoc());
-  Record.push_back(E->canOverflow());
+  CurrentPackingBits.addBit(E->canOverflow());
+
   if (HasFPFeatures)
     Record.push_back(E->getStoredFPFeatures().getAsOpaqueInt());
   Code = serialization::EXPR_UNARY_OPERATOR;
@@ -872,12 +936,10 @@ void ASTStmtWriter::VisitOMPIteratorExpr(OMPIteratorExpr *E) {
 void ASTStmtWriter::VisitCallExpr(CallExpr *E) {
   VisitExpr(E);
 
-  BitsPacker CallExprBits;
-  // 16 bits should be sufficient to store the number args;
-  CallExprBits.addBits(E->getNumArgs(), /*BitsWidth=*/16);
-  CallExprBits.addBit(E->hasStoredFPFeatures());
-  CallExprBits.addBit(static_cast<bool>(E->getADLCallKind()));
-  Record.push_back(CallExprBits);
+  // 13 bits should be sufficient to store the number args;
+  CurrentPackingBits.addBits(E->getNumArgs(), /*BitsWidth=*/13);
+  CurrentPackingBits.addBit(E->hasStoredFPFeatures());
+  CurrentPackingBits.addBit(static_cast<bool>(E->getADLCallKind()));
 
   Record.AddSourceLocation(E->getRParenLoc());
   Record.AddStmt(E->getCallee());
@@ -887,6 +949,10 @@ void ASTStmtWriter::VisitCallExpr(CallExpr *E) {
 
   if (E->hasStoredFPFeatures())
     Record.push_back(E->getFPFeatures().getAsOpaqueInt());
+
+  if (!E->hasStoredFPFeatures() && E->getStmtClass() == Stmt::CallExprClass)
+    AbbrevToUse = Writer.getCallExprAbbrev();
+
   Code = serialization::EXPR_CALL;
 }
 
@@ -913,25 +979,27 @@ void ASTStmtWriter::VisitMemberExpr(MemberExpr *E) {
 
   // Write these first for easy access when deserializing, as they affect the
   // size of the MemberExpr.
-  Record.push_back(HasQualifier);
-  Record.push_back(HasFoundDecl);
-  Record.push_back(HasTemplateInfo);
-  Record.push_back(NumTemplateArgs);
+
+  CurrentPackingBits.addBit(HasQualifier);
+  CurrentPackingBits.addBit(HasFoundDecl);
+  CurrentPackingBits.addBit(HasTemplateInfo);
+  // 12 bits should be enough to store the number of args
+  CurrentPackingBits.addBits(NumTemplateArgs, /*Width=*/12);
 
   Record.AddStmt(E->getBase());
   Record.AddDeclRef(E->getMemberDecl());
   Record.AddDeclarationNameLoc(E->MemberDNLoc,
                                E->getMemberDecl()->getDeclName());
   Record.AddSourceLocation(E->getMemberLoc());
-  Record.push_back(E->isArrow());
-  Record.push_back(E->hadMultipleCandidates());
-  Record.push_back(E->isNonOdrUse());
+  CurrentPackingBits.addBit(E->isArrow());
+  CurrentPackingBits.addBit(E->hadMultipleCandidates());
+  CurrentPackingBits.addBits(E->isNonOdrUse(), /*Width=*/2);
   Record.AddSourceLocation(E->getOperatorLoc());
 
   if (HasFoundDecl) {
     DeclAccessPair FoundDecl = E->getFoundDecl();
     Record.AddDeclRef(FoundDecl.getDecl());
-    Record.push_back(FoundDecl.getAccess());
+    CurrentPackingBits.addBits(FoundDecl.getAccess(), /*BitWidth=*/2);
   }
 
   if (HasQualifier)
@@ -971,10 +1039,12 @@ void ASTStmtWriter::VisitObjCBridgedCastExpr(ObjCBridgedCastExpr *E) {
 
 void ASTStmtWriter::VisitCastExpr(CastExpr *E) {
   VisitExpr(E);
+
   Record.push_back(E->path_size());
-  Record.push_back(E->hasStoredFPFeatures());
+  CurrentPackingBits.addBit(E->hasStoredFPFeatures());
+  // 7 bits should be enough to store the casting kinds.
+  CurrentPackingBits.addBits(E->getCastKind(), /*Width=*/7);
   Record.AddStmt(E->getSubExpr());
-  Record.push_back(E->getCastKind()); // FIXME: stable encoding
 
   for (CastExpr::path_iterator
          PI = E->path_begin(), PE = E->path_end(); PI != PE; ++PI)
@@ -989,13 +1059,17 @@ void ASTStmtWriter::VisitBinaryOperator(BinaryOperator *E) {
   bool HasFPFeatures = E->hasStoredFPFeatures();
   // Write this first for easy access when deserializing, as they affect the
   // size of the UnaryOperator.
-  Record.push_back(HasFPFeatures);
-  Record.push_back(E->getOpcode()); // FIXME: stable encoding
+  CurrentPackingBits.addBit(HasFPFeatures);
+  CurrentPackingBits.addBits(E->getOpcode(), /*Width=*/6);
   Record.AddStmt(E->getLHS());
   Record.AddStmt(E->getRHS());
   Record.AddSourceLocation(E->getOperatorLoc());
   if (HasFPFeatures)
     Record.push_back(E->getStoredFPFeatures().getAsOpaqueInt());
+
+  if (!HasFPFeatures)
+    AbbrevToUse = Writer.getBinaryOperatorAbbrev();
+
   Code = serialization::EXPR_BINARY_OPERATOR;
 }
 
@@ -1003,6 +1077,10 @@ void ASTStmtWriter::VisitCompoundAssignOperator(CompoundAssignOperator *E) {
   VisitBinaryOperator(E);
   Record.AddTypeRef(E->getComputationLHSType());
   Record.AddTypeRef(E->getComputationResultType());
+
+  if (!E->hasStoredFPFeatures())
+    AbbrevToUse = Writer.getCompoundAssignOperatorAbbrev();
+
   Code = serialization::EXPR_COMPOUND_ASSIGN_OPERATOR;
 }
 
@@ -1031,7 +1109,7 @@ ASTStmtWriter::VisitBinaryConditionalOperator(BinaryConditionalOperator *E) {
 
 void ASTStmtWriter::VisitImplicitCastExpr(ImplicitCastExpr *E) {
   VisitCastExpr(E);
-  Record.push_back(E->isPartOfExplicitCast());
+  CurrentPackingBits.addBit(E->isPartOfExplicitCast());
 
   if (E->path_size() == 0 && !E->hasStoredFPFeatures())
     AbbrevToUse = Writer.getExprImplicitCastAbbrev();
@@ -1586,13 +1664,21 @@ void ASTStmtWriter::VisitMSDependentExistsStmt(MSDependentExistsStmt *S) {
 
 void ASTStmtWriter::VisitCXXOperatorCallExpr(CXXOperatorCallExpr *E) {
   VisitCallExpr(E);
-  Record.push_back(E->getOperator());
+  CurrentPackingBits.addBits(E->getOperator(), /*Width=*/6);
   Record.AddSourceRange(E->Range);
+
+  if (!E->hasStoredFPFeatures())
+    AbbrevToUse = Writer.getCXXOperatorCallExprAbbrev();
+
   Code = serialization::EXPR_CXX_OPERATOR_CALL;
 }
 
 void ASTStmtWriter::VisitCXXMemberCallExpr(CXXMemberCallExpr *E) {
   VisitCallExpr(E);
+
+  if (!E->hasStoredFPFeatures())
+    AbbrevToUse = Writer.getCXXMemberCallExprAbbrev();
+
   Code = serialization::EXPR_CXX_MEMBER_CALL;
 }
 
@@ -1673,7 +1759,9 @@ void ASTStmtWriter::VisitCXXStdInitializerListExpr(CXXStdInitializerListExpr *E)
 void ASTStmtWriter::VisitCXXNamedCastExpr(CXXNamedCastExpr *E) {
   VisitExplicitCastExpr(E);
   Record.AddSourceRange(SourceRange(E->getOperatorLoc(), E->getRParenLoc()));
-  Record.AddSourceRange(E->getAngleBrackets());
+  CurrentPackingBits.addBit(E->getAngleBrackets().isValid());
+  if (E->getAngleBrackets().isValid())
+    Record.AddSourceRange(E->getAngleBrackets());
 }
 
 void ASTStmtWriter::VisitCXXStaticCastExpr(CXXStaticCastExpr *E) {
@@ -1884,9 +1972,10 @@ void ASTStmtWriter::VisitCXXDependentScopeMemberExpr(
   // Don't emit anything here (or if you do you will have to update
   // the corresponding deserialization function).
 
-  Record.push_back(E->hasTemplateKWAndArgsInfo());
-  Record.push_back(E->getNumTemplateArgs());
-  Record.push_back(E->hasFirstQualifierFoundInScope());
+  CurrentPackingBits.addBit(E->hasTemplateKWAndArgsInfo());
+  // 16 bits should be enough to store the number of template args.
+  CurrentPackingBits.addBits(E->getNumTemplateArgs(), /*Width=*/16);
+  CurrentPackingBits.addBit(E->hasFirstQualifierFoundInScope());
 
   if (E->hasTemplateKWAndArgsInfo()) {
     const ASTTemplateKWAndArgsInfo &ArgInfo =
@@ -1895,14 +1984,15 @@ void ASTStmtWriter::VisitCXXDependentScopeMemberExpr(
                              E->getTrailingObjects<TemplateArgumentLoc>());
   }
 
-  Record.push_back(E->isArrow());
-  Record.AddSourceLocation(E->getOperatorLoc());
+  CurrentPackingBits.addBit(E->isArrow());
+
   Record.AddTypeRef(E->getBaseType());
   Record.AddNestedNameSpecifierLoc(E->getQualifierLoc());
+  CurrentPackingBits.addBit(!E->isImplicitAccess());
   if (!E->isImplicitAccess())
     Record.AddStmt(E->getBase());
-  else
-    Record.AddStmt(nullptr);
+
+  Record.AddSourceLocation(E->getOperatorLoc());
 
   if (E->hasFirstQualifierFoundInScope())
     Record.AddDeclRef(E->getFirstQualifierFoundInScope());
@@ -1917,12 +2007,14 @@ ASTStmtWriter::VisitDependentScopeDeclRefExpr(DependentScopeDeclRefExpr *E) {
 
   // Don't emit anything here, HasTemplateKWAndArgsInfo must be
   // emitted first.
+  CurrentPackingBits.addBit(
+      E->DependentScopeDeclRefExprBits.HasTemplateKWAndArgsInfo);
 
-  Record.push_back(E->DependentScopeDeclRefExprBits.HasTemplateKWAndArgsInfo);
   if (E->DependentScopeDeclRefExprBits.HasTemplateKWAndArgsInfo) {
     const ASTTemplateKWAndArgsInfo &ArgInfo =
         *E->getTrailingObjects<ASTTemplateKWAndArgsInfo>();
-    Record.push_back(ArgInfo.NumTemplateArgs);
+    // 16 bits should be enought to store the number of args
+    CurrentPackingBits.addBits(ArgInfo.NumTemplateArgs, /*Width=*/16);
     AddTemplateKWAndArgsInfo(ArgInfo,
                              E->getTrailingObjects<TemplateArgumentLoc>());
   }
@@ -1949,19 +2041,17 @@ ASTStmtWriter::VisitCXXUnresolvedConstructExpr(CXXUnresolvedConstructExpr *E) {
 void ASTStmtWriter::VisitOverloadExpr(OverloadExpr *E) {
   VisitExpr(E);
 
-  BitsPacker OverloadExprBits;
-  // 14 Bits should enough to store the number of decls.
-  OverloadExprBits.addBits(E->getNumDecls(), /*BitWidth=*/14);
-  OverloadExprBits.addBit(E->hasTemplateKWAndArgsInfo());
+  CurrentPackingBits.updateBits();
+  // 12 Bits should enough to store the number of decls.
+  CurrentPackingBits.addBits(E->getNumDecls(), /*BitWidth=*/12);
+  CurrentPackingBits.addBit(E->hasTemplateKWAndArgsInfo());
   if (E->hasTemplateKWAndArgsInfo()) {
     const ASTTemplateKWAndArgsInfo &ArgInfo =
         *E->getTrailingASTTemplateKWAndArgsInfo();
-    // 14 Bits should enough to store the number of template args.
-    OverloadExprBits.addBits(ArgInfo.NumTemplateArgs, /*BitWidth=*/14);
-    Record.push_back(OverloadExprBits);
+    // 12 Bits should enough to store the number of template args.
+    CurrentPackingBits.addBits(ArgInfo.NumTemplateArgs, /*BitWidth=*/12);
     AddTemplateKWAndArgsInfo(ArgInfo, E->getTrailingTemplateArgumentLoc());
-  } else
-    Record.push_back(OverloadExprBits);
+  }
 
   for (OverloadExpr::decls_iterator OvI = E->decls_begin(),
                                     OvE = E->decls_end();
@@ -1976,18 +2066,22 @@ void ASTStmtWriter::VisitOverloadExpr(OverloadExpr *E) {
 
 void ASTStmtWriter::VisitUnresolvedMemberExpr(UnresolvedMemberExpr *E) {
   VisitOverloadExpr(E);
-  Record.push_back(E->isArrow());
-  Record.push_back(E->hasUnresolvedUsing());
-  Record.AddStmt(!E->isImplicitAccess() ? E->getBase() : nullptr);
-  Record.AddTypeRef(E->getBaseType());
+  CurrentPackingBits.addBit(E->isArrow());
+  CurrentPackingBits.addBit(E->hasUnresolvedUsing());
+  CurrentPackingBits.addBit(!E->isImplicitAccess());
+  if (!E->isImplicitAccess())
+    Record.AddStmt(E->getBase());
+
   Record.AddSourceLocation(E->getOperatorLoc());
+
+  Record.AddTypeRef(E->getBaseType());
   Code = serialization::EXPR_CXX_UNRESOLVED_MEMBER;
 }
 
 void ASTStmtWriter::VisitUnresolvedLookupExpr(UnresolvedLookupExpr *E) {
   VisitOverloadExpr(E);
-  Record.push_back(E->requiresADL());
-  Record.push_back(E->isOverloaded());
+  CurrentPackingBits.addBit(E->requiresADL());
+  CurrentPackingBits.addBit(E->isOverloaded());
   Record.AddDeclRef(E->getNamingClass());
   Code = serialization::EXPR_CXX_UNRESOLVED_LOOKUP;
 }
@@ -2059,12 +2153,12 @@ void ASTStmtWriter::VisitSubstNonTypeTemplateParmExpr(
                                               SubstNonTypeTemplateParmExpr *E) {
   VisitExpr(E);
   Record.AddDeclRef(E->getAssociatedDecl());
-  Record.push_back(E->isReferenceParameter());
-  Record.push_back(E->getIndex());
+  CurrentPackingBits.addBit(E->isReferenceParameter());
+  CurrentPackingBits.addBits(E->getIndex(), /*Width=*/12);
+  CurrentPackingBits.addBit((bool)E->getPackIndex());
   if (auto PackIndex = E->getPackIndex())
     Record.push_back(*PackIndex + 1);
-  else
-    Record.push_back(0);
+
   Record.AddSourceLocation(E->getNameLoc());
   Record.AddStmt(E->getReplacement());
   Code = serialization::EXPR_SUBST_NON_TYPE_TEMPLATE_PARM;

From 5e92fa69eaf0a3d18ff9a8ba0fb27726b38ba035 Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Wed, 20 Dec 2023 18:39:59 -0800
Subject: [PATCH 019/342] [NFC][sanitizer] Fix typo in comment

---
 .../lib/sanitizer_common/symbolizer/sanitizer_symbolize.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/compiler-rt/lib/sanitizer_common/symbolizer/sanitizer_symbolize.cpp b/compiler-rt/lib/sanitizer_common/symbolizer/sanitizer_symbolize.cpp
index 4bdf75332bf33..c851dbbf2eb23 100644
--- a/compiler-rt/lib/sanitizer_common/symbolizer/sanitizer_symbolize.cpp
+++ b/compiler-rt/lib/sanitizer_common/symbolizer/sanitizer_symbolize.cpp
@@ -71,7 +71,7 @@ bool __sanitizer_symbolize_code(const char *ModuleName, uint64_t ModuleOffset,
     auto Printer = std::make_unique<llvm::symbolize::LLVMPrinter>(
         OS, symbolize_error_handler(OS), Config);
 
-    // TODO: it is neccessary to set proper SectionIndex here.
+    // TODO: it is necessary to set proper SectionIndex here.
     // object::SectionedAddress::UndefSection works for only absolute addresses.
     if (InlineFrames) {
       auto ResOrErr = getDefaultSymbolizer()->symbolizeInlinedCode(
@@ -103,7 +103,7 @@ bool __sanitizer_symbolize_data(const char *ModuleName, uint64_t ModuleOffset,
     auto Printer = std::make_unique<llvm::symbolize::LLVMPrinter>(
         OS, symbolize_error_handler(OS), Config);
 
-    // TODO: it is neccessary to set proper SectionIndex here.
+    // TODO: it is necessary to set proper SectionIndex here.
     // object::SectionedAddress::UndefSection works for only absolute addresses.
     auto ResOrErr = getDefaultSymbolizer()->symbolizeData(
         ModuleName,
@@ -126,7 +126,7 @@ bool __sanitizer_symbolize_frame(const char *ModuleName, uint64_t ModuleOffset,
     auto Printer = std::make_unique<llvm::symbolize::LLVMPrinter>(
         OS, symbolize_error_handler(OS), Config);
 
-    // TODO: it is neccessary to set proper SectionIndex here.
+    // TODO: it is necessary to set proper SectionIndex here.
     // object::SectionedAddress::UndefSection works for only absolute addresses.
     auto ResOrErr = getDefaultSymbolizer()->symbolizeFrame(
         ModuleName,

From 61b58123a3137323d6876006a6171d42e5e03cc1 Mon Sep 17 00:00:00 2001
From: Shengchen Kan <shengchen.kan@intel.com>
Date: Thu, 21 Dec 2023 10:40:32 +0800
Subject: [PATCH 020/342] [X86][NFC] Not imply EVEX in NoCD8

NDD (new data destination) instructions need to set NoCD8 and EVEX_4V.
EVEX_4V already implies EVEX. If NoCD8 implied EVEX too, we would not
be able to reuse the class.
---
 llvm/lib/Target/X86/X86InstrMisc.td   | 8 ++++----
 llvm/lib/Target/X86/X86InstrSystem.td | 2 +-
 llvm/lib/Target/X86/X86InstrUtils.td  | 2 +-
 llvm/lib/Target/X86/X86InstrVMX.td    | 4 ++--
 4 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Target/X86/X86InstrMisc.td b/llvm/lib/Target/X86/X86InstrMisc.td
index 2ea10e317e12b..3006969b76d67 100644
--- a/llvm/lib/Target/X86/X86InstrMisc.td
+++ b/llvm/lib/Target/X86/X86InstrMisc.td
@@ -1505,11 +1505,11 @@ def MOVDIRI64 : RI<0xF9, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
 def MOVDIRI32_EVEX : I<0xF9, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
                        "movdiri\t{$src, $dst|$dst, $src}",
                        [(int_x86_directstore32 addr:$dst, GR32:$src)]>,
-                     EVEX_NoCD8, T_MAP4PS, Requires<[In64BitMode, HasMOVDIRI, HasEGPR]>;
+                     EVEX, NoCD8, T_MAP4PS, Requires<[In64BitMode, HasMOVDIRI, HasEGPR]>;
 def MOVDIRI64_EVEX : RI<0xF9, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
                         "movdiri\t{$src, $dst|$dst, $src}",
                         [(int_x86_directstore64 addr:$dst, GR64:$src)]>,
-                     EVEX_NoCD8, T_MAP4PS, Requires<[In64BitMode, HasMOVDIRI, HasEGPR]>;
+                     EVEX, NoCD8, T_MAP4PS, Requires<[In64BitMode, HasMOVDIRI, HasEGPR]>;
 } // SchedRW
 
 //===----------------------------------------------------------------------===//
@@ -1530,11 +1530,11 @@ def MOVDIR64B64 : I<0xF8, MRMSrcMem, (outs), (ins GR64:$dst, i512mem_GR64:$src),
 def MOVDIR64B32_EVEX : I<0xF8, MRMSrcMem, (outs), (ins GR32:$dst, i512mem_GR32:$src),
                          "movdir64b\t{$src, $dst|$dst, $src}",
                          [(int_x86_movdir64b GR32:$dst, addr:$src)]>,
-                       EVEX_NoCD8, T_MAP4PD, AdSize32, Requires<[HasMOVDIR64B, HasEGPR, In64BitMode]>;
+                       EVEX, NoCD8, T_MAP4PD, AdSize32, Requires<[HasMOVDIR64B, HasEGPR, In64BitMode]>;
 def MOVDIR64B64_EVEX : I<0xF8, MRMSrcMem, (outs), (ins GR64:$dst, i512mem_GR64:$src),
                          "movdir64b\t{$src, $dst|$dst, $src}",
                          [(int_x86_movdir64b GR64:$dst, addr:$src)]>,
-                       EVEX_NoCD8, T_MAP4PD, AdSize64, Requires<[HasMOVDIR64B, HasEGPR, In64BitMode]>;
+                       EVEX, NoCD8, T_MAP4PD, AdSize64, Requires<[HasMOVDIR64B, HasEGPR, In64BitMode]>;
 } // SchedRW
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/X86/X86InstrSystem.td b/llvm/lib/Target/X86/X86InstrSystem.td
index cbb5d4ed5bbdc..51972c63bb2ce 100644
--- a/llvm/lib/Target/X86/X86InstrSystem.td
+++ b/llvm/lib/Target/X86/X86InstrSystem.td
@@ -682,7 +682,7 @@ def INVPCID64 : I<0x82, MRMSrcMem, (outs), (ins GR64:$src1, i128mem:$src2),
 
 def INVPCID64_EVEX : I<0xF2, MRMSrcMem, (outs), (ins GR64:$src1, i128mem:$src2),
                        "invpcid\t{$src2, $src1|$src1, $src2}", []>,
-                     EVEX_NoCD8, T_MAP4XS, Requires<[In64BitMode, HasINVPCID]>;
+                     EVEX, NoCD8, T_MAP4XS, Requires<[In64BitMode, HasINVPCID]>;
 } // SchedRW
 
 let Predicates = [In64BitMode, HasINVPCID] in {
diff --git a/llvm/lib/Target/X86/X86InstrUtils.td b/llvm/lib/Target/X86/X86InstrUtils.td
index 78a4812903597..3d8f4e642aebe 100644
--- a/llvm/lib/Target/X86/X86InstrUtils.td
+++ b/llvm/lib/Target/X86/X86InstrUtils.td
@@ -86,7 +86,7 @@ class EVEX_CD8<int esize, CD8VForm form> {
   int CD8_EltSize = !srl(esize, 3);
   bits<3> CD8_Form = form.Value;
 }
-class EVEX_NoCD8 : EVEX { bits<7> CD8_Scale = 0; }
+class NoCD8 { bits<7> CD8_Scale = 0; }
 class XOP { Encoding OpEnc = EncXOP; }
 class XOP_4V : XOP { bit hasVEX_4V = 1; }
 class EVEX2VEXOverride<string VEXInstrName> {
diff --git a/llvm/lib/Target/X86/X86InstrVMX.td b/llvm/lib/Target/X86/X86InstrVMX.td
index 5289819119ceb..c3fba9c5728ca 100644
--- a/llvm/lib/Target/X86/X86InstrVMX.td
+++ b/llvm/lib/Target/X86/X86InstrVMX.td
@@ -24,7 +24,7 @@ def INVEPT64 : I<0x80, MRMSrcMem, (outs), (ins GR64:$src1, i128mem:$src2),
                Requires<[In64BitMode]>;
 def INVEPT64_EVEX : I<0xF0, MRMSrcMem, (outs), (ins GR64:$src1, i128mem:$src2),
                       "invept\t{$src2, $src1|$src1, $src2}", []>,
-                    EVEX_NoCD8, T_MAP4XS, Requires<[In64BitMode]>;
+                    EVEX, NoCD8, T_MAP4XS, Requires<[In64BitMode]>;
 
 // 66 0F 38 81
 def INVVPID32 : I<0x81, MRMSrcMem, (outs), (ins GR32:$src1, i128mem:$src2),
@@ -35,7 +35,7 @@ def INVVPID64 : I<0x81, MRMSrcMem, (outs), (ins GR64:$src1, i128mem:$src2),
                 Requires<[In64BitMode]>;
 def INVVPID64_EVEX : I<0xF1, MRMSrcMem, (outs), (ins GR64:$src1, i128mem:$src2),
                        "invvpid\t{$src2, $src1|$src1, $src2}", []>,
-                     EVEX_NoCD8, T_MAP4XS, Requires<[In64BitMode]>;
+                     EVEX, NoCD8, T_MAP4XS, Requires<[In64BitMode]>;
 
 // 0F 01 C1
 def VMCALL : I<0x01, MRM_C1, (outs), (ins), "vmcall", []>, TB;

From 5fa46daab3428eedfbf6eed0f442e4f797033adf Mon Sep 17 00:00:00 2001
From: Shengchen Kan <shengchen.kan@intel.com>
Date: Thu, 21 Dec 2023 11:05:56 +0800
Subject: [PATCH 021/342] [X86] Replace EVEX_NoCD8 with EVEX, NoCD8

This fixes the build error after
61b58123a3137323d6876006a6171d42e5e03cc1
---
 llvm/lib/Target/X86/X86InstrSSE.td | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td
index be6962ebbb4fb..2e1560a9f7dc1 100644
--- a/llvm/lib/Target/X86/X86InstrSSE.td
+++ b/llvm/lib/Target/X86/X86InstrSSE.td
@@ -6772,7 +6772,7 @@ let Constraints = "$src1 = $dst", Predicates = [HasSHA, HasEGPR, In64BitMode] in
                              [(set VR128:$dst,
                                (int_x86_sha1rnds4 VR128:$src1, VR128:$src2,
                                 (i8 timm:$src3)))]>,
-                         EVEX_NoCD8, T_MAP4PS, Sched<[SchedWriteVecIMul.XMM]>;
+                         EVEX, NoCD8, T_MAP4PS, Sched<[SchedWriteVecIMul.XMM]>;
   def SHA1RNDS4rmi_EVEX: Ii8<0xD4, MRMSrcMem, (outs VR128:$dst),
                              (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
                              "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}",
@@ -6780,31 +6780,31 @@ let Constraints = "$src1 = $dst", Predicates = [HasSHA, HasEGPR, In64BitMode] in
                                (int_x86_sha1rnds4 VR128:$src1,
                                 (memop addr:$src2),
                                 (i8 timm:$src3)))]>,
-                         EVEX_NoCD8, T_MAP4PS,
+                         EVEX, NoCD8, T_MAP4PS,
                          Sched<[SchedWriteVecIMul.XMM.Folded,
                                 SchedWriteVecIMul.XMM.ReadAfterFold]>;
 
   defm SHA1NEXTE : SHAI_binop<0xD8, "sha1nexte", int_x86_sha1nexte,
                                    SchedWriteVecIMul.XMM, "_EVEX">,
-                        EVEX_NoCD8, T_MAP4PS;
+                        EVEX, NoCD8, T_MAP4PS;
   defm SHA1MSG1  : SHAI_binop<0xD9, "sha1msg1", int_x86_sha1msg1,
                               SchedWriteVecIMul.XMM, "_EVEX">,
-                   EVEX_NoCD8, T_MAP4PS;
+                   EVEX, NoCD8, T_MAP4PS;
   defm SHA1MSG2  : SHAI_binop<0xDA, "sha1msg2", int_x86_sha1msg2,
                               SchedWriteVecIMul.XMM, "_EVEX">,
-                   EVEX_NoCD8, T_MAP4PS;
+                   EVEX, NoCD8, T_MAP4PS;
 
   let Uses=[XMM0] in
   defm SHA256RNDS2 : SHAI_binop<0xDB, "sha256rnds2", int_x86_sha256rnds2,
                                 SchedWriteVecIMul.XMM, "_EVEX", 1>,
-                     EVEX_NoCD8, T_MAP4PS;
+                     EVEX, NoCD8, T_MAP4PS;
 
   defm SHA256MSG1 : SHAI_binop<0xDC, "sha256msg1", int_x86_sha256msg1,
                                SchedWriteVecIMul.XMM, "_EVEX">,
-                    EVEX_NoCD8, T_MAP4PS;
+                    EVEX, NoCD8, T_MAP4PS;
   defm SHA256MSG2 : SHAI_binop<0xDD, "sha256msg2", int_x86_sha256msg2,
                                SchedWriteVecIMul.XMM, "_EVEX">,
-                    EVEX_NoCD8, T_MAP4PS;
+                    EVEX, NoCD8, T_MAP4PS;
 }
 
 //===----------------------------------------------------------------------===//

From b26c0ed93a1b735396f3b167ea47d82357468c96 Mon Sep 17 00:00:00 2001
From: Shengchen Kan <shengchen.kan@intel.com>
Date: Thu, 21 Dec 2023 10:59:20 +0800
Subject: [PATCH 022/342] [X86][NFC] Remove class BinOpRM_ImplicitUse b/c it's
 used once only

---
 llvm/lib/Target/X86/X86InstrArithmetic.td | 28 ++++++++---------------
 1 file changed, 10 insertions(+), 18 deletions(-)

diff --git a/llvm/lib/Target/X86/X86InstrArithmetic.td b/llvm/lib/Target/X86/X86InstrArithmetic.td
index 8c355e84a0659..46b430a842ef0 100644
--- a/llvm/lib/Target/X86/X86InstrArithmetic.td
+++ b/llvm/lib/Target/X86/X86InstrArithmetic.td
@@ -121,19 +121,6 @@ class BinOpRM<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
         mnemonic, "{$src2, $src1|$src1, $src2}", pattern>,
     Sched<[sched.Folded, sched.ReadAfterFold]>;
 
-// BinOpRM_ImplicitUse - Binary instructions with inputs "reg, [mem]".
-// There is an implicit register read at the end of the operand sequence.
-class BinOpRM_ImplicitUse<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
-                          dag outlist, X86FoldableSchedWrite sched, list<dag> pattern>
-  : ITy<opcode, MRMSrcMem, typeinfo, outlist,
-        (ins typeinfo.RegClass:$src1, typeinfo.MemOperand:$src2),
-        mnemonic, "{$src2, $src1|$src1, $src2}", pattern>,
-    Sched<[sched.Folded, sched.ReadAfterFold,
-           // base, scale, index, offset, segment.
-           ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault,
-           // implicit register read.
-           sched.ReadAfterFold]>;
-
 // BinOpRM_F - Binary instructions with inputs "reg, [mem]", where the pattern
 // has just a EFLAGS as a result.
 class BinOpRM_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
@@ -154,11 +141,16 @@ class BinOpRM_RF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
 // has both a regclass and EFLAGS as a result, and has EFLAGS as input.
 class BinOpRM_RFF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
                   SDNode opnode>
-  : BinOpRM_ImplicitUse<opcode, mnemonic, typeinfo,
-                        (outs typeinfo.RegClass:$dst), WriteADC,
-                        [(set typeinfo.RegClass:$dst, EFLAGS,
-                         (opnode typeinfo.RegClass:$src1,
-                         (typeinfo.LoadNode addr:$src2), EFLAGS))]>;
+  : BinOpRM<opcode, mnemonic, typeinfo, (outs typeinfo.RegClass:$dst), WriteADC,
+            [(set typeinfo.RegClass:$dst, EFLAGS,
+             (opnode typeinfo.RegClass:$src1, (typeinfo.LoadNode addr:$src2),
+             EFLAGS))]> {
+  let SchedRW = [WriteADC.Folded, WriteADC.ReadAfterFold,
+                 // base, scale, index, offset, segment.
+                 ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault,
+                 // implicit register read.
+                 WriteADC.ReadAfterFold];
+}
 
 // BinOpRI - Binary instructions with inputs "reg, imm".
 class BinOpRI<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,

From a25da1a92120eb5cb74f1a3d28a4849178cfbdff Mon Sep 17 00:00:00 2001
From: Valentin Clement <clementval@gmail.com>
Date: Thu, 7 Dec 2023 14:04:54 -0800
Subject: [PATCH 023/342] [mlir][openacc] Add device_type support for compute
 operations (#75864)

Re-land PR after being reverted because of buildbot failures.

This patch adds representation for `device_type` clause information on
compute construct (parallel, kernels, serial).

The `device_type` clause on compute construct impacts clauses that
appear after it. The values impacted by `device_type` are now tied with
an attribute array that represent the device_type associated with them.
`DeviceType::None` is used to represent the value produced by a clause
before any `device_type`. The operands and the attribute information are
parser/printed together.

This is an example with `vector_length` clause. The first value (64) is
not impacted by `device_type` so it will be represented with
DeviceType::None. None is not printed. The second value (128) is tied
with the `device_type(multicore)` clause.
```
!$acc parallel vector_length(64) device_type(multicore) vector_length(256)
```
```
acc.parallel vector_length(%c64 : i32, %c128 : i32 [#acc.device_type<multicore>]) {
}
```

When multiple values can be produced for a single clause like
`num_gangs` and `wait`, an extra attribute describe the number of values
belonging to each `device_type`. Values and attributes are
parsed/printed together.

```
acc.parallel num_gangs({%c2 : i32, %c4 : i32}, {%c4 : i32} [#acc.device_type<nvidia>])
```

While preparing this patch I noticed that the wait devnum is not part of
the operations and is not lowered. It will be added in a follow up
patch.
---
 flang/lib/Lower/OpenACC.cpp                   | 107 +++-
 flang/test/Lower/OpenACC/acc-device-type.f90  |  44 ++
 flang/test/Lower/OpenACC/acc-kernels-loop.f90 |  14 +-
 flang/test/Lower/OpenACC/acc-kernels.f90      |  14 +-
 .../test/Lower/OpenACC/acc-parallel-loop.f90  |  14 +-
 flang/test/Lower/OpenACC/acc-parallel.f90     |  16 +-
 flang/test/Lower/OpenACC/acc-serial-loop.f90  |  10 +-
 flang/test/Lower/OpenACC/acc-serial.f90       |  10 +-
 .../mlir/Dialect/OpenACC/OpenACCOps.td        | 286 +++++++---
 mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp       | 515 +++++++++++++++++-
 mlir/test/Dialect/OpenACC/invalid.mlir        |   4 +-
 mlir/test/Dialect/OpenACC/ops.mlir            |  76 +--
 12 files changed, 932 insertions(+), 178 deletions(-)
 create mode 100644 flang/test/Lower/OpenACC/acc-device-type.f90

diff --git a/flang/lib/Lower/OpenACC.cpp b/flang/lib/Lower/OpenACC.cpp
index fae54eefb02f7..ecf70818c4ac0 100644
--- a/flang/lib/Lower/OpenACC.cpp
+++ b/flang/lib/Lower/OpenACC.cpp
@@ -1480,7 +1480,7 @@ getDeviceType(Fortran::parser::AccDeviceTypeExpr::Device device) {
   case Fortran::parser::AccDeviceTypeExpr::Device::Multicore:
     return mlir::acc::DeviceType::Multicore;
   }
-  return mlir::acc::DeviceType::Default;
+  return mlir::acc::DeviceType::None;
 }
 
 static void gatherDeviceTypeAttrs(
@@ -1781,26 +1781,24 @@ createComputeOp(Fortran::lower::AbstractConverter &converter,
                 bool outerCombined = false) {
 
   // Parallel operation operands
-  mlir::Value async;
-  mlir::Value numWorkers;
-  mlir::Value vectorLength;
   mlir::Value ifCond;
   mlir::Value selfCond;
-  mlir::Value waitDevnum;
   llvm::SmallVector<mlir::Value> waitOperands, attachEntryOperands,
       copyEntryOperands, copyoutEntryOperands, createEntryOperands,
-      dataClauseOperands, numGangs;
+      dataClauseOperands, numGangs, numWorkers, vectorLength, async;
+  llvm::SmallVector<mlir::Attribute> numGangsDeviceTypes, numWorkersDeviceTypes,
+      vectorLengthDeviceTypes, asyncDeviceTypes, asyncOnlyDeviceTypes,
+      waitOperandsDeviceTypes, waitOnlyDeviceTypes;
+  llvm::SmallVector<int32_t> numGangsSegments, waitOperandsSegments;
 
   llvm::SmallVector<mlir::Value> reductionOperands, privateOperands,
       firstprivateOperands;
   llvm::SmallVector<mlir::Attribute> privatizations, firstPrivatizations,
       reductionRecipes;
 
-  // Async, wait and self clause have optional values but can be present with
+  // Self clause has optional values but can be present with
   // no value as well. When there is no value, the op has an attribute to
   // represent the clause.
-  bool addAsyncAttr = false;
-  bool addWaitAttr = false;
   bool addSelfAttr = false;
 
   bool hasDefaultNone = false;
@@ -1808,6 +1806,11 @@ createComputeOp(Fortran::lower::AbstractConverter &converter,
 
   fir::FirOpBuilder &builder = converter.getFirOpBuilder();
 
+  // device_type attribute is set to `none` until a device_type clause is
+  // encountered.
+  auto crtDeviceTypeAttr = mlir::acc::DeviceTypeAttr::get(
+      builder.getContext(), mlir::acc::DeviceType::None);
+
   // Lower clauses values mapped to operands.
   // Keep track of each group of operands separatly as clauses can appear
   // more than once.
@@ -1815,27 +1818,52 @@ createComputeOp(Fortran::lower::AbstractConverter &converter,
     mlir::Location clauseLocation = converter.genLocation(clause.source);
     if (const auto *asyncClause =
             std::get_if<Fortran::parser::AccClause::Async>(&clause.u)) {
-      genAsyncClause(converter, asyncClause, async, addAsyncAttr, stmtCtx);
+      const auto &asyncClauseValue = asyncClause->v;
+      if (asyncClauseValue) { // async has a value.
+        async.push_back(fir::getBase(converter.genExprValue(
+            *Fortran::semantics::GetExpr(*asyncClauseValue), stmtCtx)));
+        asyncDeviceTypes.push_back(crtDeviceTypeAttr);
+      } else {
+        asyncOnlyDeviceTypes.push_back(crtDeviceTypeAttr);
+      }
     } else if (const auto *waitClause =
                    std::get_if<Fortran::parser::AccClause::Wait>(&clause.u)) {
-      genWaitClause(converter, waitClause, waitOperands, waitDevnum,
-                    addWaitAttr, stmtCtx);
+      const auto &waitClauseValue = waitClause->v;
+      if (waitClauseValue) { // wait has a value.
+        const Fortran::parser::AccWaitArgument &waitArg = *waitClauseValue;
+        const auto &waitList =
+            std::get<std::list<Fortran::parser::ScalarIntExpr>>(waitArg.t);
+        auto crtWaitOperands = waitOperands.size();
+        for (const Fortran::parser::ScalarIntExpr &value : waitList) {
+          waitOperands.push_back(fir::getBase(converter.genExprValue(
+              *Fortran::semantics::GetExpr(value), stmtCtx)));
+        }
+        waitOperandsDeviceTypes.push_back(crtDeviceTypeAttr);
+        waitOperandsSegments.push_back(waitOperands.size() - crtWaitOperands);
+      } else {
+        waitOnlyDeviceTypes.push_back(crtDeviceTypeAttr);
+      }
     } else if (const auto *numGangsClause =
                    std::get_if<Fortran::parser::AccClause::NumGangs>(
                        &clause.u)) {
+      auto crtNumGangs = numGangs.size();
       for (const Fortran::parser::ScalarIntExpr &expr : numGangsClause->v)
         numGangs.push_back(fir::getBase(converter.genExprValue(
             *Fortran::semantics::GetExpr(expr), stmtCtx)));
+      numGangsDeviceTypes.push_back(crtDeviceTypeAttr);
+      numGangsSegments.push_back(numGangs.size() - crtNumGangs);
     } else if (const auto *numWorkersClause =
                    std::get_if<Fortran::parser::AccClause::NumWorkers>(
                        &clause.u)) {
-      numWorkers = fir::getBase(converter.genExprValue(
-          *Fortran::semantics::GetExpr(numWorkersClause->v), stmtCtx));
+      numWorkers.push_back(fir::getBase(converter.genExprValue(
+          *Fortran::semantics::GetExpr(numWorkersClause->v), stmtCtx)));
+      numWorkersDeviceTypes.push_back(crtDeviceTypeAttr);
     } else if (const auto *vectorLengthClause =
                    std::get_if<Fortran::parser::AccClause::VectorLength>(
                        &clause.u)) {
-      vectorLength = fir::getBase(converter.genExprValue(
-          *Fortran::semantics::GetExpr(vectorLengthClause->v), stmtCtx));
+      vectorLength.push_back(fir::getBase(converter.genExprValue(
+          *Fortran::semantics::GetExpr(vectorLengthClause->v), stmtCtx)));
+      vectorLengthDeviceTypes.push_back(crtDeviceTypeAttr);
     } else if (const auto *ifClause =
                    std::get_if<Fortran::parser::AccClause::If>(&clause.u)) {
       genIfClause(converter, clauseLocation, ifClause, ifCond, stmtCtx);
@@ -1986,18 +2014,27 @@ createComputeOp(Fortran::lower::AbstractConverter &converter,
       else if ((defaultClause->v).v ==
                llvm::acc::DefaultValue::ACC_Default_present)
         hasDefaultPresent = true;
+    } else if (const auto *deviceTypeClause =
+                   std::get_if<Fortran::parser::AccClause::DeviceType>(
+                       &clause.u)) {
+      const Fortran::parser::AccDeviceTypeExprList &deviceTypeExprList =
+          deviceTypeClause->v;
+      assert(deviceTypeExprList.v.size() == 1 &&
+             "expect only one device_type expr");
+      crtDeviceTypeAttr = mlir::acc::DeviceTypeAttr::get(
+          builder.getContext(), getDeviceType(deviceTypeExprList.v.front().v));
     }
   }
 
   // Prepare the operand segment size attribute and the operands value range.
   llvm::SmallVector<mlir::Value, 8> operands;
   llvm::SmallVector<int32_t, 8> operandSegments;
-  addOperand(operands, operandSegments, async);
+  addOperands(operands, operandSegments, async);
   addOperands(operands, operandSegments, waitOperands);
   if constexpr (!std::is_same_v<Op, mlir::acc::SerialOp>) {
     addOperands(operands, operandSegments, numGangs);
-    addOperand(operands, operandSegments, numWorkers);
-    addOperand(operands, operandSegments, vectorLength);
+    addOperands(operands, operandSegments, numWorkers);
+    addOperands(operands, operandSegments, vectorLength);
   }
   addOperand(operands, operandSegments, ifCond);
   addOperand(operands, operandSegments, selfCond);
@@ -2018,10 +2055,6 @@ createComputeOp(Fortran::lower::AbstractConverter &converter,
         builder, currentLocation, eval, operands, operandSegments,
         outerCombined);
 
-  if (addAsyncAttr)
-    computeOp.setAsyncAttrAttr(builder.getUnitAttr());
-  if (addWaitAttr)
-    computeOp.setWaitAttrAttr(builder.getUnitAttr());
   if (addSelfAttr)
     computeOp.setSelfAttrAttr(builder.getUnitAttr());
 
@@ -2030,6 +2063,34 @@ createComputeOp(Fortran::lower::AbstractConverter &converter,
   if (hasDefaultPresent)
     computeOp.setDefaultAttr(mlir::acc::ClauseDefaultValue::Present);
 
+  if constexpr (!std::is_same_v<Op, mlir::acc::SerialOp>) {
+    if (!numWorkersDeviceTypes.empty())
+      computeOp.setNumWorkersDeviceTypeAttr(
+          mlir::ArrayAttr::get(builder.getContext(), numWorkersDeviceTypes));
+    if (!vectorLengthDeviceTypes.empty())
+      computeOp.setVectorLengthDeviceTypeAttr(
+          mlir::ArrayAttr::get(builder.getContext(), vectorLengthDeviceTypes));
+    if (!numGangsDeviceTypes.empty())
+      computeOp.setNumGangsDeviceTypeAttr(
+          mlir::ArrayAttr::get(builder.getContext(), numGangsDeviceTypes));
+    if (!numGangsSegments.empty())
+      computeOp.setNumGangsSegmentsAttr(
+          builder.getDenseI32ArrayAttr(numGangsSegments));
+  }
+  if (!asyncDeviceTypes.empty())
+    computeOp.setAsyncDeviceTypeAttr(builder.getArrayAttr(asyncDeviceTypes));
+  if (!asyncOnlyDeviceTypes.empty())
+    computeOp.setAsyncOnlyAttr(builder.getArrayAttr(asyncOnlyDeviceTypes));
+
+  if (!waitOperandsDeviceTypes.empty())
+    computeOp.setWaitOperandsDeviceTypeAttr(
+        builder.getArrayAttr(waitOperandsDeviceTypes));
+  if (!waitOperandsSegments.empty())
+    computeOp.setWaitOperandsSegmentsAttr(
+        builder.getDenseI32ArrayAttr(waitOperandsSegments));
+  if (!waitOnlyDeviceTypes.empty())
+    computeOp.setWaitOnlyAttr(builder.getArrayAttr(waitOnlyDeviceTypes));
+
   if constexpr (!std::is_same_v<Op, mlir::acc::KernelsOp>) {
     if (!privatizations.empty())
       computeOp.setPrivatizationsAttr(
diff --git a/flang/test/Lower/OpenACC/acc-device-type.f90 b/flang/test/Lower/OpenACC/acc-device-type.f90
new file mode 100644
index 0000000000000..871dbc95f60fc
--- /dev/null
+++ b/flang/test/Lower/OpenACC/acc-device-type.f90
@@ -0,0 +1,44 @@
+! This test checks lowering of OpenACC device_type clause on directive where its
+! position and the clauses that follow have special semantic
+
+! RUN: bbc -fopenacc -emit-hlfir %s -o - | FileCheck %s
+
+subroutine sub1()
+
+  !$acc parallel num_workers(16)
+  !$acc end parallel
+
+! CHECK: acc.parallel num_workers(%c16{{.*}} : i32) {
+
+  !$acc parallel num_workers(1) device_type(nvidia) num_workers(16)
+  !$acc end parallel
+
+! CHECK: acc.parallel num_workers(%c1{{.*}} : i32, %c16{{.*}} : i32 [#acc.device_type<nvidia>])
+
+  !$acc parallel device_type(*) num_workers(1) device_type(nvidia) num_workers(16)
+  !$acc end parallel
+
+! CHECK: acc.parallel num_workers(%c1{{.*}} : i32 [#acc.device_type<star>], %c16{{.*}} : i32 [#acc.device_type<nvidia>])
+
+  !$acc parallel vector_length(1)
+  !$acc end parallel
+
+! CHECK: acc.parallel vector_length(%c1{{.*}} : i32)
+
+  !$acc parallel device_type(multicore) vector_length(1)
+  !$acc end parallel
+
+! CHECK: acc.parallel vector_length(%c1{{.*}} : i32 [#acc.device_type<multicore>])
+
+  !$acc parallel num_gangs(2) device_type(nvidia) num_gangs(4)
+  !$acc end parallel
+
+! CHECK: acc.parallel num_gangs({%c2{{.*}} : i32}, {%c4{{.*}} : i32} [#acc.device_type<nvidia>])
+
+  !$acc parallel num_gangs(2) device_type(nvidia) num_gangs(1, 1, 1)
+  !$acc end parallel
+
+! CHECK: acc.parallel num_gangs({%c2{{.*}} : i32}, {%c1{{.*}} : i32, %c1{{.*}} : i32, %c1{{.*}} : i32} [#acc.device_type<nvidia>])
+
+
+end subroutine
diff --git a/flang/test/Lower/OpenACC/acc-kernels-loop.f90 b/flang/test/Lower/OpenACC/acc-kernels-loop.f90
index 34e7232697241..93bc699031d55 100644
--- a/flang/test/Lower/OpenACC/acc-kernels-loop.f90
+++ b/flang/test/Lower/OpenACC/acc-kernels-loop.f90
@@ -62,7 +62,7 @@ subroutine acc_kernels_loop
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.terminator
-! CHECK-NEXT: } attributes {asyncAttr}
+! CHECK-NEXT: } attributes {asyncOnly = [#acc.device_type<none>]} 
 
   !$acc kernels loop async(1)
   DO i = 1, n
@@ -103,7 +103,7 @@ subroutine acc_kernels_loop
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.terminator
-! CHECK-NEXT: } attributes {waitAttr}
+! CHECK-NEXT: } attributes {waitOnly = [#acc.device_type<none>]}
 
   !$acc kernels loop wait(1)
   DO i = 1, n
@@ -111,7 +111,7 @@ subroutine acc_kernels_loop
   END DO
 
 ! CHECK:      [[WAIT1:%.*]] = arith.constant 1 : i32
-! CHECK:      acc.kernels wait([[WAIT1]] : i32) {
+! CHECK:      acc.kernels wait({[[WAIT1]] : i32}) {
 ! CHECK:        acc.loop {
 ! CHECK:          fir.do_loop
 ! CHECK:          acc.yield
@@ -126,7 +126,7 @@ subroutine acc_kernels_loop
 
 ! CHECK:      [[WAIT2:%.*]] = arith.constant 1 : i32
 ! CHECK:      [[WAIT3:%.*]] = arith.constant 2 : i32
-! CHECK:      acc.kernels wait([[WAIT2]], [[WAIT3]] : i32, i32) {
+! CHECK:      acc.kernels wait({[[WAIT2]] : i32, [[WAIT3]] : i32}) {
 ! CHECK:        acc.loop {
 ! CHECK:          fir.do_loop
 ! CHECK:          acc.yield
@@ -141,7 +141,7 @@ subroutine acc_kernels_loop
 
 ! CHECK:      [[WAIT4:%.*]] = fir.load %{{.*}} : !fir.ref<i32>
 ! CHECK:      [[WAIT5:%.*]] = fir.load %{{.*}} : !fir.ref<i32>
-! CHECK:      acc.kernels wait([[WAIT4]], [[WAIT5]] : i32, i32) {
+! CHECK:      acc.kernels wait({[[WAIT4]] : i32, [[WAIT5]] : i32}) {
 ! CHECK:        acc.loop {
 ! CHECK:          fir.do_loop
 ! CHECK:          acc.yield
@@ -155,7 +155,7 @@ subroutine acc_kernels_loop
   END DO
 
 ! CHECK:      [[NUMGANGS1:%.*]] = arith.constant 1 : i32
-! CHECK:      acc.kernels num_gangs([[NUMGANGS1]] : i32) {
+! CHECK:      acc.kernels num_gangs({[[NUMGANGS1]] : i32}) {
 ! CHECK:        acc.loop {
 ! CHECK:          fir.do_loop
 ! CHECK:          acc.yield
@@ -169,7 +169,7 @@ subroutine acc_kernels_loop
   END DO
 
 ! CHECK:      [[NUMGANGS2:%.*]] = fir.load %{{.*}} : !fir.ref<i32>
-! CHECK:      acc.kernels num_gangs([[NUMGANGS2]] : i32) {
+! CHECK:      acc.kernels num_gangs({[[NUMGANGS2]] : i32}) {
 ! CHECK:        acc.loop {
 ! CHECK:          fir.do_loop
 ! CHECK:          acc.yield
diff --git a/flang/test/Lower/OpenACC/acc-kernels.f90 b/flang/test/Lower/OpenACC/acc-kernels.f90
index 1f882c6df5106..99629bb835172 100644
--- a/flang/test/Lower/OpenACC/acc-kernels.f90
+++ b/flang/test/Lower/OpenACC/acc-kernels.f90
@@ -40,7 +40,7 @@ subroutine acc_kernels
 
 ! CHECK:      acc.kernels  {
 ! CHECK:        acc.terminator
-! CHECK-NEXT: } attributes {asyncAttr}
+! CHECK-NEXT: } attributes {asyncOnly = [#acc.device_type<none>]} 
 
   !$acc kernels async(1)
   !$acc end kernels
@@ -63,13 +63,13 @@ subroutine acc_kernels
 
 ! CHECK:      acc.kernels  {
 ! CHECK:        acc.terminator
-! CHECK-NEXT: } attributes {waitAttr}
+! CHECK-NEXT: } attributes {waitOnly = [#acc.device_type<none>]}
 
   !$acc kernels wait(1)
   !$acc end kernels
 
 ! CHECK:      [[WAIT1:%.*]] = arith.constant 1 : i32
-! CHECK:      acc.kernels  wait([[WAIT1]] : i32) {
+! CHECK:      acc.kernels  wait({[[WAIT1]] : i32}) {
 ! CHECK:        acc.terminator
 ! CHECK-NEXT: }{{$}}
 
@@ -78,7 +78,7 @@ subroutine acc_kernels
 
 ! CHECK:      [[WAIT2:%.*]] = arith.constant 1 : i32
 ! CHECK:      [[WAIT3:%.*]] = arith.constant 2 : i32
-! CHECK:      acc.kernels  wait([[WAIT2]], [[WAIT3]] : i32, i32) {
+! CHECK:      acc.kernels  wait({[[WAIT2]] : i32, [[WAIT3]] : i32}) {
 ! CHECK:        acc.terminator
 ! CHECK-NEXT: }{{$}}
 
@@ -87,7 +87,7 @@ subroutine acc_kernels
 
 ! CHECK:      [[WAIT4:%.*]] = fir.load %{{.*}} : !fir.ref<i32>
 ! CHECK:      [[WAIT5:%.*]] = fir.load %{{.*}} : !fir.ref<i32>
-! CHECK:      acc.kernels  wait([[WAIT4]], [[WAIT5]] : i32, i32) {
+! CHECK:      acc.kernels  wait({[[WAIT4]] : i32, [[WAIT5]] : i32}) {
 ! CHECK:        acc.terminator
 ! CHECK-NEXT: }{{$}}
 
@@ -95,7 +95,7 @@ subroutine acc_kernels
   !$acc end kernels
 
 ! CHECK:      [[NUMGANGS1:%.*]] = arith.constant 1 : i32
-! CHECK:      acc.kernels  num_gangs([[NUMGANGS1]] : i32) {
+! CHECK:      acc.kernels  num_gangs({[[NUMGANGS1]] : i32}) {
 ! CHECK:        acc.terminator
 ! CHECK-NEXT: }{{$}}
 
@@ -103,7 +103,7 @@ subroutine acc_kernels
   !$acc end kernels
 
 ! CHECK:      [[NUMGANGS2:%.*]] = fir.load %{{.*}} : !fir.ref<i32>
-! CHECK:      acc.kernels  num_gangs([[NUMGANGS2]] : i32) {
+! CHECK:      acc.kernels  num_gangs({[[NUMGANGS2]] : i32}) {
 ! CHECK:        acc.terminator
 ! CHECK-NEXT: }{{$}}
 
diff --git a/flang/test/Lower/OpenACC/acc-parallel-loop.f90 b/flang/test/Lower/OpenACC/acc-parallel-loop.f90
index 1856215ce59d1..deee7089033ea 100644
--- a/flang/test/Lower/OpenACC/acc-parallel-loop.f90
+++ b/flang/test/Lower/OpenACC/acc-parallel-loop.f90
@@ -64,7 +64,7 @@ subroutine acc_parallel_loop
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.yield
-! CHECK-NEXT: } attributes {asyncAttr}
+! CHECK-NEXT: } attributes {asyncOnly = [#acc.device_type<none>]}
 
   !$acc parallel loop async(1)
   DO i = 1, n
@@ -105,7 +105,7 @@ subroutine acc_parallel_loop
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.yield
-! CHECK-NEXT: } attributes {waitAttr}
+! CHECK-NEXT: } attributes {waitOnly = [#acc.device_type<none>]}
 
   !$acc parallel loop wait(1)
   DO i = 1, n
@@ -113,7 +113,7 @@ subroutine acc_parallel_loop
   END DO
 
 ! CHECK:      [[WAIT1:%.*]] = arith.constant 1 : i32
-! CHECK:      acc.parallel wait([[WAIT1]] : i32) {
+! CHECK:      acc.parallel wait({[[WAIT1]] : i32}) {
 ! CHECK:        acc.loop {
 ! CHECK:          fir.do_loop
 ! CHECK:          acc.yield
@@ -128,7 +128,7 @@ subroutine acc_parallel_loop
 
 ! CHECK:      [[WAIT2:%.*]] = arith.constant 1 : i32
 ! CHECK:      [[WAIT3:%.*]] = arith.constant 2 : i32
-! CHECK:      acc.parallel wait([[WAIT2]], [[WAIT3]] : i32, i32) {
+! CHECK:      acc.parallel wait({[[WAIT2]] : i32, [[WAIT3]] : i32}) {
 ! CHECK:        acc.loop {
 ! CHECK:          fir.do_loop
 ! CHECK:          acc.yield
@@ -143,7 +143,7 @@ subroutine acc_parallel_loop
 
 ! CHECK:      [[WAIT4:%.*]] = fir.load %{{.*}} : !fir.ref<i32>
 ! CHECK:      [[WAIT5:%.*]] = fir.load %{{.*}} : !fir.ref<i32>
-! CHECK:      acc.parallel wait([[WAIT4]], [[WAIT5]] : i32, i32) {
+! CHECK:      acc.parallel wait({[[WAIT4]] : i32, [[WAIT5]] : i32}) {
 ! CHECK:        acc.loop {
 ! CHECK:          fir.do_loop
 ! CHECK:          acc.yield
@@ -157,7 +157,7 @@ subroutine acc_parallel_loop
   END DO
 
 ! CHECK:      [[NUMGANGS1:%.*]] = arith.constant 1 : i32
-! CHECK:      acc.parallel num_gangs([[NUMGANGS1]] : i32) {
+! CHECK:      acc.parallel num_gangs({[[NUMGANGS1]] : i32}) {
 ! CHECK:        acc.loop {
 ! CHECK:          fir.do_loop
 ! CHECK:          acc.yield
@@ -171,7 +171,7 @@ subroutine acc_parallel_loop
   END DO
 
 ! CHECK:      [[NUMGANGS2:%.*]] = fir.load %{{.*}} : !fir.ref<i32>
-! CHECK:      acc.parallel num_gangs([[NUMGANGS2]] : i32) {
+! CHECK:      acc.parallel num_gangs({[[NUMGANGS2]] : i32}) {
 ! CHECK:        acc.loop {
 ! CHECK:          fir.do_loop
 ! CHECK:          acc.yield
diff --git a/flang/test/Lower/OpenACC/acc-parallel.f90 b/flang/test/Lower/OpenACC/acc-parallel.f90
index bbf51ba36a7de..a369bf01f2599 100644
--- a/flang/test/Lower/OpenACC/acc-parallel.f90
+++ b/flang/test/Lower/OpenACC/acc-parallel.f90
@@ -62,7 +62,7 @@ subroutine acc_parallel
 
 ! CHECK:      acc.parallel {
 ! CHECK:        acc.yield
-! CHECK-NEXT: } attributes {asyncAttr}
+! CHECK-NEXT: } attributes {asyncOnly = [#acc.device_type<none>]}
 
   !$acc parallel async(1)
   !$acc end parallel
@@ -85,13 +85,13 @@ subroutine acc_parallel
 
 ! CHECK:      acc.parallel {
 ! CHECK:        acc.yield
-! CHECK-NEXT: } attributes {waitAttr}
+! CHECK-NEXT: } attributes {waitOnly = [#acc.device_type<none>]}
 
   !$acc parallel wait(1)
   !$acc end parallel
 
 ! CHECK:      [[WAIT1:%.*]] = arith.constant 1 : i32
-! CHECK:      acc.parallel wait([[WAIT1]] : i32) {
+! CHECK:      acc.parallel wait({[[WAIT1]] : i32}) {
 ! CHECK:        acc.yield
 ! CHECK-NEXT: }{{$}}
 
@@ -100,7 +100,7 @@ subroutine acc_parallel
 
 ! CHECK:      [[WAIT2:%.*]] = arith.constant 1 : i32
 ! CHECK:      [[WAIT3:%.*]] = arith.constant 2 : i32
-! CHECK:      acc.parallel wait([[WAIT2]], [[WAIT3]] : i32, i32) {
+! CHECK:      acc.parallel wait({[[WAIT2]] : i32, [[WAIT3]] : i32}) {
 ! CHECK:        acc.yield
 ! CHECK-NEXT: }{{$}}
 
@@ -109,7 +109,7 @@ subroutine acc_parallel
 
 ! CHECK:      [[WAIT4:%.*]] = fir.load %{{.*}} : !fir.ref<i32>
 ! CHECK:      [[WAIT5:%.*]] = fir.load %{{.*}} : !fir.ref<i32>
-! CHECK:      acc.parallel wait([[WAIT4]], [[WAIT5]] : i32, i32) {
+! CHECK:      acc.parallel wait({[[WAIT4]] : i32, [[WAIT5]] : i32}) {
 ! CHECK:        acc.yield
 ! CHECK-NEXT: }{{$}}
 
@@ -117,7 +117,7 @@ subroutine acc_parallel
   !$acc end parallel
 
 ! CHECK:      [[NUMGANGS1:%.*]] = arith.constant 1 : i32
-! CHECK:      acc.parallel num_gangs([[NUMGANGS1]] : i32) {
+! CHECK:      acc.parallel num_gangs({[[NUMGANGS1]] : i32}) {
 ! CHECK:        acc.yield
 ! CHECK-NEXT: }{{$}}
 
@@ -125,14 +125,14 @@ subroutine acc_parallel
   !$acc end parallel
 
 ! CHECK:      [[NUMGANGS2:%.*]] = fir.load %{{.*}} : !fir.ref<i32>
-! CHECK:      acc.parallel num_gangs([[NUMGANGS2]] : i32) {
+! CHECK:      acc.parallel num_gangs({[[NUMGANGS2]] : i32}) {
 ! CHECK:        acc.yield
 ! CHECK-NEXT: }{{$}}
 
   !$acc parallel num_gangs(1, 1, 1)
   !$acc end parallel
 
-! CHECK:      acc.parallel num_gangs(%{{.*}}, %{{.*}}, %{{.*}} : i32, i32, i32) {
+! CHECK:      acc.parallel num_gangs({%{{.*}} : i32, %{{.*}} : i32, %{{.*}} : i32}) {
 ! CHECK:        acc.yield
 ! CHECK-NEXT: }{{$}}
 
diff --git a/flang/test/Lower/OpenACC/acc-serial-loop.f90 b/flang/test/Lower/OpenACC/acc-serial-loop.f90
index 4ed7bb8da29a1..712bfc80ce387 100644
--- a/flang/test/Lower/OpenACC/acc-serial-loop.f90
+++ b/flang/test/Lower/OpenACC/acc-serial-loop.f90
@@ -83,7 +83,7 @@ subroutine acc_serial_loop
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.yield
-! CHECK-NEXT: } attributes {asyncAttr}
+! CHECK-NEXT: } attributes {asyncOnly = [#acc.device_type<none>]}
 
   !$acc serial loop async(1)
   DO i = 1, n
@@ -124,7 +124,7 @@ subroutine acc_serial_loop
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.yield
-! CHECK-NEXT: } attributes {waitAttr}
+! CHECK-NEXT: } attributes {waitOnly = [#acc.device_type<none>]}
 
   !$acc serial loop wait(1)
   DO i = 1, n
@@ -132,7 +132,7 @@ subroutine acc_serial_loop
   END DO
 
 ! CHECK:      [[WAIT1:%.*]] = arith.constant 1 : i32
-! CHECK:      acc.serial wait([[WAIT1]] : i32) {
+! CHECK:      acc.serial wait({[[WAIT1]] : i32}) {
 ! CHECK:        acc.loop {
 ! CHECK:          fir.do_loop
 ! CHECK:          acc.yield
@@ -147,7 +147,7 @@ subroutine acc_serial_loop
 
 ! CHECK:      [[WAIT2:%.*]] = arith.constant 1 : i32
 ! CHECK:      [[WAIT3:%.*]] = arith.constant 2 : i32
-! CHECK:      acc.serial wait([[WAIT2]], [[WAIT3]] : i32, i32) {
+! CHECK:      acc.serial wait({[[WAIT2]] : i32, [[WAIT3]] : i32}) {
 ! CHECK:        acc.loop {
 ! CHECK:          fir.do_loop
 ! CHECK:          acc.yield
@@ -162,7 +162,7 @@ subroutine acc_serial_loop
 
 ! CHECK:      [[WAIT4:%.*]] = fir.load %{{.*}} : !fir.ref<i32>
 ! CHECK:      [[WAIT5:%.*]] = fir.load %{{.*}} : !fir.ref<i32>
-! CHECK:      acc.serial wait([[WAIT4]], [[WAIT5]] : i32, i32) {
+! CHECK:      acc.serial wait({[[WAIT4]] : i32, [[WAIT5]] : i32}) {
 ! CHECK:        acc.loop {
 ! CHECK:          fir.do_loop
 ! CHECK:          acc.yield
diff --git a/flang/test/Lower/OpenACC/acc-serial.f90 b/flang/test/Lower/OpenACC/acc-serial.f90
index ab3b0ccd54595..d05e51d3d274f 100644
--- a/flang/test/Lower/OpenACC/acc-serial.f90
+++ b/flang/test/Lower/OpenACC/acc-serial.f90
@@ -62,7 +62,7 @@ subroutine acc_serial
 
 ! CHECK:      acc.serial {
 ! CHECK:        acc.yield
-! CHECK-NEXT: } attributes {asyncAttr}
+! CHECK-NEXT: } attributes {asyncOnly = [#acc.device_type<none>]} 
 
   !$acc serial async(1)
   !$acc end serial
@@ -85,13 +85,13 @@ subroutine acc_serial
 
 ! CHECK:      acc.serial {
 ! CHECK:        acc.yield
-! CHECK-NEXT: } attributes {waitAttr}
+! CHECK-NEXT: } attributes {waitOnly = [#acc.device_type<none>]}
 
   !$acc serial wait(1)
   !$acc end serial
 
 ! CHECK:      [[WAIT1:%.*]] = arith.constant 1 : i32
-! CHECK:      acc.serial wait([[WAIT1]] : i32) {
+! CHECK:      acc.serial wait({[[WAIT1]] : i32}) {
 ! CHECK:        acc.yield
 ! CHECK-NEXT: }{{$}}
 
@@ -100,7 +100,7 @@ subroutine acc_serial
 
 ! CHECK:      [[WAIT2:%.*]] = arith.constant 1 : i32
 ! CHECK:      [[WAIT3:%.*]] = arith.constant 2 : i32
-! CHECK:      acc.serial wait([[WAIT2]], [[WAIT3]] : i32, i32) {
+! CHECK:      acc.serial wait({[[WAIT2]] : i32, [[WAIT3]] : i32}) {
 ! CHECK:        acc.yield
 ! CHECK-NEXT: }{{$}}
 
@@ -109,7 +109,7 @@ subroutine acc_serial
 
 ! CHECK:      [[WAIT4:%.*]] = fir.load %{{.*}} : !fir.ref<i32>
 ! CHECK:      [[WAIT5:%.*]] = fir.load %{{.*}} : !fir.ref<i32>
-! CHECK:      acc.serial wait([[WAIT4]], [[WAIT5]] : i32, i32) {
+! CHECK:      acc.serial wait({[[WAIT4]] : i32, [[WAIT5]] : i32}) {
 ! CHECK:        acc.yield
 ! CHECK-NEXT: }{{$}}
 
diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td b/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
index a78c3e98c9551..234c1076e14e3 100644
--- a/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
+++ b/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
@@ -156,29 +156,46 @@ def DeclareActionAttr : OpenACC_Attr<"DeclareAction", "declare_action"> {
 }
 
 // Device type enumeration.
-def OpenACC_DeviceTypeStar      : I32EnumAttrCase<"Star", 0, "star">;
-def OpenACC_DeviceTypeDefault   : I32EnumAttrCase<"Default", 1, "default">;
-def OpenACC_DeviceTypeHost      : I32EnumAttrCase<"Host", 2, "host">;
-def OpenACC_DeviceTypeMulticore : I32EnumAttrCase<"Multicore", 3, "multicore">;
-def OpenACC_DeviceTypeNvidia    : I32EnumAttrCase<"Nvidia", 4, "nvidia">;
-def OpenACC_DeviceTypeRadeon    : I32EnumAttrCase<"Radeon", 5, "radeon">;
-
+def OpenACC_DeviceTypeNone      : I32EnumAttrCase<"None", 0, "none">;
+def OpenACC_DeviceTypeStar      : I32EnumAttrCase<"Star", 1, "star">;
+def OpenACC_DeviceTypeDefault   : I32EnumAttrCase<"Default", 2, "default">;
+def OpenACC_DeviceTypeHost      : I32EnumAttrCase<"Host", 3, "host">;
+def OpenACC_DeviceTypeMulticore : I32EnumAttrCase<"Multicore", 4, "multicore">;
+def OpenACC_DeviceTypeNvidia    : I32EnumAttrCase<"Nvidia", 5, "nvidia">;
+def OpenACC_DeviceTypeRadeon    : I32EnumAttrCase<"Radeon", 6, "radeon">;
 
 def OpenACC_DeviceType : I32EnumAttr<"DeviceType",
     "built-in device type supported by OpenACC",
-    [OpenACC_DeviceTypeStar, OpenACC_DeviceTypeDefault,
+    [OpenACC_DeviceTypeNone, OpenACC_DeviceTypeStar, OpenACC_DeviceTypeDefault,
      OpenACC_DeviceTypeHost, OpenACC_DeviceTypeMulticore,
      OpenACC_DeviceTypeNvidia, OpenACC_DeviceTypeRadeon
     ]> {
   let genSpecializedAttr = 0;
   let cppNamespace = "::mlir::acc";
 }
+
+// Device type attribute is used to associate a value for for clauses that
+// appear after a device_type clause. The list of clauses allowed after the
+// device_type clause is defined per construct as follows:
+// Loop construct: collapse, gang, worker, vector, seq, independent, auto,
+//                 and tile
+// Compute construct: async, wait, num_gangs, num_workers, and vector_length
+// Data construct: async and wait 
+// Routine: gang, worker, vector, seq and bind
+//
+// The `none` means that the value appears before any device_type clause.
+//
 def OpenACC_DeviceTypeAttr : EnumAttr<OpenACC_Dialect,
                                       OpenACC_DeviceType,
                                       "device_type"> {
   let assemblyFormat = [{ ```<` $value `>` }];
 }
 
+def DeviceTypeArrayAttr :
+  TypedArrayAttrBase<OpenACC_DeviceTypeAttr, "device type array attribute"> {
+  let constBuilderCall = ?;
+}
+
 // Define a resource for the OpenACC runtime counters.
 def OpenACC_RuntimeCounters : Resource<"::mlir::acc::RuntimeCounters">;
 
@@ -863,24 +880,32 @@ def OpenACC_ParallelOp : OpenACC_Op<"parallel",
     ```
   }];
 
-  let arguments = (ins Optional<IntOrIndex>:$async,
-                       UnitAttr:$asyncAttr,
-                       Variadic<IntOrIndex>:$waitOperands,
-                       UnitAttr:$waitAttr,
-                       Variadic<IntOrIndex>:$numGangs,
-                       Optional<IntOrIndex>:$numWorkers,
-                       Optional<IntOrIndex>:$vectorLength,
-                       Optional<I1>:$ifCond,
-                       Optional<I1>:$selfCond,
-                       UnitAttr:$selfAttr,
-                       Variadic<AnyType>:$reductionOperands,
-                       OptionalAttr<SymbolRefArrayAttr>:$reductionRecipes,
-                       Variadic<OpenACC_PointerLikeTypeInterface>:$gangPrivateOperands,
-                       OptionalAttr<SymbolRefArrayAttr>:$privatizations,
-                       Variadic<OpenACC_PointerLikeTypeInterface>:$gangFirstPrivateOperands,
-                       OptionalAttr<SymbolRefArrayAttr>:$firstprivatizations,
-                       Variadic<OpenACC_PointerLikeTypeInterface>:$dataClauseOperands,
-                       OptionalAttr<DefaultValueAttr>:$defaultAttr);
+  let arguments = (ins
+      Variadic<IntOrIndex>:$async,
+      OptionalAttr<DeviceTypeArrayAttr>:$asyncDeviceType,
+      OptionalAttr<DeviceTypeArrayAttr>:$asyncOnly,
+      Variadic<IntOrIndex>:$waitOperands,
+      OptionalAttr<DenseI32ArrayAttr>:$waitOperandsSegments,
+      OptionalAttr<DeviceTypeArrayAttr>:$waitOperandsDeviceType,
+      OptionalAttr<DeviceTypeArrayAttr>:$waitOnly,
+      Variadic<IntOrIndex>:$numGangs,
+      OptionalAttr<DenseI32ArrayAttr>:$numGangsSegments,
+      OptionalAttr<DeviceTypeArrayAttr>:$numGangsDeviceType,
+      Variadic<IntOrIndex>:$numWorkers,
+      OptionalAttr<DeviceTypeArrayAttr>:$numWorkersDeviceType,
+      Variadic<IntOrIndex>:$vectorLength,
+      OptionalAttr<DeviceTypeArrayAttr>:$vectorLengthDeviceType,
+      Optional<I1>:$ifCond,
+      Optional<I1>:$selfCond,
+      UnitAttr:$selfAttr,
+      Variadic<AnyType>:$reductionOperands,
+      OptionalAttr<SymbolRefArrayAttr>:$reductionRecipes,
+      Variadic<OpenACC_PointerLikeTypeInterface>:$gangPrivateOperands,
+      OptionalAttr<SymbolRefArrayAttr>:$privatizations,
+      Variadic<OpenACC_PointerLikeTypeInterface>:$gangFirstPrivateOperands,
+      OptionalAttr<SymbolRefArrayAttr>:$firstprivatizations,
+      Variadic<OpenACC_PointerLikeTypeInterface>:$dataClauseOperands,
+      OptionalAttr<DefaultValueAttr>:$defaultAttr);
 
   let regions = (region AnyRegion:$region);
 
@@ -890,22 +915,69 @@ def OpenACC_ParallelOp : OpenACC_Op<"parallel",
 
     /// The i-th data operand passed.
     Value getDataOperand(unsigned i);
+
+    /// Return true if the op has the async attribute for the
+    /// mlir::acc::DeviceType::None device_type.
+    bool hasAsyncOnly();
+    /// Return true if the op has the async attribute for the given device_type.
+    bool hasAsyncOnly(mlir::acc::DeviceType deviceType);
+    /// Return the value of the async clause if present.
+    mlir::Value getAsyncValue();
+    /// Return the value of the async clause for the given device_type if
+    /// present.
+    mlir::Value getAsyncValue(mlir::acc::DeviceType deviceType);
+
+    /// Return the value of the num_workers clause if present.
+    mlir::Value getNumWorkersValue();
+    /// Return the value of the num_workers clause for the given device_type if
+    /// present.
+    mlir::Value getNumWorkersValue(mlir::acc::DeviceType deviceType);
+
+    /// Return the value of the vector_length clause if present.
+    mlir::Value getVectorLengthValue();
+    /// Return the value of the vector_length clause for the given device_type 
+    /// if present.
+    mlir::Value getVectorLengthValue(mlir::acc::DeviceType deviceType);
+
+    /// Return the values of the num_gangs clause if present.
+    mlir::Operation::operand_range getNumGangsValues();
+    /// Return the values of the num_gangs clause for the given device_type if
+    /// present.
+    mlir::Operation::operand_range
+    getNumGangsValues(mlir::acc::DeviceType deviceType);
+
+    /// Return true if the op has the wait attribute for the
+    /// mlir::acc::DeviceType::None device_type.
+    bool hasWaitOnly();
+    /// Return true if the op has the wait attribute for the given device_type.
+    bool hasWaitOnly(mlir::acc::DeviceType deviceType);
+    /// Return the values of the wait clause if present.
+    mlir::Operation::operand_range getWaitValues();
+    /// Return the values of the wait clause for the given device_type if
+    /// present.
+    mlir::Operation::operand_range
+    getWaitValues(mlir::acc::DeviceType deviceType);
   }];
 
   let assemblyFormat = [{
     oilist(
         `dataOperands` `(` $dataClauseOperands `:` type($dataClauseOperands) `)`
-      | `async` `(` $async `:` type($async) `)`
+      | `async` `(` custom<DeviceTypeOperands>($async,
+            type($async), $asyncDeviceType) `)`
       | `firstprivate` `(` custom<SymOperandList>($gangFirstPrivateOperands,
             type($gangFirstPrivateOperands), $firstprivatizations)
         `)`
-      | `num_gangs` `(` $numGangs `:` type($numGangs) `)`
-      | `num_workers` `(` $numWorkers `:` type($numWorkers) `)`
+      | `num_gangs` `(` custom<NumGangs>($numGangs,
+            type($numGangs), $numGangsDeviceType, $numGangsSegments) `)`
+      | `num_workers` `(` custom<DeviceTypeOperands>($numWorkers,
+            type($numWorkers), $numWorkersDeviceType) `)`
       | `private` `(` custom<SymOperandList>(
             $gangPrivateOperands, type($gangPrivateOperands), $privatizations)
         `)`
-      | `vector_length` `(` $vectorLength `:` type($vectorLength) `)`
-      | `wait` `(` $waitOperands `:` type($waitOperands) `)`
+      | `vector_length` `(` custom<DeviceTypeOperands>($vectorLength,
+            type($vectorLength), $vectorLengthDeviceType) `)`
+      | `wait` `(` custom<WaitOperands>($waitOperands,
+            type($waitOperands), $waitOperandsDeviceType, $waitOperandsSegments) `)`
       | `self` `(` $selfCond `)`
       | `if` `(` $ifCond `)`
       | `reduction` `(` custom<SymOperandList>(
@@ -939,21 +1011,25 @@ def OpenACC_SerialOp : OpenACC_Op<"serial",
     ```
   }];
 
-  let arguments = (ins Optional<IntOrIndex>:$async,
-                       UnitAttr:$asyncAttr,
-                       Variadic<IntOrIndex>:$waitOperands,
-                       UnitAttr:$waitAttr,
-                       Optional<I1>:$ifCond,
-                       Optional<I1>:$selfCond,
-                       UnitAttr:$selfAttr,
-                       Variadic<AnyType>:$reductionOperands,
-                       OptionalAttr<SymbolRefArrayAttr>:$reductionRecipes,
-                       Variadic<OpenACC_PointerLikeTypeInterface>:$gangPrivateOperands,
-                       OptionalAttr<SymbolRefArrayAttr>:$privatizations,
-                       Variadic<OpenACC_PointerLikeTypeInterface>:$gangFirstPrivateOperands,
-                       OptionalAttr<SymbolRefArrayAttr>:$firstprivatizations,
-                       Variadic<OpenACC_PointerLikeTypeInterface>:$dataClauseOperands,
-                       OptionalAttr<DefaultValueAttr>:$defaultAttr);
+  let arguments = (ins
+      Variadic<IntOrIndex>:$async,
+      OptionalAttr<DeviceTypeArrayAttr>:$asyncDeviceType,
+      OptionalAttr<DeviceTypeArrayAttr>:$asyncOnly,
+      Variadic<IntOrIndex>:$waitOperands,
+      OptionalAttr<DenseI32ArrayAttr>:$waitOperandsSegments,
+      OptionalAttr<DeviceTypeArrayAttr>:$waitOperandsDeviceType,
+      OptionalAttr<DeviceTypeArrayAttr>:$waitOnly,
+      Optional<I1>:$ifCond,
+      Optional<I1>:$selfCond,
+      UnitAttr:$selfAttr,
+      Variadic<AnyType>:$reductionOperands,
+      OptionalAttr<SymbolRefArrayAttr>:$reductionRecipes,
+      Variadic<OpenACC_PointerLikeTypeInterface>:$gangPrivateOperands,
+      OptionalAttr<SymbolRefArrayAttr>:$privatizations,
+      Variadic<OpenACC_PointerLikeTypeInterface>:$gangFirstPrivateOperands,
+      OptionalAttr<SymbolRefArrayAttr>:$firstprivatizations,
+      Variadic<OpenACC_PointerLikeTypeInterface>:$dataClauseOperands,
+      OptionalAttr<DefaultValueAttr>:$defaultAttr);
 
   let regions = (region AnyRegion:$region);
 
@@ -963,19 +1039,44 @@ def OpenACC_SerialOp : OpenACC_Op<"serial",
 
     /// The i-th data operand passed.
     Value getDataOperand(unsigned i);
+
+    /// Return true if the op has the async attribute for the
+    /// mlir::acc::DeviceType::None device_type.
+    bool hasAsyncOnly();
+    /// Return true if the op has the async attribute for the given device_type.
+    bool hasAsyncOnly(mlir::acc::DeviceType deviceType);
+    /// Return the value of the async clause if present.
+    mlir::Value getAsyncValue();
+    /// Return the value of the async clause for the given device_type if
+    /// present.
+    mlir::Value getAsyncValue(mlir::acc::DeviceType deviceType);
+
+    /// Return true if the op has the wait attribute for the
+    /// mlir::acc::DeviceType::None device_type.
+    bool hasWaitOnly();
+    /// Return true if the op has the wait attribute for the given device_type.
+    bool hasWaitOnly(mlir::acc::DeviceType deviceType);
+    /// Return the values of the wait clause if present.
+    mlir::Operation::operand_range getWaitValues();
+    /// Return the values of the wait clause for the given device_type if
+    /// present.
+    mlir::Operation::operand_range
+    getWaitValues(mlir::acc::DeviceType deviceType);
   }];
 
   let assemblyFormat = [{
     oilist(
         `dataOperands` `(` $dataClauseOperands `:` type($dataClauseOperands) `)`
-      | `async` `(` $async `:` type($async) `)`
+      | `async` `(` custom<DeviceTypeOperands>($async,
+            type($async), $asyncDeviceType) `)`
       | `firstprivate` `(` custom<SymOperandList>($gangFirstPrivateOperands,
             type($gangFirstPrivateOperands), $firstprivatizations)
         `)`
       | `private` `(` custom<SymOperandList>(
             $gangPrivateOperands, type($gangPrivateOperands), $privatizations)
         `)`
-      | `wait` `(` $waitOperands `:` type($waitOperands) `)`
+      | `wait` `(` custom<WaitOperands>($waitOperands,
+            type($waitOperands), $waitOperandsDeviceType, $waitOperandsSegments) `)`
       | `self` `(` $selfCond `)`
       | `if` `(` $ifCond `)`
       | `reduction` `(` custom<SymOperandList>(
@@ -1011,18 +1112,26 @@ def OpenACC_KernelsOp : OpenACC_Op<"kernels",
     ```
   }];
 
-  let arguments = (ins Optional<IntOrIndex>:$async,
-                       UnitAttr:$asyncAttr,
-                       Variadic<IntOrIndex>:$waitOperands,
-                       UnitAttr:$waitAttr,
-                       Variadic<IntOrIndex>:$numGangs,
-                       Optional<IntOrIndex>:$numWorkers,
-                       Optional<IntOrIndex>:$vectorLength,
-                       Optional<I1>:$ifCond,
-                       Optional<I1>:$selfCond,
-                       UnitAttr:$selfAttr,
-                       Variadic<OpenACC_PointerLikeTypeInterface>:$dataClauseOperands,
-                       OptionalAttr<DefaultValueAttr>:$defaultAttr);
+  let arguments = (ins
+      Variadic<IntOrIndex>:$async,
+      OptionalAttr<DeviceTypeArrayAttr>:$asyncDeviceType,
+      OptionalAttr<DeviceTypeArrayAttr>:$asyncOnly,
+      Variadic<IntOrIndex>:$waitOperands,
+      OptionalAttr<DenseI32ArrayAttr>:$waitOperandsSegments,
+      OptionalAttr<DeviceTypeArrayAttr>:$waitOperandsDeviceType,
+      OptionalAttr<DeviceTypeArrayAttr>:$waitOnly,
+      Variadic<IntOrIndex>:$numGangs,
+      OptionalAttr<DenseI32ArrayAttr>:$numGangsSegments,
+      OptionalAttr<DeviceTypeArrayAttr>:$numGangsDeviceType,
+      Variadic<IntOrIndex>:$numWorkers,
+      OptionalAttr<DeviceTypeArrayAttr>:$numWorkersDeviceType,
+      Variadic<IntOrIndex>:$vectorLength,
+      OptionalAttr<DeviceTypeArrayAttr>:$vectorLengthDeviceType,
+      Optional<I1>:$ifCond,
+      Optional<I1>:$selfCond,
+      UnitAttr:$selfAttr,
+      Variadic<OpenACC_PointerLikeTypeInterface>:$dataClauseOperands,
+      OptionalAttr<DefaultValueAttr>:$defaultAttr);
 
   let regions = (region AnyRegion:$region);
 
@@ -1032,16 +1141,63 @@ def OpenACC_KernelsOp : OpenACC_Op<"kernels",
 
     /// The i-th data operand passed.
     Value getDataOperand(unsigned i);
+
+    /// Return true if the op has the async attribute for the
+    /// mlir::acc::DeviceType::None device_type.
+    bool hasAsyncOnly();
+    /// Return true if the op has the async attribute for the given device_type.
+    bool hasAsyncOnly(mlir::acc::DeviceType deviceType);
+    /// Return the value of the async clause if present.
+    mlir::Value getAsyncValue();
+    /// Return the value of the async clause for the given device_type if
+    /// present.
+    mlir::Value getAsyncValue(mlir::acc::DeviceType deviceType);
+
+    /// Return the value of the num_workers clause if present.
+    mlir::Value getNumWorkersValue();
+    /// Return the value of the num_workers clause for the given device_type if
+    /// present.
+    mlir::Value getNumWorkersValue(mlir::acc::DeviceType deviceType);
+
+    /// Return the value of the vector_length clause if present.
+    mlir::Value getVectorLengthValue();
+    /// Return the value of the vector_length clause for the given device_type 
+    /// if present.
+    mlir::Value getVectorLengthValue(mlir::acc::DeviceType deviceType);
+
+    /// Return the values of the num_gangs clause if present.
+    mlir::Operation::operand_range getNumGangsValues();
+    /// Return the values of the num_gangs clause for the given device_type if
+    /// present.
+    mlir::Operation::operand_range
+    getNumGangsValues(mlir::acc::DeviceType deviceType);
+
+    /// Return true if the op has the wait attribute for the
+    /// mlir::acc::DeviceType::None device_type.
+    bool hasWaitOnly();
+    /// Return true if the op has the wait attribute for the given device_type.
+    bool hasWaitOnly(mlir::acc::DeviceType deviceType);
+    /// Return the values of the wait clause if present.
+    mlir::Operation::operand_range getWaitValues();
+    /// Return the values of the wait clause for the given device_type if
+    /// present.
+    mlir::Operation::operand_range
+    getWaitValues(mlir::acc::DeviceType deviceType);
   }];
 
   let assemblyFormat = [{
     oilist(
         `dataOperands` `(` $dataClauseOperands `:` type($dataClauseOperands) `)`
-      | `async` `(` $async `:` type($async) `)`
-      | `num_gangs` `(` $numGangs `:` type($numGangs) `)`
-      | `num_workers` `(` $numWorkers `:` type($numWorkers) `)`
-      | `vector_length` `(` $vectorLength `:` type($vectorLength) `)`
-      | `wait` `(` $waitOperands `:` type($waitOperands) `)`
+      | `async` `(` custom<DeviceTypeOperands>($async,
+            type($async), $asyncDeviceType) `)`
+      | `num_gangs` `(` custom<NumGangs>($numGangs,
+            type($numGangs), $numGangsDeviceType, $numGangsSegments) `)`
+      | `num_workers` `(` custom<DeviceTypeOperands>($numWorkers,
+            type($numWorkers), $numWorkersDeviceType) `)`
+      | `vector_length` `(` custom<DeviceTypeOperands>($vectorLength,
+            type($vectorLength), $vectorLengthDeviceType) `)`
+      | `wait` `(` custom<WaitOperands>($waitOperands,
+            type($waitOperands), $waitOperandsDeviceType, $waitOperandsSegments) `)`
       | `self` `(` $selfCond `)`
       | `if` `(` $ifCond `)`
     )
diff --git a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
index df4f7825545c2..45e0632db5ef2 100644
--- a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
+++ b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
@@ -615,15 +615,49 @@ unsigned ParallelOp::getNumDataOperands() {
 }
 
 Value ParallelOp::getDataOperand(unsigned i) {
-  unsigned numOptional = getAsync() ? 1 : 0;
+  unsigned numOptional = getAsync().size();
   numOptional += getNumGangs().size();
-  numOptional += getNumWorkers() ? 1 : 0;
-  numOptional += getVectorLength() ? 1 : 0;
+  numOptional += getNumWorkers().size();
+  numOptional += getVectorLength().size();
   numOptional += getIfCond() ? 1 : 0;
   numOptional += getSelfCond() ? 1 : 0;
   return getOperand(getWaitOperands().size() + numOptional + i);
 }
 
+template <typename Op>
+static LogicalResult verifyDeviceTypeCountMatch(Op op, OperandRange operands,
+                                                ArrayAttr deviceTypes,
+                                                llvm::StringRef keyword) {
+  if (operands.size() > 0 && deviceTypes.getValue().size() != operands.size())
+    return op.emitOpError() << keyword << " operands count must match "
+                            << keyword << " device_type count";
+  return success();
+}
+
+template <typename Op>
+static LogicalResult verifyDeviceTypeAndSegmentCountMatch(
+    Op op, OperandRange operands, DenseI32ArrayAttr segments,
+    ArrayAttr deviceTypes, llvm::StringRef keyword, int32_t maxInSegment = 0) {
+  std::size_t numOperandsInSegments = 0;
+
+  if (!segments)
+    return success();
+
+  for (auto segCount : segments.asArrayRef()) {
+    if (maxInSegment != 0 && segCount > maxInSegment)
+      return op.emitOpError() << keyword << " expects a maximum of "
+                              << maxInSegment << " values per segment";
+    numOperandsInSegments += segCount;
+  }
+  if (numOperandsInSegments != operands.size())
+    return op.emitOpError()
+           << keyword << " operand count does not match count in segments";
+  if (deviceTypes.getValue().size() != (size_t)segments.size())
+    return op.emitOpError()
+           << keyword << " segment count does not match device_type count";
+  return success();
+}
+
 LogicalResult acc::ParallelOp::verify() {
   if (failed(checkSymOperandList<mlir::acc::PrivateRecipeOp>(
           *this, getPrivatizations(), getGangPrivateOperands(), "private",
@@ -633,11 +667,322 @@ LogicalResult acc::ParallelOp::verify() {
           *this, getReductionRecipes(), getReductionOperands(), "reduction",
           "reductions", false)))
     return failure();
-  if (getNumGangs().size() > 3)
-    return emitOpError() << "num_gangs expects a maximum of 3 values";
+
+  if (failed(verifyDeviceTypeAndSegmentCountMatch(
+          *this, getNumGangs(), getNumGangsSegmentsAttr(),
+          getNumGangsDeviceTypeAttr(), "num_gangs", 3)))
+    return failure();
+
+  if (failed(verifyDeviceTypeAndSegmentCountMatch(
+          *this, getWaitOperands(), getWaitOperandsSegmentsAttr(),
+          getWaitOperandsDeviceTypeAttr(), "wait")))
+    return failure();
+
+  if (failed(verifyDeviceTypeCountMatch(*this, getNumWorkers(),
+                                        getNumWorkersDeviceTypeAttr(),
+                                        "num_workers")))
+    return failure();
+
+  if (failed(verifyDeviceTypeCountMatch(*this, getVectorLength(),
+                                        getVectorLengthDeviceTypeAttr(),
+                                        "vector_length")))
+    return failure();
+
+  if (failed(verifyDeviceTypeCountMatch(*this, getAsync(),
+                                        getAsyncDeviceTypeAttr(), "async")))
+    return failure();
+
   return checkDataOperands<acc::ParallelOp>(*this, getDataClauseOperands());
 }
 
+static std::optional<unsigned> findSegment(ArrayAttr segments,
+                                           mlir::acc::DeviceType deviceType) {
+  unsigned segmentIdx = 0;
+  for (auto attr : segments) {
+    auto deviceTypeAttr = mlir::dyn_cast<mlir::acc::DeviceTypeAttr>(attr);
+    if (deviceTypeAttr.getValue() == deviceType)
+      return std::make_optional(segmentIdx);
+    ++segmentIdx;
+  }
+  return std::nullopt;
+}
+
+static mlir::Value
+getValueInDeviceTypeSegment(std::optional<mlir::ArrayAttr> arrayAttr,
+                            mlir::Operation::operand_range range,
+                            mlir::acc::DeviceType deviceType) {
+  if (!arrayAttr)
+    return {};
+  if (auto pos = findSegment(*arrayAttr, deviceType))
+    return range[*pos];
+  return {};
+}
+
+bool acc::ParallelOp::hasAsyncOnly() {
+  return hasAsyncOnly(mlir::acc::DeviceType::None);
+}
+
+bool acc::ParallelOp::hasAsyncOnly(mlir::acc::DeviceType deviceType) {
+  if (auto arrayAttr = getAsyncOnly()) {
+    if (findSegment(*arrayAttr, deviceType))
+      return true;
+  }
+  return false;
+}
+
+mlir::Value acc::ParallelOp::getAsyncValue() {
+  return getAsyncValue(mlir::acc::DeviceType::None);
+}
+
+mlir::Value acc::ParallelOp::getAsyncValue(mlir::acc::DeviceType deviceType) {
+  return getValueInDeviceTypeSegment(getAsyncDeviceType(), getAsync(),
+                                     deviceType);
+}
+
+mlir::Value acc::ParallelOp::getNumWorkersValue() {
+  return getNumWorkersValue(mlir::acc::DeviceType::None);
+}
+
+mlir::Value
+acc::ParallelOp::getNumWorkersValue(mlir::acc::DeviceType deviceType) {
+  return getValueInDeviceTypeSegment(getNumWorkersDeviceType(), getNumWorkers(),
+                                     deviceType);
+}
+
+mlir::Value acc::ParallelOp::getVectorLengthValue() {
+  return getVectorLengthValue(mlir::acc::DeviceType::None);
+}
+
+mlir::Value
+acc::ParallelOp::getVectorLengthValue(mlir::acc::DeviceType deviceType) {
+  return getValueInDeviceTypeSegment(getVectorLengthDeviceType(),
+                                     getVectorLength(), deviceType);
+}
+
+mlir::Operation::operand_range ParallelOp::getNumGangsValues() {
+  return getNumGangsValues(mlir::acc::DeviceType::None);
+}
+
+static mlir::Operation::operand_range
+getValuesFromSegments(std::optional<mlir::ArrayAttr> arrayAttr,
+                      mlir::Operation::operand_range range,
+                      std::optional<llvm::ArrayRef<int32_t>> segments,
+                      mlir::acc::DeviceType deviceType) {
+  if (!arrayAttr)
+    return range.take_front(0);
+  if (auto pos = findSegment(*arrayAttr, deviceType)) {
+    int32_t nbOperandsBefore = 0;
+    for (unsigned i = 0; i < *pos; ++i)
+      nbOperandsBefore += (*segments)[i];
+    return range.drop_front(nbOperandsBefore).take_front((*segments)[*pos]);
+  }
+  return range.take_front(0);
+}
+
+mlir::Operation::operand_range
+ParallelOp::getNumGangsValues(mlir::acc::DeviceType deviceType) {
+  return getValuesFromSegments(getNumGangsDeviceType(), getNumGangs(),
+                               getNumGangsSegments(), deviceType);
+}
+
+bool acc::ParallelOp::hasWaitOnly() {
+  return hasWaitOnly(mlir::acc::DeviceType::None);
+}
+
+bool acc::ParallelOp::hasWaitOnly(mlir::acc::DeviceType deviceType) {
+  if (auto arrayAttr = getWaitOnly()) {
+    if (findSegment(*arrayAttr, deviceType))
+      return true;
+  }
+  return false;
+}
+
+mlir::Operation::operand_range ParallelOp::getWaitValues() {
+  return getWaitValues(mlir::acc::DeviceType::None);
+}
+
+mlir::Operation::operand_range
+ParallelOp::getWaitValues(mlir::acc::DeviceType deviceType) {
+  return getValuesFromSegments(getWaitOperandsDeviceType(), getWaitOperands(),
+                               getWaitOperandsSegments(), deviceType);
+}
+
+static ParseResult parseNumGangs(
+    mlir::OpAsmParser &parser,
+    llvm::SmallVectorImpl<mlir::OpAsmParser::UnresolvedOperand> &operands,
+    llvm::SmallVectorImpl<Type> &types, mlir::ArrayAttr &deviceTypes,
+    mlir::DenseI32ArrayAttr &segments) {
+  llvm::SmallVector<DeviceTypeAttr> attributes;
+  llvm::SmallVector<int32_t> seg;
+
+  do {
+    if (failed(parser.parseLBrace()))
+      return failure();
+
+    if (failed(parser.parseCommaSeparatedList(
+            mlir::AsmParser::Delimiter::None, [&]() {
+              if (parser.parseOperand(operands.emplace_back()) ||
+                  parser.parseColonType(types.emplace_back()))
+                return failure();
+              return success();
+            })))
+      return failure();
+
+    seg.push_back(operands.size());
+
+    if (failed(parser.parseRBrace()))
+      return failure();
+
+    if (succeeded(parser.parseOptionalLSquare())) {
+      if (parser.parseAttribute(attributes.emplace_back()) ||
+          parser.parseRSquare())
+        return failure();
+    } else {
+      attributes.push_back(mlir::acc::DeviceTypeAttr::get(
+          parser.getContext(), mlir::acc::DeviceType::None));
+    }
+  } while (succeeded(parser.parseOptionalComma()));
+
+  llvm::SmallVector<mlir::Attribute> arrayAttr(attributes.begin(),
+                                               attributes.end());
+  deviceTypes = ArrayAttr::get(parser.getContext(), arrayAttr);
+  segments = DenseI32ArrayAttr::get(parser.getContext(), seg);
+
+  return success();
+}
+
+static void printNumGangs(mlir::OpAsmPrinter &p, mlir::Operation *op,
+                          mlir::OperandRange operands, mlir::TypeRange types,
+                          std::optional<mlir::ArrayAttr> deviceTypes,
+                          std::optional<mlir::DenseI32ArrayAttr> segments) {
+  unsigned opIdx = 0;
+  for (unsigned i = 0; i < deviceTypes->size(); ++i) {
+    if (i != 0)
+      p << ", ";
+    p << "{";
+    for (int32_t j = 0; j < (*segments)[i]; ++j) {
+      if (j != 0)
+        p << ", ";
+      p << operands[opIdx] << " : " << operands[opIdx].getType();
+      ++opIdx;
+    }
+    p << "}";
+    auto deviceTypeAttr =
+        mlir::dyn_cast<mlir::acc::DeviceTypeAttr>((*deviceTypes)[i]);
+    if (deviceTypeAttr.getValue() != mlir::acc::DeviceType::None)
+      p << " [" << (*deviceTypes)[i] << "]";
+  }
+}
+
+static ParseResult parseWaitOperands(
+    mlir::OpAsmParser &parser,
+    llvm::SmallVectorImpl<mlir::OpAsmParser::UnresolvedOperand> &operands,
+    llvm::SmallVectorImpl<Type> &types, mlir::ArrayAttr &deviceTypes,
+    mlir::DenseI32ArrayAttr &segments) {
+  llvm::SmallVector<DeviceTypeAttr> attributes;
+  llvm::SmallVector<int32_t> seg;
+
+  do {
+    if (failed(parser.parseLBrace()))
+      return failure();
+
+    if (failed(parser.parseCommaSeparatedList(
+            mlir::AsmParser::Delimiter::None, [&]() {
+              if (parser.parseOperand(operands.emplace_back()) ||
+                  parser.parseColonType(types.emplace_back()))
+                return failure();
+              return success();
+            })))
+      return failure();
+
+    seg.push_back(operands.size());
+
+    if (failed(parser.parseRBrace()))
+      return failure();
+
+    if (succeeded(parser.parseOptionalLSquare())) {
+      if (parser.parseAttribute(attributes.emplace_back()) ||
+          parser.parseRSquare())
+        return failure();
+    } else {
+      attributes.push_back(mlir::acc::DeviceTypeAttr::get(
+          parser.getContext(), mlir::acc::DeviceType::None));
+    }
+  } while (succeeded(parser.parseOptionalComma()));
+
+  llvm::SmallVector<mlir::Attribute> arrayAttr(attributes.begin(),
+                                               attributes.end());
+  deviceTypes = ArrayAttr::get(parser.getContext(), arrayAttr);
+  segments = DenseI32ArrayAttr::get(parser.getContext(), seg);
+
+  return success();
+}
+
+static void printWaitOperands(mlir::OpAsmPrinter &p, mlir::Operation *op,
+                              mlir::OperandRange operands,
+                              mlir::TypeRange types,
+                              std::optional<mlir::ArrayAttr> deviceTypes,
+                              std::optional<mlir::DenseI32ArrayAttr> segments) {
+  unsigned opIdx = 0;
+  for (unsigned i = 0; i < deviceTypes->size(); ++i) {
+    if (i != 0)
+      p << ", ";
+    p << "{";
+    for (int32_t j = 0; j < (*segments)[i]; ++j) {
+      if (j != 0)
+        p << ", ";
+      p << operands[opIdx] << " : " << operands[opIdx].getType();
+      ++opIdx;
+    }
+    p << "}";
+    auto deviceTypeAttr =
+        mlir::dyn_cast<mlir::acc::DeviceTypeAttr>((*deviceTypes)[i]);
+    if (deviceTypeAttr.getValue() != mlir::acc::DeviceType::None)
+      p << " [" << (*deviceTypes)[i] << "]";
+  }
+}
+
+static ParseResult parseDeviceTypeOperands(
+    mlir::OpAsmParser &parser,
+    llvm::SmallVectorImpl<mlir::OpAsmParser::UnresolvedOperand> &operands,
+    llvm::SmallVectorImpl<Type> &types, mlir::ArrayAttr &deviceTypes) {
+  llvm::SmallVector<DeviceTypeAttr> attributes;
+  if (failed(parser.parseCommaSeparatedList([&]() {
+        if (parser.parseOperand(operands.emplace_back()) ||
+            parser.parseColonType(types.emplace_back()))
+          return failure();
+        if (succeeded(parser.parseOptionalLSquare())) {
+          if (parser.parseAttribute(attributes.emplace_back()) ||
+              parser.parseRSquare())
+            return failure();
+        } else {
+          attributes.push_back(mlir::acc::DeviceTypeAttr::get(
+              parser.getContext(), mlir::acc::DeviceType::None));
+        }
+        return success();
+      })))
+    return failure();
+  llvm::SmallVector<mlir::Attribute> arrayAttr(attributes.begin(),
+                                               attributes.end());
+  deviceTypes = ArrayAttr::get(parser.getContext(), arrayAttr);
+  return success();
+}
+
+static void
+printDeviceTypeOperands(mlir::OpAsmPrinter &p, mlir::Operation *op,
+                        mlir::OperandRange operands, mlir::TypeRange types,
+                        std::optional<mlir::ArrayAttr> deviceTypes) {
+  for (unsigned i = 0, e = deviceTypes->size(); i < e; ++i) {
+    if (i != 0)
+      p << ", ";
+    p << operands[i] << " : " << operands[i].getType();
+    auto deviceTypeAttr =
+        mlir::dyn_cast<mlir::acc::DeviceTypeAttr>((*deviceTypes)[i]);
+    if (deviceTypeAttr.getValue() != mlir::acc::DeviceType::None)
+      p << " [" << (*deviceTypes)[i] << "]";
+  }
+}
+
 //===----------------------------------------------------------------------===//
 // SerialOp
 //===----------------------------------------------------------------------===//
@@ -648,12 +993,55 @@ unsigned SerialOp::getNumDataOperands() {
 }
 
 Value SerialOp::getDataOperand(unsigned i) {
-  unsigned numOptional = getAsync() ? 1 : 0;
+  unsigned numOptional = getAsync().size();
   numOptional += getIfCond() ? 1 : 0;
   numOptional += getSelfCond() ? 1 : 0;
   return getOperand(getWaitOperands().size() + numOptional + i);
 }
 
+bool acc::SerialOp::hasAsyncOnly() {
+  return hasAsyncOnly(mlir::acc::DeviceType::None);
+}
+
+bool acc::SerialOp::hasAsyncOnly(mlir::acc::DeviceType deviceType) {
+  if (auto arrayAttr = getAsyncOnly()) {
+    if (findSegment(*arrayAttr, deviceType))
+      return true;
+  }
+  return false;
+}
+
+mlir::Value acc::SerialOp::getAsyncValue() {
+  return getAsyncValue(mlir::acc::DeviceType::None);
+}
+
+mlir::Value acc::SerialOp::getAsyncValue(mlir::acc::DeviceType deviceType) {
+  return getValueInDeviceTypeSegment(getAsyncDeviceType(), getAsync(),
+                                     deviceType);
+}
+
+bool acc::SerialOp::hasWaitOnly() {
+  return hasWaitOnly(mlir::acc::DeviceType::None);
+}
+
+bool acc::SerialOp::hasWaitOnly(mlir::acc::DeviceType deviceType) {
+  if (auto arrayAttr = getWaitOnly()) {
+    if (findSegment(*arrayAttr, deviceType))
+      return true;
+  }
+  return false;
+}
+
+mlir::Operation::operand_range SerialOp::getWaitValues() {
+  return getWaitValues(mlir::acc::DeviceType::None);
+}
+
+mlir::Operation::operand_range
+SerialOp::getWaitValues(mlir::acc::DeviceType deviceType) {
+  return getValuesFromSegments(getWaitOperandsDeviceType(), getWaitOperands(),
+                               getWaitOperandsSegments(), deviceType);
+}
+
 LogicalResult acc::SerialOp::verify() {
   if (failed(checkSymOperandList<mlir::acc::PrivateRecipeOp>(
           *this, getPrivatizations(), getGangPrivateOperands(), "private",
@@ -663,6 +1051,16 @@ LogicalResult acc::SerialOp::verify() {
           *this, getReductionRecipes(), getReductionOperands(), "reduction",
           "reductions", false)))
     return failure();
+
+  if (failed(verifyDeviceTypeAndSegmentCountMatch(
+          *this, getWaitOperands(), getWaitOperandsSegmentsAttr(),
+          getWaitOperandsDeviceTypeAttr(), "wait")))
+    return failure();
+
+  if (failed(verifyDeviceTypeCountMatch(*this, getAsync(),
+                                        getAsyncDeviceTypeAttr(), "async")))
+    return failure();
+
   return checkDataOperands<acc::SerialOp>(*this, getDataClauseOperands());
 }
 
@@ -675,19 +1073,114 @@ unsigned KernelsOp::getNumDataOperands() {
 }
 
 Value KernelsOp::getDataOperand(unsigned i) {
-  unsigned numOptional = getAsync() ? 1 : 0;
+  unsigned numOptional = getAsync().size();
   numOptional += getWaitOperands().size();
   numOptional += getNumGangs().size();
-  numOptional += getNumWorkers() ? 1 : 0;
-  numOptional += getVectorLength() ? 1 : 0;
+  numOptional += getNumWorkers().size();
+  numOptional += getVectorLength().size();
   numOptional += getIfCond() ? 1 : 0;
   numOptional += getSelfCond() ? 1 : 0;
   return getOperand(numOptional + i);
 }
 
+bool acc::KernelsOp::hasAsyncOnly() {
+  return hasAsyncOnly(mlir::acc::DeviceType::None);
+}
+
+bool acc::KernelsOp::hasAsyncOnly(mlir::acc::DeviceType deviceType) {
+  if (auto arrayAttr = getAsyncOnly()) {
+    if (findSegment(*arrayAttr, deviceType))
+      return true;
+  }
+  return false;
+}
+
+mlir::Value acc::KernelsOp::getAsyncValue() {
+  return getAsyncValue(mlir::acc::DeviceType::None);
+}
+
+mlir::Value acc::KernelsOp::getAsyncValue(mlir::acc::DeviceType deviceType) {
+  return getValueInDeviceTypeSegment(getAsyncDeviceType(), getAsync(),
+                                     deviceType);
+}
+
+mlir::Value acc::KernelsOp::getNumWorkersValue() {
+  return getNumWorkersValue(mlir::acc::DeviceType::None);
+}
+
+mlir::Value
+acc::KernelsOp::getNumWorkersValue(mlir::acc::DeviceType deviceType) {
+  return getValueInDeviceTypeSegment(getNumWorkersDeviceType(), getNumWorkers(),
+                                     deviceType);
+}
+
+mlir::Value acc::KernelsOp::getVectorLengthValue() {
+  return getVectorLengthValue(mlir::acc::DeviceType::None);
+}
+
+mlir::Value
+acc::KernelsOp::getVectorLengthValue(mlir::acc::DeviceType deviceType) {
+  return getValueInDeviceTypeSegment(getVectorLengthDeviceType(),
+                                     getVectorLength(), deviceType);
+}
+
+mlir::Operation::operand_range KernelsOp::getNumGangsValues() {
+  return getNumGangsValues(mlir::acc::DeviceType::None);
+}
+
+mlir::Operation::operand_range
+KernelsOp::getNumGangsValues(mlir::acc::DeviceType deviceType) {
+  return getValuesFromSegments(getNumGangsDeviceType(), getNumGangs(),
+                               getNumGangsSegments(), deviceType);
+}
+
+bool acc::KernelsOp::hasWaitOnly() {
+  return hasWaitOnly(mlir::acc::DeviceType::None);
+}
+
+bool acc::KernelsOp::hasWaitOnly(mlir::acc::DeviceType deviceType) {
+  if (auto arrayAttr = getWaitOnly()) {
+    if (findSegment(*arrayAttr, deviceType))
+      return true;
+  }
+  return false;
+}
+
+mlir::Operation::operand_range KernelsOp::getWaitValues() {
+  return getWaitValues(mlir::acc::DeviceType::None);
+}
+
+mlir::Operation::operand_range
+KernelsOp::getWaitValues(mlir::acc::DeviceType deviceType) {
+  return getValuesFromSegments(getWaitOperandsDeviceType(), getWaitOperands(),
+                               getWaitOperandsSegments(), deviceType);
+}
+
 LogicalResult acc::KernelsOp::verify() {
-  if (getNumGangs().size() > 3)
-    return emitOpError() << "num_gangs expects a maximum of 3 values";
+  if (failed(verifyDeviceTypeAndSegmentCountMatch(
+          *this, getNumGangs(), getNumGangsSegmentsAttr(),
+          getNumGangsDeviceTypeAttr(), "num_gangs", 3)))
+    return failure();
+
+  if (failed(verifyDeviceTypeAndSegmentCountMatch(
+          *this, getWaitOperands(), getWaitOperandsSegmentsAttr(),
+          getWaitOperandsDeviceTypeAttr(), "wait")))
+    return failure();
+
+  if (failed(verifyDeviceTypeCountMatch(*this, getNumWorkers(),
+                                        getNumWorkersDeviceTypeAttr(),
+                                        "num_workers")))
+    return failure();
+
+  if (failed(verifyDeviceTypeCountMatch(*this, getVectorLength(),
+                                        getVectorLengthDeviceTypeAttr(),
+                                        "vector_length")))
+    return failure();
+
+  if (failed(verifyDeviceTypeCountMatch(*this, getAsync(),
+                                        getAsyncDeviceTypeAttr(), "async")))
+    return failure();
+
   return checkDataOperands<acc::KernelsOp>(*this, getDataClauseOperands());
 }
 
diff --git a/mlir/test/Dialect/OpenACC/invalid.mlir b/mlir/test/Dialect/OpenACC/invalid.mlir
index b9ac68d0592c8..c18d964b370f2 100644
--- a/mlir/test/Dialect/OpenACC/invalid.mlir
+++ b/mlir/test/Dialect/OpenACC/invalid.mlir
@@ -462,8 +462,8 @@ acc.loop gang() {
 // -----
 
 %i64value = arith.constant 1 : i64
-// expected-error@+1 {{num_gangs expects a maximum of 3 values}}
-acc.parallel num_gangs(%i64value, %i64value, %i64value, %i64value : i64, i64, i64, i64) {
+// expected-error@+1 {{num_gangs expects a maximum of 3 values per segment}}
+acc.parallel num_gangs({%i64value: i64, %i64value : i64, %i64value : i64, %i64value : i64}) {
 }
 
 // -----
diff --git a/mlir/test/Dialect/OpenACC/ops.mlir b/mlir/test/Dialect/OpenACC/ops.mlir
index 05b0450c7fb91..5a95811685f84 100644
--- a/mlir/test/Dialect/OpenACC/ops.mlir
+++ b/mlir/test/Dialect/OpenACC/ops.mlir
@@ -137,7 +137,7 @@ func.func @compute3(%a: memref<10x10xf32>, %b: memref<10x10xf32>, %c: memref<10x
   %pd = acc.present varPtr(%d : memref<10xf32>) -> memref<10xf32>
   acc.data dataOperands(%pa, %pb, %pc, %pd: memref<10x10xf32>, memref<10x10xf32>, memref<10xf32>, memref<10xf32>) {
     %private = acc.private varPtr(%c : memref<10xf32>) -> memref<10xf32>
-    acc.parallel num_gangs(%numGangs: i64) num_workers(%numWorkers: i64) private(@privatization_memref_10_f32 -> %private : memref<10xf32>) {
+    acc.parallel num_gangs({%numGangs: i64}) num_workers(%numWorkers: i64 [#acc.device_type<nvidia>]) private(@privatization_memref_10_f32 -> %private : memref<10xf32>) {
       acc.loop gang {
         scf.for %x = %lb to %c10 step %st {
           acc.loop worker {
@@ -180,7 +180,7 @@ func.func @compute3(%a: memref<10x10xf32>, %b: memref<10x10xf32>, %c: memref<10x
 // CHECK-NEXT:   [[NUMWORKERS:%.*]] = arith.constant 10 : i64
 // CHECK:        acc.data dataOperands(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : memref<10x10xf32>, memref<10x10xf32>, memref<10xf32>, memref<10xf32>) {
 // CHECK-NEXT:     %[[P_ARG2:.*]] = acc.private varPtr([[ARG2]] : memref<10xf32>) -> memref<10xf32> 
-// CHECK-NEXT:     acc.parallel num_gangs([[NUMGANG]] : i64) num_workers([[NUMWORKERS]] : i64) private(@privatization_memref_10_f32 -> %[[P_ARG2]] : memref<10xf32>) {
+// CHECK-NEXT:     acc.parallel num_gangs({[[NUMGANG]] : i64}) num_workers([[NUMWORKERS]] : i64 [#acc.device_type<nvidia>]) private(@privatization_memref_10_f32 -> %[[P_ARG2]] : memref<10xf32>) {
 // CHECK-NEXT:       acc.loop gang {
 // CHECK-NEXT:         scf.for %{{.*}} = [[C0]] to [[C10]] step [[C1]] {
 // CHECK-NEXT:           acc.loop worker {
@@ -439,25 +439,25 @@ func.func @testparallelop(%a: memref<10xf32>, %b: memref<10xf32>, %c: memref<10x
   }
   acc.parallel async(%idxValue: index) {
   }
-  acc.parallel wait(%i64value: i64) {
+  acc.parallel wait({%i64value: i64}) {
   }
-  acc.parallel wait(%i32value: i32) {
+  acc.parallel wait({%i32value: i32}) {
   }
-  acc.parallel wait(%idxValue: index) {
+  acc.parallel wait({%idxValue: index}) {
   }
-  acc.parallel wait(%i64value, %i32value, %idxValue : i64, i32, index) {
+  acc.parallel wait({%i64value : i64, %i32value : i32, %idxValue : index}) {
   }
-  acc.parallel num_gangs(%i64value: i64) {
+  acc.parallel num_gangs({%i64value: i64}) {
   }
-  acc.parallel num_gangs(%i32value: i32) {
+  acc.parallel num_gangs({%i32value: i32}) {
   }
-  acc.parallel num_gangs(%idxValue: index) {
+  acc.parallel num_gangs({%idxValue: index}) {
   }
-  acc.parallel num_gangs(%i64value, %i64value, %idxValue : i64, i64, index) {
+  acc.parallel num_gangs({%i64value: i64, %i64value: i64, %idxValue: index}) {
   }
-  acc.parallel num_workers(%i64value: i64) {
+  acc.parallel num_workers(%i64value: i64 [#acc.device_type<nvidia>]) {
   }
-  acc.parallel num_workers(%i32value: i32) {
+  acc.parallel num_workers(%i32value: i32 [#acc.device_type<default>]) {
   }
   acc.parallel num_workers(%idxValue: index) {
   }
@@ -492,25 +492,25 @@ func.func @testparallelop(%a: memref<10xf32>, %b: memref<10xf32>, %c: memref<10x
 // CHECK-NEXT: }
 // CHECK:      acc.parallel async([[IDXVALUE]] : index) {
 // CHECK-NEXT: }
-// CHECK:      acc.parallel wait([[I64VALUE]] : i64) {
+// CHECK:      acc.parallel wait({[[I64VALUE]] : i64}) {
 // CHECK-NEXT: }
-// CHECK:      acc.parallel wait([[I32VALUE]] : i32) {
+// CHECK:      acc.parallel wait({[[I32VALUE]] : i32}) {
 // CHECK-NEXT: }
-// CHECK:      acc.parallel wait([[IDXVALUE]] : index) {
+// CHECK:      acc.parallel wait({[[IDXVALUE]] : index}) {
 // CHECK-NEXT: }
-// CHECK:      acc.parallel wait([[I64VALUE]], [[I32VALUE]], [[IDXVALUE]] : i64, i32, index) {
+// CHECK:      acc.parallel wait({[[I64VALUE]] : i64, [[I32VALUE]] : i32, [[IDXVALUE]] : index}) {
 // CHECK-NEXT: }
-// CHECK:      acc.parallel num_gangs([[I64VALUE]] : i64) {
+// CHECK:      acc.parallel num_gangs({[[I64VALUE]] : i64}) {
 // CHECK-NEXT: }
-// CHECK:      acc.parallel num_gangs([[I32VALUE]] : i32) {
+// CHECK:      acc.parallel num_gangs({[[I32VALUE]] : i32}) {
 // CHECK-NEXT: }
-// CHECK:      acc.parallel num_gangs([[IDXVALUE]] : index) {
+// CHECK:      acc.parallel num_gangs({[[IDXVALUE]] : index}) {
 // CHECK-NEXT: }
-// CHECK:      acc.parallel num_gangs([[I64VALUE]], [[I64VALUE]], [[IDXVALUE]] : i64, i64, index) {
+// CHECK:      acc.parallel num_gangs({[[I64VALUE]] : i64, [[I64VALUE]] : i64, [[IDXVALUE]] : index}) {
 // CHECK-NEXT: }
-// CHECK:      acc.parallel num_workers([[I64VALUE]] : i64) {
+// CHECK:      acc.parallel num_workers([[I64VALUE]] : i64 [#acc.device_type<nvidia>]) {
 // CHECK-NEXT: }
-// CHECK:      acc.parallel num_workers([[I32VALUE]] : i32) {
+// CHECK:      acc.parallel num_workers([[I32VALUE]] : i32 [#acc.device_type<default>]) {
 // CHECK-NEXT: }
 // CHECK:      acc.parallel num_workers([[IDXVALUE]] : index) {
 // CHECK-NEXT: }
@@ -590,13 +590,13 @@ func.func @testserialop(%a: memref<10xf32>, %b: memref<10xf32>, %c: memref<10x10
   }
   acc.serial async(%idxValue: index) {
   }
-  acc.serial wait(%i64value: i64) {
+  acc.serial wait({%i64value: i64}) {
   }
-  acc.serial wait(%i32value: i32) {
+  acc.serial wait({%i32value: i32}) {
   }
-  acc.serial wait(%idxValue: index) {
+  acc.serial wait({%idxValue: index}) {
   }
-  acc.serial wait(%i64value, %i32value, %idxValue : i64, i32, index) {
+  acc.serial wait({%i64value : i64, %i32value : i32, %idxValue : index}) {
   }
   %firstprivate = acc.firstprivate varPtr(%b : memref<10xf32>) -> memref<10xf32>
   acc.serial private(@privatization_memref_10_f32 -> %a : memref<10xf32>, @privatization_memref_10_10_f32 -> %c : memref<10x10xf32>) firstprivate(@firstprivatization_memref_10xf32 -> %firstprivate : memref<10xf32>) {
@@ -627,13 +627,13 @@ func.func @testserialop(%a: memref<10xf32>, %b: memref<10xf32>, %c: memref<10x10
 // CHECK-NEXT: }
 // CHECK:      acc.serial async([[IDXVALUE]] : index) {
 // CHECK-NEXT: }
-// CHECK:      acc.serial wait([[I64VALUE]] : i64) {
+// CHECK:      acc.serial wait({[[I64VALUE]] : i64}) {
 // CHECK-NEXT: }
-// CHECK:      acc.serial wait([[I32VALUE]] : i32) {
+// CHECK:      acc.serial wait({[[I32VALUE]] : i32}) {
 // CHECK-NEXT: }
-// CHECK:      acc.serial wait([[IDXVALUE]] : index) {
+// CHECK:      acc.serial wait({[[IDXVALUE]] : index}) {
 // CHECK-NEXT: }
-// CHECK:      acc.serial wait([[I64VALUE]], [[I32VALUE]], [[IDXVALUE]] : i64, i32, index) {
+// CHECK:      acc.serial wait({[[I64VALUE]] : i64, [[I32VALUE]] : i32, [[IDXVALUE]] : index}) {
 // CHECK-NEXT: }
 // CHECK:      %[[FIRSTP:.*]] = acc.firstprivate varPtr([[ARGB]] : memref<10xf32>) -> memref<10xf32>
 // CHECK:      acc.serial firstprivate(@firstprivatization_memref_10xf32 -> %[[FIRSTP]] : memref<10xf32>) private(@privatization_memref_10_f32 -> [[ARGA]] : memref<10xf32>, @privatization_memref_10_10_f32 -> [[ARGC]] : memref<10x10xf32>) {
@@ -665,13 +665,13 @@ func.func @testserialop(%a: memref<10xf32>, %b: memref<10xf32>, %c: memref<10x10
   }
   acc.kernels async(%idxValue: index) {
   }
-  acc.kernels wait(%i64value: i64) {
+  acc.kernels wait({%i64value: i64}) {
   }
-  acc.kernels wait(%i32value: i32) {
+  acc.kernels wait({%i32value: i32}) {
   }
-  acc.kernels wait(%idxValue: index) {
+  acc.kernels wait({%idxValue: index}) {
   }
-  acc.kernels wait(%i64value, %i32value, %idxValue : i64, i32, index) {
+  acc.kernels wait({%i64value : i64, %i32value : i32, %idxValue : index}) {
   }
   acc.kernels {
   } attributes {defaultAttr = #acc<defaultvalue none>}
@@ -699,13 +699,13 @@ func.func @testserialop(%a: memref<10xf32>, %b: memref<10xf32>, %c: memref<10x10
 // CHECK-NEXT: }
 // CHECK:      acc.kernels async([[IDXVALUE]] : index) {
 // CHECK-NEXT: }
-// CHECK:      acc.kernels wait([[I64VALUE]] : i64) {
+// CHECK:      acc.kernels wait({[[I64VALUE]] : i64}) {
 // CHECK-NEXT: }
-// CHECK:      acc.kernels wait([[I32VALUE]] : i32) {
+// CHECK:      acc.kernels wait({[[I32VALUE]] : i32}) {
 // CHECK-NEXT: }
-// CHECK:      acc.kernels wait([[IDXVALUE]] : index) {
+// CHECK:      acc.kernels wait({[[IDXVALUE]] : index}) {
 // CHECK-NEXT: }
-// CHECK:      acc.kernels wait([[I64VALUE]], [[I32VALUE]], [[IDXVALUE]] : i64, i32, index) {
+// CHECK:      acc.kernels wait({[[I64VALUE]] : i64, [[I32VALUE]] : i32, [[IDXVALUE]] : index}) {
 // CHECK-NEXT: }
 // CHECK:      acc.kernels {
 // CHECK-NEXT: } attributes {defaultAttr = #acc<defaultvalue none>}

From b3769adbc566abdee45975c190984545a281c636 Mon Sep 17 00:00:00 2001
From: Brandon Wu <brandon.wu@sifive.com>
Date: Thu, 21 Dec 2023 13:24:26 +0800
Subject: [PATCH 024/342] [RISCV] Fix wrong lmul for sf_vfnrclip (#76016)

---
 llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td    | 10 +++----
 .../CodeGen/RISCV/rvv/sf_vfnrclip_x_f_qf.ll   | 26 +++++++++----------
 .../CodeGen/RISCV/rvv/sf_vfnrclip_xu_f_qf.ll  | 26 +++++++++----------
 3 files changed, 31 insertions(+), 31 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td
index a16fa7e769929..0b1d5b664df97 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td
@@ -362,11 +362,11 @@ multiclass VPseudoSiFiveVFWMACC<string Constraint = ""> {
 }
 
 multiclass VPseudoSiFiveVFNRCLIP<string Constraint = "@earlyclobber $rd"> {
-  foreach m = MxListVF4 in
+  foreach i = [0, 1, 2, 3, 4] in
     let hasSideEffects = 0 in
-      defm "Pseudo" # NAME : VPseudoBinaryRoundingMode<!if(!eq(m.vrclass, VRM8),
-                                                           VRM2, VR),
-                                                       m.vrclass, FPR32, m,
+      defm "Pseudo" # NAME : VPseudoBinaryRoundingMode<MxListW[i].vrclass,
+                                                       MxListVF4[i].vrclass,
+                                                       FPR32, MxListW[i],
                                                        Constraint, /*sew*/0,
                                                        UsesVXRM=0>;
 }
@@ -594,7 +594,7 @@ multiclass VPatVFNRCLIP<string intrinsic, string instruction> {
     defvar Vti = pair.Vti;
     defvar Wti = pair.Wti;
     defm : VPatBinaryRoundingMode<"int_riscv_sf_" # intrinsic,
-                                  "Pseudo" # instruction # "_" # Wti.LMul.MX,
+                                  "Pseudo" # instruction # "_" # Vti.LMul.MX,
                                   Vti.Vector, Wti.Vector, Wti.Scalar, Vti.Mask,
                                   Vti.Log2SEW, Vti.RegClass,
                                   Wti.RegClass, Wti.ScalarRegClass>;
diff --git a/llvm/test/CodeGen/RISCV/rvv/sf_vfnrclip_x_f_qf.ll b/llvm/test/CodeGen/RISCV/rvv/sf_vfnrclip_x_f_qf.ll
index b4f4a879a0b57..b44b57394321a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/sf_vfnrclip_x_f_qf.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/sf_vfnrclip_x_f_qf.ll
@@ -13,7 +13,7 @@ declare <vscale x 1 x i8> @llvm.riscv.sf.vfnrclip.x.f.qf.nxv1i8.nxv1f32.iXLen(
 define <vscale x 1 x i8> @intrinsic_sf_vfnrclip_x_f_qf_nxv1i8_nxv1f32(<vscale x 1 x float> %0, float %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_sf_vfnrclip_x_f_qf_nxv1i8_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    sf.vfnrclip.x.f.qf v9, v8, fa0
 ; CHECK-NEXT:    fsrm a0
@@ -39,7 +39,7 @@ declare <vscale x 1 x i8> @llvm.riscv.sf.vfnrclip.x.f.qf.mask.nxv1i8.nxv1f32.iXL
 define <vscale x 1 x i8> @intrinsic_sf_vfnrclip_x_f_qf_mask_nxv1i8_nxv1f32(<vscale x 1 x i8> %0, <vscale x 1 x float> %1, float %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_sf_vfnrclip_x_f_qf_mask_nxv1i8_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, mu
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
 ; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    sf.vfnrclip.x.f.qf v8, v9, fa0, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -64,11 +64,11 @@ declare <vscale x 2 x i8> @llvm.riscv.sf.vfnrclip.x.f.qf.nxv2i8.nxv2f32.iXLen(
 define <vscale x 2 x i8> @intrinsic_sf_vfnrclip_x_f_qf_nxv2i8_nxv2f32(<vscale x 2 x float> %0, float %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_sf_vfnrclip_x_f_qf_nxv2i8_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    sf.vfnrclip.x.f.qf v9, v8, fa0
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i8> @llvm.riscv.sf.vfnrclip.x.f.qf.nxv2i8.nxv2f32.iXLen(
@@ -90,7 +90,7 @@ declare <vscale x 2 x i8> @llvm.riscv.sf.vfnrclip.x.f.qf.mask.nxv2i8.nxv2f32.iXL
 define <vscale x 2 x i8> @intrinsic_sf_vfnrclip_x_f_qf_mask_nxv2i8_nxv2f32(<vscale x 2 x i8> %0, <vscale x 2 x float> %1, float %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_sf_vfnrclip_x_f_qf_mask_nxv2i8_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, mu
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, mu
 ; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    sf.vfnrclip.x.f.qf v8, v9, fa0, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -115,7 +115,7 @@ declare <vscale x 4 x i8> @llvm.riscv.sf.vfnrclip.x.f.qf.nxv4i8.nxv4f32.iXLen(
 define <vscale x 4 x i8> @intrinsic_sf_vfnrclip_x_f_qf_nxv4i8_nxv4f32(<vscale x 4 x float> %0, float %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_sf_vfnrclip_x_f_qf_nxv4i8_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    sf.vfnrclip.x.f.qf v10, v8, fa0
 ; CHECK-NEXT:    fsrm a0
@@ -141,7 +141,7 @@ declare <vscale x 4 x i8> @llvm.riscv.sf.vfnrclip.x.f.qf.mask.nxv4i8.nxv4f32.iXL
 define <vscale x 4 x i8> @intrinsic_sf_vfnrclip_x_f_qf_mask_nxv4i8_nxv4f32(<vscale x 4 x i8> %0, <vscale x 4 x float> %1, float %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_sf_vfnrclip_x_f_qf_mask_nxv4i8_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, mu
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, mu
 ; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    sf.vfnrclip.x.f.qf v8, v10, fa0, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -166,11 +166,11 @@ declare <vscale x 8 x i8> @llvm.riscv.sf.vfnrclip.x.f.qf.nxv8i8.nxv8f32.iXLen(
 define <vscale x 8 x i8> @intrinsic_sf_vfnrclip_x_f_qf_nxv8i8_nxv8f32(<vscale x 8 x float> %0, float %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_sf_vfnrclip_x_f_qf_nxv8i8_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    sf.vfnrclip.x.f.qf v12, v8, fa0
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vmv1r.v v8, v12
+; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i8> @llvm.riscv.sf.vfnrclip.x.f.qf.nxv8i8.nxv8f32.iXLen(
@@ -192,7 +192,7 @@ declare <vscale x 8 x i8> @llvm.riscv.sf.vfnrclip.x.f.qf.mask.nxv8i8.nxv8f32.iXL
 define <vscale x 8 x i8> @intrinsic_sf_vfnrclip_x_f_qf_mask_nxv8i8_nxv8f32(<vscale x 8 x i8> %0, <vscale x 8 x float> %1, float %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_sf_vfnrclip_x_f_qf_mask_nxv8i8_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, mu
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, mu
 ; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    sf.vfnrclip.x.f.qf v8, v12, fa0, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -217,11 +217,11 @@ declare <vscale x 16 x i8> @llvm.riscv.sf.vfnrclip.x.f.qf.nxv16i8.nxv16f32.iXLen
 define <vscale x 16 x i8> @intrinsic_sf_vfnrclip_x_f_qf_nxv16i8_nxv16f32(<vscale x 16 x float> %0, float %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_sf_vfnrclip_x_f_qf_nxv16i8_nxv16f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    sf.vfnrclip.x.f.qf v16, v8, fa0
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vmv2r.v v8, v16
+; CHECK-NEXT:    vmv.v.v v8, v16
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i8> @llvm.riscv.sf.vfnrclip.x.f.qf.nxv16i8.nxv16f32.iXLen(
@@ -243,7 +243,7 @@ declare <vscale x 16 x i8> @llvm.riscv.sf.vfnrclip.x.f.qf.mask.nxv16i8.nxv16f32.
 define <vscale x 16 x i8> @intrinsic_sf_vfnrclip_x_f_qf_mask_nxv16i8_nxv16f32(<vscale x 16 x i8> %0, <vscale x 16 x float> %1, float %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_sf_vfnrclip_x_f_qf_mask_nxv16i8_nxv16f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, mu
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, mu
 ; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    sf.vfnrclip.x.f.qf v8, v16, fa0, v0.t
 ; CHECK-NEXT:    fsrm a0
diff --git a/llvm/test/CodeGen/RISCV/rvv/sf_vfnrclip_xu_f_qf.ll b/llvm/test/CodeGen/RISCV/rvv/sf_vfnrclip_xu_f_qf.ll
index 363cccd5ad356..bc2f7ca7dc860 100644
--- a/llvm/test/CodeGen/RISCV/rvv/sf_vfnrclip_xu_f_qf.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/sf_vfnrclip_xu_f_qf.ll
@@ -13,7 +13,7 @@ declare <vscale x 1 x i8> @llvm.riscv.sf.vfnrclip.xu.f.qf.nxv1i8.nxv1f32.iXLen(
 define <vscale x 1 x i8> @intrinsic_sf_vfnrclip_xu_f_qf_nxv1i8_nxv1f32(<vscale x 1 x float> %0, float %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_sf_vfnrclip_xu_f_qf_nxv1i8_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    sf.vfnrclip.xu.f.qf v9, v8, fa0
 ; CHECK-NEXT:    fsrm a0
@@ -39,7 +39,7 @@ declare <vscale x 1 x i8> @llvm.riscv.sf.vfnrclip.xu.f.qf.mask.nxv1i8.nxv1f32.iX
 define <vscale x 1 x i8> @intrinsic_sf_vfnrclip_xu_f_qf_mask_nxv1i8_nxv1f32(<vscale x 1 x i8> %0, <vscale x 1 x float> %1, float %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_sf_vfnrclip_xu_f_qf_mask_nxv1i8_nxv1f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, mu
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, mu
 ; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    sf.vfnrclip.xu.f.qf v8, v9, fa0, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -64,11 +64,11 @@ declare <vscale x 2 x i8> @llvm.riscv.sf.vfnrclip.xu.f.qf.nxv2i8.nxv2f32.iXLen(
 define <vscale x 2 x i8> @intrinsic_sf_vfnrclip_xu_f_qf_nxv2i8_nxv2f32(<vscale x 2 x float> %0, float %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_sf_vfnrclip_xu_f_qf_nxv2i8_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    sf.vfnrclip.xu.f.qf v9, v8, fa0
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i8> @llvm.riscv.sf.vfnrclip.xu.f.qf.nxv2i8.nxv2f32.iXLen(
@@ -90,7 +90,7 @@ declare <vscale x 2 x i8> @llvm.riscv.sf.vfnrclip.xu.f.qf.mask.nxv2i8.nxv2f32.iX
 define <vscale x 2 x i8> @intrinsic_sf_vfnrclip_xu_f_qf_mask_nxv2i8_nxv2f32(<vscale x 2 x i8> %0, <vscale x 2 x float> %1, float %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_sf_vfnrclip_xu_f_qf_mask_nxv2i8_nxv2f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, mu
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, mu
 ; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    sf.vfnrclip.xu.f.qf v8, v9, fa0, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -115,7 +115,7 @@ declare <vscale x 4 x i8> @llvm.riscv.sf.vfnrclip.xu.f.qf.nxv4i8.nxv4f32.iXLen(
 define <vscale x 4 x i8> @intrinsic_sf_vfnrclip_xu_f_qf_nxv4i8_nxv4f32(<vscale x 4 x float> %0, float %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_sf_vfnrclip_xu_f_qf_nxv4i8_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    sf.vfnrclip.xu.f.qf v10, v8, fa0
 ; CHECK-NEXT:    fsrm a0
@@ -141,7 +141,7 @@ declare <vscale x 4 x i8> @llvm.riscv.sf.vfnrclip.xu.f.qf.mask.nxv4i8.nxv4f32.iX
 define <vscale x 4 x i8> @intrinsic_sf_vfnrclip_xu_f_qf_mask_nxv4i8_nxv4f32(<vscale x 4 x i8> %0, <vscale x 4 x float> %1, float %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_sf_vfnrclip_xu_f_qf_mask_nxv4i8_nxv4f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, mu
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, mu
 ; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    sf.vfnrclip.xu.f.qf v8, v10, fa0, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -166,11 +166,11 @@ declare <vscale x 8 x i8> @llvm.riscv.sf.vfnrclip.xu.f.qf.nxv8i8.nxv8f32.iXLen(
 define <vscale x 8 x i8> @intrinsic_sf_vfnrclip_xu_f_qf_nxv8i8_nxv8f32(<vscale x 8 x float> %0, float %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_sf_vfnrclip_xu_f_qf_nxv8i8_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    sf.vfnrclip.xu.f.qf v12, v8, fa0
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vmv1r.v v8, v12
+; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i8> @llvm.riscv.sf.vfnrclip.xu.f.qf.nxv8i8.nxv8f32.iXLen(
@@ -192,7 +192,7 @@ declare <vscale x 8 x i8> @llvm.riscv.sf.vfnrclip.xu.f.qf.mask.nxv8i8.nxv8f32.iX
 define <vscale x 8 x i8> @intrinsic_sf_vfnrclip_xu_f_qf_mask_nxv8i8_nxv8f32(<vscale x 8 x i8> %0, <vscale x 8 x float> %1, float %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_sf_vfnrclip_xu_f_qf_mask_nxv8i8_nxv8f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, mu
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, mu
 ; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    sf.vfnrclip.xu.f.qf v8, v12, fa0, v0.t
 ; CHECK-NEXT:    fsrm a0
@@ -217,11 +217,11 @@ declare <vscale x 16 x i8> @llvm.riscv.sf.vfnrclip.xu.f.qf.nxv16i8.nxv16f32.iXLe
 define <vscale x 16 x i8> @intrinsic_sf_vfnrclip_xu_f_qf_nxv16i8_nxv16f32(<vscale x 16 x float> %0, float %1, iXLen %2) nounwind {
 ; CHECK-LABEL: intrinsic_sf_vfnrclip_xu_f_qf_nxv16i8_nxv16f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
 ; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    sf.vfnrclip.xu.f.qf v16, v8, fa0
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vmv2r.v v8, v16
+; CHECK-NEXT:    vmv.v.v v8, v16
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i8> @llvm.riscv.sf.vfnrclip.xu.f.qf.nxv16i8.nxv16f32.iXLen(
@@ -243,7 +243,7 @@ declare <vscale x 16 x i8> @llvm.riscv.sf.vfnrclip.xu.f.qf.mask.nxv16i8.nxv16f32
 define <vscale x 16 x i8> @intrinsic_sf_vfnrclip_xu_f_qf_mask_nxv16i8_nxv16f32(<vscale x 16 x i8> %0, <vscale x 16 x float> %1, float %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
 ; CHECK-LABEL: intrinsic_sf_vfnrclip_xu_f_qf_mask_nxv16i8_nxv16f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, mu
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, mu
 ; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    sf.vfnrclip.xu.f.qf v8, v16, fa0, v0.t
 ; CHECK-NEXT:    fsrm a0

From d5c98e783779c4e6b26b2010b20cd0ab7210ead3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder@redhat.com>
Date: Wed, 20 Dec 2023 15:21:06 +0100
Subject: [PATCH 025/342] [clang][AST][NFC] const-qualify some local references

---
 clang/lib/AST/Decl.cpp | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/clang/lib/AST/Decl.cpp b/clang/lib/AST/Decl.cpp
index c2ea155679193..12e0a6faa4c33 100644
--- a/clang/lib/AST/Decl.cpp
+++ b/clang/lib/AST/Decl.cpp
@@ -2943,7 +2943,7 @@ bool ParmVarDecl::isDestroyedInCallee() const {
 
   // FIXME: isParamDestroyedInCallee() should probably imply
   // isDestructedType()
-  auto *RT = getType()->getAs<RecordType>();
+  const auto *RT = getType()->getAs<RecordType>();
   if (RT && RT->getDecl()->isParamDestroyedInCallee() &&
       getType().isDestructedType())
     return true;
@@ -3105,7 +3105,7 @@ FunctionDecl::getDefaultedFunctionInfo() const {
 }
 
 bool FunctionDecl::hasBody(const FunctionDecl *&Definition) const {
-  for (auto *I : redecls()) {
+  for (const auto *I : redecls()) {
     if (I->doesThisDeclarationHaveABody()) {
       Definition = I;
       return true;
@@ -3116,7 +3116,7 @@ bool FunctionDecl::hasBody(const FunctionDecl *&Definition) const {
 }
 
 bool FunctionDecl::hasTrivialBody() const {
-  Stmt *S = getBody();
+  const Stmt *S = getBody();
   if (!S) {
     // Since we don't have a body for this function, we don't know if it's
     // trivial or not.
@@ -3212,7 +3212,7 @@ void FunctionDecl::setPure(bool P) {
 
 template<std::size_t Len>
 static bool isNamed(const NamedDecl *ND, const char (&Str)[Len]) {
-  IdentifierInfo *II = ND->getIdentifier();
+  const IdentifierInfo *II = ND->getIdentifier();
   return II && II->isStr(Str);
 }
 
@@ -3305,9 +3305,9 @@ bool FunctionDecl::isReservedGlobalPlacementOperator() const {
   if (proto->getNumParams() != 2 || proto->isVariadic())
     return false;
 
-  ASTContext &Context =
-    cast<TranslationUnitDecl>(getDeclContext()->getRedeclContext())
-      ->getASTContext();
+  const ASTContext &Context =
+      cast<TranslationUnitDecl>(getDeclContext()->getRedeclContext())
+          ->getASTContext();
 
   // The result type and first argument type are constant across all
   // these operators.  The second argument must be exactly void*.
@@ -3342,7 +3342,7 @@ bool FunctionDecl::isReplaceableGlobalAllocationFunction(
 
   unsigned Params = 1;
   QualType Ty = FPT->getParamType(Params);
-  ASTContext &Ctx = getASTContext();
+  const ASTContext &Ctx = getASTContext();
 
   auto Consume = [&] {
     ++Params;
@@ -3388,7 +3388,8 @@ bool FunctionDecl::isReplaceableGlobalAllocationFunction(
     QualType T = Ty;
     while (const auto *TD = T->getAs<TypedefType>())
       T = TD->getDecl()->getUnderlyingType();
-    IdentifierInfo *II = T->castAs<EnumType>()->getDecl()->getIdentifier();
+    const IdentifierInfo *II =
+        T->castAs<EnumType>()->getDecl()->getIdentifier();
     if (II && II->isStr("__hot_cold_t"))
       Consume();
   }
@@ -3586,7 +3587,7 @@ unsigned FunctionDecl::getBuiltinID(bool ConsiderWrapperFunctions) const {
       (!hasAttr<ArmBuiltinAliasAttr>() && !hasAttr<BuiltinAliasAttr>()))
     return 0;
 
-  ASTContext &Context = getASTContext();
+  const ASTContext &Context = getASTContext();
   if (!Context.BuiltinInfo.isPredefinedLibFunction(BuiltinID))
     return BuiltinID;
 
@@ -3745,7 +3746,7 @@ bool FunctionDecl::doesDeclarationForceExternallyVisibleDefinition() const {
   assert(!doesThisDeclarationHaveABody() &&
          "Must have a declaration without a body.");
 
-  ASTContext &Context = getASTContext();
+  const ASTContext &Context = getASTContext();
 
   if (Context.getLangOpts().MSVCCompat) {
     const FunctionDecl *Definition;

From d8d09296ed8139c6c91ed7e467764ce7375f6667 Mon Sep 17 00:00:00 2001
From: Matthias Springer <me@m-sp.org>
Date: Thu, 21 Dec 2023 16:00:18 +0900
Subject: [PATCH 026/342] [mlir][EmitC] Fix invalid rewriter API usage (#76124)

When operations are modified in-place, the rewriter must be notified.
This commit fixes `mlir/test/Dialect/EmitC/transforms.mlir` when running
with `MLIR_ENABLE_EXPENSIVE_PATTERN_API_CHECKS` enabled.
---
 mlir/lib/Dialect/EmitC/Transforms/Transforms.cpp | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/mlir/lib/Dialect/EmitC/Transforms/Transforms.cpp b/mlir/lib/Dialect/EmitC/Transforms/Transforms.cpp
index 593d774cac73b..88b691b50f325 100644
--- a/mlir/lib/Dialect/EmitC/Transforms/Transforms.cpp
+++ b/mlir/lib/Dialect/EmitC/Transforms/Transforms.cpp
@@ -96,10 +96,7 @@ struct FoldExpressionOp : public OpRewritePattern<ExpressionOp> {
         assert(clonedExpressionRootOp->getNumResults() == 1 &&
                "Expected cloned root to have a single result");
 
-        Value clonedExpressionResult = clonedExpressionRootOp->getResult(0);
-
-        usedExpression.getResult().replaceAllUsesWith(clonedExpressionResult);
-        rewriter.eraseOp(usedExpression);
+        rewriter.replaceOp(usedExpression, clonedExpressionRootOp);
         anythingFolded = true;
       }
     }

From 9b561ca044cbb9f29a676ef85539c8e36becf579 Mon Sep 17 00:00:00 2001
From: Yeting Kuo <46629943+yetingk@users.noreply.github.com>
Date: Thu, 21 Dec 2023 15:03:36 +0800
Subject: [PATCH 027/342] [RISCV] Make performFP_TO_INTCombine fold with
 ISD::FRINT. (#76020)

Fold (fp_to_int (frint X)) to (fcvt X) without rounding mode.
---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   |  10 +-
 .../CodeGen/RISCV/rvv/double-round-conv.ll    | 184 +++---------------
 .../CodeGen/RISCV/rvv/float-round-conv.ll     | 132 ++-----------
 .../test/CodeGen/RISCV/rvv/half-round-conv.ll | 132 ++-----------
 4 files changed, 62 insertions(+), 396 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 97d76ca494cbe..de15bea72e466 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -13493,6 +13493,7 @@ static SDValue performMemPairCombine(SDNode *N,
 //   (fp_to_int (ffloor X))     -> fcvt X, rdn
 //   (fp_to_int (fceil X))      -> fcvt X, rup
 //   (fp_to_int (fround X))     -> fcvt X, rmm
+//   (fp_to_int (frint X))      -> fcvt X
 static SDValue performFP_TO_INTCombine(SDNode *N,
                                        TargetLowering::DAGCombinerInfo &DCI,
                                        const RISCVSubtarget &Subtarget) {
@@ -13516,10 +13517,7 @@ static SDValue performFP_TO_INTCombine(SDNode *N,
 
   RISCVFPRndMode::RoundingMode FRM = matchRoundingOp(Src.getOpcode());
   // If the result is invalid, we didn't find a foldable instruction.
-  // If the result is dynamic, then we found an frint which we don't yet
-  // support. It will cause 7 to be written to the FRM CSR for vector.
-  // FIXME: We could support this by using VFCVT_X_F_VL/VFCVT_XU_F_VL below.
-  if (FRM == RISCVFPRndMode::Invalid || FRM == RISCVFPRndMode::DYN)
+  if (FRM == RISCVFPRndMode::Invalid)
     return SDValue();
 
   SDLoc DL(N);
@@ -13558,6 +13556,10 @@ static SDValue performFP_TO_INTCombine(SDNode *N,
       unsigned Opc =
           IsSigned ? RISCVISD::VFCVT_RTZ_X_F_VL : RISCVISD::VFCVT_RTZ_XU_F_VL;
       FpToInt = DAG.getNode(Opc, DL, ContainerVT, XVal, Mask, VL);
+    } else if (FRM == RISCVFPRndMode::DYN) {
+      unsigned Opc =
+          IsSigned ? RISCVISD::VFCVT_X_F_VL : RISCVISD::VFCVT_XU_F_VL;
+      FpToInt = DAG.getNode(Opc, DL, ContainerVT, XVal, Mask, VL);
     } else {
       unsigned Opc =
           IsSigned ? RISCVISD::VFCVT_RM_X_F_VL : RISCVISD::VFCVT_RM_XU_F_VL;
diff --git a/llvm/test/CodeGen/RISCV/rvv/double-round-conv.ll b/llvm/test/CodeGen/RISCV/rvv/double-round-conv.ll
index 2ff0b21cd251e..ee9ad097b442b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/double-round-conv.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/double-round-conv.ll
@@ -1209,33 +1209,15 @@ define <vscale x 1 x i16> @rint_nxv1f64_to_ui16(<vscale x 1 x double> %x) {
 define <vscale x 1 x i32> @rint_nxv1f64_to_si32(<vscale x 1 x double> %x) {
 ; RV32-LABEL: rint_nxv1f64_to_si32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    lui a0, %hi(.LCPI36_0)
-; RV32-NEXT:    fld fa5, %lo(.LCPI36_0)(a0)
-; RV32-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; RV32-NEXT:    vfabs.v v9, v8
-; RV32-NEXT:    vmflt.vf v0, v9, fa5
-; RV32-NEXT:    vfcvt.x.f.v v9, v8, v0.t
-; RV32-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; RV32-NEXT:    vsetvli zero, zero, e64, m1, ta, mu
-; RV32-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
-; RV32-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; RV32-NEXT:    vfncvt.rtz.x.f.w v9, v8
+; RV32-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
+; RV32-NEXT:    vfncvt.x.f.w v9, v8
 ; RV32-NEXT:    vmv1r.v v8, v9
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: rint_nxv1f64_to_si32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    lui a0, %hi(.LCPI36_0)
-; RV64-NEXT:    fld fa5, %lo(.LCPI36_0)(a0)
-; RV64-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; RV64-NEXT:    vfabs.v v9, v8
-; RV64-NEXT:    vmflt.vf v0, v9, fa5
-; RV64-NEXT:    vfcvt.x.f.v v9, v8, v0.t
-; RV64-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; RV64-NEXT:    vsetvli zero, zero, e64, m1, ta, mu
-; RV64-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
-; RV64-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; RV64-NEXT:    vfncvt.rtz.x.f.w v9, v8
+; RV64-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
+; RV64-NEXT:    vfncvt.x.f.w v9, v8
 ; RV64-NEXT:    vmv1r.v v8, v9
 ; RV64-NEXT:    ret
   %a = call <vscale x 1 x double> @llvm.rint.nxv1f64(<vscale x 1 x double> %x)
@@ -1246,33 +1228,15 @@ define <vscale x 1 x i32> @rint_nxv1f64_to_si32(<vscale x 1 x double> %x) {
 define <vscale x 1 x i32> @rint_nxv1f64_to_ui32(<vscale x 1 x double> %x) {
 ; RV32-LABEL: rint_nxv1f64_to_ui32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    lui a0, %hi(.LCPI37_0)
-; RV32-NEXT:    fld fa5, %lo(.LCPI37_0)(a0)
-; RV32-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; RV32-NEXT:    vfabs.v v9, v8
-; RV32-NEXT:    vmflt.vf v0, v9, fa5
-; RV32-NEXT:    vfcvt.x.f.v v9, v8, v0.t
-; RV32-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; RV32-NEXT:    vsetvli zero, zero, e64, m1, ta, mu
-; RV32-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
-; RV32-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; RV32-NEXT:    vfncvt.rtz.xu.f.w v9, v8
+; RV32-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
+; RV32-NEXT:    vfncvt.xu.f.w v9, v8
 ; RV32-NEXT:    vmv1r.v v8, v9
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: rint_nxv1f64_to_ui32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    lui a0, %hi(.LCPI37_0)
-; RV64-NEXT:    fld fa5, %lo(.LCPI37_0)(a0)
-; RV64-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; RV64-NEXT:    vfabs.v v9, v8
-; RV64-NEXT:    vmflt.vf v0, v9, fa5
-; RV64-NEXT:    vfcvt.x.f.v v9, v8, v0.t
-; RV64-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; RV64-NEXT:    vsetvli zero, zero, e64, m1, ta, mu
-; RV64-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
-; RV64-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; RV64-NEXT:    vfncvt.rtz.xu.f.w v9, v8
+; RV64-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
+; RV64-NEXT:    vfncvt.xu.f.w v9, v8
 ; RV64-NEXT:    vmv1r.v v8, v9
 ; RV64-NEXT:    ret
   %a = call <vscale x 1 x double> @llvm.rint.nxv1f64(<vscale x 1 x double> %x)
@@ -1283,30 +1247,14 @@ define <vscale x 1 x i32> @rint_nxv1f64_to_ui32(<vscale x 1 x double> %x) {
 define <vscale x 1 x i64> @rint_nxv1f64_to_si64(<vscale x 1 x double> %x) {
 ; RV32-LABEL: rint_nxv1f64_to_si64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    lui a0, %hi(.LCPI38_0)
-; RV32-NEXT:    fld fa5, %lo(.LCPI38_0)(a0)
 ; RV32-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; RV32-NEXT:    vfabs.v v9, v8
-; RV32-NEXT:    vmflt.vf v0, v9, fa5
-; RV32-NEXT:    vfcvt.x.f.v v9, v8, v0.t
-; RV32-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; RV32-NEXT:    vsetvli zero, zero, e64, m1, ta, mu
-; RV32-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
-; RV32-NEXT:    vfcvt.rtz.x.f.v v8, v8
+; RV32-NEXT:    vfcvt.x.f.v v8, v8
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: rint_nxv1f64_to_si64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    lui a0, %hi(.LCPI38_0)
-; RV64-NEXT:    fld fa5, %lo(.LCPI38_0)(a0)
 ; RV64-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; RV64-NEXT:    vfabs.v v9, v8
-; RV64-NEXT:    vmflt.vf v0, v9, fa5
-; RV64-NEXT:    vfcvt.x.f.v v9, v8, v0.t
-; RV64-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; RV64-NEXT:    vsetvli zero, zero, e64, m1, ta, mu
-; RV64-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
-; RV64-NEXT:    vfcvt.rtz.x.f.v v8, v8
+; RV64-NEXT:    vfcvt.x.f.v v8, v8
 ; RV64-NEXT:    ret
   %a = call <vscale x 1 x double> @llvm.rint.nxv1f64(<vscale x 1 x double> %x)
   %b = fptosi <vscale x 1 x double> %a to <vscale x 1 x i64>
@@ -1316,30 +1264,14 @@ define <vscale x 1 x i64> @rint_nxv1f64_to_si64(<vscale x 1 x double> %x) {
 define <vscale x 1 x i64> @rint_nxv1f64_to_ui64(<vscale x 1 x double> %x) {
 ; RV32-LABEL: rint_nxv1f64_to_ui64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    lui a0, %hi(.LCPI39_0)
-; RV32-NEXT:    fld fa5, %lo(.LCPI39_0)(a0)
 ; RV32-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; RV32-NEXT:    vfabs.v v9, v8
-; RV32-NEXT:    vmflt.vf v0, v9, fa5
-; RV32-NEXT:    vfcvt.x.f.v v9, v8, v0.t
-; RV32-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; RV32-NEXT:    vsetvli zero, zero, e64, m1, ta, mu
-; RV32-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
-; RV32-NEXT:    vfcvt.rtz.xu.f.v v8, v8
+; RV32-NEXT:    vfcvt.xu.f.v v8, v8
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: rint_nxv1f64_to_ui64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    lui a0, %hi(.LCPI39_0)
-; RV64-NEXT:    fld fa5, %lo(.LCPI39_0)(a0)
 ; RV64-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; RV64-NEXT:    vfabs.v v9, v8
-; RV64-NEXT:    vmflt.vf v0, v9, fa5
-; RV64-NEXT:    vfcvt.x.f.v v9, v8, v0.t
-; RV64-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; RV64-NEXT:    vsetvli zero, zero, e64, m1, ta, mu
-; RV64-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
-; RV64-NEXT:    vfcvt.rtz.xu.f.v v8, v8
+; RV64-NEXT:    vfcvt.xu.f.v v8, v8
 ; RV64-NEXT:    ret
   %a = call <vscale x 1 x double> @llvm.rint.nxv1f64(<vscale x 1 x double> %x)
   %b = fptoui <vscale x 1 x double> %a to <vscale x 1 x i64>
@@ -1519,33 +1451,15 @@ define <vscale x 4 x i16> @rint_nxv4f64_to_ui16(<vscale x 4 x double> %x) {
 define <vscale x 4 x i32> @rint_nxv4f64_to_si32(<vscale x 4 x double> %x) {
 ; RV32-LABEL: rint_nxv4f64_to_si32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    lui a0, %hi(.LCPI44_0)
-; RV32-NEXT:    fld fa5, %lo(.LCPI44_0)(a0)
-; RV32-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; RV32-NEXT:    vfabs.v v12, v8
-; RV32-NEXT:    vmflt.vf v0, v12, fa5
-; RV32-NEXT:    vfcvt.x.f.v v12, v8, v0.t
-; RV32-NEXT:    vfcvt.f.x.v v12, v12, v0.t
-; RV32-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
-; RV32-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
-; RV32-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; RV32-NEXT:    vfncvt.rtz.x.f.w v12, v8
+; RV32-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
+; RV32-NEXT:    vfncvt.x.f.w v12, v8
 ; RV32-NEXT:    vmv.v.v v8, v12
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: rint_nxv4f64_to_si32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    lui a0, %hi(.LCPI44_0)
-; RV64-NEXT:    fld fa5, %lo(.LCPI44_0)(a0)
-; RV64-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; RV64-NEXT:    vfabs.v v12, v8
-; RV64-NEXT:    vmflt.vf v0, v12, fa5
-; RV64-NEXT:    vfcvt.x.f.v v12, v8, v0.t
-; RV64-NEXT:    vfcvt.f.x.v v12, v12, v0.t
-; RV64-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
-; RV64-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
-; RV64-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; RV64-NEXT:    vfncvt.rtz.x.f.w v12, v8
+; RV64-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
+; RV64-NEXT:    vfncvt.x.f.w v12, v8
 ; RV64-NEXT:    vmv.v.v v8, v12
 ; RV64-NEXT:    ret
   %a = call <vscale x 4 x double> @llvm.rint.nxv4f64(<vscale x 4 x double> %x)
@@ -1556,33 +1470,15 @@ define <vscale x 4 x i32> @rint_nxv4f64_to_si32(<vscale x 4 x double> %x) {
 define <vscale x 4 x i32> @rint_nxv4f64_to_ui32(<vscale x 4 x double> %x) {
 ; RV32-LABEL: rint_nxv4f64_to_ui32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    lui a0, %hi(.LCPI45_0)
-; RV32-NEXT:    fld fa5, %lo(.LCPI45_0)(a0)
-; RV32-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; RV32-NEXT:    vfabs.v v12, v8
-; RV32-NEXT:    vmflt.vf v0, v12, fa5
-; RV32-NEXT:    vfcvt.x.f.v v12, v8, v0.t
-; RV32-NEXT:    vfcvt.f.x.v v12, v12, v0.t
-; RV32-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
-; RV32-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
-; RV32-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; RV32-NEXT:    vfncvt.rtz.xu.f.w v12, v8
+; RV32-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
+; RV32-NEXT:    vfncvt.xu.f.w v12, v8
 ; RV32-NEXT:    vmv.v.v v8, v12
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: rint_nxv4f64_to_ui32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    lui a0, %hi(.LCPI45_0)
-; RV64-NEXT:    fld fa5, %lo(.LCPI45_0)(a0)
-; RV64-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; RV64-NEXT:    vfabs.v v12, v8
-; RV64-NEXT:    vmflt.vf v0, v12, fa5
-; RV64-NEXT:    vfcvt.x.f.v v12, v8, v0.t
-; RV64-NEXT:    vfcvt.f.x.v v12, v12, v0.t
-; RV64-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
-; RV64-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
-; RV64-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; RV64-NEXT:    vfncvt.rtz.xu.f.w v12, v8
+; RV64-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
+; RV64-NEXT:    vfncvt.xu.f.w v12, v8
 ; RV64-NEXT:    vmv.v.v v8, v12
 ; RV64-NEXT:    ret
   %a = call <vscale x 4 x double> @llvm.rint.nxv4f64(<vscale x 4 x double> %x)
@@ -1593,30 +1489,14 @@ define <vscale x 4 x i32> @rint_nxv4f64_to_ui32(<vscale x 4 x double> %x) {
 define <vscale x 4 x i64> @rint_nxv4f64_to_si64(<vscale x 4 x double> %x) {
 ; RV32-LABEL: rint_nxv4f64_to_si64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    lui a0, %hi(.LCPI46_0)
-; RV32-NEXT:    fld fa5, %lo(.LCPI46_0)(a0)
 ; RV32-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; RV32-NEXT:    vfabs.v v12, v8
-; RV32-NEXT:    vmflt.vf v0, v12, fa5
-; RV32-NEXT:    vfcvt.x.f.v v12, v8, v0.t
-; RV32-NEXT:    vfcvt.f.x.v v12, v12, v0.t
-; RV32-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
-; RV32-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
-; RV32-NEXT:    vfcvt.rtz.x.f.v v8, v8
+; RV32-NEXT:    vfcvt.x.f.v v8, v8
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: rint_nxv4f64_to_si64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    lui a0, %hi(.LCPI46_0)
-; RV64-NEXT:    fld fa5, %lo(.LCPI46_0)(a0)
 ; RV64-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; RV64-NEXT:    vfabs.v v12, v8
-; RV64-NEXT:    vmflt.vf v0, v12, fa5
-; RV64-NEXT:    vfcvt.x.f.v v12, v8, v0.t
-; RV64-NEXT:    vfcvt.f.x.v v12, v12, v0.t
-; RV64-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
-; RV64-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
-; RV64-NEXT:    vfcvt.rtz.x.f.v v8, v8
+; RV64-NEXT:    vfcvt.x.f.v v8, v8
 ; RV64-NEXT:    ret
   %a = call <vscale x 4 x double> @llvm.rint.nxv4f64(<vscale x 4 x double> %x)
   %b = fptosi <vscale x 4 x double> %a to <vscale x 4 x i64>
@@ -1626,30 +1506,14 @@ define <vscale x 4 x i64> @rint_nxv4f64_to_si64(<vscale x 4 x double> %x) {
 define <vscale x 4 x i64> @rint_nxv4f64_to_ui64(<vscale x 4 x double> %x) {
 ; RV32-LABEL: rint_nxv4f64_to_ui64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    lui a0, %hi(.LCPI47_0)
-; RV32-NEXT:    fld fa5, %lo(.LCPI47_0)(a0)
 ; RV32-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; RV32-NEXT:    vfabs.v v12, v8
-; RV32-NEXT:    vmflt.vf v0, v12, fa5
-; RV32-NEXT:    vfcvt.x.f.v v12, v8, v0.t
-; RV32-NEXT:    vfcvt.f.x.v v12, v12, v0.t
-; RV32-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
-; RV32-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
-; RV32-NEXT:    vfcvt.rtz.xu.f.v v8, v8
+; RV32-NEXT:    vfcvt.xu.f.v v8, v8
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: rint_nxv4f64_to_ui64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    lui a0, %hi(.LCPI47_0)
-; RV64-NEXT:    fld fa5, %lo(.LCPI47_0)(a0)
 ; RV64-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; RV64-NEXT:    vfabs.v v12, v8
-; RV64-NEXT:    vmflt.vf v0, v12, fa5
-; RV64-NEXT:    vfcvt.x.f.v v12, v8, v0.t
-; RV64-NEXT:    vfcvt.f.x.v v12, v12, v0.t
-; RV64-NEXT:    vsetvli zero, zero, e64, m4, ta, mu
-; RV64-NEXT:    vfsgnj.vv v8, v12, v8, v0.t
-; RV64-NEXT:    vfcvt.rtz.xu.f.v v8, v8
+; RV64-NEXT:    vfcvt.xu.f.v v8, v8
 ; RV64-NEXT:    ret
   %a = call <vscale x 4 x double> @llvm.rint.nxv4f64(<vscale x 4 x double> %x)
   %b = fptoui <vscale x 4 x double> %a to <vscale x 4 x i64>
diff --git a/llvm/test/CodeGen/RISCV/rvv/float-round-conv.ll b/llvm/test/CodeGen/RISCV/rvv/float-round-conv.ll
index 46b1dd9d2b46d..9dcb6d211cb91 100644
--- a/llvm/test/CodeGen/RISCV/rvv/float-round-conv.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/float-round-conv.ll
@@ -931,33 +931,15 @@ define <vscale x 4 x i8> @rint_nxv4f32_to_ui8(<vscale x 4 x float> %x) {
 define <vscale x 4 x i16> @rint_nxv4f32_to_si16(<vscale x 4 x float> %x) {
 ; RV32-LABEL: rint_nxv4f32_to_si16:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
-; RV32-NEXT:    vfabs.v v10, v8
-; RV32-NEXT:    lui a0, 307200
-; RV32-NEXT:    fmv.w.x fa5, a0
-; RV32-NEXT:    vmflt.vf v0, v10, fa5
-; RV32-NEXT:    vfcvt.x.f.v v10, v8, v0.t
-; RV32-NEXT:    vfcvt.f.x.v v10, v10, v0.t
-; RV32-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
-; RV32-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
-; RV32-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; RV32-NEXT:    vfncvt.rtz.x.f.w v10, v8
+; RV32-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; RV32-NEXT:    vfncvt.x.f.w v10, v8
 ; RV32-NEXT:    vmv.v.v v8, v10
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: rint_nxv4f32_to_si16:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
-; RV64-NEXT:    vfabs.v v10, v8
-; RV64-NEXT:    lui a0, 307200
-; RV64-NEXT:    fmv.w.x fa5, a0
-; RV64-NEXT:    vmflt.vf v0, v10, fa5
-; RV64-NEXT:    vfcvt.x.f.v v10, v8, v0.t
-; RV64-NEXT:    vfcvt.f.x.v v10, v10, v0.t
-; RV64-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
-; RV64-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
-; RV64-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; RV64-NEXT:    vfncvt.rtz.x.f.w v10, v8
+; RV64-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; RV64-NEXT:    vfncvt.x.f.w v10, v8
 ; RV64-NEXT:    vmv.v.v v8, v10
 ; RV64-NEXT:    ret
   %a = call <vscale x 4 x float> @llvm.rint.nxv4f32(<vscale x 4 x float> %x)
@@ -968,33 +950,15 @@ define <vscale x 4 x i16> @rint_nxv4f32_to_si16(<vscale x 4 x float> %x) {
 define <vscale x 4 x i16> @rint_nxv4f32_to_ui16(<vscale x 4 x float> %x) {
 ; RV32-LABEL: rint_nxv4f32_to_ui16:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
-; RV32-NEXT:    vfabs.v v10, v8
-; RV32-NEXT:    lui a0, 307200
-; RV32-NEXT:    fmv.w.x fa5, a0
-; RV32-NEXT:    vmflt.vf v0, v10, fa5
-; RV32-NEXT:    vfcvt.x.f.v v10, v8, v0.t
-; RV32-NEXT:    vfcvt.f.x.v v10, v10, v0.t
-; RV32-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
-; RV32-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
-; RV32-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; RV32-NEXT:    vfncvt.rtz.xu.f.w v10, v8
+; RV32-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; RV32-NEXT:    vfncvt.xu.f.w v10, v8
 ; RV32-NEXT:    vmv.v.v v8, v10
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: rint_nxv4f32_to_ui16:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
-; RV64-NEXT:    vfabs.v v10, v8
-; RV64-NEXT:    lui a0, 307200
-; RV64-NEXT:    fmv.w.x fa5, a0
-; RV64-NEXT:    vmflt.vf v0, v10, fa5
-; RV64-NEXT:    vfcvt.x.f.v v10, v8, v0.t
-; RV64-NEXT:    vfcvt.f.x.v v10, v10, v0.t
-; RV64-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
-; RV64-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
-; RV64-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; RV64-NEXT:    vfncvt.rtz.xu.f.w v10, v8
+; RV64-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; RV64-NEXT:    vfncvt.xu.f.w v10, v8
 ; RV64-NEXT:    vmv.v.v v8, v10
 ; RV64-NEXT:    ret
   %a = call <vscale x 4 x float> @llvm.rint.nxv4f32(<vscale x 4 x float> %x)
@@ -1006,29 +970,13 @@ define <vscale x 4 x i32> @rint_nxv4f32_to_si32(<vscale x 4 x float> %x) {
 ; RV32-LABEL: rint_nxv4f32_to_si32:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
-; RV32-NEXT:    vfabs.v v10, v8
-; RV32-NEXT:    lui a0, 307200
-; RV32-NEXT:    fmv.w.x fa5, a0
-; RV32-NEXT:    vmflt.vf v0, v10, fa5
-; RV32-NEXT:    vfcvt.x.f.v v10, v8, v0.t
-; RV32-NEXT:    vfcvt.f.x.v v10, v10, v0.t
-; RV32-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
-; RV32-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
-; RV32-NEXT:    vfcvt.rtz.x.f.v v8, v8
+; RV32-NEXT:    vfcvt.x.f.v v8, v8
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: rint_nxv4f32_to_si32:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
-; RV64-NEXT:    vfabs.v v10, v8
-; RV64-NEXT:    lui a0, 307200
-; RV64-NEXT:    fmv.w.x fa5, a0
-; RV64-NEXT:    vmflt.vf v0, v10, fa5
-; RV64-NEXT:    vfcvt.x.f.v v10, v8, v0.t
-; RV64-NEXT:    vfcvt.f.x.v v10, v10, v0.t
-; RV64-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
-; RV64-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
-; RV64-NEXT:    vfcvt.rtz.x.f.v v8, v8
+; RV64-NEXT:    vfcvt.x.f.v v8, v8
 ; RV64-NEXT:    ret
   %a = call <vscale x 4 x float> @llvm.rint.nxv4f32(<vscale x 4 x float> %x)
   %b = fptosi <vscale x 4 x float> %a to <vscale x 4 x i32>
@@ -1039,29 +987,13 @@ define <vscale x 4 x i32> @rint_nxv4f32_to_ui32(<vscale x 4 x float> %x) {
 ; RV32-LABEL: rint_nxv4f32_to_ui32:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
-; RV32-NEXT:    vfabs.v v10, v8
-; RV32-NEXT:    lui a0, 307200
-; RV32-NEXT:    fmv.w.x fa5, a0
-; RV32-NEXT:    vmflt.vf v0, v10, fa5
-; RV32-NEXT:    vfcvt.x.f.v v10, v8, v0.t
-; RV32-NEXT:    vfcvt.f.x.v v10, v10, v0.t
-; RV32-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
-; RV32-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
-; RV32-NEXT:    vfcvt.rtz.xu.f.v v8, v8
+; RV32-NEXT:    vfcvt.xu.f.v v8, v8
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: rint_nxv4f32_to_ui32:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
-; RV64-NEXT:    vfabs.v v10, v8
-; RV64-NEXT:    lui a0, 307200
-; RV64-NEXT:    fmv.w.x fa5, a0
-; RV64-NEXT:    vmflt.vf v0, v10, fa5
-; RV64-NEXT:    vfcvt.x.f.v v10, v8, v0.t
-; RV64-NEXT:    vfcvt.f.x.v v10, v10, v0.t
-; RV64-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
-; RV64-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
-; RV64-NEXT:    vfcvt.rtz.xu.f.v v8, v8
+; RV64-NEXT:    vfcvt.xu.f.v v8, v8
 ; RV64-NEXT:    ret
   %a = call <vscale x 4 x float> @llvm.rint.nxv4f32(<vscale x 4 x float> %x)
   %b = fptoui <vscale x 4 x float> %a to <vscale x 4 x i32>
@@ -1072,30 +1004,14 @@ define <vscale x 4 x i64> @rint_nxv4f32_to_si64(<vscale x 4 x float> %x) {
 ; RV32-LABEL: rint_nxv4f32_to_si64:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
-; RV32-NEXT:    vfabs.v v10, v8
-; RV32-NEXT:    lui a0, 307200
-; RV32-NEXT:    fmv.w.x fa5, a0
-; RV32-NEXT:    vmflt.vf v0, v10, fa5
-; RV32-NEXT:    vfcvt.x.f.v v10, v8, v0.t
-; RV32-NEXT:    vfcvt.f.x.v v10, v10, v0.t
-; RV32-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
-; RV32-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
-; RV32-NEXT:    vfwcvt.rtz.x.f.v v12, v8
+; RV32-NEXT:    vfwcvt.x.f.v v12, v8
 ; RV32-NEXT:    vmv4r.v v8, v12
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: rint_nxv4f32_to_si64:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
-; RV64-NEXT:    vfabs.v v10, v8
-; RV64-NEXT:    lui a0, 307200
-; RV64-NEXT:    fmv.w.x fa5, a0
-; RV64-NEXT:    vmflt.vf v0, v10, fa5
-; RV64-NEXT:    vfcvt.x.f.v v10, v8, v0.t
-; RV64-NEXT:    vfcvt.f.x.v v10, v10, v0.t
-; RV64-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
-; RV64-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
-; RV64-NEXT:    vfwcvt.rtz.x.f.v v12, v8
+; RV64-NEXT:    vfwcvt.x.f.v v12, v8
 ; RV64-NEXT:    vmv4r.v v8, v12
 ; RV64-NEXT:    ret
   %a = call <vscale x 4 x float> @llvm.rint.nxv4f32(<vscale x 4 x float> %x)
@@ -1107,30 +1023,14 @@ define <vscale x 4 x i64> @rint_nxv4f32_to_ui64(<vscale x 4 x float> %x) {
 ; RV32-LABEL: rint_nxv4f32_to_ui64:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
-; RV32-NEXT:    vfabs.v v10, v8
-; RV32-NEXT:    lui a0, 307200
-; RV32-NEXT:    fmv.w.x fa5, a0
-; RV32-NEXT:    vmflt.vf v0, v10, fa5
-; RV32-NEXT:    vfcvt.x.f.v v10, v8, v0.t
-; RV32-NEXT:    vfcvt.f.x.v v10, v10, v0.t
-; RV32-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
-; RV32-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
-; RV32-NEXT:    vfwcvt.rtz.xu.f.v v12, v8
+; RV32-NEXT:    vfwcvt.xu.f.v v12, v8
 ; RV32-NEXT:    vmv4r.v v8, v12
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: rint_nxv4f32_to_ui64:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
-; RV64-NEXT:    vfabs.v v10, v8
-; RV64-NEXT:    lui a0, 307200
-; RV64-NEXT:    fmv.w.x fa5, a0
-; RV64-NEXT:    vmflt.vf v0, v10, fa5
-; RV64-NEXT:    vfcvt.x.f.v v10, v8, v0.t
-; RV64-NEXT:    vfcvt.f.x.v v10, v10, v0.t
-; RV64-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
-; RV64-NEXT:    vfsgnj.vv v8, v10, v8, v0.t
-; RV64-NEXT:    vfwcvt.rtz.xu.f.v v12, v8
+; RV64-NEXT:    vfwcvt.xu.f.v v12, v8
 ; RV64-NEXT:    vmv4r.v v8, v12
 ; RV64-NEXT:    ret
   %a = call <vscale x 4 x float> @llvm.rint.nxv4f32(<vscale x 4 x float> %x)
diff --git a/llvm/test/CodeGen/RISCV/rvv/half-round-conv.ll b/llvm/test/CodeGen/RISCV/rvv/half-round-conv.ll
index 2e960209f9ed3..6de62214ccc46 100644
--- a/llvm/test/CodeGen/RISCV/rvv/half-round-conv.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/half-round-conv.ll
@@ -653,17 +653,8 @@ declare <vscale x 1 x half> @llvm.rint.nxv1f16(<vscale x 1 x half>)
 define <vscale x 1 x i8> @rint_nxv1f16_to_si8(<vscale x 1 x half> %x) {
 ; CHECK-LABEL: rint_nxv1f16_to_si8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, %hi(.LCPI32_0)
-; CHECK-NEXT:    flh fa5, %lo(.LCPI32_0)(a0)
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vfabs.v v9, v8
-; CHECK-NEXT:    vmflt.vf v0, v9, fa5
-; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
-; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
-; CHECK-NEXT:    vfncvt.rtz.x.f.w v9, v8
+; CHECK-NEXT:    vsetvli a0, zero, e8, mf8, ta, ma
+; CHECK-NEXT:    vfncvt.x.f.w v9, v8
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
   %a = call <vscale x 1 x half> @llvm.rint.nxv1f16(<vscale x 1 x half> %x)
@@ -674,17 +665,8 @@ define <vscale x 1 x i8> @rint_nxv1f16_to_si8(<vscale x 1 x half> %x) {
 define <vscale x 1 x i8> @rint_nxv1f16_to_ui8(<vscale x 1 x half> %x) {
 ; CHECK-LABEL: rint_nxv1f16_to_ui8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, %hi(.LCPI33_0)
-; CHECK-NEXT:    flh fa5, %lo(.LCPI33_0)(a0)
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vfabs.v v9, v8
-; CHECK-NEXT:    vmflt.vf v0, v9, fa5
-; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
-; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
-; CHECK-NEXT:    vfncvt.rtz.xu.f.w v9, v8
+; CHECK-NEXT:    vsetvli a0, zero, e8, mf8, ta, ma
+; CHECK-NEXT:    vfncvt.xu.f.w v9, v8
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
   %a = call <vscale x 1 x half> @llvm.rint.nxv1f16(<vscale x 1 x half> %x)
@@ -695,16 +677,8 @@ define <vscale x 1 x i8> @rint_nxv1f16_to_ui8(<vscale x 1 x half> %x) {
 define <vscale x 1 x i16> @rint_nxv1f16_to_si16(<vscale x 1 x half> %x) {
 ; CHECK-LABEL: rint_nxv1f16_to_si16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, %hi(.LCPI34_0)
-; CHECK-NEXT:    flh fa5, %lo(.LCPI34_0)(a0)
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vfabs.v v9, v8
-; CHECK-NEXT:    vmflt.vf v0, v9, fa5
-; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
-; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
-; CHECK-NEXT:    vfcvt.rtz.x.f.v v8, v8
+; CHECK-NEXT:    vfcvt.x.f.v v8, v8
 ; CHECK-NEXT:    ret
   %a = call <vscale x 1 x half> @llvm.rint.nxv1f16(<vscale x 1 x half> %x)
   %b = fptosi <vscale x 1 x half> %a to <vscale x 1 x i16>
@@ -714,16 +688,8 @@ define <vscale x 1 x i16> @rint_nxv1f16_to_si16(<vscale x 1 x half> %x) {
 define <vscale x 1 x i16> @rint_nxv1f16_to_ui16(<vscale x 1 x half> %x) {
 ; CHECK-LABEL: rint_nxv1f16_to_ui16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, %hi(.LCPI35_0)
-; CHECK-NEXT:    flh fa5, %lo(.LCPI35_0)(a0)
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vfabs.v v9, v8
-; CHECK-NEXT:    vmflt.vf v0, v9, fa5
-; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
-; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
-; CHECK-NEXT:    vfcvt.rtz.xu.f.v v8, v8
+; CHECK-NEXT:    vfcvt.xu.f.v v8, v8
 ; CHECK-NEXT:    ret
   %a = call <vscale x 1 x half> @llvm.rint.nxv1f16(<vscale x 1 x half> %x)
   %b = fptoui <vscale x 1 x half> %a to <vscale x 1 x i16>
@@ -733,16 +699,8 @@ define <vscale x 1 x i16> @rint_nxv1f16_to_ui16(<vscale x 1 x half> %x) {
 define <vscale x 1 x i32> @rint_nxv1f16_to_si32(<vscale x 1 x half> %x) {
 ; CHECK-LABEL: rint_nxv1f16_to_si32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, %hi(.LCPI36_0)
-; CHECK-NEXT:    flh fa5, %lo(.LCPI36_0)(a0)
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vfabs.v v9, v8
-; CHECK-NEXT:    vmflt.vf v0, v9, fa5
-; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
-; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
-; CHECK-NEXT:    vfwcvt.rtz.x.f.v v9, v8
+; CHECK-NEXT:    vfwcvt.x.f.v v9, v8
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
   %a = call <vscale x 1 x half> @llvm.rint.nxv1f16(<vscale x 1 x half> %x)
@@ -753,16 +711,8 @@ define <vscale x 1 x i32> @rint_nxv1f16_to_si32(<vscale x 1 x half> %x) {
 define <vscale x 1 x i32> @rint_nxv1f16_to_ui32(<vscale x 1 x half> %x) {
 ; CHECK-LABEL: rint_nxv1f16_to_ui32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, %hi(.LCPI37_0)
-; CHECK-NEXT:    flh fa5, %lo(.LCPI37_0)(a0)
 ; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vfabs.v v9, v8
-; CHECK-NEXT:    vmflt.vf v0, v9, fa5
-; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
-; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
-; CHECK-NEXT:    vfwcvt.rtz.xu.f.v v9, v8
+; CHECK-NEXT:    vfwcvt.xu.f.v v9, v8
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
   %a = call <vscale x 1 x half> @llvm.rint.nxv1f16(<vscale x 1 x half> %x)
@@ -889,17 +839,8 @@ declare <vscale x 4 x half> @llvm.rint.nxv4f16(<vscale x 4 x half>)
 define <vscale x 4 x i8> @rint_nxv4f16_to_si8(<vscale x 4 x half> %x) {
 ; CHECK-LABEL: rint_nxv4f16_to_si8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, %hi(.LCPI40_0)
-; CHECK-NEXT:    flh fa5, %lo(.LCPI40_0)(a0)
-; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vfabs.v v9, v8
-; CHECK-NEXT:    vmflt.vf v0, v9, fa5
-; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
-; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vfncvt.rtz.x.f.w v9, v8
+; CHECK-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vfncvt.x.f.w v9, v8
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
   %a = call <vscale x 4 x half> @llvm.rint.nxv4f16(<vscale x 4 x half> %x)
@@ -910,17 +851,8 @@ define <vscale x 4 x i8> @rint_nxv4f16_to_si8(<vscale x 4 x half> %x) {
 define <vscale x 4 x i8> @rint_nxv4f16_to_ui8(<vscale x 4 x half> %x) {
 ; CHECK-LABEL: rint_nxv4f16_to_ui8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, %hi(.LCPI41_0)
-; CHECK-NEXT:    flh fa5, %lo(.LCPI41_0)(a0)
-; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vfabs.v v9, v8
-; CHECK-NEXT:    vmflt.vf v0, v9, fa5
-; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
-; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vfncvt.rtz.xu.f.w v9, v8
+; CHECK-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vfncvt.xu.f.w v9, v8
 ; CHECK-NEXT:    vmv1r.v v8, v9
 ; CHECK-NEXT:    ret
   %a = call <vscale x 4 x half> @llvm.rint.nxv4f16(<vscale x 4 x half> %x)
@@ -931,16 +863,8 @@ define <vscale x 4 x i8> @rint_nxv4f16_to_ui8(<vscale x 4 x half> %x) {
 define <vscale x 4 x i16> @rint_nxv4f16_to_si16(<vscale x 4 x half> %x) {
 ; CHECK-LABEL: rint_nxv4f16_to_si16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, %hi(.LCPI42_0)
-; CHECK-NEXT:    flh fa5, %lo(.LCPI42_0)(a0)
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vfabs.v v9, v8
-; CHECK-NEXT:    vmflt.vf v0, v9, fa5
-; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
-; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
-; CHECK-NEXT:    vfcvt.rtz.x.f.v v8, v8
+; CHECK-NEXT:    vfcvt.x.f.v v8, v8
 ; CHECK-NEXT:    ret
   %a = call <vscale x 4 x half> @llvm.rint.nxv4f16(<vscale x 4 x half> %x)
   %b = fptosi <vscale x 4 x half> %a to <vscale x 4 x i16>
@@ -950,16 +874,8 @@ define <vscale x 4 x i16> @rint_nxv4f16_to_si16(<vscale x 4 x half> %x) {
 define <vscale x 4 x i16> @rint_nxv4f16_to_ui16(<vscale x 4 x half> %x) {
 ; CHECK-LABEL: rint_nxv4f16_to_ui16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, %hi(.LCPI43_0)
-; CHECK-NEXT:    flh fa5, %lo(.LCPI43_0)(a0)
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vfabs.v v9, v8
-; CHECK-NEXT:    vmflt.vf v0, v9, fa5
-; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
-; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
-; CHECK-NEXT:    vfcvt.rtz.xu.f.v v8, v8
+; CHECK-NEXT:    vfcvt.xu.f.v v8, v8
 ; CHECK-NEXT:    ret
   %a = call <vscale x 4 x half> @llvm.rint.nxv4f16(<vscale x 4 x half> %x)
   %b = fptoui <vscale x 4 x half> %a to <vscale x 4 x i16>
@@ -969,16 +885,8 @@ define <vscale x 4 x i16> @rint_nxv4f16_to_ui16(<vscale x 4 x half> %x) {
 define <vscale x 4 x i32> @rint_nxv4f16_to_si32(<vscale x 4 x half> %x) {
 ; CHECK-LABEL: rint_nxv4f16_to_si32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, %hi(.LCPI44_0)
-; CHECK-NEXT:    flh fa5, %lo(.LCPI44_0)(a0)
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vfabs.v v9, v8
-; CHECK-NEXT:    vmflt.vf v0, v9, fa5
-; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
-; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
-; CHECK-NEXT:    vfwcvt.rtz.x.f.v v10, v8
+; CHECK-NEXT:    vfwcvt.x.f.v v10, v8
 ; CHECK-NEXT:    vmv2r.v v8, v10
 ; CHECK-NEXT:    ret
   %a = call <vscale x 4 x half> @llvm.rint.nxv4f16(<vscale x 4 x half> %x)
@@ -989,16 +897,8 @@ define <vscale x 4 x i32> @rint_nxv4f16_to_si32(<vscale x 4 x half> %x) {
 define <vscale x 4 x i32> @rint_nxv4f16_to_ui32(<vscale x 4 x half> %x) {
 ; CHECK-LABEL: rint_nxv4f16_to_ui32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, %hi(.LCPI45_0)
-; CHECK-NEXT:    flh fa5, %lo(.LCPI45_0)(a0)
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vfabs.v v9, v8
-; CHECK-NEXT:    vmflt.vf v0, v9, fa5
-; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
-; CHECK-NEXT:    vfcvt.f.x.v v9, v9, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v9, v8, v0.t
-; CHECK-NEXT:    vfwcvt.rtz.xu.f.v v10, v8
+; CHECK-NEXT:    vfwcvt.xu.f.v v10, v8
 ; CHECK-NEXT:    vmv2r.v v8, v10
 ; CHECK-NEXT:    ret
   %a = call <vscale x 4 x half> @llvm.rint.nxv4f16(<vscale x 4 x half> %x)

From 72e8ab7d442f57cd65c89a4a99e76acd94788707 Mon Sep 17 00:00:00 2001
From: Qizhi Hu <836744285@qq.com>
Date: Thu, 21 Dec 2023 15:05:33 +0800
Subject: [PATCH 028/342] [clang][ASTImporter] add processing of
 SubstNonTypeTemplateParmExpr in isAncestorDeclContextOf (#74991)

Lack of processing of `SubstNonTypeTemplateParmExpr` in
`isAncestorDeclContextOf` would make `hasAutoReturnTypeDeclaredInside`
returns false and lead to infinite recursion. This patch adds the
processor and try to fix [this
issue](https://github.com/llvm/llvm-project/issues/74839)

Co-authored-by: huqizhi <836744285@qq.com>
---
 clang/lib/AST/ASTImporter.cpp           |  8 +++++++-
 clang/unittests/AST/ASTImporterTest.cpp | 20 ++++++++++++++++++++
 2 files changed, 27 insertions(+), 1 deletion(-)

diff --git a/clang/lib/AST/ASTImporter.cpp b/clang/lib/AST/ASTImporter.cpp
index 49d0dd218d683..1cc47de675bf3 100644
--- a/clang/lib/AST/ASTImporter.cpp
+++ b/clang/lib/AST/ASTImporter.cpp
@@ -3418,10 +3418,16 @@ static bool isAncestorDeclContextOf(const DeclContext *DC, const Stmt *S) {
   while (!ToProcess.empty()) {
     const Stmt *CurrentS = ToProcess.pop_back_val();
     ToProcess.append(CurrentS->child_begin(), CurrentS->child_end());
-    if (const auto *DeclRef = dyn_cast<DeclRefExpr>(CurrentS))
+    if (const auto *DeclRef = dyn_cast<DeclRefExpr>(CurrentS)) {
       if (const Decl *D = DeclRef->getDecl())
         if (isAncestorDeclContextOf(DC, D))
           return true;
+    } else if (const auto *E =
+                   dyn_cast_or_null<SubstNonTypeTemplateParmExpr>(CurrentS)) {
+      if (const Decl *D = E->getAssociatedDecl())
+        if (isAncestorDeclContextOf(DC, D))
+          return true;
+    }
   }
   return false;
 }
diff --git a/clang/unittests/AST/ASTImporterTest.cpp b/clang/unittests/AST/ASTImporterTest.cpp
index 4dd7510bf8ddf..4c06152d3eb56 100644
--- a/clang/unittests/AST/ASTImporterTest.cpp
+++ b/clang/unittests/AST/ASTImporterTest.cpp
@@ -7250,6 +7250,26 @@ TEST_P(ImportAutoFunctions, ReturnWithAutoTemplateType) {
       Lang_CXX14, /*FindLast=*/true);
 }
 
+TEST_P(ImportAutoFunctions, ReturnWithSubstNonTypeTemplateParmExpr) {
+  const char *Code =
+      R"(
+      template<int>
+      struct array {};
+
+      template <int N>
+      auto foo() { return array<N>(); }
+
+      void bar() { foo<0>(); }
+      )";
+  Decl *FromTU = getTuDecl(Code, Lang_CXX17);
+
+  auto *FromBar = FirstDeclMatcher<FunctionDecl>().match(
+      FromTU, functionDecl(hasName("bar")));
+
+  auto *ToBar = Import(FromBar, Lang_CXX17);
+  EXPECT_TRUE(ToBar);
+}
+
 struct ImportSourceLocations : ASTImporterOptionSpecificTestBase {};
 
 TEST_P(ImportSourceLocations, PreserveFileIDTreeStructure) {

From 9971b9ab195dd629fb2625c5c5c674d355760231 Mon Sep 17 00:00:00 2001
From: Tobias Gysi <tobias.gysi@nextsilicon.com>
Date: Thu, 21 Dec 2023 08:11:17 +0100
Subject: [PATCH 029/342] [mlir][llvm] Improve alloca handling during inlining
 (#75961)

This revision changes the alloca handling in the LLVM inliner.
It ensures that alloca operations, even those nested within a
region operation, can be relocated to the entry block of the function,
or the closest ancestor region that is marked with either the
isolated from above or automatic allocation scope trait.

While the LLVM dialect does not have any region operations,
the inlining interface may be used on IR that mixes different
dialects.
---
 mlir/lib/Dialect/LLVMIR/IR/LLVMInlining.cpp | 26 ++++++++++--
 mlir/test/Dialect/LLVMIR/inlining.mlir      | 47 +++++++++++++++++++++
 mlir/test/lib/Dialect/Test/TestOps.td       |  6 +++
 3 files changed, 76 insertions(+), 3 deletions(-)

diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMInlining.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMInlining.cpp
index 6e9019f932aa8..65c1daee6711a 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/LLVMInlining.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMInlining.cpp
@@ -50,11 +50,31 @@ static bool hasLifetimeMarkers(LLVM::AllocaOp allocaOp) {
 static void
 handleInlinedAllocas(Operation *call,
                      iterator_range<Region::iterator> inlinedBlocks) {
+  // Locate the entry block of the closest callsite ancestor that has either the
+  // IsolatedFromAbove or AutomaticAllocationScope trait. In pure LLVM dialect
+  // programs, this is the LLVMFuncOp containing the call site. However, in
+  // mixed-dialect programs, the callsite might be nested in another operation
+  // that carries one of these traits. In such scenarios, this traversal stops
+  // at the closest ancestor with either trait, ensuring visibility post
+  // relocation and respecting allocation scopes.
+  Block *callerEntryBlock = nullptr;
+  Operation *currentOp = call;
+  while (Operation *parentOp = currentOp->getParentOp()) {
+    if (parentOp->mightHaveTrait<OpTrait::IsIsolatedFromAbove>() ||
+        parentOp->mightHaveTrait<OpTrait::AutomaticAllocationScope>()) {
+      callerEntryBlock = &currentOp->getParentRegion()->front();
+      break;
+    }
+    currentOp = parentOp;
+  }
+
+  // Avoid relocating the alloca operations if the call has been inlined into
+  // the entry block already, which is typically the encompassing
+  // LLVM function, or if the relevant entry block cannot be identified.
   Block *calleeEntryBlock = &(*inlinedBlocks.begin());
-  Block *callerEntryBlock = &(*calleeEntryBlock->getParent()->begin());
-  if (calleeEntryBlock == callerEntryBlock)
-    // Nothing to do.
+  if (!callerEntryBlock || callerEntryBlock == calleeEntryBlock)
     return;
+
   SmallVector<std::tuple<LLVM::AllocaOp, IntegerAttr, bool>> allocasToMove;
   bool shouldInsertLifetimes = false;
   bool hasDynamicAlloca = false;
diff --git a/mlir/test/Dialect/LLVMIR/inlining.mlir b/mlir/test/Dialect/LLVMIR/inlining.mlir
index b684be1f9626b..63e7a46f1bdb0 100644
--- a/mlir/test/Dialect/LLVMIR/inlining.mlir
+++ b/mlir/test/Dialect/LLVMIR/inlining.mlir
@@ -324,6 +324,53 @@ llvm.func @test_inline(%cond0 : i1, %cond1 : i1, %funcArg : f32) -> f32 {
 
 // -----
 
+llvm.func @static_alloca() -> f32 {
+  %0 = llvm.mlir.constant(4 : i32) : i32
+  %1 = llvm.alloca %0 x f32 : (i32) -> !llvm.ptr
+  %2 = llvm.load %1 : !llvm.ptr -> f32
+  llvm.return %2 : f32
+}
+
+// CHECK-LABEL: llvm.func @test_inline
+llvm.func @test_inline(%cond0 : i1) {
+  // Verify the alloca is relocated to the entry block of the parent function
+  // if the region operation is neither marked as isolated from above or
+  // automatic allocation scope.
+  // CHECK: %[[ALLOCA:.+]] = llvm.alloca
+  // CHECK: "test.one_region_op"() ({
+  "test.one_region_op"() ({
+    %0 = llvm.call @static_alloca() : () -> f32
+    // CHECK-NEXT: llvm.intr.lifetime.start 4, %[[ALLOCA]]
+    // CHECK-NEXT: %[[RES:.+]] = llvm.load %[[ALLOCA]]
+    // CHECK-NEXT: llvm.intr.lifetime.end 4, %[[ALLOCA]]
+    // CHECK-NEXT: test.region_yield %[[RES]]
+    test.region_yield %0 : f32
+  }) : () -> ()
+  // Verify the alloca is not relocated out of operations that are marked as
+  // isolated from above.
+  // CHECK-NOT: llvm.alloca
+  // CHECK: test.isolated_regions
+  test.isolated_regions {
+    // CHECK: %[[ALLOCA:.+]] = llvm.alloca
+    %0 = llvm.call @static_alloca() : () -> f32
+    // CHECK: test.region_yield
+    test.region_yield %0 : f32
+  }
+  // Verify the alloca is not relocated out of operations that are marked as
+  // automatic allocation scope.
+  // CHECK-NOT: llvm.alloca
+  // CHECK: test.alloca_scope_region
+  test.alloca_scope_region {
+    // CHECK: %[[ALLOCA:.+]] = llvm.alloca
+    %0 = llvm.call @static_alloca() : () -> f32
+    // CHECK: test.region_yield
+    test.region_yield %0 : f32
+  }
+  llvm.return
+}
+
+// -----
+
 llvm.func @alloca_with_lifetime(%cond: i1) -> f32 {
   %0 = llvm.mlir.constant(4 : i32) : i32
   %1 = llvm.alloca %0 x f32 : (i32) -> !llvm.ptr
diff --git a/mlir/test/lib/Dialect/Test/TestOps.td b/mlir/test/lib/Dialect/Test/TestOps.td
index 70ccc71883e3c..48b41d8698762 100644
--- a/mlir/test/lib/Dialect/Test/TestOps.td
+++ b/mlir/test/lib/Dialect/Test/TestOps.td
@@ -480,6 +480,12 @@ def IsolatedRegionsOp : TEST_Op<"isolated_regions", [IsolatedFromAbove]> {
   let assemblyFormat = "attr-dict-with-keyword $regions";
 }
 
+def AllocaScopeRegionOp : TEST_Op<"alloca_scope_region",
+                                  [AutomaticAllocationScope]> {
+  let regions = (region AnyRegion:$region);
+  let assemblyFormat = "attr-dict-with-keyword $region";
+}
+
 //===----------------------------------------------------------------------===//
 // NoTerminator Operation
 //===----------------------------------------------------------------------===//

From 591fc4f5242adf2f4d2dbd301c21be578b9b9278 Mon Sep 17 00:00:00 2001
From: Christian Sigg <chsigg@users.noreply.github.com>
Date: Thu, 21 Dec 2023 08:20:10 +0100
Subject: [PATCH 030/342] [bazel] Fix build after
 b4e7ae883f8d75b7a464d030e14bfeca2b796d49f797bdd0674cbf13977d0b65

---
 utils/bazel/llvm-project-overlay/llvm/BUILD.bazel | 1 +
 1 file changed, 1 insertion(+)

diff --git a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
index fd74bac5a8c5b..f035a17833d8e 100644
--- a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
@@ -5225,6 +5225,7 @@ cc_binary(
     copts = llvm_copts,
     stamp = 0,
     deps = [
+        ":BinaryFormat",
         ":Object",
         ":Option",
         ":ReadTAPIOptsTableGen",

From 2ee396b0b102a857ec918beb583c3e71718efbce Mon Sep 17 00:00:00 2001
From: martinboehme <mboehme@google.com>
Date: Thu, 21 Dec 2023 09:02:20 +0100
Subject: [PATCH 031/342] [clang][dataflow] Add `Environment::get<>()`.
 (#76027)

This template function casts the result of `getValue()` or
`getStorageLocation()` to a given subclass of `Value` or
`StorageLocation` (using `cast_or_null`).

It's a common pattern to do something like this:

```cxx
auto *Val = cast_or_null<PointerValue>(Env.getValue(E));
```

This can now be expressed more concisely like this:

```cxx
auto *Val = Env.get<PointerValue>(E);
```

Instead of adding a new method `get()`, I had originally considered
simply adding a template parameter to `getValue()` and
`getStorageLocation()` (with a default argument of `Value` or
`StorageLocation`), but this results in an undesirable repetition at the
callsite, e.g. `getStorageLocation<RecordStorageLocation>(...)`. The
`Value` and `StorageLocation` in the method name adds nothing of value
when the template argument already contains this information, so it
seemed best to shorten the method name to simply `get()`.
---
 .../FlowSensitive/DataflowEnvironment.h       | 36 +++++++++++++++++++
 .../FlowSensitive/DataflowEnvironment.cpp     | 11 +++---
 .../Models/UncheckedOptionalAccessModel.cpp   | 31 +++++++---------
 .../lib/Analysis/FlowSensitive/RecordOps.cpp  |  8 ++---
 clang/lib/Analysis/FlowSensitive/Transfer.cpp | 16 ++++-----
 .../TypeErasedDataflowAnalysis.cpp            |  2 +-
 .../FlowSensitive/SignAnalysisTest.cpp        |  2 +-
 7 files changed, 65 insertions(+), 41 deletions(-)

diff --git a/clang/include/clang/Analysis/FlowSensitive/DataflowEnvironment.h b/clang/include/clang/Analysis/FlowSensitive/DataflowEnvironment.h
index 5943af50b6ad8..2a9f8dce74c0a 100644
--- a/clang/include/clang/Analysis/FlowSensitive/DataflowEnvironment.h
+++ b/clang/include/clang/Analysis/FlowSensitive/DataflowEnvironment.h
@@ -289,6 +289,22 @@ class Environment {
   ///  `E` must be a glvalue or a `BuiltinType::BuiltinFn`
   StorageLocation *getStorageLocation(const Expr &E) const;
 
+  /// Returns the result of casting `getStorageLocation(...)` to a subclass of
+  /// `StorageLocation` (using `cast_or_null<T>`).
+  /// This assert-fails if the result of `getStorageLocation(...)` is not of
+  /// type `T *`; if the storage location is not guaranteed to have type `T *`,
+  /// consider using `dyn_cast_or_null<T>(getStorageLocation(...))` instead.
+  template <typename T>
+  std::enable_if_t<std::is_base_of_v<StorageLocation, T>, T *>
+  get(const ValueDecl &D) const {
+    return cast_or_null<T>(getStorageLocation(D));
+  }
+  template <typename T>
+  std::enable_if_t<std::is_base_of_v<StorageLocation, T>, T *>
+  get(const Expr &E) const {
+    return cast_or_null<T>(getStorageLocation(E));
+  }
+
   /// Returns the storage location assigned to the `this` pointee in the
   /// environment or null if the `this` pointee has no assigned storage location
   /// in the environment.
@@ -457,6 +473,26 @@ class Environment {
   /// storage location in the environment, otherwise returns null.
   Value *getValue(const Expr &E) const;
 
+  /// Returns the result of casting `getValue(...)` to a subclass of `Value`
+  /// (using `cast_or_null<T>`).
+  /// This assert-fails if the result of `getValue(...)` is not of type `T *`;
+  /// if the value is not guaranteed to have type `T *`, consider using
+  /// `dyn_cast_or_null<T>(getValue(...))` instead.
+  template <typename T>
+  std::enable_if_t<std::is_base_of_v<Value, T>, T *>
+  get(const StorageLocation &Loc) const {
+    return cast_or_null<T>(getValue(Loc));
+  }
+  template <typename T>
+  std::enable_if_t<std::is_base_of_v<Value, T>, T *>
+  get(const ValueDecl &D) const {
+    return cast_or_null<T>(getValue(D));
+  }
+  template <typename T>
+  std::enable_if_t<std::is_base_of_v<Value, T>, T *> get(const Expr &E) const {
+    return cast_or_null<T>(getValue(E));
+  }
+
   // FIXME: should we deprecate the following & call arena().create() directly?
 
   /// Creates a `T` (some subclass of `Value`), forwarding `args` to the
diff --git a/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp b/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp
index 93919cd0243d0..96fe6df88dbb9 100644
--- a/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp
+++ b/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp
@@ -1034,7 +1034,7 @@ RecordStorageLocation *getImplicitObjectLocation(const CXXMemberCallExpr &MCE,
   if (ImplicitObject == nullptr)
     return nullptr;
   if (ImplicitObject->getType()->isPointerType()) {
-    if (auto *Val = cast_or_null<PointerValue>(Env.getValue(*ImplicitObject)))
+    if (auto *Val = Env.get<PointerValue>(*ImplicitObject))
       return &cast<RecordStorageLocation>(Val->getPointeeLoc());
     return nullptr;
   }
@@ -1048,11 +1048,11 @@ RecordStorageLocation *getBaseObjectLocation(const MemberExpr &ME,
   if (Base == nullptr)
     return nullptr;
   if (ME.isArrow()) {
-    if (auto *Val = cast_or_null<PointerValue>(Env.getValue(*Base)))
+    if (auto *Val = Env.get<PointerValue>(*Base))
       return &cast<RecordStorageLocation>(Val->getPointeeLoc());
     return nullptr;
   }
-  return cast_or_null<RecordStorageLocation>(Env.getStorageLocation(*Base));
+  return Env.get<RecordStorageLocation>(*Base);
 }
 
 std::vector<FieldDecl *> getFieldsForInitListExpr(const RecordDecl *RD) {
@@ -1077,7 +1077,7 @@ RecordValue &refreshRecordValue(const Expr &Expr, Environment &Env) {
   assert(Expr.getType()->isRecordType());
 
   if (Expr.isPRValue()) {
-    if (auto *ExistingVal = cast_or_null<RecordValue>(Env.getValue(Expr))) {
+    if (auto *ExistingVal = Env.get<RecordValue>(Expr)) {
       auto &NewVal = Env.create<RecordValue>(ExistingVal->getLoc());
       Env.setValue(Expr, NewVal);
       Env.setValue(NewVal.getLoc(), NewVal);
@@ -1089,8 +1089,7 @@ RecordValue &refreshRecordValue(const Expr &Expr, Environment &Env) {
     return NewVal;
   }
 
-  if (auto *Loc =
-          cast_or_null<RecordStorageLocation>(Env.getStorageLocation(Expr))) {
+  if (auto *Loc = Env.get<RecordStorageLocation>(Expr)) {
     auto &NewVal = Env.create<RecordValue>(*Loc);
     Env.setValue(*Loc, NewVal);
     return NewVal;
diff --git a/clang/lib/Analysis/FlowSensitive/Models/UncheckedOptionalAccessModel.cpp b/clang/lib/Analysis/FlowSensitive/Models/UncheckedOptionalAccessModel.cpp
index 69ac2c2b82cff..1d31b22b6d25f 100644
--- a/clang/lib/Analysis/FlowSensitive/Models/UncheckedOptionalAccessModel.cpp
+++ b/clang/lib/Analysis/FlowSensitive/Models/UncheckedOptionalAccessModel.cpp
@@ -226,7 +226,7 @@ auto isComparisonOperatorCall(L lhs_arg_matcher, R rhs_arg_matcher) {
 
 /// Ensures that `Expr` is mapped to a `BoolValue` and returns its formula.
 const Formula &forceBoolValue(Environment &Env, const Expr &Expr) {
-  auto *Value = cast_or_null<BoolValue>(Env.getValue(Expr));
+  auto *Value = Env.get<BoolValue>(Expr);
   if (Value != nullptr)
     return Value->formula();
 
@@ -267,7 +267,7 @@ BoolValue *getHasValue(Environment &Env, RecordStorageLocation *OptionalLoc) {
   if (OptionalLoc == nullptr)
     return nullptr;
   StorageLocation &HasValueLoc = locForHasValue(*OptionalLoc);
-  auto *HasValueVal = cast_or_null<BoolValue>(Env.getValue(HasValueLoc));
+  auto *HasValueVal = Env.get<BoolValue>(HasValueLoc);
   if (HasValueVal == nullptr) {
     HasValueVal = &Env.makeAtomicBoolValue();
     Env.setValue(HasValueLoc, *HasValueVal);
@@ -406,7 +406,7 @@ void transferCallReturningOptional(const CallExpr *E,
   if (E->isPRValue()) {
     Loc = &State.Env.getResultObjectLocation(*E);
   } else {
-    Loc = cast_or_null<RecordStorageLocation>(State.Env.getStorageLocation(*E));
+    Loc = State.Env.get<RecordStorageLocation>(*E);
     if (Loc == nullptr) {
       Loc = &cast<RecordStorageLocation>(State.Env.createStorageLocation(*E));
       State.Env.setStorageLocation(*E, *Loc);
@@ -449,8 +449,7 @@ BoolValue &valueOrConversionHasValue(const FunctionDecl &F, const Expr &E,
 
   // This is a constructor/assignment call for `optional<T>` with argument of
   // type `optional<U>` such that `T` is constructible from `U`.
-  auto *Loc =
-      cast_or_null<RecordStorageLocation>(State.Env.getStorageLocation(E));
+  auto *Loc = State.Env.get<RecordStorageLocation>(E);
   if (auto *HasValueVal = getHasValue(State.Env, Loc))
     return *HasValueVal;
   return State.Env.makeAtomicBoolValue();
@@ -471,8 +470,7 @@ void transferAssignment(const CXXOperatorCallExpr *E, BoolValue &HasValueVal,
                         LatticeTransferState &State) {
   assert(E->getNumArgs() > 0);
 
-  if (auto *Loc = cast_or_null<RecordStorageLocation>(
-          State.Env.getStorageLocation(*E->getArg(0)))) {
+  if (auto *Loc = State.Env.get<RecordStorageLocation>(*E->getArg(0))) {
     createOptionalValue(*Loc, HasValueVal, State.Env);
 
     // Assign a storage location for the whole expression.
@@ -534,18 +532,15 @@ void transferSwapCall(const CXXMemberCallExpr *E,
                       const MatchFinder::MatchResult &,
                       LatticeTransferState &State) {
   assert(E->getNumArgs() == 1);
-  auto *OtherLoc = cast_or_null<RecordStorageLocation>(
-      State.Env.getStorageLocation(*E->getArg(0)));
+  auto *OtherLoc = State.Env.get<RecordStorageLocation>(*E->getArg(0));
   transferSwap(getImplicitObjectLocation(*E, State.Env), OtherLoc, State.Env);
 }
 
 void transferStdSwapCall(const CallExpr *E, const MatchFinder::MatchResult &,
                          LatticeTransferState &State) {
   assert(E->getNumArgs() == 2);
-  auto *Arg0Loc = cast_or_null<RecordStorageLocation>(
-      State.Env.getStorageLocation(*E->getArg(0)));
-  auto *Arg1Loc = cast_or_null<RecordStorageLocation>(
-      State.Env.getStorageLocation(*E->getArg(1)));
+  auto *Arg0Loc = State.Env.get<RecordStorageLocation>(*E->getArg(0));
+  auto *Arg1Loc = State.Env.get<RecordStorageLocation>(*E->getArg(1));
   transferSwap(Arg0Loc, Arg1Loc, State.Env);
 }
 
@@ -585,11 +580,9 @@ void transferOptionalAndOptionalCmp(const clang::CXXOperatorCallExpr *CmpExpr,
   Environment &Env = State.Env;
   auto &A = Env.arena();
   auto *CmpValue = &forceBoolValue(Env, *CmpExpr);
-  auto *Arg0Loc = cast_or_null<RecordStorageLocation>(
-      Env.getStorageLocation(*CmpExpr->getArg(0)));
+  auto *Arg0Loc = Env.get<RecordStorageLocation>(*CmpExpr->getArg(0));
   if (auto *LHasVal = getHasValue(Env, Arg0Loc)) {
-    auto *Arg1Loc = cast_or_null<RecordStorageLocation>(
-        Env.getStorageLocation(*CmpExpr->getArg(1)));
+    auto *Arg1Loc = Env.get<RecordStorageLocation>(*CmpExpr->getArg(1));
     if (auto *RHasVal = getHasValue(Env, Arg1Loc)) {
       if (CmpExpr->getOperator() == clang::OO_ExclaimEqual)
         CmpValue = &A.makeNot(*CmpValue);
@@ -603,7 +596,7 @@ void transferOptionalAndValueCmp(const clang::CXXOperatorCallExpr *CmpExpr,
                                  const clang::Expr *E, Environment &Env) {
   auto &A = Env.arena();
   auto *CmpValue = &forceBoolValue(Env, *CmpExpr);
-  auto *Loc = cast_or_null<RecordStorageLocation>(Env.getStorageLocation(*E));
+  auto *Loc = Env.get<RecordStorageLocation>(*E);
   if (auto *HasVal = getHasValue(Env, Loc)) {
     if (CmpExpr->getOperator() == clang::OO_ExclaimEqual)
       CmpValue = &A.makeNot(*CmpValue);
@@ -616,7 +609,7 @@ void transferOptionalAndNulloptCmp(const clang::CXXOperatorCallExpr *CmpExpr,
                                    const clang::Expr *E, Environment &Env) {
   auto &A = Env.arena();
   auto *CmpValue = &forceBoolValue(Env, *CmpExpr);
-  auto *Loc = cast_or_null<RecordStorageLocation>(Env.getStorageLocation(*E));
+  auto *Loc = Env.get<RecordStorageLocation>(*E);
   if (auto *HasVal = getHasValue(Env, Loc)) {
     if (CmpExpr->getOperator() == clang::OO_ExclaimEqual)
       CmpValue = &A.makeNot(*CmpValue);
diff --git a/clang/lib/Analysis/FlowSensitive/RecordOps.cpp b/clang/lib/Analysis/FlowSensitive/RecordOps.cpp
index caaf443382b02..a326826db2394 100644
--- a/clang/lib/Analysis/FlowSensitive/RecordOps.cpp
+++ b/clang/lib/Analysis/FlowSensitive/RecordOps.cpp
@@ -66,8 +66,8 @@ void clang::dataflow::copyRecord(RecordStorageLocation &Src,
     }
   }
 
-  RecordValue *SrcVal = cast_or_null<RecordValue>(Env.getValue(Src));
-  RecordValue *DstVal = cast_or_null<RecordValue>(Env.getValue(Dst));
+  RecordValue *SrcVal = Env.get<RecordValue>(Src);
+  RecordValue *DstVal = Env.get<RecordValue>(Dst);
 
   DstVal = &Env.create<RecordValue>(Dst);
   Env.setValue(Dst, *DstVal);
@@ -127,10 +127,10 @@ bool clang::dataflow::recordsEqual(const RecordStorageLocation &Loc1,
 
   llvm::StringMap<Value *> Props1, Props2;
 
-  if (RecordValue *Val1 = cast_or_null<RecordValue>(Env1.getValue(Loc1)))
+  if (RecordValue *Val1 = Env1.get<RecordValue>(Loc1))
     for (const auto &[Name, Value] : Val1->properties())
       Props1[Name] = Value;
-  if (RecordValue *Val2 = cast_or_null<RecordValue>(Env2.getValue(Loc2)))
+  if (RecordValue *Val2 = Env2.get<RecordValue>(Loc2))
     for (const auto &[Name, Value] : Val2->properties())
       Props2[Name] = Value;
 
diff --git a/clang/lib/Analysis/FlowSensitive/Transfer.cpp b/clang/lib/Analysis/FlowSensitive/Transfer.cpp
index 346469660662e..55093c2e2cdaf 100644
--- a/clang/lib/Analysis/FlowSensitive/Transfer.cpp
+++ b/clang/lib/Analysis/FlowSensitive/Transfer.cpp
@@ -339,8 +339,7 @@ class TransferVisitor : public ConstStmtVisitor<TransferVisitor> {
 
     switch (S->getOpcode()) {
     case UO_Deref: {
-      const auto *SubExprVal =
-          cast_or_null<PointerValue>(Env.getValue(*SubExpr));
+      const auto *SubExprVal = Env.get<PointerValue>(*SubExpr);
       if (SubExprVal == nullptr)
         break;
 
@@ -467,8 +466,7 @@ class TransferVisitor : public ConstStmtVisitor<TransferVisitor> {
       const Expr *Arg = S->getArg(0);
       assert(Arg != nullptr);
 
-      auto *ArgLoc =
-          cast_or_null<RecordStorageLocation>(Env.getStorageLocation(*Arg));
+      auto *ArgLoc = Env.get<RecordStorageLocation>(*Arg);
       if (ArgLoc == nullptr)
         return;
 
@@ -515,14 +513,12 @@ class TransferVisitor : public ConstStmtVisitor<TransferVisitor> {
 
       RecordStorageLocation *LocSrc = nullptr;
       if (Arg1->isPRValue()) {
-        if (auto *Val = cast_or_null<RecordValue>(Env.getValue(*Arg1)))
+        if (auto *Val = Env.get<RecordValue>(*Arg1))
           LocSrc = &Val->getLoc();
       } else {
-        LocSrc =
-            cast_or_null<RecordStorageLocation>(Env.getStorageLocation(*Arg1));
+        LocSrc = Env.get<RecordStorageLocation>(*Arg1);
       }
-      auto *LocDst =
-          cast_or_null<RecordStorageLocation>(Env.getStorageLocation(*Arg0));
+      auto *LocDst = Env.get<RecordStorageLocation>(*Arg0);
 
       if (LocSrc == nullptr || LocDst == nullptr)
         return;
@@ -676,7 +672,7 @@ class TransferVisitor : public ConstStmtVisitor<TransferVisitor> {
         auto Init = Inits[InitIdx++];
         assert(Base.getType().getCanonicalType() ==
                Init->getType().getCanonicalType());
-        auto* BaseVal = cast_or_null<RecordValue>(Env.getValue(*Init));
+        auto *BaseVal = Env.get<RecordValue>(*Init);
         if (!BaseVal)
           BaseVal = cast<RecordValue>(Env.createValue(Init->getType()));
         // Take ownership of the fields of the `RecordValue` for the base class
diff --git a/clang/lib/Analysis/FlowSensitive/TypeErasedDataflowAnalysis.cpp b/clang/lib/Analysis/FlowSensitive/TypeErasedDataflowAnalysis.cpp
index 8c9360235da7c..faf83a8920d4e 100644
--- a/clang/lib/Analysis/FlowSensitive/TypeErasedDataflowAnalysis.cpp
+++ b/clang/lib/Analysis/FlowSensitive/TypeErasedDataflowAnalysis.cpp
@@ -130,7 +130,7 @@ class TerminatorVisitor
     if (Env.getValue(Cond) == nullptr)
       transfer(StmtToEnv, Cond, Env);
 
-    auto *Val = cast_or_null<BoolValue>(Env.getValue(Cond));
+    auto *Val = Env.get<BoolValue>(Cond);
     // Value merging depends on flow conditions from different environments
     // being mutually exclusive -- that is, they cannot both be true in their
     // entirety (even if they may share some clauses). So, we need *some* value
diff --git a/clang/unittests/Analysis/FlowSensitive/SignAnalysisTest.cpp b/clang/unittests/Analysis/FlowSensitive/SignAnalysisTest.cpp
index 362b0dea58d6b..b5fc7bbc431ea 100644
--- a/clang/unittests/Analysis/FlowSensitive/SignAnalysisTest.cpp
+++ b/clang/unittests/Analysis/FlowSensitive/SignAnalysisTest.cpp
@@ -133,7 +133,7 @@ void transferBinary(const BinaryOperator *BO, const MatchFinder::MatchResult &M,
                     LatticeTransferState &State) {
   auto &A = State.Env.arena();
   const Formula *Comp;
-  if (BoolValue *V = cast_or_null<BoolValue>(State.Env.getValue(*BO))) {
+  if (BoolValue *V = State.Env.get<BoolValue>(*BO)) {
     Comp = &V->formula();
   } else {
     Comp = &A.makeAtomRef(A.makeAtom());

From da4bd5bece9a32d9a5f903e0a3a6cb8233dd43f2 Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Thu, 21 Dec 2023 00:10:28 -0800
Subject: [PATCH 032/342] [test][hwasan] Clang-format a test

---
 .../hwasan/TestCases/heap-buffer-overflow.c   | 62 +++++++++----------
 1 file changed, 31 insertions(+), 31 deletions(-)

diff --git a/compiler-rt/test/hwasan/TestCases/heap-buffer-overflow.c b/compiler-rt/test/hwasan/TestCases/heap-buffer-overflow.c
index 4e6638be584b0..d390017dd7555 100644
--- a/compiler-rt/test/hwasan/TestCases/heap-buffer-overflow.c
+++ b/compiler-rt/test/hwasan/TestCases/heap-buffer-overflow.c
@@ -36,36 +36,36 @@ int main(int argc, char **argv) {
   }
 #endif
 
-// CHECK40: allocated heap chunk; size: 32 offset: 8
-// CHECK40: Cause: heap-buffer-overflow
-// CHECK40: is located 10 bytes after a 30-byte region
-//
-// CHECK80: allocated heap chunk; size: 32 offset: 16
-// CHECK80: Cause: heap-buffer-overflow
-// CHECK80: is located 50 bytes after a 30-byte region
-//
-// CHECKm30: Cause: heap-buffer-overflow
-// CHECKm30: is located 30 bytes before a 30-byte region
-//
-// CHECKMm30: is a large allocated heap chunk; size: 1003520 offset: -30
-// CHECKMm30: Cause: heap-buffer-overflow
-// CHECKMm30: is located 30 bytes before a 1000000-byte region
-//
-// CHECKM: is a large allocated heap chunk; size: 1003520 offset: 1000000
-// CHECKM: Cause: heap-buffer-overflow
-// CHECKM: is located 0 bytes after a 1000000-byte region
-//
-// CHECK31: tags: [[TAG:..]]/0e([[TAG]]) (ptr/mem)
-// CHECK31-NOT: Invalid access starting at offset
-// CHECK31: Cause: heap-buffer-overflow
-// CHECK31: is located 1 bytes after a 30-byte region
-// CHECK31: Memory tags around the buggy address
-// CHECK31: [0e]
-// CHECK31: Tags for short granules around the buggy address
-// CHECK31: {{\[}}[[TAG]]]
-//
-// CHECK20-NOT: Invalid access starting at offset
-// CHECK20: Cause: heap-buffer-overflow
-// CHECK20: is located 10 bytes after a 20-byte region [0x{{.*}}0,0x{{.*}}4)
+  // CHECK40: allocated heap chunk; size: 32 offset: 8
+  // CHECK40: Cause: heap-buffer-overflow
+  // CHECK40: is located 10 bytes after a 30-byte region
+  //
+  // CHECK80: allocated heap chunk; size: 32 offset: 16
+  // CHECK80: Cause: heap-buffer-overflow
+  // CHECK80: is located 50 bytes after a 30-byte region
+  //
+  // CHECKm30: Cause: heap-buffer-overflow
+  // CHECKm30: is located 30 bytes before a 30-byte region
+  //
+  // CHECKMm30: is a large allocated heap chunk; size: 1003520 offset: -30
+  // CHECKMm30: Cause: heap-buffer-overflow
+  // CHECKMm30: is located 30 bytes before a 1000000-byte region
+  //
+  // CHECKM: is a large allocated heap chunk; size: 1003520 offset: 1000000
+  // CHECKM: Cause: heap-buffer-overflow
+  // CHECKM: is located 0 bytes after a 1000000-byte region
+  //
+  // CHECK31: tags: [[TAG:..]]/0e([[TAG]]) (ptr/mem)
+  // CHECK31-NOT: Invalid access starting at offset
+  // CHECK31: Cause: heap-buffer-overflow
+  // CHECK31: is located 1 bytes after a 30-byte region
+  // CHECK31: Memory tags around the buggy address
+  // CHECK31: [0e]
+  // CHECK31: Tags for short granules around the buggy address
+  // CHECK31: {{\[}}[[TAG]]]
+  //
+  // CHECK20-NOT: Invalid access starting at offset
+  // CHECK20: Cause: heap-buffer-overflow
+  // CHECK20: is located 10 bytes after a 20-byte region [0x{{.*}}0,0x{{.*}}4)
   free(x);
 }

From 469374e5c4ba7c75327096a4db6b8ee92065c378 Mon Sep 17 00:00:00 2001
From: martinboehme <mboehme@google.com>
Date: Thu, 21 Dec 2023 09:21:24 +0100
Subject: [PATCH 033/342] [clang][dataflow] Disallow setting properties on
 `RecordValue`s. (#76042)

Instead, synthetic fields should now be used for the same purpose. These
have a
number of advantages, as described in
https://github.com/llvm/llvm-project/pull/73860, and longer-term, we
want to
eliminate `RecordValue` entirely.

As `RecordValue`s cannot have properties any more, I have replaced the
`OptionalIntAnalysis` with an equivalent analysis that tracks nullness
of
pointers (instead of whether an optional has a value). This serves the
same
purpose, namely to check whether the framework applies a custom
`merge()`
operation to widen properties.
---
 .../FlowSensitive/DataflowEnvironment.h       |  11 --
 .../clang/Analysis/FlowSensitive/RecordOps.h  |  12 +-
 .../clang/Analysis/FlowSensitive/Value.h      |  40 ++---
 .../lib/Analysis/FlowSensitive/RecordOps.cpp  |  33 +---
 .../Analysis/FlowSensitive/RecordOpsTest.cpp  |  37 -----
 .../TypeErasedDataflowAnalysisTest.cpp        | 150 ++++++++----------
 6 files changed, 88 insertions(+), 195 deletions(-)

diff --git a/clang/include/clang/Analysis/FlowSensitive/DataflowEnvironment.h b/clang/include/clang/Analysis/FlowSensitive/DataflowEnvironment.h
index 2a9f8dce74c0a..e8c27d6c12038 100644
--- a/clang/include/clang/Analysis/FlowSensitive/DataflowEnvironment.h
+++ b/clang/include/clang/Analysis/FlowSensitive/DataflowEnvironment.h
@@ -727,20 +727,9 @@ RecordStorageLocation *getBaseObjectLocation(const MemberExpr &ME,
 std::vector<FieldDecl *> getFieldsForInitListExpr(const RecordDecl *RD);
 
 /// Associates a new `RecordValue` with `Loc` and returns the new value.
-/// It is not defined whether the field values remain the same or not.
-///
-/// This function is primarily intended for use by checks that set custom
-/// properties on `RecordValue`s to model the state of these values. Such checks
-/// should avoid modifying the properties of an existing `RecordValue` because
-/// these changes would be visible to other `Environment`s that share the same
-/// `RecordValue`. Instead, call `refreshRecordValue()`, then set the properties
-/// on the new `RecordValue` that it returns. Typical usage:
-///
-///   refreshRecordValue(Loc, Env).setProperty("my_prop", MyPropValue);
 RecordValue &refreshRecordValue(RecordStorageLocation &Loc, Environment &Env);
 
 /// Associates a new `RecordValue` with `Expr` and returns the new value.
-/// See also documentation for the overload above.
 RecordValue &refreshRecordValue(const Expr &Expr, Environment &Env);
 
 } // namespace dataflow
diff --git a/clang/include/clang/Analysis/FlowSensitive/RecordOps.h b/clang/include/clang/Analysis/FlowSensitive/RecordOps.h
index 7b87840d626b4..783e53e980aa2 100644
--- a/clang/include/clang/Analysis/FlowSensitive/RecordOps.h
+++ b/clang/include/clang/Analysis/FlowSensitive/RecordOps.h
@@ -22,19 +22,13 @@ namespace dataflow {
 /// Copies a record (struct, class, or union) from `Src` to `Dst`.
 ///
 /// This performs a deep copy, i.e. it copies every field (including synthetic
-/// fields) and recurses on fields of record type. It also copies properties
-/// from the `RecordValue` associated with `Src` to the `RecordValue` associated
-/// with `Dst` (if these `RecordValue`s exist).
+/// fields) and recurses on fields of record type.
 ///
 /// If there is a `RecordValue` associated with `Dst` in the environment, this
 /// function creates a new `RecordValue` and associates it with `Dst`; clients
 /// need to be aware of this and must not assume that the `RecordValue`
 /// associated with `Dst` remains the same after the call.
 ///
-/// We create a new `RecordValue` rather than modifying properties on the old
-/// `RecordValue` because the old `RecordValue` may be shared with other
-/// `Environment`s, and we don't want changes to properties to be visible there.
-///
 /// Requirements:
 ///
 ///  `Src` and `Dst` must have the same canonical unqualified type.
@@ -49,9 +43,7 @@ void copyRecord(RecordStorageLocation &Src, RecordStorageLocation &Dst,
 ///
 /// This performs a deep comparison, i.e. it compares every field (including
 /// synthetic fields) and recurses on fields of record type. Fields of reference
-/// type compare equal if they refer to the same storage location. If
-/// `RecordValue`s are associated with `Loc1` and Loc2`, it also compares the
-/// properties on those `RecordValue`s.
+/// type compare equal if they refer to the same storage location.
 ///
 /// Note on how to interpret the result:
 /// - If this returns true, the records are guaranteed to be equal at runtime.
diff --git a/clang/include/clang/Analysis/FlowSensitive/Value.h b/clang/include/clang/Analysis/FlowSensitive/Value.h
index e6c68e5b4e93e..be1bf9324c87b 100644
--- a/clang/include/clang/Analysis/FlowSensitive/Value.h
+++ b/clang/include/clang/Analysis/FlowSensitive/Value.h
@@ -63,7 +63,11 @@ class Value {
 
   /// Assigns `Val` as the value of the synthetic property with the given
   /// `Name`.
+  ///
+  /// Properties may not be set on `RecordValue`s; use synthetic fields instead
+  /// (for details, see documentation for `RecordStorageLocation`).
   void setProperty(llvm::StringRef Name, Value &Val) {
+    assert(getKind() != Kind::Record);
     Properties.insert_or_assign(Name, &Val);
   }
 
@@ -184,33 +188,23 @@ class PointerValue final : public Value {
 /// In C++, prvalues of class type serve only a limited purpose: They can only
 /// be used to initialize a result object. It is not possible to access member
 /// variables or call member functions on a prvalue of class type.
-/// Correspondingly, `RecordValue` also serves only two limited purposes:
-/// - It conveys a prvalue of class type from the place where the object is
-///   constructed to the result object that it initializes.
+/// Correspondingly, `RecordValue` also serves only a limited purpose: It
+/// conveys a prvalue of class type from the place where the object is
+/// constructed to the result object that it initializes.
 ///
-///   When creating a prvalue of class type, we already need a storage location
-///   for `this`, even though prvalues are otherwise not associated with storage
-///   locations. `RecordValue` is therefore essentially a wrapper for a storage
-///   location, which is then used to set the storage location for the result
-///   object when we process the AST node for that result object.
+/// When creating a prvalue of class type, we already need a storage location
+/// for `this`, even though prvalues are otherwise not associated with storage
+/// locations. `RecordValue` is therefore essentially a wrapper for a storage
+/// location, which is then used to set the storage location for the result
+/// object when we process the AST node for that result object.
 ///
-///   For example:
-///      MyStruct S = MyStruct(3);
+/// For example:
+///    MyStruct S = MyStruct(3);
 ///
-///   In this example, `MyStruct(3) is a prvalue, which is modeled as a
-///   `RecordValue` that wraps a `RecordStorageLocation`. This
-//    `RecordStorageLocation` is then used as the storage location for `S`.
+/// In this example, `MyStruct(3) is a prvalue, which is modeled as a
+/// `RecordValue` that wraps a `RecordStorageLocation`. This
+/// `RecordStorageLocation` is then used as the storage location for `S`.
 ///
-/// - It allows properties to be associated with an object of class type.
-///   Note that when doing so, you should avoid mutating the properties of an
-///   existing `RecordValue` in place, as these changes would be visible to
-///   other `Environment`s that share the same `RecordValue`. Instead, associate
-///   a new `RecordValue` with the `RecordStorageLocation` and set the
-///   properties on this new `RecordValue`. (See also `refreshRecordValue()` in
-///   DataflowEnvironment.h, which makes this easy.)
-///   Note also that this implies that it is common for the same
-///   `RecordStorageLocation` to be associated with different `RecordValue`s
-///   in different environments.
 /// Over time, we may eliminate `RecordValue` entirely. See also the discussion
 /// here: https://reviews.llvm.org/D155204#inline-1503204
 class RecordValue final : public Value {
diff --git a/clang/lib/Analysis/FlowSensitive/RecordOps.cpp b/clang/lib/Analysis/FlowSensitive/RecordOps.cpp
index a326826db2394..da4dd6dc07851 100644
--- a/clang/lib/Analysis/FlowSensitive/RecordOps.cpp
+++ b/clang/lib/Analysis/FlowSensitive/RecordOps.cpp
@@ -66,19 +66,8 @@ void clang::dataflow::copyRecord(RecordStorageLocation &Src,
     }
   }
 
-  RecordValue *SrcVal = Env.get<RecordValue>(Src);
-  RecordValue *DstVal = Env.get<RecordValue>(Dst);
-
-  DstVal = &Env.create<RecordValue>(Dst);
+  RecordValue *DstVal = &Env.create<RecordValue>(Dst);
   Env.setValue(Dst, *DstVal);
-
-  if (SrcVal == nullptr)
-    return;
-
-  for (const auto &[Name, Value] : SrcVal->properties()) {
-    if (Value != nullptr)
-      DstVal->setProperty(Name, *Value);
-  }
 }
 
 bool clang::dataflow::recordsEqual(const RecordStorageLocation &Loc1,
@@ -125,25 +114,5 @@ bool clang::dataflow::recordsEqual(const RecordStorageLocation &Loc1,
     }
   }
 
-  llvm::StringMap<Value *> Props1, Props2;
-
-  if (RecordValue *Val1 = Env1.get<RecordValue>(Loc1))
-    for (const auto &[Name, Value] : Val1->properties())
-      Props1[Name] = Value;
-  if (RecordValue *Val2 = Env2.get<RecordValue>(Loc2))
-    for (const auto &[Name, Value] : Val2->properties())
-      Props2[Name] = Value;
-
-  if (Props1.size() != Props2.size())
-    return false;
-
-  for (const auto &[Name, Value] : Props1) {
-    auto It = Props2.find(Name);
-    if (It == Props2.end())
-      return false;
-    if (Value != It->second)
-      return false;
-  }
-
   return true;
 }
diff --git a/clang/unittests/Analysis/FlowSensitive/RecordOpsTest.cpp b/clang/unittests/Analysis/FlowSensitive/RecordOpsTest.cpp
index 84fe675c32c2d..cd6a37d370e85 100644
--- a/clang/unittests/Analysis/FlowSensitive/RecordOpsTest.cpp
+++ b/clang/unittests/Analysis/FlowSensitive/RecordOpsTest.cpp
@@ -89,8 +89,6 @@ TEST(RecordOpsTest, CopyRecord) {
         auto *S2Val = cast<RecordValue>(Env.getValue(S2));
         EXPECT_NE(S1Val, S2Val);
 
-        S1Val->setProperty("prop", Env.getBoolLiteralValue(true));
-
         copyRecord(S1, S2, Env);
 
         EXPECT_EQ(getFieldValue(&S1, *OuterIntDecl, Env),
@@ -104,8 +102,6 @@ TEST(RecordOpsTest, CopyRecord) {
         S1Val = cast<RecordValue>(Env.getValue(S1));
         S2Val = cast<RecordValue>(Env.getValue(S2));
         EXPECT_NE(S1Val, S2Val);
-
-        EXPECT_EQ(S2Val->getProperty("prop"), &Env.getBoolLiteralValue(true));
       });
 }
 
@@ -150,9 +146,6 @@ TEST(RecordOpsTest, RecordsEqual) {
         Env.setValue(S1.getSyntheticField("synth_int"),
                      Env.create<IntegerValue>());
 
-        cast<RecordValue>(Env.getValue(S1))
-            ->setProperty("prop", Env.getBoolLiteralValue(true));
-
         // Strategy: Create two equal records, then verify each of the various
         // ways in which records can differ causes recordsEqual to return false.
         // changes we can make to the record.
@@ -202,36 +195,6 @@ TEST(RecordOpsTest, RecordsEqual) {
         EXPECT_FALSE(recordsEqual(S1, S2, Env));
         copyRecord(S1, S2, Env);
         EXPECT_TRUE(recordsEqual(S1, S2, Env));
-
-        // S1 and S2 have the same property with different values.
-        cast<RecordValue>(Env.getValue(S2))
-            ->setProperty("prop", Env.getBoolLiteralValue(false));
-        EXPECT_FALSE(recordsEqual(S1, S2, Env));
-        copyRecord(S1, S2, Env);
-        EXPECT_TRUE(recordsEqual(S1, S2, Env));
-
-        // S1 has a property that S2 doesn't have.
-        cast<RecordValue>(Env.getValue(S1))
-            ->setProperty("other_prop", Env.getBoolLiteralValue(false));
-        EXPECT_FALSE(recordsEqual(S1, S2, Env));
-        // We modified S1 this time, so need to copy back the other way.
-        copyRecord(S2, S1, Env);
-        EXPECT_TRUE(recordsEqual(S1, S2, Env));
-
-        // S2 has a property that S1 doesn't have.
-        cast<RecordValue>(Env.getValue(S2))
-            ->setProperty("other_prop", Env.getBoolLiteralValue(false));
-        EXPECT_FALSE(recordsEqual(S1, S2, Env));
-        copyRecord(S1, S2, Env);
-        EXPECT_TRUE(recordsEqual(S1, S2, Env));
-
-        // S1 and S2 have the same number of properties, but with different
-        // names.
-        cast<RecordValue>(Env.getValue(S1))
-            ->setProperty("prop1", Env.getBoolLiteralValue(false));
-        cast<RecordValue>(Env.getValue(S2))
-            ->setProperty("prop2", Env.getBoolLiteralValue(false));
-        EXPECT_FALSE(recordsEqual(S1, S2, Env));
       });
 }
 
diff --git a/clang/unittests/Analysis/FlowSensitive/TypeErasedDataflowAnalysisTest.cpp b/clang/unittests/Analysis/FlowSensitive/TypeErasedDataflowAnalysisTest.cpp
index 4c3cb322eacfb..8d481788af208 100644
--- a/clang/unittests/Analysis/FlowSensitive/TypeErasedDataflowAnalysisTest.cpp
+++ b/clang/unittests/Analysis/FlowSensitive/TypeErasedDataflowAnalysisTest.cpp
@@ -623,11 +623,11 @@ TEST_F(JoinFlowConditionsTest, JoinDistinctButProvablyEquivalentValues) {
       });
 }
 
-class OptionalIntAnalysis final
-    : public DataflowAnalysis<OptionalIntAnalysis, NoopLattice> {
+class NullPointerAnalysis final
+    : public DataflowAnalysis<NullPointerAnalysis, NoopLattice> {
 public:
-  explicit OptionalIntAnalysis(ASTContext &Context)
-      : DataflowAnalysis<OptionalIntAnalysis, NoopLattice>(Context) {}
+  explicit NullPointerAnalysis(ASTContext &Context)
+      : DataflowAnalysis<NullPointerAnalysis, NoopLattice>(Context) {}
 
   static NoopLattice initialElement() { return {}; }
 
@@ -636,40 +636,37 @@ class OptionalIntAnalysis final
     if (!CS)
       return;
     const Stmt *S = CS->getStmt();
-    auto OptionalIntRecordDecl = recordDecl(hasName("OptionalInt"));
-    auto HasOptionalIntType = hasType(OptionalIntRecordDecl);
-
-    SmallVector<BoundNodes, 1> Matches = match(
-        stmt(anyOf(cxxConstructExpr(HasOptionalIntType).bind("construct"),
-                   cxxOperatorCallExpr(
-                       callee(cxxMethodDecl(ofClass(OptionalIntRecordDecl))))
-                       .bind("operator"))),
-        *S, getASTContext());
-    if (const auto *E = selectFirst<CXXConstructExpr>(
-            "construct", Matches)) {
-      cast<RecordValue>(Env.getValue(*E))
-          ->setProperty("has_value", Env.getBoolLiteralValue(false));
-    } else if (const auto *E =
-                   selectFirst<CXXOperatorCallExpr>("operator", Matches)) {
-      assert(E->getNumArgs() > 0);
-      auto *Object = E->getArg(0);
-      assert(Object != nullptr);
-
-      refreshRecordValue(*Object, Env)
-          .setProperty("has_value", Env.getBoolLiteralValue(true));
+    const Expr *E = dyn_cast<Expr>(S);
+    if (!E)
+      return;
+
+    if (!E->getType()->isPointerType())
+      return;
+
+    // Make sure we have a `PointerValue` for `E`.
+    auto *PtrVal = cast_or_null<PointerValue>(Env.getValue(*E));
+    if (PtrVal == nullptr) {
+      PtrVal = cast<PointerValue>(Env.createValue(E->getType()));
+      Env.setValue(*E, *PtrVal);
     }
+
+    if (auto *Cast = dyn_cast<ImplicitCastExpr>(E);
+        Cast && Cast->getCastKind() == CK_NullToPointer)
+      PtrVal->setProperty("is_null", Env.getBoolLiteralValue(true));
+    else if (auto *Op = dyn_cast<UnaryOperator>(E);
+             Op && Op->getOpcode() == UO_AddrOf)
+      PtrVal->setProperty("is_null", Env.getBoolLiteralValue(false));
   }
 
   ComparisonResult compare(QualType Type, const Value &Val1,
                            const Environment &Env1, const Value &Val2,
                            const Environment &Env2) override {
-    // Nothing to say about a value that does not model an `OptionalInt`.
-    if (!Type->isRecordType() ||
-        Type->getAsCXXRecordDecl()->getQualifiedNameAsString() != "OptionalInt")
+    // Nothing to say about a value that is not a pointer.
+    if (!Type->isPointerType())
       return ComparisonResult::Unknown;
 
-    auto *Prop1 = Val1.getProperty("has_value");
-    auto *Prop2 = Val2.getProperty("has_value");
+    auto *Prop1 = Val1.getProperty("is_null");
+    auto *Prop2 = Val2.getProperty("is_null");
     assert(Prop1 != nullptr && Prop2 != nullptr);
     return areEquivalentValues(*Prop1, *Prop2) ? ComparisonResult::Same
                                                : ComparisonResult::Different;
@@ -678,23 +675,22 @@ class OptionalIntAnalysis final
   bool merge(QualType Type, const Value &Val1, const Environment &Env1,
              const Value &Val2, const Environment &Env2, Value &MergedVal,
              Environment &MergedEnv) override {
-    // Nothing to say about a value that does not model an `OptionalInt`.
-    if (!Type->isRecordType() ||
-        Type->getAsCXXRecordDecl()->getQualifiedNameAsString() != "OptionalInt")
+    // Nothing to say about a value that is not a pointer.
+    if (!Type->isPointerType())
       return false;
 
-    auto *HasValue1 = cast_or_null<BoolValue>(Val1.getProperty("has_value"));
-    if (HasValue1 == nullptr)
+    auto *IsNull1 = cast_or_null<BoolValue>(Val1.getProperty("is_null"));
+    if (IsNull1 == nullptr)
       return false;
 
-    auto *HasValue2 = cast_or_null<BoolValue>(Val2.getProperty("has_value"));
-    if (HasValue2 == nullptr)
+    auto *IsNull2 = cast_or_null<BoolValue>(Val2.getProperty("is_null"));
+    if (IsNull2 == nullptr)
       return false;
 
-    if (HasValue1 == HasValue2)
-      MergedVal.setProperty("has_value", *HasValue1);
+    if (IsNull1 == IsNull2)
+      MergedVal.setProperty("is_null", *IsNull1);
     else
-      MergedVal.setProperty("has_value", MergedEnv.makeTopBoolValue());
+      MergedVal.setProperty("is_null", MergedEnv.makeTopBoolValue());
     return true;
   }
 };
@@ -703,23 +699,14 @@ class WideningTest : public Test {
 protected:
   template <typename Matcher>
   void runDataflow(llvm::StringRef Code, Matcher Match) {
-    tooling::FileContentMappings FilesContents;
-    FilesContents.push_back(
-        std::make_pair<std::string, std::string>("widening_test_defs.h", R"(
-      struct OptionalInt {
-        OptionalInt() = default;
-        OptionalInt& operator=(int);
-      };
-    )"));
     ASSERT_THAT_ERROR(
-        checkDataflow<OptionalIntAnalysis>(
-            AnalysisInputs<OptionalIntAnalysis>(
+        checkDataflow<NullPointerAnalysis>(
+            AnalysisInputs<NullPointerAnalysis>(
                 Code, ast_matchers::hasName("target"),
                 [](ASTContext &Context, Environment &Env) {
-                  return OptionalIntAnalysis(Context);
+                  return NullPointerAnalysis(Context);
                 })
-                .withASTBuildArgs({"-fsyntax-only", "-std=c++17"})
-                .withASTBuildVirtualMappedFiles(std::move(FilesContents)),
+                .withASTBuildArgs({"-fsyntax-only", "-std=c++17"}),
             /*VerifyResults=*/[&Match](const llvm::StringMap<
                                            DataflowAnalysisState<NoopLattice>>
                                            &Results,
@@ -731,13 +718,12 @@ class WideningTest : public Test {
 
 TEST_F(WideningTest, JoinDistinctValuesWithDistinctProperties) {
   std::string Code = R"(
-    #include "widening_test_defs.h"
-
     void target(bool Cond) {
-      OptionalInt Foo;
+      int *Foo = nullptr;
+      int i = 0;
       /*[[p1]]*/
       if (Cond) {
-        Foo = 1;
+        Foo = &i;
         /*[[p2]]*/
       }
       (void)0;
@@ -760,27 +746,27 @@ TEST_F(WideningTest, JoinDistinctValuesWithDistinctProperties) {
           return Env.getValue(*FooDecl);
         };
 
-        EXPECT_EQ(GetFooValue(Env1)->getProperty("has_value"),
-                  &Env1.getBoolLiteralValue(false));
-        EXPECT_EQ(GetFooValue(Env2)->getProperty("has_value"),
-                  &Env2.getBoolLiteralValue(true));
+        EXPECT_EQ(GetFooValue(Env1)->getProperty("is_null"),
+                  &Env1.getBoolLiteralValue(true));
+        EXPECT_EQ(GetFooValue(Env2)->getProperty("is_null"),
+                  &Env2.getBoolLiteralValue(false));
         EXPECT_TRUE(
-            isa<TopBoolValue>(GetFooValue(Env3)->getProperty("has_value")));
+            isa<TopBoolValue>(GetFooValue(Env3)->getProperty("is_null")));
       });
 }
 
 TEST_F(WideningTest, JoinDistinctValuesWithSameProperties) {
   std::string Code = R"(
-    #include "widening_test_defs.h"
-
     void target(bool Cond) {
-      OptionalInt Foo;
+      int *Foo = nullptr;
+      int i1 = 0;
+      int i2 = 0;
       /*[[p1]]*/
       if (Cond) {
-        Foo = 1;
+        Foo = &i1;
         /*[[p2]]*/
       } else {
-        Foo = 2;
+        Foo = &i2;
         /*[[p3]]*/
       }
       (void)0;
@@ -805,14 +791,14 @@ TEST_F(WideningTest, JoinDistinctValuesWithSameProperties) {
           return Env.getValue(*FooDecl);
         };
 
-        EXPECT_EQ(GetFooValue(Env1)->getProperty("has_value"),
-                  &Env1.getBoolLiteralValue(false));
-        EXPECT_EQ(GetFooValue(Env2)->getProperty("has_value"),
-                  &Env2.getBoolLiteralValue(true));
-        EXPECT_EQ(GetFooValue(Env3)->getProperty("has_value"),
-                  &Env3.getBoolLiteralValue(true));
-        EXPECT_EQ(GetFooValue(Env4)->getProperty("has_value"),
-                  &Env4.getBoolLiteralValue(true));
+        EXPECT_EQ(GetFooValue(Env1)->getProperty("is_null"),
+                  &Env1.getBoolLiteralValue(true));
+        EXPECT_EQ(GetFooValue(Env2)->getProperty("is_null"),
+                  &Env2.getBoolLiteralValue(false));
+        EXPECT_EQ(GetFooValue(Env3)->getProperty("is_null"),
+                  &Env3.getBoolLiteralValue(false));
+        EXPECT_EQ(GetFooValue(Env4)->getProperty("is_null"),
+                  &Env4.getBoolLiteralValue(false));
       });
 }
 
@@ -849,13 +835,13 @@ TEST_F(WideningTest, DistinctPointersToTheSameLocationAreEquivalent) {
 
 TEST_F(WideningTest, DistinctValuesWithSamePropertiesAreEquivalent) {
   std::string Code = R"(
-    #include "widening_test_defs.h"
-
     void target(bool Cond) {
-      OptionalInt Foo;
-      Foo = 1;
+      int *Foo;
+      int i1 = 0;
+      int i2 = 0;
+      Foo = &i1;
       while (Cond) {
-        Foo = 2;
+        Foo = &i2;
       }
       (void)0;
       /*[[p]]*/
@@ -872,8 +858,8 @@ TEST_F(WideningTest, DistinctValuesWithSamePropertiesAreEquivalent) {
         ASSERT_THAT(FooDecl, NotNull());
 
         const auto *FooVal = Env.getValue(*FooDecl);
-        EXPECT_EQ(FooVal->getProperty("has_value"),
-                  &Env.getBoolLiteralValue(true));
+        EXPECT_EQ(FooVal->getProperty("is_null"),
+                  &Env.getBoolLiteralValue(false));
       });
 }
 

From 0df320093155f937ae985847f20e28b790667a9f Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Thu, 21 Dec 2023 09:22:21 +0100
Subject: [PATCH 034/342] [ValueTracking] Fix KnownBits conflict for
 poison-only vector

If all the demanded elements are poison, return unknown instead of
conflict to avoid downstream assertions.

Fixes https://github.com/llvm/llvm-project/issues/75505.
---
 llvm/lib/Analysis/ValueTracking.cpp         |  4 ++++
 llvm/test/Analysis/ValueTracking/pr75505.ll | 17 +++++++++++++++++
 2 files changed, 21 insertions(+)
 create mode 100644 llvm/test/Analysis/ValueTracking/pr75505.ll

diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index 9b8376833d2e6..efdb3fc285824 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -1826,6 +1826,8 @@ void computeKnownBits(const Value *V, const APInt &DemandedElts,
       Known.Zero &= ~Elt;
       Known.One &= Elt;
     }
+    if (Known.hasConflict())
+      Known.resetAll();
     return;
   }
 
@@ -1849,6 +1851,8 @@ void computeKnownBits(const Value *V, const APInt &DemandedElts,
       Known.Zero &= ~Elt;
       Known.One &= Elt;
     }
+    if (Known.hasConflict())
+      Known.resetAll();
     return;
   }
 
diff --git a/llvm/test/Analysis/ValueTracking/pr75505.ll b/llvm/test/Analysis/ValueTracking/pr75505.ll
new file mode 100644
index 0000000000000..79368ff18e060
--- /dev/null
+++ b/llvm/test/Analysis/ValueTracking/pr75505.ll
@@ -0,0 +1,17 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S -passes=instsimplify < %s | FileCheck %s
+
+; Just make sure that we don't assert.
+define i32 @test(<2 x i16> %a, i32 %b) {
+; CHECK-LABEL: define i32 @test(
+; CHECK-SAME: <2 x i16> [[A:%.*]], i32 [[B:%.*]]) {
+; CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i16> [[A]], <i16 -1, i16 poison>
+; CHECK-NEXT:    [[BC:%.*]] = bitcast <2 x i16> [[MUL]] to i32
+; CHECK-NEXT:    [[LSHR:%.*]] = lshr i32 [[B]], [[BC]]
+; CHECK-NEXT:    ret i32 [[LSHR]]
+;
+  %mul = mul <2 x i16> %a, <i16 -1, i16 poison>
+  %bc = bitcast <2 x i16> %mul to i32
+  %lshr = lshr i32 %b, %bc
+  ret i32 %lshr
+}

From 2203a4e6e01ce6bfd69505420d304a81daf23dc9 Mon Sep 17 00:00:00 2001
From: Chuanqi Xu <yedeng.yd@linux.alibaba.com>
Date: Thu, 21 Dec 2023 16:35:20 +0800
Subject: [PATCH 035/342] [NFC] [Serialization] Improve AST serialization by
 reordering packed bits and extract big bits from packed bits

Previously I tried to improve the size of .pcm files by introducing
packed bits. And I find we can improve it further by reordering the
bits.

The secret comes from the VBR format. We can find the formal definition
of VBR format in the doc of LLVM. The VBR format will be pretty
efficicent for small numbers.

For example, if we need to pack 8 bits into a value and the stored value
is 0xf0, the actual stored value will be 0b000111'110000, which takes 12
bits actually. However, if we changed the order to be 0x0f, then we
can store it as 0b001111, which takes 6 bits only now.

So we can improve the size by placing bits with lower probability to be
1 in the higher bits and extract bit bigs from the packed bits to make
it possible to be optimized by VBR.

After this patch, the size of std module becomes to 27.7MB from 28.1MB.
---
 clang/lib/Serialization/ASTReaderDecl.cpp |  50 +++--
 clang/lib/Serialization/ASTReaderStmt.cpp | 138 ++++++------
 clang/lib/Serialization/ASTWriterDecl.cpp | 261 ++++++++++++----------
 clang/lib/Serialization/ASTWriterStmt.cpp |  67 +++---
 4 files changed, 280 insertions(+), 236 deletions(-)

diff --git a/clang/lib/Serialization/ASTReaderDecl.cpp b/clang/lib/Serialization/ASTReaderDecl.cpp
index 209fb04342088..d989707d55752 100644
--- a/clang/lib/Serialization/ASTReaderDecl.cpp
+++ b/clang/lib/Serialization/ASTReaderDecl.cpp
@@ -584,7 +584,18 @@ void ASTDeclReader::Visit(Decl *D) {
 
 void ASTDeclReader::VisitDecl(Decl *D) {
   BitsUnpacker DeclBits(Record.readInt());
+  auto ModuleOwnership =
+      (Decl::ModuleOwnershipKind)DeclBits.getNextBits(/*Width=*/3);
+  D->setReferenced(DeclBits.getNextBit());
+  D->Used = DeclBits.getNextBit();
+  IsDeclMarkedUsed |= D->Used;
+  D->setAccess((AccessSpecifier)DeclBits.getNextBits(/*Width=*/2));
+  D->setImplicit(DeclBits.getNextBit());
   bool HasStandaloneLexicalDC = DeclBits.getNextBit();
+  bool HasAttrs = DeclBits.getNextBit();
+  D->setTopLevelDeclInObjCContainer(DeclBits.getNextBit());
+  D->InvalidDecl = DeclBits.getNextBit();
+  D->FromASTFile = true;
 
   if (D->isTemplateParameter() || D->isTemplateParameterPack() ||
       isa<ParmVarDecl, ObjCTypeParamDecl>(D)) {
@@ -623,20 +634,6 @@ void ASTDeclReader::VisitDecl(Decl *D) {
   }
   D->setLocation(ThisDeclLoc);
 
-  D->InvalidDecl = DeclBits.getNextBit();
-  bool HasAttrs = DeclBits.getNextBit();
-  D->setImplicit(DeclBits.getNextBit());
-  D->Used = DeclBits.getNextBit();
-  IsDeclMarkedUsed |= D->Used;
-  D->setReferenced(DeclBits.getNextBit());
-  D->setTopLevelDeclInObjCContainer(DeclBits.getNextBit());
-  D->setAccess((AccessSpecifier)DeclBits.getNextBits(/*Width=*/2));
-  D->FromASTFile = true;
-  auto ModuleOwnership =
-      (Decl::ModuleOwnershipKind)DeclBits.getNextBits(/*Width=*/3);
-  bool ModulePrivate =
-      (ModuleOwnership == Decl::ModuleOwnershipKind::ModulePrivate);
-
   if (HasAttrs) {
     AttrVec Attrs;
     Record.readAttributes(Attrs);
@@ -647,8 +644,9 @@ void ASTDeclReader::VisitDecl(Decl *D) {
 
   // Determine whether this declaration is part of a (sub)module. If so, it
   // may not yet be visible.
+  bool ModulePrivate =
+      (ModuleOwnership == Decl::ModuleOwnershipKind::ModulePrivate);
   if (unsigned SubmoduleID = readSubmoduleID()) {
-
     switch (ModuleOwnership) {
     case Decl::ModuleOwnershipKind::Visible:
       ModuleOwnership = Decl::ModuleOwnershipKind::VisibleWhenImported;
@@ -1065,9 +1063,11 @@ void ASTDeclReader::VisitFunctionDecl(FunctionDecl *FD) {
   // after everything else is read.
   BitsUnpacker FunctionDeclBits(Record.readInt());
 
+  FD->setCachedLinkage((Linkage)FunctionDeclBits.getNextBits(/*Width=*/3));
   FD->setStorageClass((StorageClass)FunctionDeclBits.getNextBits(/*Width=*/3));
   FD->setInlineSpecified(FunctionDeclBits.getNextBit());
   FD->setImplicitlyInline(FunctionDeclBits.getNextBit());
+  FD->setHasSkippedBody(FunctionDeclBits.getNextBit());
   FD->setVirtualAsWritten(FunctionDeclBits.getNextBit());
   // We defer calling `FunctionDecl::setPure()` here as for methods of
   // `CXXTemplateSpecializationDecl`s, we may not have connected up the
@@ -1081,16 +1081,14 @@ void ASTDeclReader::VisitFunctionDecl(FunctionDecl *FD) {
   FD->setDefaulted(FunctionDeclBits.getNextBit());
   FD->setExplicitlyDefaulted(FunctionDeclBits.getNextBit());
   FD->setIneligibleOrNotSelected(FunctionDeclBits.getNextBit());
-  FD->setHasImplicitReturnZero(FunctionDeclBits.getNextBit());
   FD->setConstexprKind(
       (ConstexprSpecKind)FunctionDeclBits.getNextBits(/*Width=*/2));
-  FD->setUsesSEHTry(FunctionDeclBits.getNextBit());
-  FD->setHasSkippedBody(FunctionDeclBits.getNextBit());
+  FD->setHasImplicitReturnZero(FunctionDeclBits.getNextBit());
   FD->setIsMultiVersion(FunctionDeclBits.getNextBit());
   FD->setLateTemplateParsed(FunctionDeclBits.getNextBit());
   FD->setFriendConstraintRefersToEnclosingTemplate(
       FunctionDeclBits.getNextBit());
-  FD->setCachedLinkage((Linkage)FunctionDeclBits.getNextBits(/*Width=*/3));
+  FD->setUsesSEHTry(FunctionDeclBits.getNextBit());
 
   FD->EndRangeLoc = readSourceLocation();
   if (FD->isExplicitlyDefaulted())
@@ -1597,6 +1595,8 @@ ASTDeclReader::RedeclarableResult ASTDeclReader::VisitVarDeclImpl(VarDecl *VD) {
   VisitDeclaratorDecl(VD);
 
   BitsUnpacker VarDeclBits(Record.readInt());
+  auto VarLinkage = Linkage(VarDeclBits.getNextBits(/*Width=*/3));
+  bool DefGeneratedInModule = VarDeclBits.getNextBit();
   VD->VarDeclBits.SClass = (StorageClass)VarDeclBits.getNextBits(/*Width=*/3);
   VD->VarDeclBits.TSCSpec = VarDeclBits.getNextBits(/*Width=*/2);
   VD->VarDeclBits.InitStyle = VarDeclBits.getNextBits(/*Width=*/2);
@@ -1608,17 +1608,20 @@ ASTDeclReader::RedeclarableResult ASTDeclReader::VisitVarDeclImpl(VarDecl *VD) {
     VD->NonParmVarDeclBits.ExceptionVar = VarDeclBits.getNextBit();
     VD->NonParmVarDeclBits.NRVOVariable = VarDeclBits.getNextBit();
     VD->NonParmVarDeclBits.CXXForRangeDecl = VarDeclBits.getNextBit();
-    VD->NonParmVarDeclBits.ObjCForDecl = VarDeclBits.getNextBit();
+
     VD->NonParmVarDeclBits.IsInline = VarDeclBits.getNextBit();
     VD->NonParmVarDeclBits.IsInlineSpecified = VarDeclBits.getNextBit();
     VD->NonParmVarDeclBits.IsConstexpr = VarDeclBits.getNextBit();
     VD->NonParmVarDeclBits.IsInitCapture = VarDeclBits.getNextBit();
     VD->NonParmVarDeclBits.PreviousDeclInSameBlockScope =
         VarDeclBits.getNextBit();
-    VD->NonParmVarDeclBits.ImplicitParamKind =
-        VarDeclBits.getNextBits(/*Width*/ 3);
+
     VD->NonParmVarDeclBits.EscapingByref = VarDeclBits.getNextBit();
     HasDeducedType = VarDeclBits.getNextBit();
+    VD->NonParmVarDeclBits.ImplicitParamKind =
+        VarDeclBits.getNextBits(/*Width*/ 3);
+
+    VD->NonParmVarDeclBits.ObjCForDecl = VarDeclBits.getNextBit();
   }
 
   // If this variable has a deduced type, defer reading that type until we are
@@ -1630,7 +1633,6 @@ ASTDeclReader::RedeclarableResult ASTDeclReader::VisitVarDeclImpl(VarDecl *VD) {
     VD->setType(Reader.GetType(DeferredTypeID));
   DeferredTypeID = 0;
 
-  auto VarLinkage = Linkage(VarDeclBits.getNextBits(/*Width=*/3));
   VD->setCachedLinkage(VarLinkage);
 
   // Reconstruct the one piece of the IdentifierNamespace that we need.
@@ -1638,7 +1640,7 @@ ASTDeclReader::RedeclarableResult ASTDeclReader::VisitVarDeclImpl(VarDecl *VD) {
       VD->getLexicalDeclContext()->isFunctionOrMethod())
     VD->setLocalExternDecl();
 
-  if (VarDeclBits.getNextBit()) {
+  if (DefGeneratedInModule) {
     Reader.DefinitionSource[VD] =
         Loc.F->Kind == ModuleKind::MK_MainFile ||
         Reader.getContext().getLangOpts().BuildingPCHWithObjectFile;
diff --git a/clang/lib/Serialization/ASTReaderStmt.cpp b/clang/lib/Serialization/ASTReaderStmt.cpp
index d2424bffc2288..cf37ffe4c38b5 100644
--- a/clang/lib/Serialization/ASTReaderStmt.cpp
+++ b/clang/lib/Serialization/ASTReaderStmt.cpp
@@ -152,10 +152,9 @@ void ASTStmtReader::VisitNullStmt(NullStmt *S) {
 
 void ASTStmtReader::VisitCompoundStmt(CompoundStmt *S) {
   VisitStmt(S);
-  CurrentUnpackingBits.emplace(Record.readInt());
   SmallVector<Stmt *, 16> Stmts;
-  unsigned NumStmts = CurrentUnpackingBits->getNextBits(/*Width=*/20);
-  unsigned HasFPFeatures = CurrentUnpackingBits->getNextBit();
+  unsigned NumStmts = Record.readInt();
+  unsigned HasFPFeatures = Record.readInt();
   assert(S->hasStoredFPFeatures() == HasFPFeatures);
   while (NumStmts--)
     Stmts.push_back(Record.readSubStmt());
@@ -600,20 +599,21 @@ void ASTStmtReader::VisitPredefinedExpr(PredefinedExpr *E) {
 void ASTStmtReader::VisitDeclRefExpr(DeclRefExpr *E) {
   VisitExpr(E);
 
-  E->DeclRefExprBits.HasQualifier = CurrentUnpackingBits->getNextBit();
-  E->DeclRefExprBits.HasFoundDecl = CurrentUnpackingBits->getNextBit();
-  E->DeclRefExprBits.HasTemplateKWAndArgsInfo =
-      CurrentUnpackingBits->getNextBit();
+  CurrentUnpackingBits.emplace(Record.readInt());
   E->DeclRefExprBits.HadMultipleCandidates = CurrentUnpackingBits->getNextBit();
   E->DeclRefExprBits.RefersToEnclosingVariableOrCapture =
       CurrentUnpackingBits->getNextBit();
   E->DeclRefExprBits.NonOdrUseReason =
       CurrentUnpackingBits->getNextBits(/*Width=*/2);
   E->DeclRefExprBits.IsImmediateEscalating = CurrentUnpackingBits->getNextBit();
+  E->DeclRefExprBits.HasFoundDecl = CurrentUnpackingBits->getNextBit();
+  E->DeclRefExprBits.HasQualifier = CurrentUnpackingBits->getNextBit();
+  E->DeclRefExprBits.HasTemplateKWAndArgsInfo =
+      CurrentUnpackingBits->getNextBit();
   E->DeclRefExprBits.CapturedByCopyInLambdaWithExplicitObjectParameter = false;
   unsigned NumTemplateArgs = 0;
   if (E->hasTemplateKWAndArgsInfo())
-    NumTemplateArgs = CurrentUnpackingBits->getNextBits(/*Width=*/12);
+    NumTemplateArgs = Record.readInt();
 
   if (E->hasQualifier())
     new (E->getTrailingObjects<NestedNameSpecifierLoc>())
@@ -1013,10 +1013,11 @@ void ASTStmtReader::VisitOMPIteratorExpr(OMPIteratorExpr *E) {
 void ASTStmtReader::VisitCallExpr(CallExpr *E) {
   VisitExpr(E);
 
-  unsigned NumArgs = CurrentUnpackingBits->getNextBits(/*Width=*/13);
-  bool HasFPFeatures = CurrentUnpackingBits->getNextBit();
+  unsigned NumArgs = Record.readInt();
+  CurrentUnpackingBits.emplace(Record.readInt());
   E->setADLCallKind(
       static_cast<CallExpr::ADLCallKind>(CurrentUnpackingBits->getNextBit()));
+  bool HasFPFeatures = CurrentUnpackingBits->getNextBit();
   assert((NumArgs == E->getNumArgs()) && "Wrong NumArgs!");
   E->setRParenLoc(readSourceLocation());
   E->setCallee(Record.readSubExpr());
@@ -1035,10 +1036,11 @@ void ASTStmtReader::VisitCXXMemberCallExpr(CXXMemberCallExpr *E) {
 void ASTStmtReader::VisitMemberExpr(MemberExpr *E) {
   VisitExpr(E);
 
+  CurrentUnpackingBits.emplace(Record.readInt());
   bool HasQualifier = CurrentUnpackingBits->getNextBit();
   bool HasFoundDecl = CurrentUnpackingBits->getNextBit();
   bool HasTemplateInfo = CurrentUnpackingBits->getNextBit();
-  unsigned NumTemplateArgs = CurrentUnpackingBits->getNextBits(/*Width=*/12);
+  unsigned NumTemplateArgs = Record.readInt();
 
   E->Base = Record.readSubExpr();
   E->MemberDecl = Record.readDeclAs<ValueDecl>();
@@ -1103,10 +1105,14 @@ void ASTStmtReader::VisitCastExpr(CastExpr *E) {
   VisitExpr(E);
   unsigned NumBaseSpecs = Record.readInt();
   assert(NumBaseSpecs == E->path_size());
+
+  CurrentUnpackingBits.emplace(Record.readInt());
+  E->setCastKind((CastKind)CurrentUnpackingBits->getNextBits(/*Width=*/7));
   unsigned HasFPFeatures = CurrentUnpackingBits->getNextBit();
   assert(E->hasStoredFPFeatures() == HasFPFeatures);
+
   E->setSubExpr(Record.readSubExpr());
-  E->setCastKind((CastKind)CurrentUnpackingBits->getNextBits(/*Width=*/7));
+
   CastExpr::path_iterator BaseI = E->path_begin();
   while (NumBaseSpecs--) {
     auto *BaseSpec = new (Record.getContext()) CXXBaseSpecifier;
@@ -1119,12 +1125,12 @@ void ASTStmtReader::VisitCastExpr(CastExpr *E) {
 }
 
 void ASTStmtReader::VisitBinaryOperator(BinaryOperator *E) {
-
   VisitExpr(E);
-  bool hasFP_Features = CurrentUnpackingBits->getNextBit();
-  E->setHasStoredFPFeatures(hasFP_Features);
+  CurrentUnpackingBits.emplace(Record.readInt());
   E->setOpcode(
       (BinaryOperator::Opcode)CurrentUnpackingBits->getNextBits(/*Width=*/6));
+  bool hasFP_Features = CurrentUnpackingBits->getNextBit();
+  E->setHasStoredFPFeatures(hasFP_Features);
   E->setLHS(Record.readSubExpr());
   E->setRHS(Record.readSubExpr());
   E->setOperatorLoc(readSourceLocation());
@@ -1700,8 +1706,7 @@ void ASTStmtReader::VisitMSDependentExistsStmt(MSDependentExistsStmt *S) {
 
 void ASTStmtReader::VisitCXXOperatorCallExpr(CXXOperatorCallExpr *E) {
   VisitCallExpr(E);
-  E->CXXOperatorCallExprBits.OperatorKind =
-      CurrentUnpackingBits->getNextBits(/*Width=*/6);
+  E->CXXOperatorCallExprBits.OperatorKind = Record.readInt();
   E->Range = Record.readSourceRange();
 }
 
@@ -1976,8 +1981,9 @@ void ASTStmtReader::VisitCXXDependentScopeMemberExpr(
     CXXDependentScopeMemberExpr *E) {
   VisitExpr(E);
 
+  unsigned NumTemplateArgs = Record.readInt();
+  CurrentUnpackingBits.emplace(Record.readInt());
   bool HasTemplateKWAndArgsInfo = CurrentUnpackingBits->getNextBit();
-  unsigned NumTemplateArgs = CurrentUnpackingBits->getNextBits(/*Width=*/16);
   bool HasFirstQualifierFoundInScope = CurrentUnpackingBits->getNextBit();
 
   assert((HasTemplateKWAndArgsInfo == E->hasTemplateKWAndArgsInfo()) &&
@@ -2044,15 +2050,15 @@ ASTStmtReader::VisitCXXUnresolvedConstructExpr(CXXUnresolvedConstructExpr *E) {
 void ASTStmtReader::VisitOverloadExpr(OverloadExpr *E) {
   VisitExpr(E);
 
+  unsigned NumResults = Record.readInt();
   CurrentUnpackingBits.emplace(Record.readInt());
-  unsigned NumResults = CurrentUnpackingBits->getNextBits(/*Width=*/12);
   bool HasTemplateKWAndArgsInfo = CurrentUnpackingBits->getNextBit();
   assert((E->getNumDecls() == NumResults) && "Wrong NumResults!");
   assert((E->hasTemplateKWAndArgsInfo() == HasTemplateKWAndArgsInfo) &&
          "Wrong HasTemplateKWAndArgsInfo!");
 
   if (HasTemplateKWAndArgsInfo) {
-    unsigned NumTemplateArgs = CurrentUnpackingBits->getNextBits(/*Width=*/12);
+    unsigned NumTemplateArgs = Record.readInt();
     ReadTemplateKWAndArgsInfo(*E->getTrailingASTTemplateKWAndArgsInfo(),
                               E->getTrailingTemplateArgumentLoc(),
                               NumTemplateArgs);
@@ -2869,9 +2875,8 @@ Stmt *ASTReader::ReadStmtFromStream(ModuleFile &F) {
       break;
 
     case STMT_COMPOUND: {
-      BitsUnpacker StmtCompoundBits(Record[ASTStmtReader::NumStmtFields]);
-      unsigned NumStmts = StmtCompoundBits.getNextBits(/*Width=*/20);
-      bool HasFPFeatures = StmtCompoundBits.getNextBit();
+      unsigned NumStmts = Record[ASTStmtReader::NumStmtFields];
+      bool HasFPFeatures = Record[ASTStmtReader::NumStmtFields + 1];
       S = CompoundStmt::CreateEmpty(Context, NumStmts, HasFPFeatures);
       break;
     }
@@ -2981,14 +2986,13 @@ Stmt *ASTReader::ReadStmtFromStream(ModuleFile &F) {
       break;
 
     case EXPR_DECL_REF: {
-      BitsUnpacker DeclRefExprBits(Record[ASTStmtReader::NumStmtFields]);
-      DeclRefExprBits.advance(ASTStmtReader::NumExprBits);
-      bool HasQualifier = DeclRefExprBits.getNextBit();
+      BitsUnpacker DeclRefExprBits(Record[ASTStmtReader::NumExprFields]);
+      DeclRefExprBits.advance(5);
       bool HasFoundDecl = DeclRefExprBits.getNextBit();
+      bool HasQualifier = DeclRefExprBits.getNextBit();
       bool HasTemplateKWAndArgsInfo = DeclRefExprBits.getNextBit();
-      DeclRefExprBits.advance(5);
       unsigned NumTemplateArgs = HasTemplateKWAndArgsInfo
-                                     ? DeclRefExprBits.getNextBits(/*Width=*/12)
+                                     ? Record[ASTStmtReader::NumExprFields + 1]
                                      : 0;
       S = DeclRefExpr::CreateEmpty(Context, HasQualifier, HasFoundDecl,
                                    HasTemplateKWAndArgsInfo, NumTemplateArgs);
@@ -3074,9 +3078,9 @@ Stmt *ASTReader::ReadStmtFromStream(ModuleFile &F) {
       break;
 
     case EXPR_CALL: {
-      BitsUnpacker CallExprBits(Record[ASTStmtReader::NumStmtFields]);
-      CallExprBits.advance(ASTStmtReader::NumExprBits);
-      auto NumArgs = CallExprBits.getNextBits(/*Width=*/13);
+      auto NumArgs = Record[ASTStmtReader::NumExprFields];
+      BitsUnpacker CallExprBits(Record[ASTStmtReader::NumExprFields + 1]);
+      CallExprBits.advance(1);
       auto HasFPFeatures = CallExprBits.getNextBit();
       S = CallExpr::CreateEmpty(Context, NumArgs, HasFPFeatures, Empty);
       break;
@@ -3088,28 +3092,27 @@ Stmt *ASTReader::ReadStmtFromStream(ModuleFile &F) {
       break;
 
     case EXPR_MEMBER: {
-      BitsUnpacker ExprMemberBits(Record[ASTStmtReader::NumStmtFields]);
-      ExprMemberBits.advance(ASTStmtReader::NumExprBits);
+      BitsUnpacker ExprMemberBits(Record[ASTStmtReader::NumExprFields]);
       bool HasQualifier = ExprMemberBits.getNextBit();
       bool HasFoundDecl = ExprMemberBits.getNextBit();
       bool HasTemplateInfo = ExprMemberBits.getNextBit();
-      unsigned NumTemplateArgs = ExprMemberBits.getNextBits(/*Width=*/12);
+      unsigned NumTemplateArgs = Record[ASTStmtReader::NumExprFields + 1];
       S = MemberExpr::CreateEmpty(Context, HasQualifier, HasFoundDecl,
                                   HasTemplateInfo, NumTemplateArgs);
       break;
     }
 
     case EXPR_BINARY_OPERATOR: {
-      BitsUnpacker BinaryOperatorBits(Record[ASTStmtReader::NumStmtFields]);
-      BinaryOperatorBits.advance(ASTStmtReader::NumExprBits);
+      BitsUnpacker BinaryOperatorBits(Record[ASTStmtReader::NumExprFields]);
+      BinaryOperatorBits.advance(/*Size of opcode*/ 6);
       bool HasFPFeatures = BinaryOperatorBits.getNextBit();
       S = BinaryOperator::CreateEmpty(Context, HasFPFeatures);
       break;
     }
 
     case EXPR_COMPOUND_ASSIGN_OPERATOR: {
-      BitsUnpacker BinaryOperatorBits(Record[ASTStmtReader::NumStmtFields]);
-      BinaryOperatorBits.advance(ASTStmtReader::NumExprBits);
+      BitsUnpacker BinaryOperatorBits(Record[ASTStmtReader::NumExprFields]);
+      BinaryOperatorBits.advance(/*Size of opcode*/ 6);
       bool HasFPFeatures = BinaryOperatorBits.getNextBit();
       S = CompoundAssignOperator::CreateEmpty(Context, HasFPFeatures);
       break;
@@ -3124,18 +3127,18 @@ Stmt *ASTReader::ReadStmtFromStream(ModuleFile &F) {
       break;
 
     case EXPR_IMPLICIT_CAST: {
-      BitsUnpacker CastExprBits(Record[ASTStmtReader::NumStmtFields]);
-      CastExprBits.advance(ASTStmtReader::NumExprBits);
       unsigned PathSize = Record[ASTStmtReader::NumExprFields];
+      BitsUnpacker CastExprBits(Record[ASTStmtReader::NumExprFields + 1]);
+      CastExprBits.advance(7);
       bool HasFPFeatures = CastExprBits.getNextBit();
       S = ImplicitCastExpr::CreateEmpty(Context, PathSize, HasFPFeatures);
       break;
     }
 
     case EXPR_CSTYLE_CAST: {
-      BitsUnpacker CastExprBits(Record[ASTStmtReader::NumStmtFields]);
-      CastExprBits.advance(ASTStmtReader::NumExprBits);
       unsigned PathSize = Record[ASTStmtReader::NumExprFields];
+      BitsUnpacker CastExprBits(Record[ASTStmtReader::NumExprFields + 1]);
+      CastExprBits.advance(7);
       bool HasFPFeatures = CastExprBits.getNextBit();
       S = CStyleCastExpr::CreateEmpty(Context, PathSize, HasFPFeatures);
       break;
@@ -3834,9 +3837,9 @@ Stmt *ASTReader::ReadStmtFromStream(ModuleFile &F) {
     }
 
     case EXPR_CXX_OPERATOR_CALL: {
-      BitsUnpacker CallExprBits(Record[ASTStmtReader::NumStmtFields]);
-      CallExprBits.advance(ASTStmtReader::NumExprBits);
-      auto NumArgs = CallExprBits.getNextBits(/*Width=*/13);
+      auto NumArgs = Record[ASTStmtReader::NumExprFields];
+      BitsUnpacker CallExprBits(Record[ASTStmtReader::NumExprFields + 1]);
+      CallExprBits.advance(1);
       auto HasFPFeatures = CallExprBits.getNextBit();
       S = CXXOperatorCallExpr::CreateEmpty(Context, NumArgs, HasFPFeatures,
                                            Empty);
@@ -3844,9 +3847,9 @@ Stmt *ASTReader::ReadStmtFromStream(ModuleFile &F) {
     }
 
     case EXPR_CXX_MEMBER_CALL: {
-      BitsUnpacker CallExprBits(Record[ASTStmtReader::NumStmtFields]);
-      CallExprBits.advance(ASTStmtReader::NumExprBits);
-      auto NumArgs = CallExprBits.getNextBits(/*Width=*/13);
+      auto NumArgs = Record[ASTStmtReader::NumExprFields];
+      BitsUnpacker CallExprBits(Record[ASTStmtReader::NumExprFields + 1]);
+      CallExprBits.advance(1);
       auto HasFPFeatures = CallExprBits.getNextBit();
       S = CXXMemberCallExpr::CreateEmpty(Context, NumArgs, HasFPFeatures,
                                          Empty);
@@ -3874,9 +3877,9 @@ Stmt *ASTReader::ReadStmtFromStream(ModuleFile &F) {
       break;
 
     case EXPR_CXX_STATIC_CAST: {
-      BitsUnpacker CastExprBits(Record[ASTStmtReader::NumStmtFields]);
-      CastExprBits.advance(ASTStmtReader::NumExprBits);
       unsigned PathSize = Record[ASTStmtReader::NumExprFields];
+      BitsUnpacker CastExprBits(Record[ASTStmtReader::NumExprFields + 1]);
+      CastExprBits.advance(7);
       bool HasFPFeatures = CastExprBits.getNextBit();
       S = CXXStaticCastExpr::CreateEmpty(Context, PathSize, HasFPFeatures);
       break;
@@ -3903,9 +3906,9 @@ Stmt *ASTReader::ReadStmtFromStream(ModuleFile &F) {
       break;
 
     case EXPR_CXX_FUNCTIONAL_CAST: {
-      BitsUnpacker CastExprBits(Record[ASTStmtReader::NumStmtFields]);
-      CastExprBits.advance(ASTStmtReader::NumExprBits);
       unsigned PathSize = Record[ASTStmtReader::NumExprFields];
+      BitsUnpacker CastExprBits(Record[ASTStmtReader::NumExprFields + 1]);
+      CastExprBits.advance(7);
       bool HasFPFeatures = CastExprBits.getNextBit();
       S = CXXFunctionalCastExpr::CreateEmpty(Context, PathSize, HasFPFeatures);
       break;
@@ -3921,9 +3924,9 @@ Stmt *ASTReader::ReadStmtFromStream(ModuleFile &F) {
     }
 
     case EXPR_USER_DEFINED_LITERAL: {
-      BitsUnpacker CallExprBits(Record[ASTStmtReader::NumStmtFields]);
-      CallExprBits.advance(ASTStmtReader::NumExprBits);
-      auto NumArgs = CallExprBits.getNextBits(/*Width=*/13);
+      auto NumArgs = Record[ASTStmtReader::NumExprFields];
+      BitsUnpacker CallExprBits(Record[ASTStmtReader::NumExprFields + 1]);
+      CallExprBits.advance(1);
       auto HasFPFeatures = CallExprBits.getNextBit();
       S = UserDefinedLiteral::CreateEmpty(Context, NumArgs, HasFPFeatures,
                                           Empty);
@@ -4015,12 +4018,11 @@ Stmt *ASTReader::ReadStmtFromStream(ModuleFile &F) {
       break;
 
     case EXPR_CXX_DEPENDENT_SCOPE_MEMBER: {
+      unsigned NumTemplateArgs = Record[ASTStmtReader::NumExprFields];
       BitsUnpacker DependentScopeMemberBits(
-          Record[ASTStmtReader::NumStmtFields]);
-      DependentScopeMemberBits.advance(ASTStmtReader::NumExprBits);
+          Record[ASTStmtReader::NumExprFields + 1]);
       bool HasTemplateKWAndArgsInfo = DependentScopeMemberBits.getNextBit();
-      unsigned NumTemplateArgs =
-          DependentScopeMemberBits.getNextBits(/*Width=*/16);
+
       bool HasFirstQualifierFoundInScope =
           DependentScopeMemberBits.getNextBit();
       S = CXXDependentScopeMemberExpr::CreateEmpty(
@@ -4049,11 +4051,11 @@ Stmt *ASTReader::ReadStmtFromStream(ModuleFile &F) {
       break;
 
     case EXPR_CXX_UNRESOLVED_MEMBER: {
-      BitsUnpacker OverloadExprBits(Record[ASTStmtReader::NumExprFields]);
-      auto NumResults = OverloadExprBits.getNextBits(/*Width=*/12);
+      auto NumResults = Record[ASTStmtReader::NumExprFields];
+      BitsUnpacker OverloadExprBits(Record[ASTStmtReader::NumExprFields + 1]);
       auto HasTemplateKWAndArgsInfo = OverloadExprBits.getNextBit();
       auto NumTemplateArgs = HasTemplateKWAndArgsInfo
-                                 ? OverloadExprBits.getNextBits(/*Width=*/12)
+                                 ? Record[ASTStmtReader::NumExprFields + 2]
                                  : 0;
       S = UnresolvedMemberExpr::CreateEmpty(
           Context, NumResults, HasTemplateKWAndArgsInfo, NumTemplateArgs);
@@ -4061,11 +4063,11 @@ Stmt *ASTReader::ReadStmtFromStream(ModuleFile &F) {
     }
 
     case EXPR_CXX_UNRESOLVED_LOOKUP: {
-      BitsUnpacker OverloadExprBits(Record[ASTStmtReader::NumExprFields]);
-      auto NumResults = OverloadExprBits.getNextBits(/*Width=*/12);
+      auto NumResults = Record[ASTStmtReader::NumExprFields];
+      BitsUnpacker OverloadExprBits(Record[ASTStmtReader::NumExprFields + 1]);
       auto HasTemplateKWAndArgsInfo = OverloadExprBits.getNextBit();
       auto NumTemplateArgs = HasTemplateKWAndArgsInfo
-                                 ? OverloadExprBits.getNextBits(/*Width=*/12)
+                                 ? Record[ASTStmtReader::NumExprFields + 2]
                                  : 0;
       S = UnresolvedLookupExpr::CreateEmpty(
           Context, NumResults, HasTemplateKWAndArgsInfo, NumTemplateArgs);
@@ -4130,9 +4132,9 @@ Stmt *ASTReader::ReadStmtFromStream(ModuleFile &F) {
       break;
 
     case EXPR_CUDA_KERNEL_CALL: {
-      BitsUnpacker CallExprBits(Record[ASTStmtReader::NumStmtFields]);
-      CallExprBits.advance(ASTStmtReader::NumExprBits);
-      auto NumArgs = CallExprBits.getNextBits(/*Width=*/13);
+      auto NumArgs = Record[ASTStmtReader::NumExprFields];
+      BitsUnpacker CallExprBits(Record[ASTStmtReader::NumExprFields + 1]);
+      CallExprBits.advance(1);
       auto HasFPFeatures = CallExprBits.getNextBit();
       S = CUDAKernelCallExpr::CreateEmpty(Context, NumArgs, HasFPFeatures,
                                           Empty);
diff --git a/clang/lib/Serialization/ASTWriterDecl.cpp b/clang/lib/Serialization/ASTWriterDecl.cpp
index 53128133588fa..2554abc682a1d 100644
--- a/clang/lib/Serialization/ASTWriterDecl.cpp
+++ b/clang/lib/Serialization/ASTWriterDecl.cpp
@@ -321,15 +321,25 @@ void ASTDeclWriter::Visit(Decl *D) {
 
 void ASTDeclWriter::VisitDecl(Decl *D) {
   BitsPacker DeclBits;
+
+  // The order matters here. It will be better to put the bit with higher
+  // probability to be 0 in the end of the bits.
+  //
+  // Since we're using VBR6 format to store it.
+  // It will be pretty effient if all the higher bits are 0.
+  // For example, if we need to pack 8 bits into a value and the stored value
+  // is 0xf0, the actual stored value will be 0b000111'110000, which takes 12
+  // bits actually. However, if we changed the order to be 0x0f, then we can
+  // store it as 0b001111, which takes 6 bits only now.
+  DeclBits.addBits((uint64_t)D->getModuleOwnershipKind(), /*BitWidth=*/3);
+  DeclBits.addBit(D->isReferenced());
+  DeclBits.addBit(D->isUsed(false));
+  DeclBits.addBits(D->getAccess(), /*BitWidth=*/2);
+  DeclBits.addBit(D->isImplicit());
   DeclBits.addBit(D->getDeclContext() != D->getLexicalDeclContext());
-  DeclBits.addBit(D->isInvalidDecl());
   DeclBits.addBit(D->hasAttrs());
-  DeclBits.addBit(D->isImplicit());
-  DeclBits.addBit(D->isUsed(false));
-  DeclBits.addBit(D->isReferenced());
   DeclBits.addBit(D->isTopLevelDeclInObjCContainer());
-  DeclBits.addBits(D->getAccess(), /*BitWidth=*/2);
-  DeclBits.addBits((uint64_t)D->getModuleOwnershipKind(), /*BitWidth=*/3);
+  DeclBits.addBit(D->isInvalidDecl());
   Record.push_back(DeclBits);
 
   Record.AddDeclRef(cast_or_null<Decl>(D->getDeclContext()));
@@ -493,16 +503,13 @@ void ASTDeclWriter::VisitEnumDecl(EnumDecl *D) {
     Record.AddDeclRef(nullptr);
   }
 
-  if (D->getDeclContext() == D->getLexicalDeclContext() &&
-      !D->hasAttrs() &&
-      !D->isImplicit() &&
-      !D->hasExtInfo() &&
+  if (D->getDeclContext() == D->getLexicalDeclContext() && !D->hasAttrs() &&
+      !D->isInvalidDecl() && !D->isImplicit() && !D->hasExtInfo() &&
       !D->getTypedefNameForAnonDecl() &&
       D->getFirstDecl() == D->getMostRecentDecl() &&
       !D->isTopLevelDeclInObjCContainer() &&
       !CXXRecordDecl::classofKind(D->getKind()) &&
-      !D->getIntegerTypeSourceInfo() &&
-      !D->getMemberSpecializationInfo() &&
+      !D->getIntegerTypeSourceInfo() && !D->getMemberSpecializationInfo() &&
       !needsAnonymousDeclarationNumber(D) &&
       D->getDeclName().getNameKind() == DeclarationName::Identifier)
     AbbrevToUse = Writer.getDeclEnumAbbrev();
@@ -537,9 +544,8 @@ void ASTDeclWriter::VisitRecordDecl(RecordDecl *D) {
   if (!isa<CXXRecordDecl>(D))
     Record.push_back(D->getODRHash());
 
-  if (D->getDeclContext() == D->getLexicalDeclContext() &&
-      !D->hasAttrs() &&
-      !D->hasExtInfo() &&
+  if (D->getDeclContext() == D->getLexicalDeclContext() && !D->hasAttrs() &&
+      !D->isImplicit() && !D->isInvalidDecl() && !D->hasExtInfo() &&
       !D->getTypedefNameForAnonDecl() &&
       D->getFirstDecl() == D->getMostRecentDecl() &&
       !D->isTopLevelDeclInObjCContainer() &&
@@ -663,11 +669,16 @@ void ASTDeclWriter::VisitFunctionDecl(FunctionDecl *D) {
   Record.AddDeclarationNameLoc(D->DNLoc, D->getDeclName());
   Record.push_back(D->getIdentifierNamespace());
 
+  // The order matters here. It will be better to put the bit with higher
+  // probability to be 0 in the end of the bits. See the comments in VisitDecl
+  // for details.
   BitsPacker FunctionDeclBits;
   // FIXME: stable encoding
+  FunctionDeclBits.addBits(llvm::to_underlying(D->getLinkageInternal()), 3);
   FunctionDeclBits.addBits((uint32_t)D->getStorageClass(), /*BitWidth=*/3);
   FunctionDeclBits.addBit(D->isInlineSpecified());
   FunctionDeclBits.addBit(D->isInlined());
+  FunctionDeclBits.addBit(D->hasSkippedBody());
   FunctionDeclBits.addBit(D->isVirtualAsWritten());
   FunctionDeclBits.addBit(D->isPure());
   FunctionDeclBits.addBit(D->hasInheritedPrototype());
@@ -678,14 +689,12 @@ void ASTDeclWriter::VisitFunctionDecl(FunctionDecl *D) {
   FunctionDeclBits.addBit(D->isDefaulted());
   FunctionDeclBits.addBit(D->isExplicitlyDefaulted());
   FunctionDeclBits.addBit(D->isIneligibleOrNotSelected());
-  FunctionDeclBits.addBit(D->hasImplicitReturnZero());
   FunctionDeclBits.addBits((uint64_t)(D->getConstexprKind()), /*BitWidth=*/2);
-  FunctionDeclBits.addBit(D->usesSEHTry());
-  FunctionDeclBits.addBit(D->hasSkippedBody());
+  FunctionDeclBits.addBit(D->hasImplicitReturnZero());
   FunctionDeclBits.addBit(D->isMultiVersion());
   FunctionDeclBits.addBit(D->isLateTemplateParsed());
   FunctionDeclBits.addBit(D->FriendConstraintRefersToEnclosingTemplate());
-  FunctionDeclBits.addBits(llvm::to_underlying(D->getLinkageInternal()), 3);
+  FunctionDeclBits.addBit(D->usesSEHTry());
   Record.push_back(FunctionDeclBits);
 
   Record.AddSourceLocation(D->getEndLoc());
@@ -1049,7 +1058,28 @@ void ASTDeclWriter::VisitVarDecl(VarDecl *D) {
   VisitRedeclarable(D);
   VisitDeclaratorDecl(D);
 
+  // The order matters here. It will be better to put the bit with higher
+  // probability to be 0 in the end of the bits. See the comments in VisitDecl
+  // for details.
   BitsPacker VarDeclBits;
+  VarDeclBits.addBits(llvm::to_underlying(D->getLinkageInternal()),
+                      /*BitWidth=*/3);
+
+  bool ModulesCodegen = false;
+  if (Writer.WritingModule && D->getStorageDuration() == SD_Static &&
+      !D->getDescribedVarTemplate()) {
+    // When building a C++20 module interface unit or a partition unit, a
+    // strong definition in the module interface is provided by the
+    // compilation of that unit, not by its users. (Inline variables are still
+    // emitted in module users.)
+    ModulesCodegen =
+        (Writer.WritingModule->isInterfaceOrPartition() ||
+         (D->hasAttr<DLLExportAttr>() &&
+          Writer.Context->getLangOpts().BuildingPCHWithObjectFile)) &&
+        Writer.Context->GetGVALinkageForVariable(D) >= GVA_StrongExternal;
+  }
+  VarDeclBits.addBit(ModulesCodegen);
+
   VarDeclBits.addBits(D->getStorageClass(), /*BitWidth=*/3);
   VarDeclBits.addBits(D->getTSCSpec(), /*BitWidth=*/2);
   VarDeclBits.addBits(D->getInitStyle(), /*BitWidth=*/2);
@@ -1061,41 +1091,26 @@ void ASTDeclWriter::VisitVarDecl(VarDecl *D) {
     VarDeclBits.addBit(D->isExceptionVariable());
     VarDeclBits.addBit(D->isNRVOVariable());
     VarDeclBits.addBit(D->isCXXForRangeDecl());
-    VarDeclBits.addBit(D->isObjCForDecl());
+
     VarDeclBits.addBit(D->isInline());
     VarDeclBits.addBit(D->isInlineSpecified());
     VarDeclBits.addBit(D->isConstexpr());
     VarDeclBits.addBit(D->isInitCapture());
     VarDeclBits.addBit(D->isPreviousDeclInSameBlockScope());
 
+    VarDeclBits.addBit(D->isEscapingByref());
+    HasDeducedType = D->getType()->getContainedDeducedType();
+    VarDeclBits.addBit(HasDeducedType);
+
     if (const auto *IPD = dyn_cast<ImplicitParamDecl>(D))
       VarDeclBits.addBits(llvm::to_underlying(IPD->getParameterKind()),
                           /*Width=*/3);
     else
       VarDeclBits.addBits(0, /*Width=*/3);
 
-    VarDeclBits.addBit(D->isEscapingByref());
-    HasDeducedType = D->getType()->getContainedDeducedType();
-    VarDeclBits.addBit(HasDeducedType);
-  }
-
-  VarDeclBits.addBits(llvm::to_underlying(D->getLinkageInternal()), /*BitWidth=*/3);
-
-  bool ModulesCodegen = false;
-  if (Writer.WritingModule && D->getStorageDuration() == SD_Static &&
-      !D->getDescribedVarTemplate()) {
-    // When building a C++20 module interface unit or a partition unit, a
-    // strong definition in the module interface is provided by the
-    // compilation of that unit, not by its users. (Inline variables are still
-    // emitted in module users.)
-    ModulesCodegen =
-        (Writer.WritingModule->isInterfaceOrPartition() ||
-         (D->hasAttr<DLLExportAttr>() &&
-          Writer.Context->getLangOpts().BuildingPCHWithObjectFile)) &&
-         Writer.Context->GetGVALinkageForVariable(D) >= GVA_StrongExternal;
+    VarDeclBits.addBit(D->isObjCForDecl());
   }
 
-  VarDeclBits.addBit(ModulesCodegen);
   Record.push_back(VarDeclBits);
 
   if (ModulesCodegen)
@@ -1124,23 +1139,17 @@ void ASTDeclWriter::VisitVarDecl(VarDecl *D) {
     Record.push_back(VarNotTemplate);
   }
 
-  if (D->getDeclContext() == D->getLexicalDeclContext() &&
-      !D->hasAttrs() &&
+  if (D->getDeclContext() == D->getLexicalDeclContext() && !D->hasAttrs() &&
       !D->isTopLevelDeclInObjCContainer() &&
       !needsAnonymousDeclarationNumber(D) &&
       D->getDeclName().getNameKind() == DeclarationName::Identifier &&
-      !D->hasExtInfo() &&
-      D->getFirstDecl() == D->getMostRecentDecl() &&
-      D->getKind() == Decl::Var &&
-      !D->isInline() &&
-      !D->isConstexpr() &&
-      !D->isInitCapture() &&
-      !D->isPreviousDeclInSameBlockScope() &&
-      !D->isEscapingByref() &&
-      !HasDeducedType &&
-      D->getStorageDuration() != SD_Static &&
-      !D->getDescribedVarTemplate() &&
-      !D->getMemberSpecializationInfo())
+      !D->hasExtInfo() && D->getFirstDecl() == D->getMostRecentDecl() &&
+      D->getKind() == Decl::Var && !D->isInline() && !D->isConstexpr() &&
+      !D->isInitCapture() && !D->isPreviousDeclInSameBlockScope() &&
+      !D->isEscapingByref() && !HasDeducedType &&
+      D->getStorageDuration() != SD_Static && !D->getDescribedVarTemplate() &&
+      !D->getMemberSpecializationInfo() && !D->isObjCForDecl() &&
+      !isa<ImplicitParamDecl>(D) && !D->isEscapingByref())
     AbbrevToUse = Writer.getDeclVarAbbrev();
 
   Code = serialization::DECL_VAR;
@@ -1176,7 +1185,8 @@ void ASTDeclWriter::VisitParmVarDecl(ParmVarDecl *D) {
   // we dynamically check for the properties that we optimize for, but don't
   // know are true of all PARM_VAR_DECLs.
   if (D->getDeclContext() == D->getLexicalDeclContext() && !D->hasAttrs() &&
-      !D->hasExtInfo() && D->getStorageClass() == 0 &&
+      !D->hasExtInfo() && D->getStorageClass() == 0 && !D->isInvalidDecl() &&
+      !D->isTopLevelDeclInObjCContainer() &&
       D->getInitStyle() == VarDecl::CInit && // Can params have anything else?
       D->getInit() == nullptr)               // No default expr.
     AbbrevToUse = Writer.getDeclParmVarAbbrev();
@@ -1810,7 +1820,7 @@ void ASTDeclWriter::VisitTemplateTypeParmDecl(TemplateTypeParmDecl *D) {
   if (!TC && !OwnsDefaultArg &&
       D->getDeclContext() == D->getLexicalDeclContext() &&
       !D->isInvalidDecl() && !D->hasAttrs() &&
-      !D->isTopLevelDeclInObjCContainer() &&
+      !D->isTopLevelDeclInObjCContainer() && !D->isImplicit() &&
       D->getDeclName().getNameKind() == DeclarationName::Identifier)
     AbbrevToUse = Writer.getDeclTemplateTypeParmAbbrev();
 
@@ -2094,10 +2104,14 @@ getFunctionDeclAbbrev(serialization::DeclCode Code) {
   }
   // Decl
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed,
-                           12)); // Packed DeclBits: HasStandaloneLexicalDC,
-                                 // isInvalidDecl, HasAttrs, isImplicit, isUsed,
-                                 // isReferenced, TopLevelDeclInObjCContainer,
-                                 // AccessSpecifier, ModuleOwnershipKind
+                           8)); // Packed DeclBits: ModuleOwnershipKind,
+                                // isUsed, isReferenced,  AccessSpecifier,
+                                // isImplicit
+                                //
+                                // The following bits should be 0:
+                                // HasStandaloneLexicalDC, HasAttrs,
+                                // TopLevelDeclInObjCContainer,
+                                // isInvalidDecl
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // DeclContext
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // SubmoduleID
   // NamedDecl
@@ -2131,15 +2145,13 @@ getFunctionDeclAbbrev(serialization::DeclCode Code) {
   //
   //  Add an AbbrevOp for 'size then elements' and use it here.
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));
   return Abv;
 }
 
 template <FunctionDecl::TemplatedKind Kind>
 std::shared_ptr<llvm::BitCodeAbbrev> getCXXMethodAbbrev() {
-  using namespace llvm;
-  auto Abv = getFunctionDeclAbbrev<Kind>(serialization::DECL_CXX_METHOD);
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));
-  return Abv;
+  return getFunctionDeclAbbrev<Kind>(serialization::DECL_CXX_METHOD);
 }
 } // namespace
 
@@ -2153,10 +2165,13 @@ void ASTWriter::WriteDeclAbbrevs() {
   Abv->Add(BitCodeAbbrevOp(serialization::DECL_FIELD));
   // Decl
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed,
-                           12)); // Packed DeclBits: HasStandaloneLexicalDC,
-                                 // isInvalidDecl, HasAttrs, isImplicit, isUsed,
-                                 // isReferenced, TopLevelDeclInObjCContainer,
-                                 // AccessSpecifier, ModuleOwnershipKind
+                           7)); // Packed DeclBits: ModuleOwnershipKind,
+                                // isUsed, isReferenced,  AccessSpecifier,
+                                //
+                                // The following bits should be 0:
+                                // isImplicit, HasStandaloneLexicalDC, HasAttrs,
+                                // TopLevelDeclInObjCContainer,
+                                // isInvalidDecl
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // DeclContext
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // SubmoduleID
   // NamedDecl
@@ -2216,10 +2231,13 @@ void ASTWriter::WriteDeclAbbrevs() {
   Abv->Add(BitCodeAbbrevOp(0));                       // No redeclaration
   // Decl
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed,
-                           12)); // Packed DeclBits: HasStandaloneLexicalDC,
-                                 // isInvalidDecl, HasAttrs, isImplicit, isUsed,
-                                 // isReferenced, TopLevelDeclInObjCContainer,
-                                 // AccessSpecifier, ModuleOwnershipKind
+                           7)); // Packed DeclBits: ModuleOwnershipKind,
+                                // isUsed, isReferenced,  AccessSpecifier,
+                                //
+                                // The following bits should be 0:
+                                // isImplicit, HasStandaloneLexicalDC, HasAttrs,
+                                // TopLevelDeclInObjCContainer,
+                                // isInvalidDecl
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // DeclContext
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // SubmoduleID
   // NamedDecl
@@ -2257,10 +2275,13 @@ void ASTWriter::WriteDeclAbbrevs() {
   Abv->Add(BitCodeAbbrevOp(0));                       // No redeclaration
   // Decl
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed,
-                           12)); // Packed DeclBits: HasStandaloneLexicalDC,
-                                 // isInvalidDecl, HasAttrs, isImplicit, isUsed,
-                                 // isReferenced, TopLevelDeclInObjCContainer,
-                                 // AccessSpecifier, ModuleOwnershipKind
+                           7)); // Packed DeclBits: ModuleOwnershipKind,
+                                // isUsed, isReferenced,  AccessSpecifier,
+                                //
+                                // The following bits should be 0:
+                                // isImplicit, HasStandaloneLexicalDC, HasAttrs,
+                                // TopLevelDeclInObjCContainer,
+                                // isInvalidDecl
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // DeclContext
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // SubmoduleID
   // NamedDecl
@@ -2305,10 +2326,11 @@ void ASTWriter::WriteDeclAbbrevs() {
   Abv->Add(BitCodeAbbrevOp(0));                       // No redeclaration
   // Decl
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed,
-                           12)); // Packed DeclBits: HasStandaloneLexicalDC,
-                                 // isInvalidDecl, HasAttrs, isImplicit, isUsed,
-                                 // isReferenced, TopLevelDeclInObjCContainer,
-                                 // AccessSpecifier, ModuleOwnershipKind
+                           8)); // Packed DeclBits: ModuleOwnershipKind, isUsed,
+                                // isReferenced, AccessSpecifier,
+                                // HasStandaloneLexicalDC, HasAttrs, isImplicit,
+                                // TopLevelDeclInObjCContainer,
+                                // isInvalidDecl,
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // DeclContext
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // SubmoduleID
   // NamedDecl
@@ -2345,10 +2367,11 @@ void ASTWriter::WriteDeclAbbrevs() {
   Abv->Add(BitCodeAbbrevOp(0));                       // No redeclaration
   // Decl
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed,
-                           12)); // Packed DeclBits: HasStandaloneLexicalDC,
-                                 // isInvalidDecl, HasAttrs, isImplicit, isUsed,
-                                 // isReferenced, TopLevelDeclInObjCContainer,
-                                 // AccessSpecifier, ModuleOwnershipKind
+                           7)); // Packed DeclBits: ModuleOwnershipKind,
+                                // isReferenced, isUsed, AccessSpecifier. Other
+                                // higher bits should be 0: isImplicit,
+                                // HasStandaloneLexicalDC, HasAttrs,
+                                // TopLevelDeclInObjCContainer, isInvalidDecl
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // DeclContext
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // SubmoduleID
   // NamedDecl
@@ -2389,12 +2412,13 @@ void ASTWriter::WriteDeclAbbrevs() {
   // VarDecl
   Abv->Add(BitCodeAbbrevOp(
       BitCodeAbbrevOp::Fixed,
-      27)); // Packed Var Decl bits: SClass, TSCSpec, InitStyle,
+      21)); // Packed Var Decl bits:  Linkage, ModulesCodegen,
+            // SClass, TSCSpec, InitStyle,
             // isARCPseudoStrong, IsThisDeclarationADemotedDefinition,
             // isExceptionVariable, isNRVOVariable, isCXXForRangeDecl,
-            // isObjCForDecl, isInline, isInlineSpecified, isConstexpr,
-            // isInitCapture, isPrevDeclInSameScope, ImplicitParamKind,
-            // EscapingByref, HasDeducedType, Linkage, ModulesCodegen
+            // isInline, isInlineSpecified, isConstexpr,
+            // isInitCapture, isPrevDeclInSameScope,
+            // EscapingByref, HasDeducedType, ImplicitParamKind, isObjCForDecl
   Abv->Add(BitCodeAbbrevOp(0));                         // VarKind (local enum)
   // Type Source Info
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
@@ -2422,10 +2446,11 @@ void ASTWriter::WriteDeclAbbrevs() {
   Abv->Add(BitCodeAbbrevOp(0)); // hasTypeConstraint
   // Decl
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed,
-                           12)); // Packed DeclBits: HasStandaloneLexicalDC,
-                                 // isInvalidDecl, HasAttrs, isImplicit, isUsed,
-                                 // isReferenced, TopLevelDeclInObjCContainer,
-                                 // AccessSpecifier, ModuleOwnershipKind
+                           7)); // Packed DeclBits: ModuleOwnershipKind,
+                                // isReferenced, isUsed, AccessSpecifier. Other
+                                // higher bits should be 0: isImplicit,
+                                // HasStandaloneLexicalDC, HasAttrs,
+                                // TopLevelDeclInObjCContainer, isInvalidDecl
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // DeclContext
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // SubmoduleID
   // NamedDecl
@@ -2471,12 +2496,14 @@ void ASTWriter::WriteDeclAbbrevs() {
   Abv->Add(BitCodeAbbrevOp(serialization::EXPR_DECL_REF));
   // Stmt
   //  Expr
-  //  PackingBits: DependenceKind, ValueKind, ObjectKind, HasQualifier,
-  //  GetDeclFound, ExplicitTemplateArgs, HadMultipleCandidates,
-  //  NonOdrUseReason, RefersToEnclosingVariableOrCapture, IsImmediateEscalating
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 18));
+  //  PackingBits: DependenceKind, ValueKind. ObjectKind should be 0.
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 7));
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // Type
   // DeclRefExpr
+  // Packing Bits: , HadMultipleCandidates, RefersToEnclosingVariableOrCapture,
+  // IsImmediateEscalating, NonOdrUseReason.
+  // GetDeclFound, HasQualifier and ExplicitTemplateArgs should be 0.
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 5));
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // DeclRef
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // Location
   DeclRefExprAbbrev = Stream.EmitAbbrev(std::move(Abv));
@@ -2515,11 +2542,12 @@ void ASTWriter::WriteDeclAbbrevs() {
   // Stmt
   // Expr
   // Packing Bits: DependenceKind, ValueKind, ObjectKind,
-  // HasFPFeatures, CastKind, PartOfExplicitCast
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 19));
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 10));
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // Type
   // CastExpr
   Abv->Add(BitCodeAbbrevOp(0)); // PathSize
+  // Packing Bits: CastKind, StoredFPFeatures, isPartOfExplicitCast
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 9));
   // ImplicitCastExpr
   ExprImplicitCastAbbrev = Stream.EmitAbbrev(std::move(Abv));
 
@@ -2528,11 +2556,13 @@ void ASTWriter::WriteDeclAbbrevs() {
   Abv->Add(BitCodeAbbrevOp(serialization::EXPR_BINARY_OPERATOR));
   // Stmt
   // Expr
-  // Packing Bits: DependenceKind, ValueKind, ObjectKind,
-  // HasFPFeatures, OpKind
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 17));
+  // Packing Bits: DependenceKind. ValueKind and ObjectKind should
+  // be 0 in this case.
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 5));
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // Type
   // BinaryOperator
+  Abv->Add(
+      BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // OpCode and HasFPFeatures
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // Source Location
   BinaryOperatorAbbrev = Stream.EmitAbbrev(std::move(Abv));
 
@@ -2541,11 +2571,14 @@ void ASTWriter::WriteDeclAbbrevs() {
   Abv->Add(BitCodeAbbrevOp(serialization::EXPR_COMPOUND_ASSIGN_OPERATOR));
   // Stmt
   // Expr
-  // Packing Bits: DependenceKind, ValueKind, ObjectKind,
-  // HasFPFeatures, OpKind
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 17));
+  // Packing Bits: DependenceKind. ValueKind and ObjectKind should
+  // be 0 in this case.
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 5));
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // Type
   // BinaryOperator
+  // Packing Bits: OpCode. The HasFPFeatures bit should be 0
+  Abv->Add(
+      BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // OpCode and HasFPFeatures
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // Source Location
   // CompoundAssignOperator
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // LHSType
@@ -2558,10 +2591,11 @@ void ASTWriter::WriteDeclAbbrevs() {
   // Stmt
   // Expr
   // Packing Bits: DependenceKind, ValueKind, ObjectKind,
-  // NumArgs, hasStoredFPFeatures, ADLCallKind
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 25));
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 10));
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // Type
   // CallExpr
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // NumArgs
+  Abv->Add(BitCodeAbbrevOp(0));                       // ADLCallKind
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // Source Location
   CallExprAbbrev = Stream.EmitAbbrev(std::move(Abv));
 
@@ -2571,12 +2605,14 @@ void ASTWriter::WriteDeclAbbrevs() {
   // Stmt
   // Expr
   // Packing Bits: DependenceKind, ValueKind, ObjectKind,
-  // NumArgs, hasStoredFPFeatures, ADLCallKind, OperatorKind
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 31));
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 10));
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // Type
   // CallExpr
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // NumArgs
+  Abv->Add(BitCodeAbbrevOp(0));                       // ADLCallKind
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // Source Location
   // CXXOperatorCallExpr
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // Operator Kind
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // Source Location
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // Source Location
   CXXOperatorCallExprAbbrev = Stream.EmitAbbrev(std::move(Abv));
@@ -2587,10 +2623,11 @@ void ASTWriter::WriteDeclAbbrevs() {
   // Stmt
   // Expr
   // Packing Bits: DependenceKind, ValueKind, ObjectKind,
-  // NumArgs, hasStoredFPFeatures, ADLCallKind
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 25));
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 10));
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // Type
   // CallExpr
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // NumArgs
+  Abv->Add(BitCodeAbbrevOp(0));                       // ADLCallKind
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // Source Location
   // CXXMemberCallExpr
   CXXMemberCallExprAbbrev = Stream.EmitAbbrev(std::move(Abv));
@@ -2600,8 +2637,8 @@ void ASTWriter::WriteDeclAbbrevs() {
   Abv->Add(BitCodeAbbrevOp(serialization::STMT_COMPOUND));
   // Stmt
   // CompoundStmt
-  // Packing Bits: Num Stmts, hasStoredFPFeatures
-  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 21));
+  Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // Num Stmts
+  Abv->Add(BitCodeAbbrevOp(0));                       // hasStoredFPFeatures
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // Source Location
   Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // Source Location
   CompoundStmtAbbrev = Stream.EmitAbbrev(std::move(Abv));
diff --git a/clang/lib/Serialization/ASTWriterStmt.cpp b/clang/lib/Serialization/ASTWriterStmt.cpp
index 02cc7798abdb2..7f888e44dde1e 100644
--- a/clang/lib/Serialization/ASTWriterStmt.cpp
+++ b/clang/lib/Serialization/ASTWriterStmt.cpp
@@ -138,10 +138,8 @@ void ASTStmtWriter::VisitNullStmt(NullStmt *S) {
 void ASTStmtWriter::VisitCompoundStmt(CompoundStmt *S) {
   VisitStmt(S);
 
-  CurrentPackingBits.updateBits();
-  // 20 bits should be enough to store the size of stmts.
-  CurrentPackingBits.addBits(S->size(), /*Width=*/20);
-  CurrentPackingBits.addBit(S->hasStoredFPFeatures());
+  Record.push_back(S->size());
+  Record.push_back(S->hasStoredFPFeatures());
 
   for (auto *CS : S->body())
     Record.AddStmt(CS);
@@ -675,25 +673,26 @@ void ASTStmtWriter::VisitPredefinedExpr(PredefinedExpr *E) {
 void ASTStmtWriter::VisitDeclRefExpr(DeclRefExpr *E) {
   VisitExpr(E);
 
-  CurrentPackingBits.addBit(E->hasQualifier());
-  CurrentPackingBits.addBit(E->getDecl() != E->getFoundDecl());
-  CurrentPackingBits.addBit(E->hasTemplateKWAndArgsInfo());
+  CurrentPackingBits.updateBits();
+
   CurrentPackingBits.addBit(E->hadMultipleCandidates());
   CurrentPackingBits.addBit(E->refersToEnclosingVariableOrCapture());
   CurrentPackingBits.addBits(E->isNonOdrUse(), /*Width=*/2);
   CurrentPackingBits.addBit(E->isImmediateEscalating());
+  CurrentPackingBits.addBit(E->getDecl() != E->getFoundDecl());
+  CurrentPackingBits.addBit(E->hasQualifier());
+  CurrentPackingBits.addBit(E->hasTemplateKWAndArgsInfo());
 
   if (E->hasTemplateKWAndArgsInfo()) {
     unsigned NumTemplateArgs = E->getNumTemplateArgs();
-    // 12 bits should be sufficient to store the number of template args.
-    CurrentPackingBits.addBits(NumTemplateArgs, /*Width=*/12);
+    Record.push_back(NumTemplateArgs);
   }
 
   DeclarationName::NameKind nk = (E->getDecl()->getDeclName().getNameKind());
 
   if ((!E->hasTemplateKWAndArgsInfo()) && (!E->hasQualifier()) &&
       (E->getDecl() == E->getFoundDecl()) &&
-      nk == DeclarationName::Identifier) {
+      nk == DeclarationName::Identifier && E->getObjectKind() == OK_Ordinary) {
     AbbrevToUse = Writer.getDeclRefExprAbbrev();
   }
 
@@ -936,10 +935,10 @@ void ASTStmtWriter::VisitOMPIteratorExpr(OMPIteratorExpr *E) {
 void ASTStmtWriter::VisitCallExpr(CallExpr *E) {
   VisitExpr(E);
 
-  // 13 bits should be sufficient to store the number args;
-  CurrentPackingBits.addBits(E->getNumArgs(), /*BitsWidth=*/13);
-  CurrentPackingBits.addBit(E->hasStoredFPFeatures());
+  Record.push_back(E->getNumArgs());
+  CurrentPackingBits.updateBits();
   CurrentPackingBits.addBit(static_cast<bool>(E->getADLCallKind()));
+  CurrentPackingBits.addBit(E->hasStoredFPFeatures());
 
   Record.AddSourceLocation(E->getRParenLoc());
   Record.AddStmt(E->getCallee());
@@ -950,7 +949,8 @@ void ASTStmtWriter::VisitCallExpr(CallExpr *E) {
   if (E->hasStoredFPFeatures())
     Record.push_back(E->getFPFeatures().getAsOpaqueInt());
 
-  if (!E->hasStoredFPFeatures() && E->getStmtClass() == Stmt::CallExprClass)
+  if (!E->hasStoredFPFeatures() && !static_cast<bool>(E->getADLCallKind()) &&
+      E->getStmtClass() == Stmt::CallExprClass)
     AbbrevToUse = Writer.getCallExprAbbrev();
 
   Code = serialization::EXPR_CALL;
@@ -979,12 +979,11 @@ void ASTStmtWriter::VisitMemberExpr(MemberExpr *E) {
 
   // Write these first for easy access when deserializing, as they affect the
   // size of the MemberExpr.
-
+  CurrentPackingBits.updateBits();
   CurrentPackingBits.addBit(HasQualifier);
   CurrentPackingBits.addBit(HasFoundDecl);
   CurrentPackingBits.addBit(HasTemplateInfo);
-  // 12 bits should be enough to store the number of args
-  CurrentPackingBits.addBits(NumTemplateArgs, /*Width=*/12);
+  Record.push_back(NumTemplateArgs);
 
   Record.AddStmt(E->getBase());
   Record.AddDeclRef(E->getMemberDecl());
@@ -1041,9 +1040,10 @@ void ASTStmtWriter::VisitCastExpr(CastExpr *E) {
   VisitExpr(E);
 
   Record.push_back(E->path_size());
-  CurrentPackingBits.addBit(E->hasStoredFPFeatures());
+  CurrentPackingBits.updateBits();
   // 7 bits should be enough to store the casting kinds.
   CurrentPackingBits.addBits(E->getCastKind(), /*Width=*/7);
+  CurrentPackingBits.addBit(E->hasStoredFPFeatures());
   Record.AddStmt(E->getSubExpr());
 
   for (CastExpr::path_iterator
@@ -1056,18 +1056,21 @@ void ASTStmtWriter::VisitCastExpr(CastExpr *E) {
 
 void ASTStmtWriter::VisitBinaryOperator(BinaryOperator *E) {
   VisitExpr(E);
-  bool HasFPFeatures = E->hasStoredFPFeatures();
+
   // Write this first for easy access when deserializing, as they affect the
   // size of the UnaryOperator.
-  CurrentPackingBits.addBit(HasFPFeatures);
+  CurrentPackingBits.updateBits();
   CurrentPackingBits.addBits(E->getOpcode(), /*Width=*/6);
+  bool HasFPFeatures = E->hasStoredFPFeatures();
+  CurrentPackingBits.addBit(HasFPFeatures);
   Record.AddStmt(E->getLHS());
   Record.AddStmt(E->getRHS());
   Record.AddSourceLocation(E->getOperatorLoc());
   if (HasFPFeatures)
     Record.push_back(E->getStoredFPFeatures().getAsOpaqueInt());
 
-  if (!HasFPFeatures)
+  if (!HasFPFeatures && E->getValueKind() == VK_PRValue &&
+      E->getObjectKind() == OK_Ordinary)
     AbbrevToUse = Writer.getBinaryOperatorAbbrev();
 
   Code = serialization::EXPR_BINARY_OPERATOR;
@@ -1078,7 +1081,8 @@ void ASTStmtWriter::VisitCompoundAssignOperator(CompoundAssignOperator *E) {
   Record.AddTypeRef(E->getComputationLHSType());
   Record.AddTypeRef(E->getComputationResultType());
 
-  if (!E->hasStoredFPFeatures())
+  if (!E->hasStoredFPFeatures() && E->getValueKind() == VK_PRValue &&
+      E->getObjectKind() == OK_Ordinary)
     AbbrevToUse = Writer.getCompoundAssignOperatorAbbrev();
 
   Code = serialization::EXPR_COMPOUND_ASSIGN_OPERATOR;
@@ -1664,10 +1668,10 @@ void ASTStmtWriter::VisitMSDependentExistsStmt(MSDependentExistsStmt *S) {
 
 void ASTStmtWriter::VisitCXXOperatorCallExpr(CXXOperatorCallExpr *E) {
   VisitCallExpr(E);
-  CurrentPackingBits.addBits(E->getOperator(), /*Width=*/6);
+  Record.push_back(E->getOperator());
   Record.AddSourceRange(E->Range);
 
-  if (!E->hasStoredFPFeatures())
+  if (!E->hasStoredFPFeatures() && !static_cast<bool>(E->getADLCallKind()))
     AbbrevToUse = Writer.getCXXOperatorCallExprAbbrev();
 
   Code = serialization::EXPR_CXX_OPERATOR_CALL;
@@ -1676,7 +1680,7 @@ void ASTStmtWriter::VisitCXXOperatorCallExpr(CXXOperatorCallExpr *E) {
 void ASTStmtWriter::VisitCXXMemberCallExpr(CXXMemberCallExpr *E) {
   VisitCallExpr(E);
 
-  if (!E->hasStoredFPFeatures())
+  if (!E->hasStoredFPFeatures() && !static_cast<bool>(E->getADLCallKind()))
     AbbrevToUse = Writer.getCXXMemberCallExprAbbrev();
 
   Code = serialization::EXPR_CXX_MEMBER_CALL;
@@ -1838,6 +1842,7 @@ void ASTStmtWriter::VisitCXXThisExpr(CXXThisExpr *E) {
   VisitExpr(E);
   Record.AddSourceLocation(E->getLocation());
   Record.push_back(E->isImplicit());
+
   Code = serialization::EXPR_CXX_THIS;
 }
 
@@ -1971,10 +1976,9 @@ void ASTStmtWriter::VisitCXXDependentScopeMemberExpr(
 
   // Don't emit anything here (or if you do you will have to update
   // the corresponding deserialization function).
-
+  Record.push_back(E->getNumTemplateArgs());
+  CurrentPackingBits.updateBits();
   CurrentPackingBits.addBit(E->hasTemplateKWAndArgsInfo());
-  // 16 bits should be enough to store the number of template args.
-  CurrentPackingBits.addBits(E->getNumTemplateArgs(), /*Width=*/16);
   CurrentPackingBits.addBit(E->hasFirstQualifierFoundInScope());
 
   if (E->hasTemplateKWAndArgsInfo()) {
@@ -2041,15 +2045,14 @@ ASTStmtWriter::VisitCXXUnresolvedConstructExpr(CXXUnresolvedConstructExpr *E) {
 void ASTStmtWriter::VisitOverloadExpr(OverloadExpr *E) {
   VisitExpr(E);
 
+  Record.push_back(E->getNumDecls());
+
   CurrentPackingBits.updateBits();
-  // 12 Bits should enough to store the number of decls.
-  CurrentPackingBits.addBits(E->getNumDecls(), /*BitWidth=*/12);
   CurrentPackingBits.addBit(E->hasTemplateKWAndArgsInfo());
   if (E->hasTemplateKWAndArgsInfo()) {
     const ASTTemplateKWAndArgsInfo &ArgInfo =
         *E->getTrailingASTTemplateKWAndArgsInfo();
-    // 12 Bits should enough to store the number of template args.
-    CurrentPackingBits.addBits(ArgInfo.NumTemplateArgs, /*BitWidth=*/12);
+    Record.push_back(ArgInfo.NumTemplateArgs);
     AddTemplateKWAndArgsInfo(ArgInfo, E->getTrailingTemplateArgumentLoc());
   }
 

From db8a119e8f04dfccd40ab7675c62ada81423e5c2 Mon Sep 17 00:00:00 2001
From: Matthias Springer <me@m-sp.org>
Date: Thu, 21 Dec 2023 17:39:36 +0900
Subject: [PATCH 036/342] [mlir][ArmSME] Fix invalid rewriter API usage
 (#76123)

When operations are modified in-place, the rewriter must be notified.
This commit fixes `mlir/test/Conversion/ArmSMEToLLVM/unsupported.mlir`,
`mlir/test/Dialect/ArmSME/tile-zero-masks.mlir` and
`mlir/test/Dialect/ArmSME/vector-ops-to-llvm.mlir` when running with
`MLIR_ENABLE_EXPENSIVE_PATTERN_API_CHECKS` enabled.
---
 mlir/lib/Dialect/ArmSME/Transforms/TileAllocation.cpp | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/mlir/lib/Dialect/ArmSME/Transforms/TileAllocation.cpp b/mlir/lib/Dialect/ArmSME/Transforms/TileAllocation.cpp
index 597846e31e218..8aa51f352f822 100644
--- a/mlir/lib/Dialect/ArmSME/Transforms/TileAllocation.cpp
+++ b/mlir/lib/Dialect/ArmSME/Transforms/TileAllocation.cpp
@@ -223,8 +223,10 @@ struct AssignTileIDsPattern
     if (failed(tileId))
       return tileOp.emitError("ran out of SME virtual tiles!");
 
-    func->setDiscardableAttr(kTilesInUseAttr,
-                             rewriter.getI32IntegerAttr((unsigned)tilesInUse));
+    rewriter.updateRootInPlace(func, [&]() {
+      func->setDiscardableAttr(
+          kTilesInUseAttr, rewriter.getI32IntegerAttr((unsigned)tilesInUse));
+    });
 
     // Find all the ops that (transitively) depend on this tile.
     SetVector<Operation *> dependantOps;
@@ -245,14 +247,15 @@ struct AssignTileIDsPattern
     // scf.if, and moving the contents of %tileA or %tileB to result tile (based
     // on the %some_cond).
     auto tileIDAttr = rewriter.getI32IntegerAttr(*tileId);
-    tileOp.setTileId(tileIDAttr);
+    rewriter.updateRootInPlace(tileOp, [&]() { tileOp.setTileId(tileIDAttr); });
     for (auto *op : dependantOps) {
       if (auto tileOp = llvm::dyn_cast<ArmSMETileOpInterface>(op)) {
         auto currentTileId = tileOp.getTileId();
         if (currentTileId && unsigned(currentTileId.getInt()) != tileId)
           return tileOp.emitOpError(
               "already assigned different SME virtual tile!");
-        tileOp.setTileId(tileIDAttr);
+        rewriter.updateRootInPlace(tileOp,
+                                   [&]() { tileOp.setTileId(tileIDAttr); });
       }
     }
 

From d3ef86708241a3bee902615c190dead1638c4e09 Mon Sep 17 00:00:00 2001
From: "boxu.zhang" <boxu-zhang@users.noreply.github.com>
Date: Thu, 21 Dec 2023 16:47:46 +0800
Subject: [PATCH 037/342] [LoopUnroll] Make UnrollMaxUpperBound to be
 overridable by target (#76029)

The UnrollMaxUpperBound should be target dependent, since different
chips provide different register set which brings different ability of
storing more temporary values of a program. So I add a MaxUpperBound
value in UnrollingPreference which can be override by targets. All uses
of UnrollMaxUpperBound are replaced with UP.MaxUpperBound.

The default value is still 8 and the command line argument
'--unroll-max-upperbound' takes final effect if provided.
---
 llvm/include/llvm/Analysis/TargetTransformInfo.h | 4 ++++
 llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp    | 9 ++++++---
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index f5114fa40c70a..735be3680aea0 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -560,6 +560,10 @@ class TargetTransformInfo {
     // (set to UINT_MAX to disable). This does not apply in cases where the
     // loop is being fully unrolled.
     unsigned MaxCount;
+    /// Set the maximum upper bound of trip count. Allowing the MaxUpperBound
+    /// to be overrided by a target gives more flexiblity on certain cases.
+    /// By default, MaxUpperBound uses UnrollMaxUpperBound which value is 8.
+    unsigned MaxUpperBound;
     /// Set the maximum unrolling factor for full unrolling. Like MaxCount, but
     /// applies even if full unrolling is selected. This allows a target to fall
     /// back to Partial unrolling if full unrolling is above FullUnrollMaxCount.
diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
index f14541a1a037e..7cfeb019af972 100644
--- a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -200,6 +200,7 @@ TargetTransformInfo::UnrollingPreferences llvm::gatherUnrollingPreferences(
   UP.Count = 0;
   UP.DefaultUnrollRuntimeCount = 8;
   UP.MaxCount = std::numeric_limits<unsigned>::max();
+  UP.MaxUpperBound = UnrollMaxUpperBound;
   UP.FullUnrollMaxCount = std::numeric_limits<unsigned>::max();
   UP.BEInsns = 2;
   UP.Partial = false;
@@ -237,6 +238,8 @@ TargetTransformInfo::UnrollingPreferences llvm::gatherUnrollingPreferences(
     UP.MaxPercentThresholdBoost = UnrollMaxPercentThresholdBoost;
   if (UnrollMaxCount.getNumOccurrences() > 0)
     UP.MaxCount = UnrollMaxCount;
+  if (UnrollMaxUpperBound.getNumOccurrences() > 0)
+    UP.MaxUpperBound = UnrollMaxUpperBound;
   if (UnrollFullMaxCount.getNumOccurrences() > 0)
     UP.FullUnrollMaxCount = UnrollFullMaxCount;
   if (UnrollAllowPartial.getNumOccurrences() > 0)
@@ -777,7 +780,7 @@ shouldPragmaUnroll(Loop *L, const PragmaInfo &PInfo,
     return TripCount;
 
   if (PInfo.PragmaEnableUnroll && !TripCount && MaxTripCount &&
-      MaxTripCount <= UnrollMaxUpperBound)
+      MaxTripCount <= UP.MaxUpperBound)
     return MaxTripCount;
 
   // if didn't return until here, should continue to other priorties
@@ -952,7 +955,7 @@ bool llvm::computeUnrollCount(
   // cost of exact full unrolling.  As such, if we have an exact count and
   // found it unprofitable, we'll never chose to bounded unroll.
   if (!TripCount && MaxTripCount && (UP.UpperBound || MaxOrZero) &&
-      MaxTripCount <= UnrollMaxUpperBound) {
+      MaxTripCount <= UP.MaxUpperBound) {
     UP.Count = MaxTripCount;
     if (auto UnrollFactor = shouldFullUnroll(L, TTI, DT, SE, EphValues,
                                              MaxTripCount, UCE, UP)) {
@@ -1026,7 +1029,7 @@ bool llvm::computeUnrollCount(
   }
 
   // Don't unroll a small upper bound loop unless user or TTI asked to do so.
-  if (MaxTripCount && !UP.Force && MaxTripCount < UnrollMaxUpperBound) {
+  if (MaxTripCount && !UP.Force && MaxTripCount < UP.MaxUpperBound) {
     UP.Count = 0;
     return false;
   }

From 68fb3d596e451cbb9e40c01d26c4e9af1126ce01 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Thu, 21 Dec 2023 09:13:32 +0000
Subject: [PATCH 038/342] [ConstraintElim] Add test with select where the
 second op cant be poison.

Extra test for TODO from #75750.
---
 .../and-implied-by-operands.ll                | 21 +++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/llvm/test/Transforms/ConstraintElimination/and-implied-by-operands.ll b/llvm/test/Transforms/ConstraintElimination/and-implied-by-operands.ll
index 2de7592d5ccc7..5c49ca0e96f30 100644
--- a/llvm/test/Transforms/ConstraintElimination/and-implied-by-operands.ll
+++ b/llvm/test/Transforms/ConstraintElimination/and-implied-by-operands.ll
@@ -478,3 +478,24 @@ entry:
   %and = select i1 %c.1, i1 %c.2, i1 false
   ret i1 %and
 }
+
+define i1 @and_select_second_implies_first_guaranteed_not_poison(ptr noundef %A, ptr noundef %B) {
+; CHECK-LABEL: @and_select_second_implies_first_guaranteed_not_poison(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[C_1:%.*]] = icmp ne ptr [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds ptr, ptr [[B]], i64 -1
+; CHECK-NEXT:    [[C_2:%.*]] = icmp ugt ptr [[GEP]], [[A]]
+; CHECK-NEXT:    call void @no_noundef(i1 [[C_2]])
+; CHECK-NEXT:    [[AND:%.*]] = select i1 [[C_1]], i1 [[C_2]], i1 false
+; CHECK-NEXT:    ret i1 [[AND]]
+;
+entry:
+  %c.1 = icmp ne ptr %A, %B
+  %gep = getelementptr inbounds ptr, ptr %B, i64 -1
+  %c.2 = icmp ugt ptr %gep, %A
+  call void @no_noundef(i1 %c.2)
+  %and = select i1 %c.1, i1 %c.2, i1 false
+  ret i1 %and
+}
+
+declare void @no_noundef(i1 noundef)

From c0931d4950a93526aa08ec3ab86f64ffb616b406 Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Thu, 21 Dec 2023 09:22:23 +0000
Subject: [PATCH 039/342] [AArch64][GlobalISel] Lower scalarizing
 G_UNMERGE_VALUES to G_EXTRACT_VECTOR_ELT

This adds post-legalizing lowering of G_UNMERGE_VALUES which take a vector and
produce scalar values for each lane. They are converted to a G_EXTRACT_VECTOR_ELT
for each lane, allowing all the existing tablegen patterns to apply to them.

A couple of tablegen patterns need to be altered to make sure the type of the
constant operand is known, so that the patterns are recognized under global
isel.

Closes #75662
---
 llvm/lib/Target/AArch64/AArch64Combine.td     |  11 +-
 llvm/lib/Target/AArch64/AArch64InstrInfo.td   |  12 +-
 .../GISel/AArch64PostLegalizerLowering.cpp    |  21 ++
 llvm/test/CodeGen/AArch64/aarch64-minmaxv.ll  | 176 +++++-----
 llvm/test/CodeGen/AArch64/aarch64-mulv.ll     | 155 ++++-----
 llvm/test/CodeGen/AArch64/aarch64-smull.ll    |  18 +-
 llvm/test/CodeGen/AArch64/fptoi.ll            |  16 +-
 llvm/test/CodeGen/AArch64/reduce-and.ll       | 309 +++++++-----------
 llvm/test/CodeGen/AArch64/reduce-or.ll        | 309 +++++++-----------
 llvm/test/CodeGen/AArch64/reduce-xor.ll       | 309 +++++++-----------
 llvm/test/CodeGen/AArch64/sext.ll             |  42 +--
 .../AArch64/vecreduce-umax-legalization.ll    |  98 +++---
 llvm/test/CodeGen/AArch64/xtn.ll              |  24 +-
 llvm/test/CodeGen/AArch64/zext.ll             |  41 +--
 14 files changed, 622 insertions(+), 919 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td
index d6c00be80bd9c..99f256b887821 100644
--- a/llvm/lib/Target/AArch64/AArch64Combine.td
+++ b/llvm/lib/Target/AArch64/AArch64Combine.td
@@ -134,6 +134,14 @@ def shuffle_vector_lowering : GICombineGroup<[dup, rev, ext, zip, uzp, trn,
                                               form_duplane,
                                               shuf_to_ins]>;
 
+// Turn G_UNMERGE_VALUES -> G_EXTRACT_VECTOR_ELT's
+def vector_unmerge_lowering : GICombineRule <
+  (defs root:$root),
+  (match (wip_match_opcode G_UNMERGE_VALUES):$root,
+          [{ return matchScalarizeVectorUnmerge(*${root}, MRI); }]),
+  (apply [{ applyScalarizeVectorUnmerge(*${root}, MRI, B); }])
+>;
+
 def adjust_icmp_imm_matchdata :
   GIDefMatchData<"std::pair<uint64_t, CmpInst::Predicate>">;
 def adjust_icmp_imm : GICombineRule <
@@ -251,7 +259,8 @@ def AArch64PostLegalizerLowering
                         icmp_lowering, build_vector_lowering,
                         lower_vector_fcmp, form_truncstore,
                         vector_sext_inreg_to_shift,
-                        unmerge_ext_to_unmerge, lower_mull]> {
+                        unmerge_ext_to_unmerge, lower_mull,
+                        vector_unmerge_lowering]> {
 }
 
 // Post-legalization combines which are primarily optimizations.
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index bdb38f0c37895..4ccac40f99a0a 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -6482,23 +6482,23 @@ def : Pat<(v2i64 (vector_insert v2i64:$src, (i64 (bitconvert (f64 FPR64:$Sn))),
 // f32 bitcast(vector_extract(v4i32 src, lane)) -> EXTRACT_SUBREG(INSvi32lane(-, 0, src, lane))
 def : Pat<(f32 (bitconvert (i32 (vector_extract v4i32:$src, imm:$Immd)))),
           (EXTRACT_SUBREG (INSvi32lane (IMPLICIT_DEF), 0, V128:$src, imm:$Immd), ssub)>;
-def : Pat<(f32 (bitconvert (i32 (vector_extract v4i32:$src, 0)))),
+def : Pat<(f32 (bitconvert (i32 (vector_extract v4i32:$src, (i64 0))))),
           (EXTRACT_SUBREG V128:$src, ssub)>;
 def : Pat<(f64 (bitconvert (i64 (vector_extract v2i64:$src, imm:$Immd)))),
           (EXTRACT_SUBREG (INSvi64lane (IMPLICIT_DEF), 0, V128:$src, imm:$Immd), dsub)>;
-def : Pat<(f64 (bitconvert (i64 (vector_extract v2i64:$src, 0)))),
+def : Pat<(f64 (bitconvert (i64 (vector_extract v2i64:$src, (i64 0))))),
           (EXTRACT_SUBREG V128:$src, dsub)>;
 
 // Floating point vector extractions are codegen'd as either a sequence of
 // subregister extractions, or a MOV (aka DUP here) if
 // the lane number is anything other than zero.
-def : Pat<(vector_extract (v2f64 V128:$Rn), 0),
+def : Pat<(f64 (vector_extract (v2f64 V128:$Rn), (i64 0))),
           (f64 (EXTRACT_SUBREG V128:$Rn, dsub))>;
-def : Pat<(vector_extract (v4f32 V128:$Rn), 0),
+def : Pat<(f32 (vector_extract (v4f32 V128:$Rn), (i64 0))),
           (f32 (EXTRACT_SUBREG V128:$Rn, ssub))>;
-def : Pat<(vector_extract (v8f16 V128:$Rn), 0),
+def : Pat<(f16 (vector_extract (v8f16 V128:$Rn), (i64 0))),
           (f16 (EXTRACT_SUBREG V128:$Rn, hsub))>;
-def : Pat<(vector_extract (v8bf16 V128:$Rn), 0),
+def : Pat<(bf16 (vector_extract (v8bf16 V128:$Rn), (i64 0))),
           (bf16 (EXTRACT_SUBREG V128:$Rn, hsub))>;
 
 
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp
index 687063873a16b..830203b61c586 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp
@@ -769,6 +769,27 @@ void applyDupLane(MachineInstr &MI, MachineRegisterInfo &MRI,
   MI.eraseFromParent();
 }
 
+bool matchScalarizeVectorUnmerge(MachineInstr &MI, MachineRegisterInfo &MRI) {
+  auto &Unmerge = cast<GUnmerge>(MI);
+  Register Src1Reg = Unmerge.getReg(Unmerge.getNumOperands() - 1);
+  const LLT SrcTy = MRI.getType(Src1Reg);
+  return SrcTy.isVector() && !SrcTy.isScalable() &&
+         Unmerge.getNumOperands() == (unsigned)SrcTy.getNumElements() + 1;
+}
+
+void applyScalarizeVectorUnmerge(MachineInstr &MI, MachineRegisterInfo &MRI,
+                                 MachineIRBuilder &B) {
+  auto &Unmerge = cast<GUnmerge>(MI);
+  Register Src1Reg = Unmerge.getReg(Unmerge.getNumOperands() - 1);
+  const LLT SrcTy = MRI.getType(Src1Reg);
+  assert((SrcTy.isVector() && !SrcTy.isScalable()) &&
+         "Expected a fixed length vector");
+
+  for (int I = 0; I < SrcTy.getNumElements(); ++I)
+    B.buildExtractVectorElementConstant(Unmerge.getReg(I), Src1Reg, I);
+  MI.eraseFromParent();
+}
+
 bool matchBuildVectorToDup(MachineInstr &MI, MachineRegisterInfo &MRI) {
   assert(MI.getOpcode() == TargetOpcode::G_BUILD_VECTOR);
   auto Splat = getAArch64VectorSplat(MI, MRI);
diff --git a/llvm/test/CodeGen/AArch64/aarch64-minmaxv.ll b/llvm/test/CodeGen/AArch64/aarch64-minmaxv.ll
index 5e477e8947d1b..194fe5be40c2b 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-minmaxv.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-minmaxv.ll
@@ -516,20 +516,17 @@ define i8 @sminv_v4i8(<4 x i8> %a) {
 ; CHECK-GI-LABEL: sminv_v4i8:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    mov h1, v0.h[1]
-; CHECK-GI-NEXT:    mov h2, v0.h[2]
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    mov h3, v0.h[3]
+; CHECK-GI-NEXT:    umov w8, v0.h[0]
+; CHECK-GI-NEXT:    umov w9, v0.h[1]
+; CHECK-GI-NEXT:    umov w10, v0.h[2]
+; CHECK-GI-NEXT:    umov w12, v0.h[3]
+; CHECK-GI-NEXT:    sxtb w11, w8
+; CHECK-GI-NEXT:    cmp w11, w9, sxtb
+; CHECK-GI-NEXT:    sxtb w11, w10
+; CHECK-GI-NEXT:    csel w8, w8, w9, lt
+; CHECK-GI-NEXT:    cmp w11, w12, sxtb
 ; CHECK-GI-NEXT:    sxtb w9, w8
-; CHECK-GI-NEXT:    fmov w10, s1
-; CHECK-GI-NEXT:    fmov w11, s2
-; CHECK-GI-NEXT:    cmp w9, w10, sxtb
-; CHECK-GI-NEXT:    sxtb w9, w11
-; CHECK-GI-NEXT:    csel w8, w8, w10, lt
-; CHECK-GI-NEXT:    fmov w10, s3
-; CHECK-GI-NEXT:    cmp w9, w10, sxtb
-; CHECK-GI-NEXT:    sxtb w9, w8
-; CHECK-GI-NEXT:    csel w10, w11, w10, lt
+; CHECK-GI-NEXT:    csel w10, w10, w12, lt
 ; CHECK-GI-NEXT:    cmp w9, w10, sxtb
 ; CHECK-GI-NEXT:    csel w0, w8, w10, lt
 ; CHECK-GI-NEXT:    ret
@@ -611,19 +608,16 @@ define i16 @sminv_v3i16(<3 x i16> %a) {
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-GI-NEXT:    mov h1, v0.h[1]
-; CHECK-GI-NEXT:    mov h2, v0.h[2]
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    fmov w9, s0
-; CHECK-GI-NEXT:    sxth w8, w8
-; CHECK-GI-NEXT:    fmov w10, s1
-; CHECK-GI-NEXT:    fmov w11, s2
+; CHECK-GI-NEXT:    smov w8, v0.h[0]
+; CHECK-GI-NEXT:    umov w9, v0.h[0]
+; CHECK-GI-NEXT:    umov w10, v0.h[1]
+; CHECK-GI-NEXT:    smov w11, v0.h[2]
+; CHECK-GI-NEXT:    umov w13, v0.h[2]
 ; CHECK-GI-NEXT:    fmov w12, s1
-; CHECK-GI-NEXT:    cmp w8, w10, sxth
-; CHECK-GI-NEXT:    sxth w8, w11
-; CHECK-GI-NEXT:    fmov w10, s2
-; CHECK-GI-NEXT:    csel w9, w9, w12, lt
-; CHECK-GI-NEXT:    cmp w8, w9, sxth
-; CHECK-GI-NEXT:    csel w0, w9, w10, gt
+; CHECK-GI-NEXT:    cmp w8, w12, sxth
+; CHECK-GI-NEXT:    csel w8, w9, w10, lt
+; CHECK-GI-NEXT:    cmp w11, w8, sxth
+; CHECK-GI-NEXT:    csel w0, w8, w13, gt
 ; CHECK-GI-NEXT:    ret
 entry:
   %arg1 = call i16 @llvm.vector.reduce.smin.v3i16(<3 x i16> %a)
@@ -887,20 +881,17 @@ define i8 @smaxv_v4i8(<4 x i8> %a) {
 ; CHECK-GI-LABEL: smaxv_v4i8:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    mov h1, v0.h[1]
-; CHECK-GI-NEXT:    mov h2, v0.h[2]
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    mov h3, v0.h[3]
-; CHECK-GI-NEXT:    sxtb w9, w8
-; CHECK-GI-NEXT:    fmov w10, s1
-; CHECK-GI-NEXT:    fmov w11, s2
-; CHECK-GI-NEXT:    cmp w9, w10, sxtb
-; CHECK-GI-NEXT:    sxtb w9, w11
-; CHECK-GI-NEXT:    csel w8, w8, w10, gt
-; CHECK-GI-NEXT:    fmov w10, s3
-; CHECK-GI-NEXT:    cmp w9, w10, sxtb
+; CHECK-GI-NEXT:    umov w8, v0.h[0]
+; CHECK-GI-NEXT:    umov w9, v0.h[1]
+; CHECK-GI-NEXT:    umov w10, v0.h[2]
+; CHECK-GI-NEXT:    umov w12, v0.h[3]
+; CHECK-GI-NEXT:    sxtb w11, w8
+; CHECK-GI-NEXT:    cmp w11, w9, sxtb
+; CHECK-GI-NEXT:    sxtb w11, w10
+; CHECK-GI-NEXT:    csel w8, w8, w9, gt
+; CHECK-GI-NEXT:    cmp w11, w12, sxtb
 ; CHECK-GI-NEXT:    sxtb w9, w8
-; CHECK-GI-NEXT:    csel w10, w11, w10, gt
+; CHECK-GI-NEXT:    csel w10, w10, w12, gt
 ; CHECK-GI-NEXT:    cmp w9, w10, sxtb
 ; CHECK-GI-NEXT:    csel w0, w8, w10, gt
 ; CHECK-GI-NEXT:    ret
@@ -982,19 +973,16 @@ define i16 @smaxv_v3i16(<3 x i16> %a) {
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-GI-NEXT:    mov h1, v0.h[1]
-; CHECK-GI-NEXT:    mov h2, v0.h[2]
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    fmov w9, s0
-; CHECK-GI-NEXT:    sxth w8, w8
-; CHECK-GI-NEXT:    fmov w10, s1
-; CHECK-GI-NEXT:    fmov w11, s2
+; CHECK-GI-NEXT:    smov w8, v0.h[0]
+; CHECK-GI-NEXT:    umov w9, v0.h[0]
+; CHECK-GI-NEXT:    umov w10, v0.h[1]
+; CHECK-GI-NEXT:    smov w11, v0.h[2]
+; CHECK-GI-NEXT:    umov w13, v0.h[2]
 ; CHECK-GI-NEXT:    fmov w12, s1
-; CHECK-GI-NEXT:    cmp w8, w10, sxth
-; CHECK-GI-NEXT:    sxth w8, w11
-; CHECK-GI-NEXT:    fmov w10, s2
-; CHECK-GI-NEXT:    csel w9, w9, w12, gt
-; CHECK-GI-NEXT:    cmp w8, w9, sxth
-; CHECK-GI-NEXT:    csel w0, w9, w10, lt
+; CHECK-GI-NEXT:    cmp w8, w12, sxth
+; CHECK-GI-NEXT:    csel w8, w9, w10, gt
+; CHECK-GI-NEXT:    cmp w11, w8, sxth
+; CHECK-GI-NEXT:    csel w0, w8, w13, lt
 ; CHECK-GI-NEXT:    ret
 entry:
   %arg1 = call i16 @llvm.vector.reduce.smax.v3i16(<3 x i16> %a)
@@ -1256,19 +1244,16 @@ define i8 @uminv_v4i8(<4 x i8> %a) {
 ; CHECK-GI-LABEL: uminv_v4i8:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    mov h1, v0.h[1]
-; CHECK-GI-NEXT:    mov h2, v0.h[2]
-; CHECK-GI-NEXT:    mov h3, v0.h[3]
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    fmov w10, s1
-; CHECK-GI-NEXT:    fmov w11, s2
-; CHECK-GI-NEXT:    fmov w12, s3
-; CHECK-GI-NEXT:    and w9, w8, #0xff
-; CHECK-GI-NEXT:    cmp w9, w10, uxtb
-; CHECK-GI-NEXT:    and w9, w11, #0xff
-; CHECK-GI-NEXT:    csel w8, w8, w10, lo
-; CHECK-GI-NEXT:    cmp w9, w12, uxtb
-; CHECK-GI-NEXT:    csel w9, w11, w12, lo
+; CHECK-GI-NEXT:    umov w8, v0.h[0]
+; CHECK-GI-NEXT:    umov w9, v0.h[1]
+; CHECK-GI-NEXT:    umov w10, v0.h[2]
+; CHECK-GI-NEXT:    umov w11, v0.h[3]
+; CHECK-GI-NEXT:    and w12, w8, #0xff
+; CHECK-GI-NEXT:    cmp w12, w9, uxtb
+; CHECK-GI-NEXT:    and w12, w10, #0xff
+; CHECK-GI-NEXT:    csel w8, w8, w9, lo
+; CHECK-GI-NEXT:    cmp w12, w11, uxtb
+; CHECK-GI-NEXT:    csel w9, w10, w11, lo
 ; CHECK-GI-NEXT:    and w10, w8, #0xff
 ; CHECK-GI-NEXT:    cmp w10, w9, uxtb
 ; CHECK-GI-NEXT:    csel w0, w8, w9, lo
@@ -1351,19 +1336,16 @@ define i16 @uminv_v3i16(<3 x i16> %a) {
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-GI-NEXT:    mov h1, v0.h[1]
-; CHECK-GI-NEXT:    mov h2, v0.h[2]
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    fmov w9, s0
-; CHECK-GI-NEXT:    uxth w8, w8
-; CHECK-GI-NEXT:    fmov w10, s1
-; CHECK-GI-NEXT:    fmov w11, s2
+; CHECK-GI-NEXT:    umov w8, v0.h[0]
+; CHECK-GI-NEXT:    umov w9, v0.h[0]
+; CHECK-GI-NEXT:    umov w10, v0.h[1]
+; CHECK-GI-NEXT:    umov w11, v0.h[2]
+; CHECK-GI-NEXT:    umov w13, v0.h[2]
 ; CHECK-GI-NEXT:    fmov w12, s1
-; CHECK-GI-NEXT:    cmp w8, w10, uxth
-; CHECK-GI-NEXT:    uxth w8, w11
-; CHECK-GI-NEXT:    fmov w10, s2
-; CHECK-GI-NEXT:    csel w9, w9, w12, lo
-; CHECK-GI-NEXT:    cmp w8, w9, uxth
-; CHECK-GI-NEXT:    csel w0, w9, w10, hi
+; CHECK-GI-NEXT:    cmp w8, w12, uxth
+; CHECK-GI-NEXT:    csel w8, w9, w10, lo
+; CHECK-GI-NEXT:    cmp w11, w8, uxth
+; CHECK-GI-NEXT:    csel w0, w8, w13, hi
 ; CHECK-GI-NEXT:    ret
 entry:
   %arg1 = call i16 @llvm.vector.reduce.umin.v3i16(<3 x i16> %a)
@@ -1625,19 +1607,16 @@ define i8 @umaxv_v4i8(<4 x i8> %a) {
 ; CHECK-GI-LABEL: umaxv_v4i8:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    mov h1, v0.h[1]
-; CHECK-GI-NEXT:    mov h2, v0.h[2]
-; CHECK-GI-NEXT:    mov h3, v0.h[3]
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    fmov w10, s1
-; CHECK-GI-NEXT:    fmov w11, s2
-; CHECK-GI-NEXT:    fmov w12, s3
-; CHECK-GI-NEXT:    and w9, w8, #0xff
-; CHECK-GI-NEXT:    cmp w9, w10, uxtb
-; CHECK-GI-NEXT:    and w9, w11, #0xff
-; CHECK-GI-NEXT:    csel w8, w8, w10, hi
-; CHECK-GI-NEXT:    cmp w9, w12, uxtb
-; CHECK-GI-NEXT:    csel w9, w11, w12, hi
+; CHECK-GI-NEXT:    umov w8, v0.h[0]
+; CHECK-GI-NEXT:    umov w9, v0.h[1]
+; CHECK-GI-NEXT:    umov w10, v0.h[2]
+; CHECK-GI-NEXT:    umov w11, v0.h[3]
+; CHECK-GI-NEXT:    and w12, w8, #0xff
+; CHECK-GI-NEXT:    cmp w12, w9, uxtb
+; CHECK-GI-NEXT:    and w12, w10, #0xff
+; CHECK-GI-NEXT:    csel w8, w8, w9, hi
+; CHECK-GI-NEXT:    cmp w12, w11, uxtb
+; CHECK-GI-NEXT:    csel w9, w10, w11, hi
 ; CHECK-GI-NEXT:    and w10, w8, #0xff
 ; CHECK-GI-NEXT:    cmp w10, w9, uxtb
 ; CHECK-GI-NEXT:    csel w0, w8, w9, hi
@@ -1719,19 +1698,16 @@ define i16 @umaxv_v3i16(<3 x i16> %a) {
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-GI-NEXT:    mov h1, v0.h[1]
-; CHECK-GI-NEXT:    mov h2, v0.h[2]
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    fmov w9, s0
-; CHECK-GI-NEXT:    uxth w8, w8
-; CHECK-GI-NEXT:    fmov w10, s1
-; CHECK-GI-NEXT:    fmov w11, s2
+; CHECK-GI-NEXT:    umov w8, v0.h[0]
+; CHECK-GI-NEXT:    umov w9, v0.h[0]
+; CHECK-GI-NEXT:    umov w10, v0.h[1]
+; CHECK-GI-NEXT:    umov w11, v0.h[2]
+; CHECK-GI-NEXT:    umov w13, v0.h[2]
 ; CHECK-GI-NEXT:    fmov w12, s1
-; CHECK-GI-NEXT:    cmp w8, w10, uxth
-; CHECK-GI-NEXT:    uxth w8, w11
-; CHECK-GI-NEXT:    fmov w10, s2
-; CHECK-GI-NEXT:    csel w9, w9, w12, hi
-; CHECK-GI-NEXT:    cmp w8, w9, uxth
-; CHECK-GI-NEXT:    csel w0, w9, w10, lo
+; CHECK-GI-NEXT:    cmp w8, w12, uxth
+; CHECK-GI-NEXT:    csel w8, w9, w10, hi
+; CHECK-GI-NEXT:    cmp w11, w8, uxth
+; CHECK-GI-NEXT:    csel w0, w8, w13, lo
 ; CHECK-GI-NEXT:    ret
 entry:
   %arg1 = call i16 @llvm.vector.reduce.umax.v3i16(<3 x i16> %a)
diff --git a/llvm/test/CodeGen/AArch64/aarch64-mulv.ll b/llvm/test/CodeGen/AArch64/aarch64-mulv.ll
index 90f09379e68fd..7b7ca9d8ffc2d 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-mulv.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-mulv.ll
@@ -73,13 +73,10 @@ define i8 @mulv_v4i8(<4 x i8> %a) {
 ; CHECK-GI-LABEL: mulv_v4i8:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    mov h1, v0.h[1]
-; CHECK-GI-NEXT:    mov h2, v0.h[2]
-; CHECK-GI-NEXT:    mov h3, v0.h[3]
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    fmov w9, s1
-; CHECK-GI-NEXT:    fmov w10, s2
-; CHECK-GI-NEXT:    fmov w11, s3
+; CHECK-GI-NEXT:    umov w8, v0.h[0]
+; CHECK-GI-NEXT:    umov w9, v0.h[1]
+; CHECK-GI-NEXT:    umov w10, v0.h[2]
+; CHECK-GI-NEXT:    umov w11, v0.h[3]
 ; CHECK-GI-NEXT:    mul w8, w8, w9
 ; CHECK-GI-NEXT:    mul w9, w10, w11
 ; CHECK-GI-NEXT:    mul w0, w8, w9
@@ -113,27 +110,20 @@ define i8 @mulv_v8i8(<8 x i8> %a) {
 ; CHECK-GI-LABEL: mulv_v8i8:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    mov b1, v0.b[1]
-; CHECK-GI-NEXT:    mov b2, v0.b[2]
-; CHECK-GI-NEXT:    mov b3, v0.b[3]
-; CHECK-GI-NEXT:    mov b4, v0.b[4]
-; CHECK-GI-NEXT:    mov b5, v0.b[5]
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    mov b6, v0.b[6]
-; CHECK-GI-NEXT:    mov b7, v0.b[7]
-; CHECK-GI-NEXT:    fmov w9, s1
-; CHECK-GI-NEXT:    fmov w10, s2
-; CHECK-GI-NEXT:    fmov w11, s3
-; CHECK-GI-NEXT:    fmov w12, s5
+; CHECK-GI-NEXT:    umov w8, v0.b[0]
+; CHECK-GI-NEXT:    umov w9, v0.b[1]
+; CHECK-GI-NEXT:    umov w10, v0.b[2]
+; CHECK-GI-NEXT:    umov w11, v0.b[3]
+; CHECK-GI-NEXT:    umov w12, v0.b[4]
+; CHECK-GI-NEXT:    umov w13, v0.b[5]
+; CHECK-GI-NEXT:    umov w14, v0.b[6]
+; CHECK-GI-NEXT:    umov w15, v0.b[7]
 ; CHECK-GI-NEXT:    mul w8, w8, w9
-; CHECK-GI-NEXT:    fmov w9, s4
-; CHECK-GI-NEXT:    mul w10, w10, w11
-; CHECK-GI-NEXT:    fmov w11, s6
-; CHECK-GI-NEXT:    mul w9, w9, w12
-; CHECK-GI-NEXT:    fmov w12, s7
-; CHECK-GI-NEXT:    mul w8, w8, w10
-; CHECK-GI-NEXT:    mul w11, w11, w12
-; CHECK-GI-NEXT:    mul w9, w9, w11
+; CHECK-GI-NEXT:    mul w9, w10, w11
+; CHECK-GI-NEXT:    mul w10, w12, w13
+; CHECK-GI-NEXT:    mul w11, w14, w15
+; CHECK-GI-NEXT:    mul w8, w8, w9
+; CHECK-GI-NEXT:    mul w9, w10, w11
 ; CHECK-GI-NEXT:    mul w0, w8, w9
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -167,27 +157,20 @@ define i8 @mulv_v16i8(<16 x i8> %a) {
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    mov d1, v0.d[1]
 ; CHECK-GI-NEXT:    mul v0.8b, v0.8b, v1.8b
-; CHECK-GI-NEXT:    mov b1, v0.b[1]
-; CHECK-GI-NEXT:    mov b2, v0.b[2]
-; CHECK-GI-NEXT:    mov b3, v0.b[3]
-; CHECK-GI-NEXT:    mov b4, v0.b[4]
-; CHECK-GI-NEXT:    mov b5, v0.b[5]
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    mov b6, v0.b[6]
-; CHECK-GI-NEXT:    mov b7, v0.b[7]
-; CHECK-GI-NEXT:    fmov w9, s1
-; CHECK-GI-NEXT:    fmov w10, s2
-; CHECK-GI-NEXT:    fmov w11, s3
-; CHECK-GI-NEXT:    fmov w12, s5
+; CHECK-GI-NEXT:    umov w8, v0.b[0]
+; CHECK-GI-NEXT:    umov w9, v0.b[1]
+; CHECK-GI-NEXT:    umov w10, v0.b[2]
+; CHECK-GI-NEXT:    umov w11, v0.b[3]
+; CHECK-GI-NEXT:    umov w12, v0.b[4]
+; CHECK-GI-NEXT:    umov w13, v0.b[5]
+; CHECK-GI-NEXT:    umov w14, v0.b[6]
+; CHECK-GI-NEXT:    umov w15, v0.b[7]
+; CHECK-GI-NEXT:    mul w8, w8, w9
+; CHECK-GI-NEXT:    mul w9, w10, w11
+; CHECK-GI-NEXT:    mul w10, w12, w13
+; CHECK-GI-NEXT:    mul w11, w14, w15
 ; CHECK-GI-NEXT:    mul w8, w8, w9
-; CHECK-GI-NEXT:    fmov w9, s4
-; CHECK-GI-NEXT:    mul w10, w10, w11
-; CHECK-GI-NEXT:    fmov w11, s6
-; CHECK-GI-NEXT:    mul w9, w9, w12
-; CHECK-GI-NEXT:    fmov w12, s7
-; CHECK-GI-NEXT:    mul w8, w8, w10
-; CHECK-GI-NEXT:    mul w11, w11, w12
-; CHECK-GI-NEXT:    mul w9, w9, w11
+; CHECK-GI-NEXT:    mul w9, w10, w11
 ; CHECK-GI-NEXT:    mul w0, w8, w9
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -225,27 +208,20 @@ define i8 @mulv_v32i8(<32 x i8> %a) {
 ; CHECK-GI-NEXT:    mul v0.8b, v0.8b, v2.8b
 ; CHECK-GI-NEXT:    mul v1.8b, v1.8b, v3.8b
 ; CHECK-GI-NEXT:    mul v0.8b, v0.8b, v1.8b
-; CHECK-GI-NEXT:    mov b1, v0.b[1]
-; CHECK-GI-NEXT:    mov b2, v0.b[2]
-; CHECK-GI-NEXT:    mov b3, v0.b[3]
-; CHECK-GI-NEXT:    mov b4, v0.b[4]
-; CHECK-GI-NEXT:    mov b5, v0.b[5]
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    mov b6, v0.b[6]
-; CHECK-GI-NEXT:    mov b7, v0.b[7]
-; CHECK-GI-NEXT:    fmov w9, s1
-; CHECK-GI-NEXT:    fmov w10, s2
-; CHECK-GI-NEXT:    fmov w11, s3
-; CHECK-GI-NEXT:    fmov w12, s5
+; CHECK-GI-NEXT:    umov w8, v0.b[0]
+; CHECK-GI-NEXT:    umov w9, v0.b[1]
+; CHECK-GI-NEXT:    umov w10, v0.b[2]
+; CHECK-GI-NEXT:    umov w11, v0.b[3]
+; CHECK-GI-NEXT:    umov w12, v0.b[4]
+; CHECK-GI-NEXT:    umov w13, v0.b[5]
+; CHECK-GI-NEXT:    umov w14, v0.b[6]
+; CHECK-GI-NEXT:    umov w15, v0.b[7]
 ; CHECK-GI-NEXT:    mul w8, w8, w9
-; CHECK-GI-NEXT:    fmov w9, s4
-; CHECK-GI-NEXT:    mul w10, w10, w11
-; CHECK-GI-NEXT:    fmov w11, s6
-; CHECK-GI-NEXT:    mul w9, w9, w12
-; CHECK-GI-NEXT:    fmov w12, s7
-; CHECK-GI-NEXT:    mul w8, w8, w10
-; CHECK-GI-NEXT:    mul w11, w11, w12
-; CHECK-GI-NEXT:    mul w9, w9, w11
+; CHECK-GI-NEXT:    mul w9, w10, w11
+; CHECK-GI-NEXT:    mul w10, w12, w13
+; CHECK-GI-NEXT:    mul w11, w14, w15
+; CHECK-GI-NEXT:    mul w8, w8, w9
+; CHECK-GI-NEXT:    mul w9, w10, w11
 ; CHECK-GI-NEXT:    mul w0, w8, w9
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -289,13 +265,11 @@ define i16 @mulv_v3i16(<3 x i16> %a) {
 ; CHECK-GI-LABEL: mulv_v3i16:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    mov h1, v0.h[1]
-; CHECK-GI-NEXT:    mov h2, v0.h[2]
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    fmov w9, s1
+; CHECK-GI-NEXT:    umov w8, v0.h[0]
+; CHECK-GI-NEXT:    umov w9, v0.h[1]
+; CHECK-GI-NEXT:    umov w10, v0.h[2]
 ; CHECK-GI-NEXT:    mul w8, w8, w9
-; CHECK-GI-NEXT:    fmov w9, s2
-; CHECK-GI-NEXT:    mul w0, w8, w9
+; CHECK-GI-NEXT:    mul w0, w8, w10
 ; CHECK-GI-NEXT:    ret
 entry:
   %arg1 = call i16 @llvm.vector.reduce.mul.v3i16(<3 x i16> %a)
@@ -318,13 +292,10 @@ define i16 @mulv_v4i16(<4 x i16> %a) {
 ; CHECK-GI-LABEL: mulv_v4i16:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    mov h1, v0.h[1]
-; CHECK-GI-NEXT:    mov h2, v0.h[2]
-; CHECK-GI-NEXT:    mov h3, v0.h[3]
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    fmov w9, s1
-; CHECK-GI-NEXT:    fmov w10, s2
-; CHECK-GI-NEXT:    fmov w11, s3
+; CHECK-GI-NEXT:    umov w8, v0.h[0]
+; CHECK-GI-NEXT:    umov w9, v0.h[1]
+; CHECK-GI-NEXT:    umov w10, v0.h[2]
+; CHECK-GI-NEXT:    umov w11, v0.h[3]
 ; CHECK-GI-NEXT:    mul w8, w8, w9
 ; CHECK-GI-NEXT:    mul w9, w10, w11
 ; CHECK-GI-NEXT:    mul w0, w8, w9
@@ -352,13 +323,10 @@ define i16 @mulv_v8i16(<8 x i16> %a) {
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    mov d1, v0.d[1]
 ; CHECK-GI-NEXT:    mul v0.4h, v0.4h, v1.4h
-; CHECK-GI-NEXT:    mov h1, v0.h[1]
-; CHECK-GI-NEXT:    mov h2, v0.h[2]
-; CHECK-GI-NEXT:    mov h3, v0.h[3]
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    fmov w9, s1
-; CHECK-GI-NEXT:    fmov w10, s2
-; CHECK-GI-NEXT:    fmov w11, s3
+; CHECK-GI-NEXT:    umov w8, v0.h[0]
+; CHECK-GI-NEXT:    umov w9, v0.h[1]
+; CHECK-GI-NEXT:    umov w10, v0.h[2]
+; CHECK-GI-NEXT:    umov w11, v0.h[3]
 ; CHECK-GI-NEXT:    mul w8, w8, w9
 ; CHECK-GI-NEXT:    mul w9, w10, w11
 ; CHECK-GI-NEXT:    mul w0, w8, w9
@@ -390,15 +358,12 @@ define i16 @mulv_v16i16(<16 x i16> %a) {
 ; CHECK-GI-NEXT:    mul v0.4h, v0.4h, v2.4h
 ; CHECK-GI-NEXT:    mul v1.4h, v1.4h, v3.4h
 ; CHECK-GI-NEXT:    mul v0.4h, v0.4h, v1.4h
-; CHECK-GI-NEXT:    mov h1, v0.h[1]
-; CHECK-GI-NEXT:    mov h2, v0.h[2]
-; CHECK-GI-NEXT:    mov h3, v0.h[3]
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    fmov w9, s1
-; CHECK-GI-NEXT:    fmov w10, s2
+; CHECK-GI-NEXT:    umov w8, v0.h[0]
+; CHECK-GI-NEXT:    umov w9, v0.h[1]
+; CHECK-GI-NEXT:    umov w10, v0.h[2]
+; CHECK-GI-NEXT:    umov w11, v0.h[3]
 ; CHECK-GI-NEXT:    mul w8, w8, w9
-; CHECK-GI-NEXT:    fmov w9, s3
-; CHECK-GI-NEXT:    mul w9, w10, w9
+; CHECK-GI-NEXT:    mul w9, w10, w11
 ; CHECK-GI-NEXT:    mul w0, w8, w9
 ; CHECK-GI-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/AArch64/aarch64-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-smull.ll
index 99aa28d859e1f..dbc5417e23133 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-smull.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-smull.ll
@@ -3,6 +3,19 @@
 ; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+sve < %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SVE
 ; RUN: llc -mtriple=aarch64 -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
+; CHECK-GI:       warning: Instruction selection used fallback path for smull_zext_v4i16_v4i32
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for pmlsl2_v8i16_uzp1
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for smlsl2_v8i16_uzp1
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for umlsl2_v8i16_uzp1
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for smlsl2_v4i32_uzp1
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for umlsl2_v4i32_uzp1
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for pmlsl_pmlsl2_v8i16_uzp1
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for smlsl_smlsl2_v8i16_uzp1
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for umlsl_umlsl2_v8i16_uzp1
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for smlsl_smlsl2_v4i32_uzp1
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for umlsl_umlsl2_v4i32_uzp1
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for do_stuff
+
 define <8 x i16> @smull_v8i8_v8i16(ptr %A, ptr %B) nounwind {
 ; CHECK-LABEL: smull_v8i8_v8i16:
 ; CHECK:       // %bb.0:
@@ -226,11 +239,10 @@ define <2 x i64> @smull_zext_v2i32_v2i64(ptr %A, ptr %B) nounwind {
 ; CHECK-GI-NEXT:    movi d0, #0x00ffff0000ffff
 ; CHECK-GI-NEXT:    mov v1.s[1], v2.s[0]
 ; CHECK-GI-NEXT:    and v0.8b, v1.8b, v0.8b
-; CHECK-GI-NEXT:    mov s1, v0.s[1]
-; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    mov w8, v0.s[0]
+; CHECK-GI-NEXT:    mov w9, v0.s[1]
 ; CHECK-GI-NEXT:    ldr d0, [x1]
 ; CHECK-GI-NEXT:    sshll v0.2d, v0.2s, #0
-; CHECK-GI-NEXT:    fmov w9, s1
 ; CHECK-GI-NEXT:    fmov d1, x8
 ; CHECK-GI-NEXT:    mov d3, v0.d[1]
 ; CHECK-GI-NEXT:    mov v1.d[1], x9
diff --git a/llvm/test/CodeGen/AArch64/fptoi.ll b/llvm/test/CodeGen/AArch64/fptoi.ll
index f30dad966492c..23ba85d54c7a4 100644
--- a/llvm/test/CodeGen/AArch64/fptoi.ll
+++ b/llvm/test/CodeGen/AArch64/fptoi.ll
@@ -5846,11 +5846,9 @@ define <3 x i8> @fptos_v3f16_v3i8(<3 x half> %a) {
 ; CHECK-GI-FP16-LABEL: fptos_v3f16_v3i8:
 ; CHECK-GI-FP16:       // %bb.0: // %entry
 ; CHECK-GI-FP16-NEXT:    fcvtzs v0.4h, v0.4h
-; CHECK-GI-FP16-NEXT:    mov h1, v0.h[1]
-; CHECK-GI-FP16-NEXT:    mov h2, v0.h[2]
-; CHECK-GI-FP16-NEXT:    fmov w0, s0
-; CHECK-GI-FP16-NEXT:    fmov w1, s1
-; CHECK-GI-FP16-NEXT:    fmov w2, s2
+; CHECK-GI-FP16-NEXT:    umov w0, v0.h[0]
+; CHECK-GI-FP16-NEXT:    umov w1, v0.h[1]
+; CHECK-GI-FP16-NEXT:    umov w2, v0.h[2]
 ; CHECK-GI-FP16-NEXT:    ret
 entry:
   %c = fptosi <3 x half> %a to <3 x i8>
@@ -5890,11 +5888,9 @@ define <3 x i8> @fptou_v3f16_v3i8(<3 x half> %a) {
 ; CHECK-GI-FP16-LABEL: fptou_v3f16_v3i8:
 ; CHECK-GI-FP16:       // %bb.0: // %entry
 ; CHECK-GI-FP16-NEXT:    fcvtzu v0.4h, v0.4h
-; CHECK-GI-FP16-NEXT:    mov h1, v0.h[1]
-; CHECK-GI-FP16-NEXT:    mov h2, v0.h[2]
-; CHECK-GI-FP16-NEXT:    fmov w0, s0
-; CHECK-GI-FP16-NEXT:    fmov w1, s1
-; CHECK-GI-FP16-NEXT:    fmov w2, s2
+; CHECK-GI-FP16-NEXT:    umov w0, v0.h[0]
+; CHECK-GI-FP16-NEXT:    umov w1, v0.h[1]
+; CHECK-GI-FP16-NEXT:    umov w2, v0.h[2]
 ; CHECK-GI-FP16-NEXT:    ret
 entry:
   %c = fptoui <3 x half> %a to <3 x i8>
diff --git a/llvm/test/CodeGen/AArch64/reduce-and.ll b/llvm/test/CodeGen/AArch64/reduce-and.ll
index a20a76c00418d..8b7438a42b711 100644
--- a/llvm/test/CodeGen/AArch64/reduce-and.ll
+++ b/llvm/test/CodeGen/AArch64/reduce-and.ll
@@ -53,13 +53,10 @@ define i1 @test_redand_v4i1(<4 x i1> %a) {
 ; GISEL-LABEL: test_redand_v4i1:
 ; GISEL:       // %bb.0:
 ; GISEL-NEXT:    // kill: def $d0 killed $d0 def $q0
-; GISEL-NEXT:    mov h1, v0.h[1]
-; GISEL-NEXT:    mov h2, v0.h[2]
-; GISEL-NEXT:    mov h3, v0.h[3]
-; GISEL-NEXT:    fmov w8, s0
-; GISEL-NEXT:    fmov w9, s1
-; GISEL-NEXT:    fmov w10, s2
-; GISEL-NEXT:    fmov w11, s3
+; GISEL-NEXT:    umov w8, v0.h[0]
+; GISEL-NEXT:    umov w9, v0.h[1]
+; GISEL-NEXT:    umov w10, v0.h[2]
+; GISEL-NEXT:    umov w11, v0.h[3]
 ; GISEL-NEXT:    and w8, w8, w9
 ; GISEL-NEXT:    and w9, w10, w11
 ; GISEL-NEXT:    and w8, w8, w9
@@ -82,27 +79,20 @@ define i1 @test_redand_v8i1(<8 x i1> %a) {
 ; GISEL-LABEL: test_redand_v8i1:
 ; GISEL:       // %bb.0:
 ; GISEL-NEXT:    // kill: def $d0 killed $d0 def $q0
-; GISEL-NEXT:    mov b1, v0.b[1]
-; GISEL-NEXT:    mov b2, v0.b[2]
-; GISEL-NEXT:    mov b3, v0.b[3]
-; GISEL-NEXT:    mov b4, v0.b[4]
-; GISEL-NEXT:    mov b5, v0.b[5]
-; GISEL-NEXT:    mov b6, v0.b[6]
-; GISEL-NEXT:    mov b7, v0.b[7]
-; GISEL-NEXT:    fmov w8, s0
-; GISEL-NEXT:    fmov w9, s1
-; GISEL-NEXT:    fmov w10, s2
-; GISEL-NEXT:    fmov w11, s3
-; GISEL-NEXT:    fmov w12, s4
-; GISEL-NEXT:    fmov w13, s5
-; GISEL-NEXT:    fmov w14, s6
+; GISEL-NEXT:    umov w8, v0.b[0]
+; GISEL-NEXT:    umov w9, v0.b[1]
+; GISEL-NEXT:    umov w10, v0.b[2]
+; GISEL-NEXT:    umov w11, v0.b[3]
+; GISEL-NEXT:    umov w12, v0.b[4]
+; GISEL-NEXT:    umov w13, v0.b[5]
+; GISEL-NEXT:    umov w14, v0.b[6]
+; GISEL-NEXT:    umov w15, v0.b[7]
 ; GISEL-NEXT:    and w8, w8, w9
-; GISEL-NEXT:    fmov w9, s7
-; GISEL-NEXT:    and w10, w10, w11
-; GISEL-NEXT:    and w11, w12, w13
-; GISEL-NEXT:    and w8, w8, w10
-; GISEL-NEXT:    and w9, w14, w9
-; GISEL-NEXT:    and w9, w11, w9
+; GISEL-NEXT:    and w9, w10, w11
+; GISEL-NEXT:    and w10, w12, w13
+; GISEL-NEXT:    and w11, w14, w15
+; GISEL-NEXT:    and w8, w8, w9
+; GISEL-NEXT:    and w9, w10, w11
 ; GISEL-NEXT:    and w8, w8, w9
 ; GISEL-NEXT:    and w0, w8, #0x1
 ; GISEL-NEXT:    ret
@@ -122,49 +112,34 @@ define i1 @test_redand_v16i1(<16 x i1> %a) {
 ;
 ; GISEL-LABEL: test_redand_v16i1:
 ; GISEL:       // %bb.0:
-; GISEL-NEXT:    mov b1, v0.b[1]
-; GISEL-NEXT:    mov b2, v0.b[2]
-; GISEL-NEXT:    mov b3, v0.b[3]
-; GISEL-NEXT:    mov b4, v0.b[4]
-; GISEL-NEXT:    mov b5, v0.b[5]
-; GISEL-NEXT:    mov b6, v0.b[6]
-; GISEL-NEXT:    mov b7, v0.b[7]
-; GISEL-NEXT:    fmov w8, s0
-; GISEL-NEXT:    mov b16, v0.b[8]
-; GISEL-NEXT:    mov b17, v0.b[9]
-; GISEL-NEXT:    mov b18, v0.b[10]
-; GISEL-NEXT:    mov b19, v0.b[11]
-; GISEL-NEXT:    fmov w9, s1
-; GISEL-NEXT:    fmov w10, s2
-; GISEL-NEXT:    fmov w11, s3
-; GISEL-NEXT:    fmov w12, s6
-; GISEL-NEXT:    mov b20, v0.b[12]
-; GISEL-NEXT:    mov b21, v0.b[13]
-; GISEL-NEXT:    fmov w13, s7
-; GISEL-NEXT:    mov b22, v0.b[14]
-; GISEL-NEXT:    mov b23, v0.b[15]
+; GISEL-NEXT:    umov w8, v0.b[0]
+; GISEL-NEXT:    umov w9, v0.b[1]
+; GISEL-NEXT:    umov w10, v0.b[2]
+; GISEL-NEXT:    umov w11, v0.b[3]
+; GISEL-NEXT:    umov w12, v0.b[4]
+; GISEL-NEXT:    umov w13, v0.b[5]
+; GISEL-NEXT:    umov w14, v0.b[6]
+; GISEL-NEXT:    umov w15, v0.b[7]
+; GISEL-NEXT:    umov w16, v0.b[8]
+; GISEL-NEXT:    umov w17, v0.b[9]
+; GISEL-NEXT:    umov w18, v0.b[10]
+; GISEL-NEXT:    umov w0, v0.b[11]
 ; GISEL-NEXT:    and w8, w8, w9
+; GISEL-NEXT:    umov w1, v0.b[12]
+; GISEL-NEXT:    umov w2, v0.b[13]
 ; GISEL-NEXT:    and w9, w10, w11
-; GISEL-NEXT:    fmov w10, s4
+; GISEL-NEXT:    and w10, w12, w13
+; GISEL-NEXT:    umov w3, v0.b[14]
+; GISEL-NEXT:    and w11, w14, w15
 ; GISEL-NEXT:    and w8, w8, w9
-; GISEL-NEXT:    fmov w11, s5
-; GISEL-NEXT:    fmov w14, s18
-; GISEL-NEXT:    fmov w15, s19
-; GISEL-NEXT:    fmov w16, s22
-; GISEL-NEXT:    fmov w17, s23
-; GISEL-NEXT:    and w10, w10, w11
-; GISEL-NEXT:    and w11, w12, w13
-; GISEL-NEXT:    fmov w12, s16
+; GISEL-NEXT:    umov w4, v0.b[15]
+; GISEL-NEXT:    and w12, w16, w17
+; GISEL-NEXT:    and w13, w18, w0
 ; GISEL-NEXT:    and w9, w10, w11
-; GISEL-NEXT:    fmov w13, s17
-; GISEL-NEXT:    and w8, w8, w9
-; GISEL-NEXT:    and w12, w12, w13
-; GISEL-NEXT:    and w13, w14, w15
-; GISEL-NEXT:    fmov w14, s20
-; GISEL-NEXT:    fmov w15, s21
+; GISEL-NEXT:    and w14, w1, w2
 ; GISEL-NEXT:    and w10, w12, w13
-; GISEL-NEXT:    and w14, w14, w15
-; GISEL-NEXT:    and w15, w16, w17
+; GISEL-NEXT:    and w8, w8, w9
+; GISEL-NEXT:    and w15, w3, w4
 ; GISEL-NEXT:    and w11, w14, w15
 ; GISEL-NEXT:    and w9, w10, w11
 ; GISEL-NEXT:    and w8, w8, w9
@@ -184,49 +159,34 @@ define <16 x i1> @test_redand_ins_v16i1(<16 x i1> %a) {
 ;
 ; GISEL-LABEL: test_redand_ins_v16i1:
 ; GISEL:       // %bb.0:
-; GISEL-NEXT:    mov b1, v0.b[1]
-; GISEL-NEXT:    mov b2, v0.b[2]
-; GISEL-NEXT:    mov b3, v0.b[3]
-; GISEL-NEXT:    mov b4, v0.b[4]
-; GISEL-NEXT:    mov b5, v0.b[5]
-; GISEL-NEXT:    mov b6, v0.b[6]
-; GISEL-NEXT:    mov b7, v0.b[7]
-; GISEL-NEXT:    fmov w8, s0
-; GISEL-NEXT:    mov b16, v0.b[8]
-; GISEL-NEXT:    mov b17, v0.b[9]
-; GISEL-NEXT:    mov b18, v0.b[10]
-; GISEL-NEXT:    mov b19, v0.b[11]
-; GISEL-NEXT:    fmov w9, s1
-; GISEL-NEXT:    fmov w10, s2
-; GISEL-NEXT:    fmov w11, s3
-; GISEL-NEXT:    fmov w12, s6
-; GISEL-NEXT:    mov b20, v0.b[12]
-; GISEL-NEXT:    mov b21, v0.b[13]
-; GISEL-NEXT:    fmov w13, s7
-; GISEL-NEXT:    mov b22, v0.b[14]
-; GISEL-NEXT:    mov b23, v0.b[15]
+; GISEL-NEXT:    umov w8, v0.b[0]
+; GISEL-NEXT:    umov w9, v0.b[1]
+; GISEL-NEXT:    umov w10, v0.b[2]
+; GISEL-NEXT:    umov w11, v0.b[3]
+; GISEL-NEXT:    umov w12, v0.b[4]
+; GISEL-NEXT:    umov w13, v0.b[5]
+; GISEL-NEXT:    umov w14, v0.b[6]
+; GISEL-NEXT:    umov w15, v0.b[7]
+; GISEL-NEXT:    umov w16, v0.b[8]
+; GISEL-NEXT:    umov w17, v0.b[9]
+; GISEL-NEXT:    umov w18, v0.b[10]
+; GISEL-NEXT:    umov w0, v0.b[11]
 ; GISEL-NEXT:    and w8, w8, w9
+; GISEL-NEXT:    umov w1, v0.b[12]
+; GISEL-NEXT:    umov w2, v0.b[13]
 ; GISEL-NEXT:    and w9, w10, w11
-; GISEL-NEXT:    fmov w10, s4
+; GISEL-NEXT:    and w10, w12, w13
+; GISEL-NEXT:    umov w3, v0.b[14]
+; GISEL-NEXT:    and w11, w14, w15
 ; GISEL-NEXT:    and w8, w8, w9
-; GISEL-NEXT:    fmov w11, s5
-; GISEL-NEXT:    fmov w14, s18
-; GISEL-NEXT:    fmov w15, s19
-; GISEL-NEXT:    fmov w16, s22
-; GISEL-NEXT:    fmov w17, s23
-; GISEL-NEXT:    and w10, w10, w11
-; GISEL-NEXT:    and w11, w12, w13
-; GISEL-NEXT:    fmov w12, s16
+; GISEL-NEXT:    umov w4, v0.b[15]
+; GISEL-NEXT:    and w12, w16, w17
+; GISEL-NEXT:    and w13, w18, w0
 ; GISEL-NEXT:    and w9, w10, w11
-; GISEL-NEXT:    fmov w13, s17
-; GISEL-NEXT:    and w8, w8, w9
-; GISEL-NEXT:    and w12, w12, w13
-; GISEL-NEXT:    and w13, w14, w15
-; GISEL-NEXT:    fmov w14, s20
-; GISEL-NEXT:    fmov w15, s21
+; GISEL-NEXT:    and w14, w1, w2
 ; GISEL-NEXT:    and w10, w12, w13
-; GISEL-NEXT:    and w14, w14, w15
-; GISEL-NEXT:    and w15, w16, w17
+; GISEL-NEXT:    and w8, w8, w9
+; GISEL-NEXT:    and w15, w3, w4
 ; GISEL-NEXT:    and w11, w14, w15
 ; GISEL-NEXT:    and w9, w10, w11
 ; GISEL-NEXT:    and w8, w8, w9
@@ -287,13 +247,10 @@ define i8 @test_redand_v4i8(<4 x i8> %a) {
 ; GISEL-LABEL: test_redand_v4i8:
 ; GISEL:       // %bb.0:
 ; GISEL-NEXT:    // kill: def $d0 killed $d0 def $q0
-; GISEL-NEXT:    mov h1, v0.h[1]
-; GISEL-NEXT:    mov h2, v0.h[2]
-; GISEL-NEXT:    mov h3, v0.h[3]
-; GISEL-NEXT:    fmov w8, s0
-; GISEL-NEXT:    fmov w9, s1
-; GISEL-NEXT:    fmov w10, s2
-; GISEL-NEXT:    fmov w11, s3
+; GISEL-NEXT:    umov w8, v0.h[0]
+; GISEL-NEXT:    umov w9, v0.h[1]
+; GISEL-NEXT:    umov w10, v0.h[2]
+; GISEL-NEXT:    umov w11, v0.h[3]
 ; GISEL-NEXT:    and w8, w8, w9
 ; GISEL-NEXT:    and w9, w10, w11
 ; GISEL-NEXT:    and w0, w8, w9
@@ -315,27 +272,20 @@ define i8 @test_redand_v8i8(<8 x i8> %a) {
 ; GISEL-LABEL: test_redand_v8i8:
 ; GISEL:       // %bb.0:
 ; GISEL-NEXT:    // kill: def $d0 killed $d0 def $q0
-; GISEL-NEXT:    mov b1, v0.b[1]
-; GISEL-NEXT:    mov b2, v0.b[2]
-; GISEL-NEXT:    mov b3, v0.b[3]
-; GISEL-NEXT:    mov b4, v0.b[4]
-; GISEL-NEXT:    mov b5, v0.b[5]
-; GISEL-NEXT:    mov b6, v0.b[6]
-; GISEL-NEXT:    mov b7, v0.b[7]
-; GISEL-NEXT:    fmov w8, s0
-; GISEL-NEXT:    fmov w9, s1
-; GISEL-NEXT:    fmov w10, s2
-; GISEL-NEXT:    fmov w11, s3
-; GISEL-NEXT:    fmov w12, s4
-; GISEL-NEXT:    fmov w13, s5
-; GISEL-NEXT:    fmov w14, s6
+; GISEL-NEXT:    umov w8, v0.b[0]
+; GISEL-NEXT:    umov w9, v0.b[1]
+; GISEL-NEXT:    umov w10, v0.b[2]
+; GISEL-NEXT:    umov w11, v0.b[3]
+; GISEL-NEXT:    umov w12, v0.b[4]
+; GISEL-NEXT:    umov w13, v0.b[5]
+; GISEL-NEXT:    umov w14, v0.b[6]
+; GISEL-NEXT:    umov w15, v0.b[7]
 ; GISEL-NEXT:    and w8, w8, w9
-; GISEL-NEXT:    fmov w9, s7
-; GISEL-NEXT:    and w10, w10, w11
-; GISEL-NEXT:    and w11, w12, w13
-; GISEL-NEXT:    and w8, w8, w10
-; GISEL-NEXT:    and w9, w14, w9
-; GISEL-NEXT:    and w9, w11, w9
+; GISEL-NEXT:    and w9, w10, w11
+; GISEL-NEXT:    and w10, w12, w13
+; GISEL-NEXT:    and w11, w14, w15
+; GISEL-NEXT:    and w8, w8, w9
+; GISEL-NEXT:    and w9, w10, w11
 ; GISEL-NEXT:    and w0, w8, w9
 ; GISEL-NEXT:    ret
   %and_result = call i8 @llvm.vector.reduce.and.v8i8(<8 x i8> %a)
@@ -358,27 +308,20 @@ define i8 @test_redand_v16i8(<16 x i8> %a) {
 ; GISEL:       // %bb.0:
 ; GISEL-NEXT:    mov d1, v0.d[1]
 ; GISEL-NEXT:    and v0.8b, v0.8b, v1.8b
-; GISEL-NEXT:    mov b1, v0.b[1]
-; GISEL-NEXT:    mov b2, v0.b[2]
-; GISEL-NEXT:    mov b3, v0.b[3]
-; GISEL-NEXT:    mov b4, v0.b[4]
-; GISEL-NEXT:    mov b5, v0.b[5]
-; GISEL-NEXT:    mov b6, v0.b[6]
-; GISEL-NEXT:    mov b7, v0.b[7]
-; GISEL-NEXT:    fmov w8, s0
-; GISEL-NEXT:    fmov w9, s1
-; GISEL-NEXT:    fmov w10, s2
-; GISEL-NEXT:    fmov w11, s3
-; GISEL-NEXT:    fmov w12, s4
-; GISEL-NEXT:    fmov w13, s5
-; GISEL-NEXT:    fmov w14, s6
+; GISEL-NEXT:    umov w8, v0.b[0]
+; GISEL-NEXT:    umov w9, v0.b[1]
+; GISEL-NEXT:    umov w10, v0.b[2]
+; GISEL-NEXT:    umov w11, v0.b[3]
+; GISEL-NEXT:    umov w12, v0.b[4]
+; GISEL-NEXT:    umov w13, v0.b[5]
+; GISEL-NEXT:    umov w14, v0.b[6]
+; GISEL-NEXT:    umov w15, v0.b[7]
 ; GISEL-NEXT:    and w8, w8, w9
-; GISEL-NEXT:    fmov w9, s7
-; GISEL-NEXT:    and w10, w10, w11
-; GISEL-NEXT:    and w11, w12, w13
-; GISEL-NEXT:    and w8, w8, w10
-; GISEL-NEXT:    and w9, w14, w9
-; GISEL-NEXT:    and w9, w11, w9
+; GISEL-NEXT:    and w9, w10, w11
+; GISEL-NEXT:    and w10, w12, w13
+; GISEL-NEXT:    and w11, w14, w15
+; GISEL-NEXT:    and w8, w8, w9
+; GISEL-NEXT:    and w9, w10, w11
 ; GISEL-NEXT:    and w0, w8, w9
 ; GISEL-NEXT:    ret
   %and_result = call i8 @llvm.vector.reduce.and.v16i8(<16 x i8> %a)
@@ -403,27 +346,20 @@ define i8 @test_redand_v32i8(<32 x i8> %a) {
 ; GISEL-NEXT:    and v0.16b, v0.16b, v1.16b
 ; GISEL-NEXT:    mov d1, v0.d[1]
 ; GISEL-NEXT:    and v0.8b, v0.8b, v1.8b
-; GISEL-NEXT:    mov b1, v0.b[1]
-; GISEL-NEXT:    mov b2, v0.b[2]
-; GISEL-NEXT:    mov b3, v0.b[3]
-; GISEL-NEXT:    mov b4, v0.b[4]
-; GISEL-NEXT:    mov b5, v0.b[5]
-; GISEL-NEXT:    mov b6, v0.b[6]
-; GISEL-NEXT:    mov b7, v0.b[7]
-; GISEL-NEXT:    fmov w8, s0
-; GISEL-NEXT:    fmov w9, s1
-; GISEL-NEXT:    fmov w10, s2
-; GISEL-NEXT:    fmov w11, s3
-; GISEL-NEXT:    fmov w12, s4
-; GISEL-NEXT:    fmov w13, s5
-; GISEL-NEXT:    fmov w14, s6
+; GISEL-NEXT:    umov w8, v0.b[0]
+; GISEL-NEXT:    umov w9, v0.b[1]
+; GISEL-NEXT:    umov w10, v0.b[2]
+; GISEL-NEXT:    umov w11, v0.b[3]
+; GISEL-NEXT:    umov w12, v0.b[4]
+; GISEL-NEXT:    umov w13, v0.b[5]
+; GISEL-NEXT:    umov w14, v0.b[6]
+; GISEL-NEXT:    umov w15, v0.b[7]
+; GISEL-NEXT:    and w8, w8, w9
+; GISEL-NEXT:    and w9, w10, w11
+; GISEL-NEXT:    and w10, w12, w13
+; GISEL-NEXT:    and w11, w14, w15
 ; GISEL-NEXT:    and w8, w8, w9
-; GISEL-NEXT:    fmov w9, s7
-; GISEL-NEXT:    and w10, w10, w11
-; GISEL-NEXT:    and w11, w12, w13
-; GISEL-NEXT:    and w8, w8, w10
-; GISEL-NEXT:    and w9, w14, w9
-; GISEL-NEXT:    and w9, w11, w9
+; GISEL-NEXT:    and w9, w10, w11
 ; GISEL-NEXT:    and w0, w8, w9
 ; GISEL-NEXT:    ret
   %and_result = call i8 @llvm.vector.reduce.and.v32i8(<32 x i8> %a)
@@ -442,13 +378,10 @@ define i16 @test_redand_v4i16(<4 x i16> %a) {
 ; GISEL-LABEL: test_redand_v4i16:
 ; GISEL:       // %bb.0:
 ; GISEL-NEXT:    // kill: def $d0 killed $d0 def $q0
-; GISEL-NEXT:    mov h1, v0.h[1]
-; GISEL-NEXT:    mov h2, v0.h[2]
-; GISEL-NEXT:    mov h3, v0.h[3]
-; GISEL-NEXT:    fmov w8, s0
-; GISEL-NEXT:    fmov w9, s1
-; GISEL-NEXT:    fmov w10, s2
-; GISEL-NEXT:    fmov w11, s3
+; GISEL-NEXT:    umov w8, v0.h[0]
+; GISEL-NEXT:    umov w9, v0.h[1]
+; GISEL-NEXT:    umov w10, v0.h[2]
+; GISEL-NEXT:    umov w11, v0.h[3]
 ; GISEL-NEXT:    and w8, w8, w9
 ; GISEL-NEXT:    and w9, w10, w11
 ; GISEL-NEXT:    and w0, w8, w9
@@ -472,13 +405,10 @@ define i16 @test_redand_v8i16(<8 x i16> %a) {
 ; GISEL:       // %bb.0:
 ; GISEL-NEXT:    mov d1, v0.d[1]
 ; GISEL-NEXT:    and v0.8b, v0.8b, v1.8b
-; GISEL-NEXT:    mov h1, v0.h[1]
-; GISEL-NEXT:    mov h2, v0.h[2]
-; GISEL-NEXT:    mov h3, v0.h[3]
-; GISEL-NEXT:    fmov w8, s0
-; GISEL-NEXT:    fmov w9, s1
-; GISEL-NEXT:    fmov w10, s2
-; GISEL-NEXT:    fmov w11, s3
+; GISEL-NEXT:    umov w8, v0.h[0]
+; GISEL-NEXT:    umov w9, v0.h[1]
+; GISEL-NEXT:    umov w10, v0.h[2]
+; GISEL-NEXT:    umov w11, v0.h[3]
 ; GISEL-NEXT:    and w8, w8, w9
 ; GISEL-NEXT:    and w9, w10, w11
 ; GISEL-NEXT:    and w0, w8, w9
@@ -504,13 +434,10 @@ define i16 @test_redand_v16i16(<16 x i16> %a) {
 ; GISEL-NEXT:    and v0.16b, v0.16b, v1.16b
 ; GISEL-NEXT:    mov d1, v0.d[1]
 ; GISEL-NEXT:    and v0.8b, v0.8b, v1.8b
-; GISEL-NEXT:    mov h1, v0.h[1]
-; GISEL-NEXT:    mov h2, v0.h[2]
-; GISEL-NEXT:    mov h3, v0.h[3]
-; GISEL-NEXT:    fmov w8, s0
-; GISEL-NEXT:    fmov w9, s1
-; GISEL-NEXT:    fmov w10, s2
-; GISEL-NEXT:    fmov w11, s3
+; GISEL-NEXT:    umov w8, v0.h[0]
+; GISEL-NEXT:    umov w9, v0.h[1]
+; GISEL-NEXT:    umov w10, v0.h[2]
+; GISEL-NEXT:    umov w11, v0.h[3]
 ; GISEL-NEXT:    and w8, w8, w9
 ; GISEL-NEXT:    and w9, w10, w11
 ; GISEL-NEXT:    and w0, w8, w9
diff --git a/llvm/test/CodeGen/AArch64/reduce-or.ll b/llvm/test/CodeGen/AArch64/reduce-or.ll
index 4c30a32934964..c4ac01f32e365 100644
--- a/llvm/test/CodeGen/AArch64/reduce-or.ll
+++ b/llvm/test/CodeGen/AArch64/reduce-or.ll
@@ -53,13 +53,10 @@ define i1 @test_redor_v4i1(<4 x i1> %a) {
 ; GISEL-LABEL: test_redor_v4i1:
 ; GISEL:       // %bb.0:
 ; GISEL-NEXT:    // kill: def $d0 killed $d0 def $q0
-; GISEL-NEXT:    mov h1, v0.h[1]
-; GISEL-NEXT:    mov h2, v0.h[2]
-; GISEL-NEXT:    mov h3, v0.h[3]
-; GISEL-NEXT:    fmov w8, s0
-; GISEL-NEXT:    fmov w9, s1
-; GISEL-NEXT:    fmov w10, s2
-; GISEL-NEXT:    fmov w11, s3
+; GISEL-NEXT:    umov w8, v0.h[0]
+; GISEL-NEXT:    umov w9, v0.h[1]
+; GISEL-NEXT:    umov w10, v0.h[2]
+; GISEL-NEXT:    umov w11, v0.h[3]
 ; GISEL-NEXT:    orr w8, w8, w9
 ; GISEL-NEXT:    orr w9, w10, w11
 ; GISEL-NEXT:    orr w8, w8, w9
@@ -82,27 +79,20 @@ define i1 @test_redor_v8i1(<8 x i1> %a) {
 ; GISEL-LABEL: test_redor_v8i1:
 ; GISEL:       // %bb.0:
 ; GISEL-NEXT:    // kill: def $d0 killed $d0 def $q0
-; GISEL-NEXT:    mov b1, v0.b[1]
-; GISEL-NEXT:    mov b2, v0.b[2]
-; GISEL-NEXT:    mov b3, v0.b[3]
-; GISEL-NEXT:    mov b4, v0.b[4]
-; GISEL-NEXT:    mov b5, v0.b[5]
-; GISEL-NEXT:    mov b6, v0.b[6]
-; GISEL-NEXT:    mov b7, v0.b[7]
-; GISEL-NEXT:    fmov w8, s0
-; GISEL-NEXT:    fmov w9, s1
-; GISEL-NEXT:    fmov w10, s2
-; GISEL-NEXT:    fmov w11, s3
-; GISEL-NEXT:    fmov w12, s4
-; GISEL-NEXT:    fmov w13, s5
-; GISEL-NEXT:    fmov w14, s6
+; GISEL-NEXT:    umov w8, v0.b[0]
+; GISEL-NEXT:    umov w9, v0.b[1]
+; GISEL-NEXT:    umov w10, v0.b[2]
+; GISEL-NEXT:    umov w11, v0.b[3]
+; GISEL-NEXT:    umov w12, v0.b[4]
+; GISEL-NEXT:    umov w13, v0.b[5]
+; GISEL-NEXT:    umov w14, v0.b[6]
+; GISEL-NEXT:    umov w15, v0.b[7]
 ; GISEL-NEXT:    orr w8, w8, w9
-; GISEL-NEXT:    fmov w9, s7
-; GISEL-NEXT:    orr w10, w10, w11
-; GISEL-NEXT:    orr w11, w12, w13
-; GISEL-NEXT:    orr w8, w8, w10
-; GISEL-NEXT:    orr w9, w14, w9
-; GISEL-NEXT:    orr w9, w11, w9
+; GISEL-NEXT:    orr w9, w10, w11
+; GISEL-NEXT:    orr w10, w12, w13
+; GISEL-NEXT:    orr w11, w14, w15
+; GISEL-NEXT:    orr w8, w8, w9
+; GISEL-NEXT:    orr w9, w10, w11
 ; GISEL-NEXT:    orr w8, w8, w9
 ; GISEL-NEXT:    and w0, w8, #0x1
 ; GISEL-NEXT:    ret
@@ -122,49 +112,34 @@ define i1 @test_redor_v16i1(<16 x i1> %a) {
 ;
 ; GISEL-LABEL: test_redor_v16i1:
 ; GISEL:       // %bb.0:
-; GISEL-NEXT:    mov b1, v0.b[1]
-; GISEL-NEXT:    mov b2, v0.b[2]
-; GISEL-NEXT:    mov b3, v0.b[3]
-; GISEL-NEXT:    mov b4, v0.b[4]
-; GISEL-NEXT:    mov b5, v0.b[5]
-; GISEL-NEXT:    mov b6, v0.b[6]
-; GISEL-NEXT:    mov b7, v0.b[7]
-; GISEL-NEXT:    fmov w8, s0
-; GISEL-NEXT:    mov b16, v0.b[8]
-; GISEL-NEXT:    mov b17, v0.b[9]
-; GISEL-NEXT:    mov b18, v0.b[10]
-; GISEL-NEXT:    mov b19, v0.b[11]
-; GISEL-NEXT:    fmov w9, s1
-; GISEL-NEXT:    fmov w10, s2
-; GISEL-NEXT:    fmov w11, s3
-; GISEL-NEXT:    fmov w12, s6
-; GISEL-NEXT:    mov b20, v0.b[12]
-; GISEL-NEXT:    mov b21, v0.b[13]
-; GISEL-NEXT:    fmov w13, s7
-; GISEL-NEXT:    mov b22, v0.b[14]
-; GISEL-NEXT:    mov b23, v0.b[15]
+; GISEL-NEXT:    umov w8, v0.b[0]
+; GISEL-NEXT:    umov w9, v0.b[1]
+; GISEL-NEXT:    umov w10, v0.b[2]
+; GISEL-NEXT:    umov w11, v0.b[3]
+; GISEL-NEXT:    umov w12, v0.b[4]
+; GISEL-NEXT:    umov w13, v0.b[5]
+; GISEL-NEXT:    umov w14, v0.b[6]
+; GISEL-NEXT:    umov w15, v0.b[7]
+; GISEL-NEXT:    umov w16, v0.b[8]
+; GISEL-NEXT:    umov w17, v0.b[9]
+; GISEL-NEXT:    umov w18, v0.b[10]
+; GISEL-NEXT:    umov w0, v0.b[11]
 ; GISEL-NEXT:    orr w8, w8, w9
+; GISEL-NEXT:    umov w1, v0.b[12]
+; GISEL-NEXT:    umov w2, v0.b[13]
 ; GISEL-NEXT:    orr w9, w10, w11
-; GISEL-NEXT:    fmov w10, s4
+; GISEL-NEXT:    orr w10, w12, w13
+; GISEL-NEXT:    umov w3, v0.b[14]
+; GISEL-NEXT:    orr w11, w14, w15
 ; GISEL-NEXT:    orr w8, w8, w9
-; GISEL-NEXT:    fmov w11, s5
-; GISEL-NEXT:    fmov w14, s18
-; GISEL-NEXT:    fmov w15, s19
-; GISEL-NEXT:    fmov w16, s22
-; GISEL-NEXT:    fmov w17, s23
-; GISEL-NEXT:    orr w10, w10, w11
-; GISEL-NEXT:    orr w11, w12, w13
-; GISEL-NEXT:    fmov w12, s16
+; GISEL-NEXT:    umov w4, v0.b[15]
+; GISEL-NEXT:    orr w12, w16, w17
+; GISEL-NEXT:    orr w13, w18, w0
 ; GISEL-NEXT:    orr w9, w10, w11
-; GISEL-NEXT:    fmov w13, s17
-; GISEL-NEXT:    orr w8, w8, w9
-; GISEL-NEXT:    orr w12, w12, w13
-; GISEL-NEXT:    orr w13, w14, w15
-; GISEL-NEXT:    fmov w14, s20
-; GISEL-NEXT:    fmov w15, s21
+; GISEL-NEXT:    orr w14, w1, w2
 ; GISEL-NEXT:    orr w10, w12, w13
-; GISEL-NEXT:    orr w14, w14, w15
-; GISEL-NEXT:    orr w15, w16, w17
+; GISEL-NEXT:    orr w8, w8, w9
+; GISEL-NEXT:    orr w15, w3, w4
 ; GISEL-NEXT:    orr w11, w14, w15
 ; GISEL-NEXT:    orr w9, w10, w11
 ; GISEL-NEXT:    orr w8, w8, w9
@@ -184,49 +159,34 @@ define <16 x i1> @test_redor_ins_v16i1(<16 x i1> %a) {
 ;
 ; GISEL-LABEL: test_redor_ins_v16i1:
 ; GISEL:       // %bb.0:
-; GISEL-NEXT:    mov b1, v0.b[1]
-; GISEL-NEXT:    mov b2, v0.b[2]
-; GISEL-NEXT:    mov b3, v0.b[3]
-; GISEL-NEXT:    mov b4, v0.b[4]
-; GISEL-NEXT:    mov b5, v0.b[5]
-; GISEL-NEXT:    mov b6, v0.b[6]
-; GISEL-NEXT:    mov b7, v0.b[7]
-; GISEL-NEXT:    fmov w8, s0
-; GISEL-NEXT:    mov b16, v0.b[8]
-; GISEL-NEXT:    mov b17, v0.b[9]
-; GISEL-NEXT:    mov b18, v0.b[10]
-; GISEL-NEXT:    mov b19, v0.b[11]
-; GISEL-NEXT:    fmov w9, s1
-; GISEL-NEXT:    fmov w10, s2
-; GISEL-NEXT:    fmov w11, s3
-; GISEL-NEXT:    fmov w12, s6
-; GISEL-NEXT:    mov b20, v0.b[12]
-; GISEL-NEXT:    mov b21, v0.b[13]
-; GISEL-NEXT:    fmov w13, s7
-; GISEL-NEXT:    mov b22, v0.b[14]
-; GISEL-NEXT:    mov b23, v0.b[15]
+; GISEL-NEXT:    umov w8, v0.b[0]
+; GISEL-NEXT:    umov w9, v0.b[1]
+; GISEL-NEXT:    umov w10, v0.b[2]
+; GISEL-NEXT:    umov w11, v0.b[3]
+; GISEL-NEXT:    umov w12, v0.b[4]
+; GISEL-NEXT:    umov w13, v0.b[5]
+; GISEL-NEXT:    umov w14, v0.b[6]
+; GISEL-NEXT:    umov w15, v0.b[7]
+; GISEL-NEXT:    umov w16, v0.b[8]
+; GISEL-NEXT:    umov w17, v0.b[9]
+; GISEL-NEXT:    umov w18, v0.b[10]
+; GISEL-NEXT:    umov w0, v0.b[11]
 ; GISEL-NEXT:    orr w8, w8, w9
+; GISEL-NEXT:    umov w1, v0.b[12]
+; GISEL-NEXT:    umov w2, v0.b[13]
 ; GISEL-NEXT:    orr w9, w10, w11
-; GISEL-NEXT:    fmov w10, s4
+; GISEL-NEXT:    orr w10, w12, w13
+; GISEL-NEXT:    umov w3, v0.b[14]
+; GISEL-NEXT:    orr w11, w14, w15
 ; GISEL-NEXT:    orr w8, w8, w9
-; GISEL-NEXT:    fmov w11, s5
-; GISEL-NEXT:    fmov w14, s18
-; GISEL-NEXT:    fmov w15, s19
-; GISEL-NEXT:    fmov w16, s22
-; GISEL-NEXT:    fmov w17, s23
-; GISEL-NEXT:    orr w10, w10, w11
-; GISEL-NEXT:    orr w11, w12, w13
-; GISEL-NEXT:    fmov w12, s16
+; GISEL-NEXT:    umov w4, v0.b[15]
+; GISEL-NEXT:    orr w12, w16, w17
+; GISEL-NEXT:    orr w13, w18, w0
 ; GISEL-NEXT:    orr w9, w10, w11
-; GISEL-NEXT:    fmov w13, s17
-; GISEL-NEXT:    orr w8, w8, w9
-; GISEL-NEXT:    orr w12, w12, w13
-; GISEL-NEXT:    orr w13, w14, w15
-; GISEL-NEXT:    fmov w14, s20
-; GISEL-NEXT:    fmov w15, s21
+; GISEL-NEXT:    orr w14, w1, w2
 ; GISEL-NEXT:    orr w10, w12, w13
-; GISEL-NEXT:    orr w14, w14, w15
-; GISEL-NEXT:    orr w15, w16, w17
+; GISEL-NEXT:    orr w8, w8, w9
+; GISEL-NEXT:    orr w15, w3, w4
 ; GISEL-NEXT:    orr w11, w14, w15
 ; GISEL-NEXT:    orr w9, w10, w11
 ; GISEL-NEXT:    orr w8, w8, w9
@@ -289,13 +249,10 @@ define i8 @test_redor_v4i8(<4 x i8> %a) {
 ; GISEL-LABEL: test_redor_v4i8:
 ; GISEL:       // %bb.0:
 ; GISEL-NEXT:    // kill: def $d0 killed $d0 def $q0
-; GISEL-NEXT:    mov h1, v0.h[1]
-; GISEL-NEXT:    mov h2, v0.h[2]
-; GISEL-NEXT:    mov h3, v0.h[3]
-; GISEL-NEXT:    fmov w8, s0
-; GISEL-NEXT:    fmov w9, s1
-; GISEL-NEXT:    fmov w10, s2
-; GISEL-NEXT:    fmov w11, s3
+; GISEL-NEXT:    umov w8, v0.h[0]
+; GISEL-NEXT:    umov w9, v0.h[1]
+; GISEL-NEXT:    umov w10, v0.h[2]
+; GISEL-NEXT:    umov w11, v0.h[3]
 ; GISEL-NEXT:    orr w8, w8, w9
 ; GISEL-NEXT:    orr w9, w10, w11
 ; GISEL-NEXT:    orr w0, w8, w9
@@ -317,27 +274,20 @@ define i8 @test_redor_v8i8(<8 x i8> %a) {
 ; GISEL-LABEL: test_redor_v8i8:
 ; GISEL:       // %bb.0:
 ; GISEL-NEXT:    // kill: def $d0 killed $d0 def $q0
-; GISEL-NEXT:    mov b1, v0.b[1]
-; GISEL-NEXT:    mov b2, v0.b[2]
-; GISEL-NEXT:    mov b3, v0.b[3]
-; GISEL-NEXT:    mov b4, v0.b[4]
-; GISEL-NEXT:    mov b5, v0.b[5]
-; GISEL-NEXT:    mov b6, v0.b[6]
-; GISEL-NEXT:    mov b7, v0.b[7]
-; GISEL-NEXT:    fmov w8, s0
-; GISEL-NEXT:    fmov w9, s1
-; GISEL-NEXT:    fmov w10, s2
-; GISEL-NEXT:    fmov w11, s3
-; GISEL-NEXT:    fmov w12, s4
-; GISEL-NEXT:    fmov w13, s5
-; GISEL-NEXT:    fmov w14, s6
+; GISEL-NEXT:    umov w8, v0.b[0]
+; GISEL-NEXT:    umov w9, v0.b[1]
+; GISEL-NEXT:    umov w10, v0.b[2]
+; GISEL-NEXT:    umov w11, v0.b[3]
+; GISEL-NEXT:    umov w12, v0.b[4]
+; GISEL-NEXT:    umov w13, v0.b[5]
+; GISEL-NEXT:    umov w14, v0.b[6]
+; GISEL-NEXT:    umov w15, v0.b[7]
 ; GISEL-NEXT:    orr w8, w8, w9
-; GISEL-NEXT:    fmov w9, s7
-; GISEL-NEXT:    orr w10, w10, w11
-; GISEL-NEXT:    orr w11, w12, w13
-; GISEL-NEXT:    orr w8, w8, w10
-; GISEL-NEXT:    orr w9, w14, w9
-; GISEL-NEXT:    orr w9, w11, w9
+; GISEL-NEXT:    orr w9, w10, w11
+; GISEL-NEXT:    orr w10, w12, w13
+; GISEL-NEXT:    orr w11, w14, w15
+; GISEL-NEXT:    orr w8, w8, w9
+; GISEL-NEXT:    orr w9, w10, w11
 ; GISEL-NEXT:    orr w0, w8, w9
 ; GISEL-NEXT:    ret
   %or_result = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> %a)
@@ -360,27 +310,20 @@ define i8 @test_redor_v16i8(<16 x i8> %a) {
 ; GISEL:       // %bb.0:
 ; GISEL-NEXT:    mov d1, v0.d[1]
 ; GISEL-NEXT:    orr v0.8b, v0.8b, v1.8b
-; GISEL-NEXT:    mov b1, v0.b[1]
-; GISEL-NEXT:    mov b2, v0.b[2]
-; GISEL-NEXT:    mov b3, v0.b[3]
-; GISEL-NEXT:    mov b4, v0.b[4]
-; GISEL-NEXT:    mov b5, v0.b[5]
-; GISEL-NEXT:    mov b6, v0.b[6]
-; GISEL-NEXT:    mov b7, v0.b[7]
-; GISEL-NEXT:    fmov w8, s0
-; GISEL-NEXT:    fmov w9, s1
-; GISEL-NEXT:    fmov w10, s2
-; GISEL-NEXT:    fmov w11, s3
-; GISEL-NEXT:    fmov w12, s4
-; GISEL-NEXT:    fmov w13, s5
-; GISEL-NEXT:    fmov w14, s6
+; GISEL-NEXT:    umov w8, v0.b[0]
+; GISEL-NEXT:    umov w9, v0.b[1]
+; GISEL-NEXT:    umov w10, v0.b[2]
+; GISEL-NEXT:    umov w11, v0.b[3]
+; GISEL-NEXT:    umov w12, v0.b[4]
+; GISEL-NEXT:    umov w13, v0.b[5]
+; GISEL-NEXT:    umov w14, v0.b[6]
+; GISEL-NEXT:    umov w15, v0.b[7]
 ; GISEL-NEXT:    orr w8, w8, w9
-; GISEL-NEXT:    fmov w9, s7
-; GISEL-NEXT:    orr w10, w10, w11
-; GISEL-NEXT:    orr w11, w12, w13
-; GISEL-NEXT:    orr w8, w8, w10
-; GISEL-NEXT:    orr w9, w14, w9
-; GISEL-NEXT:    orr w9, w11, w9
+; GISEL-NEXT:    orr w9, w10, w11
+; GISEL-NEXT:    orr w10, w12, w13
+; GISEL-NEXT:    orr w11, w14, w15
+; GISEL-NEXT:    orr w8, w8, w9
+; GISEL-NEXT:    orr w9, w10, w11
 ; GISEL-NEXT:    orr w0, w8, w9
 ; GISEL-NEXT:    ret
   %or_result = call i8 @llvm.vector.reduce.or.v16i8(<16 x i8> %a)
@@ -405,27 +348,20 @@ define i8 @test_redor_v32i8(<32 x i8> %a) {
 ; GISEL-NEXT:    orr v0.16b, v0.16b, v1.16b
 ; GISEL-NEXT:    mov d1, v0.d[1]
 ; GISEL-NEXT:    orr v0.8b, v0.8b, v1.8b
-; GISEL-NEXT:    mov b1, v0.b[1]
-; GISEL-NEXT:    mov b2, v0.b[2]
-; GISEL-NEXT:    mov b3, v0.b[3]
-; GISEL-NEXT:    mov b4, v0.b[4]
-; GISEL-NEXT:    mov b5, v0.b[5]
-; GISEL-NEXT:    mov b6, v0.b[6]
-; GISEL-NEXT:    mov b7, v0.b[7]
-; GISEL-NEXT:    fmov w8, s0
-; GISEL-NEXT:    fmov w9, s1
-; GISEL-NEXT:    fmov w10, s2
-; GISEL-NEXT:    fmov w11, s3
-; GISEL-NEXT:    fmov w12, s4
-; GISEL-NEXT:    fmov w13, s5
-; GISEL-NEXT:    fmov w14, s6
+; GISEL-NEXT:    umov w8, v0.b[0]
+; GISEL-NEXT:    umov w9, v0.b[1]
+; GISEL-NEXT:    umov w10, v0.b[2]
+; GISEL-NEXT:    umov w11, v0.b[3]
+; GISEL-NEXT:    umov w12, v0.b[4]
+; GISEL-NEXT:    umov w13, v0.b[5]
+; GISEL-NEXT:    umov w14, v0.b[6]
+; GISEL-NEXT:    umov w15, v0.b[7]
+; GISEL-NEXT:    orr w8, w8, w9
+; GISEL-NEXT:    orr w9, w10, w11
+; GISEL-NEXT:    orr w10, w12, w13
+; GISEL-NEXT:    orr w11, w14, w15
 ; GISEL-NEXT:    orr w8, w8, w9
-; GISEL-NEXT:    fmov w9, s7
-; GISEL-NEXT:    orr w10, w10, w11
-; GISEL-NEXT:    orr w11, w12, w13
-; GISEL-NEXT:    orr w8, w8, w10
-; GISEL-NEXT:    orr w9, w14, w9
-; GISEL-NEXT:    orr w9, w11, w9
+; GISEL-NEXT:    orr w9, w10, w11
 ; GISEL-NEXT:    orr w0, w8, w9
 ; GISEL-NEXT:    ret
   %or_result = call i8 @llvm.vector.reduce.or.v32i8(<32 x i8> %a)
@@ -444,13 +380,10 @@ define i16 @test_redor_v4i16(<4 x i16> %a) {
 ; GISEL-LABEL: test_redor_v4i16:
 ; GISEL:       // %bb.0:
 ; GISEL-NEXT:    // kill: def $d0 killed $d0 def $q0
-; GISEL-NEXT:    mov h1, v0.h[1]
-; GISEL-NEXT:    mov h2, v0.h[2]
-; GISEL-NEXT:    mov h3, v0.h[3]
-; GISEL-NEXT:    fmov w8, s0
-; GISEL-NEXT:    fmov w9, s1
-; GISEL-NEXT:    fmov w10, s2
-; GISEL-NEXT:    fmov w11, s3
+; GISEL-NEXT:    umov w8, v0.h[0]
+; GISEL-NEXT:    umov w9, v0.h[1]
+; GISEL-NEXT:    umov w10, v0.h[2]
+; GISEL-NEXT:    umov w11, v0.h[3]
 ; GISEL-NEXT:    orr w8, w8, w9
 ; GISEL-NEXT:    orr w9, w10, w11
 ; GISEL-NEXT:    orr w0, w8, w9
@@ -474,13 +407,10 @@ define i16 @test_redor_v8i16(<8 x i16> %a) {
 ; GISEL:       // %bb.0:
 ; GISEL-NEXT:    mov d1, v0.d[1]
 ; GISEL-NEXT:    orr v0.8b, v0.8b, v1.8b
-; GISEL-NEXT:    mov h1, v0.h[1]
-; GISEL-NEXT:    mov h2, v0.h[2]
-; GISEL-NEXT:    mov h3, v0.h[3]
-; GISEL-NEXT:    fmov w8, s0
-; GISEL-NEXT:    fmov w9, s1
-; GISEL-NEXT:    fmov w10, s2
-; GISEL-NEXT:    fmov w11, s3
+; GISEL-NEXT:    umov w8, v0.h[0]
+; GISEL-NEXT:    umov w9, v0.h[1]
+; GISEL-NEXT:    umov w10, v0.h[2]
+; GISEL-NEXT:    umov w11, v0.h[3]
 ; GISEL-NEXT:    orr w8, w8, w9
 ; GISEL-NEXT:    orr w9, w10, w11
 ; GISEL-NEXT:    orr w0, w8, w9
@@ -506,13 +436,10 @@ define i16 @test_redor_v16i16(<16 x i16> %a) {
 ; GISEL-NEXT:    orr v0.16b, v0.16b, v1.16b
 ; GISEL-NEXT:    mov d1, v0.d[1]
 ; GISEL-NEXT:    orr v0.8b, v0.8b, v1.8b
-; GISEL-NEXT:    mov h1, v0.h[1]
-; GISEL-NEXT:    mov h2, v0.h[2]
-; GISEL-NEXT:    mov h3, v0.h[3]
-; GISEL-NEXT:    fmov w8, s0
-; GISEL-NEXT:    fmov w9, s1
-; GISEL-NEXT:    fmov w10, s2
-; GISEL-NEXT:    fmov w11, s3
+; GISEL-NEXT:    umov w8, v0.h[0]
+; GISEL-NEXT:    umov w9, v0.h[1]
+; GISEL-NEXT:    umov w10, v0.h[2]
+; GISEL-NEXT:    umov w11, v0.h[3]
 ; GISEL-NEXT:    orr w8, w8, w9
 ; GISEL-NEXT:    orr w9, w10, w11
 ; GISEL-NEXT:    orr w0, w8, w9
diff --git a/llvm/test/CodeGen/AArch64/reduce-xor.ll b/llvm/test/CodeGen/AArch64/reduce-xor.ll
index c74b3734a1b76..5c2a808ef2e88 100644
--- a/llvm/test/CodeGen/AArch64/reduce-xor.ll
+++ b/llvm/test/CodeGen/AArch64/reduce-xor.ll
@@ -48,13 +48,10 @@ define i1 @test_redxor_v4i1(<4 x i1> %a) {
 ; GISEL-LABEL: test_redxor_v4i1:
 ; GISEL:       // %bb.0:
 ; GISEL-NEXT:    // kill: def $d0 killed $d0 def $q0
-; GISEL-NEXT:    mov h1, v0.h[1]
-; GISEL-NEXT:    mov h2, v0.h[2]
-; GISEL-NEXT:    mov h3, v0.h[3]
-; GISEL-NEXT:    fmov w8, s0
-; GISEL-NEXT:    fmov w9, s1
-; GISEL-NEXT:    fmov w10, s2
-; GISEL-NEXT:    fmov w11, s3
+; GISEL-NEXT:    umov w8, v0.h[0]
+; GISEL-NEXT:    umov w9, v0.h[1]
+; GISEL-NEXT:    umov w10, v0.h[2]
+; GISEL-NEXT:    umov w11, v0.h[3]
 ; GISEL-NEXT:    eor w8, w8, w9
 ; GISEL-NEXT:    eor w9, w10, w11
 ; GISEL-NEXT:    eor w8, w8, w9
@@ -75,27 +72,20 @@ define i1 @test_redxor_v8i1(<8 x i1> %a) {
 ; GISEL-LABEL: test_redxor_v8i1:
 ; GISEL:       // %bb.0:
 ; GISEL-NEXT:    // kill: def $d0 killed $d0 def $q0
-; GISEL-NEXT:    mov b1, v0.b[1]
-; GISEL-NEXT:    mov b2, v0.b[2]
-; GISEL-NEXT:    mov b3, v0.b[3]
-; GISEL-NEXT:    mov b4, v0.b[4]
-; GISEL-NEXT:    mov b5, v0.b[5]
-; GISEL-NEXT:    mov b6, v0.b[6]
-; GISEL-NEXT:    mov b7, v0.b[7]
-; GISEL-NEXT:    fmov w8, s0
-; GISEL-NEXT:    fmov w9, s1
-; GISEL-NEXT:    fmov w10, s2
-; GISEL-NEXT:    fmov w11, s3
-; GISEL-NEXT:    fmov w12, s4
-; GISEL-NEXT:    fmov w13, s5
-; GISEL-NEXT:    fmov w14, s6
+; GISEL-NEXT:    umov w8, v0.b[0]
+; GISEL-NEXT:    umov w9, v0.b[1]
+; GISEL-NEXT:    umov w10, v0.b[2]
+; GISEL-NEXT:    umov w11, v0.b[3]
+; GISEL-NEXT:    umov w12, v0.b[4]
+; GISEL-NEXT:    umov w13, v0.b[5]
+; GISEL-NEXT:    umov w14, v0.b[6]
+; GISEL-NEXT:    umov w15, v0.b[7]
 ; GISEL-NEXT:    eor w8, w8, w9
-; GISEL-NEXT:    fmov w9, s7
-; GISEL-NEXT:    eor w10, w10, w11
-; GISEL-NEXT:    eor w11, w12, w13
-; GISEL-NEXT:    eor w8, w8, w10
-; GISEL-NEXT:    eor w9, w14, w9
-; GISEL-NEXT:    eor w9, w11, w9
+; GISEL-NEXT:    eor w9, w10, w11
+; GISEL-NEXT:    eor w10, w12, w13
+; GISEL-NEXT:    eor w11, w14, w15
+; GISEL-NEXT:    eor w8, w8, w9
+; GISEL-NEXT:    eor w9, w10, w11
 ; GISEL-NEXT:    eor w8, w8, w9
 ; GISEL-NEXT:    and w0, w8, #0x1
 ; GISEL-NEXT:    ret
@@ -113,49 +103,34 @@ define i1 @test_redxor_v16i1(<16 x i1> %a) {
 ;
 ; GISEL-LABEL: test_redxor_v16i1:
 ; GISEL:       // %bb.0:
-; GISEL-NEXT:    mov b1, v0.b[1]
-; GISEL-NEXT:    mov b2, v0.b[2]
-; GISEL-NEXT:    mov b3, v0.b[3]
-; GISEL-NEXT:    mov b4, v0.b[4]
-; GISEL-NEXT:    mov b5, v0.b[5]
-; GISEL-NEXT:    mov b6, v0.b[6]
-; GISEL-NEXT:    mov b7, v0.b[7]
-; GISEL-NEXT:    fmov w8, s0
-; GISEL-NEXT:    mov b16, v0.b[8]
-; GISEL-NEXT:    mov b17, v0.b[9]
-; GISEL-NEXT:    mov b18, v0.b[10]
-; GISEL-NEXT:    mov b19, v0.b[11]
-; GISEL-NEXT:    fmov w9, s1
-; GISEL-NEXT:    fmov w10, s2
-; GISEL-NEXT:    fmov w11, s3
-; GISEL-NEXT:    fmov w12, s6
-; GISEL-NEXT:    mov b20, v0.b[12]
-; GISEL-NEXT:    mov b21, v0.b[13]
-; GISEL-NEXT:    fmov w13, s7
-; GISEL-NEXT:    mov b22, v0.b[14]
-; GISEL-NEXT:    mov b23, v0.b[15]
+; GISEL-NEXT:    umov w8, v0.b[0]
+; GISEL-NEXT:    umov w9, v0.b[1]
+; GISEL-NEXT:    umov w10, v0.b[2]
+; GISEL-NEXT:    umov w11, v0.b[3]
+; GISEL-NEXT:    umov w12, v0.b[4]
+; GISEL-NEXT:    umov w13, v0.b[5]
+; GISEL-NEXT:    umov w14, v0.b[6]
+; GISEL-NEXT:    umov w15, v0.b[7]
+; GISEL-NEXT:    umov w16, v0.b[8]
+; GISEL-NEXT:    umov w17, v0.b[9]
+; GISEL-NEXT:    umov w18, v0.b[10]
+; GISEL-NEXT:    umov w0, v0.b[11]
 ; GISEL-NEXT:    eor w8, w8, w9
+; GISEL-NEXT:    umov w1, v0.b[12]
+; GISEL-NEXT:    umov w2, v0.b[13]
 ; GISEL-NEXT:    eor w9, w10, w11
-; GISEL-NEXT:    fmov w10, s4
+; GISEL-NEXT:    eor w10, w12, w13
+; GISEL-NEXT:    umov w3, v0.b[14]
+; GISEL-NEXT:    eor w11, w14, w15
 ; GISEL-NEXT:    eor w8, w8, w9
-; GISEL-NEXT:    fmov w11, s5
-; GISEL-NEXT:    fmov w14, s18
-; GISEL-NEXT:    fmov w15, s19
-; GISEL-NEXT:    fmov w16, s22
-; GISEL-NEXT:    fmov w17, s23
-; GISEL-NEXT:    eor w10, w10, w11
-; GISEL-NEXT:    eor w11, w12, w13
-; GISEL-NEXT:    fmov w12, s16
+; GISEL-NEXT:    umov w4, v0.b[15]
+; GISEL-NEXT:    eor w12, w16, w17
+; GISEL-NEXT:    eor w13, w18, w0
 ; GISEL-NEXT:    eor w9, w10, w11
-; GISEL-NEXT:    fmov w13, s17
-; GISEL-NEXT:    eor w8, w8, w9
-; GISEL-NEXT:    eor w12, w12, w13
-; GISEL-NEXT:    eor w13, w14, w15
-; GISEL-NEXT:    fmov w14, s20
-; GISEL-NEXT:    fmov w15, s21
+; GISEL-NEXT:    eor w14, w1, w2
 ; GISEL-NEXT:    eor w10, w12, w13
-; GISEL-NEXT:    eor w14, w14, w15
-; GISEL-NEXT:    eor w15, w16, w17
+; GISEL-NEXT:    eor w8, w8, w9
+; GISEL-NEXT:    eor w15, w3, w4
 ; GISEL-NEXT:    eor w11, w14, w15
 ; GISEL-NEXT:    eor w9, w10, w11
 ; GISEL-NEXT:    eor w8, w8, w9
@@ -173,49 +148,34 @@ define <16 x i1> @test_redxor_ins_v16i1(<16 x i1> %a) {
 ;
 ; GISEL-LABEL: test_redxor_ins_v16i1:
 ; GISEL:       // %bb.0:
-; GISEL-NEXT:    mov b1, v0.b[1]
-; GISEL-NEXT:    mov b2, v0.b[2]
-; GISEL-NEXT:    mov b3, v0.b[3]
-; GISEL-NEXT:    mov b4, v0.b[4]
-; GISEL-NEXT:    mov b5, v0.b[5]
-; GISEL-NEXT:    mov b6, v0.b[6]
-; GISEL-NEXT:    mov b7, v0.b[7]
-; GISEL-NEXT:    fmov w8, s0
-; GISEL-NEXT:    mov b16, v0.b[8]
-; GISEL-NEXT:    mov b17, v0.b[9]
-; GISEL-NEXT:    mov b18, v0.b[10]
-; GISEL-NEXT:    mov b19, v0.b[11]
-; GISEL-NEXT:    fmov w9, s1
-; GISEL-NEXT:    fmov w10, s2
-; GISEL-NEXT:    fmov w11, s3
-; GISEL-NEXT:    fmov w12, s6
-; GISEL-NEXT:    mov b20, v0.b[12]
-; GISEL-NEXT:    mov b21, v0.b[13]
-; GISEL-NEXT:    fmov w13, s7
-; GISEL-NEXT:    mov b22, v0.b[14]
-; GISEL-NEXT:    mov b23, v0.b[15]
+; GISEL-NEXT:    umov w8, v0.b[0]
+; GISEL-NEXT:    umov w9, v0.b[1]
+; GISEL-NEXT:    umov w10, v0.b[2]
+; GISEL-NEXT:    umov w11, v0.b[3]
+; GISEL-NEXT:    umov w12, v0.b[4]
+; GISEL-NEXT:    umov w13, v0.b[5]
+; GISEL-NEXT:    umov w14, v0.b[6]
+; GISEL-NEXT:    umov w15, v0.b[7]
+; GISEL-NEXT:    umov w16, v0.b[8]
+; GISEL-NEXT:    umov w17, v0.b[9]
+; GISEL-NEXT:    umov w18, v0.b[10]
+; GISEL-NEXT:    umov w0, v0.b[11]
 ; GISEL-NEXT:    eor w8, w8, w9
+; GISEL-NEXT:    umov w1, v0.b[12]
+; GISEL-NEXT:    umov w2, v0.b[13]
 ; GISEL-NEXT:    eor w9, w10, w11
-; GISEL-NEXT:    fmov w10, s4
+; GISEL-NEXT:    eor w10, w12, w13
+; GISEL-NEXT:    umov w3, v0.b[14]
+; GISEL-NEXT:    eor w11, w14, w15
 ; GISEL-NEXT:    eor w8, w8, w9
-; GISEL-NEXT:    fmov w11, s5
-; GISEL-NEXT:    fmov w14, s18
-; GISEL-NEXT:    fmov w15, s19
-; GISEL-NEXT:    fmov w16, s22
-; GISEL-NEXT:    fmov w17, s23
-; GISEL-NEXT:    eor w10, w10, w11
-; GISEL-NEXT:    eor w11, w12, w13
-; GISEL-NEXT:    fmov w12, s16
+; GISEL-NEXT:    umov w4, v0.b[15]
+; GISEL-NEXT:    eor w12, w16, w17
+; GISEL-NEXT:    eor w13, w18, w0
 ; GISEL-NEXT:    eor w9, w10, w11
-; GISEL-NEXT:    fmov w13, s17
-; GISEL-NEXT:    eor w8, w8, w9
-; GISEL-NEXT:    eor w12, w12, w13
-; GISEL-NEXT:    eor w13, w14, w15
-; GISEL-NEXT:    fmov w14, s20
-; GISEL-NEXT:    fmov w15, s21
+; GISEL-NEXT:    eor w14, w1, w2
 ; GISEL-NEXT:    eor w10, w12, w13
-; GISEL-NEXT:    eor w14, w14, w15
-; GISEL-NEXT:    eor w15, w16, w17
+; GISEL-NEXT:    eor w8, w8, w9
+; GISEL-NEXT:    eor w15, w3, w4
 ; GISEL-NEXT:    eor w11, w14, w15
 ; GISEL-NEXT:    eor w9, w10, w11
 ; GISEL-NEXT:    eor w8, w8, w9
@@ -278,13 +238,10 @@ define i8 @test_redxor_v4i8(<4 x i8> %a) {
 ; GISEL-LABEL: test_redxor_v4i8:
 ; GISEL:       // %bb.0:
 ; GISEL-NEXT:    // kill: def $d0 killed $d0 def $q0
-; GISEL-NEXT:    mov h1, v0.h[1]
-; GISEL-NEXT:    mov h2, v0.h[2]
-; GISEL-NEXT:    mov h3, v0.h[3]
-; GISEL-NEXT:    fmov w8, s0
-; GISEL-NEXT:    fmov w9, s1
-; GISEL-NEXT:    fmov w10, s2
-; GISEL-NEXT:    fmov w11, s3
+; GISEL-NEXT:    umov w8, v0.h[0]
+; GISEL-NEXT:    umov w9, v0.h[1]
+; GISEL-NEXT:    umov w10, v0.h[2]
+; GISEL-NEXT:    umov w11, v0.h[3]
 ; GISEL-NEXT:    eor w8, w8, w9
 ; GISEL-NEXT:    eor w9, w10, w11
 ; GISEL-NEXT:    eor w0, w8, w9
@@ -306,27 +263,20 @@ define i8 @test_redxor_v8i8(<8 x i8> %a) {
 ; GISEL-LABEL: test_redxor_v8i8:
 ; GISEL:       // %bb.0:
 ; GISEL-NEXT:    // kill: def $d0 killed $d0 def $q0
-; GISEL-NEXT:    mov b1, v0.b[1]
-; GISEL-NEXT:    mov b2, v0.b[2]
-; GISEL-NEXT:    mov b3, v0.b[3]
-; GISEL-NEXT:    mov b4, v0.b[4]
-; GISEL-NEXT:    mov b5, v0.b[5]
-; GISEL-NEXT:    mov b6, v0.b[6]
-; GISEL-NEXT:    mov b7, v0.b[7]
-; GISEL-NEXT:    fmov w8, s0
-; GISEL-NEXT:    fmov w9, s1
-; GISEL-NEXT:    fmov w10, s2
-; GISEL-NEXT:    fmov w11, s3
-; GISEL-NEXT:    fmov w12, s4
-; GISEL-NEXT:    fmov w13, s5
-; GISEL-NEXT:    fmov w14, s6
+; GISEL-NEXT:    umov w8, v0.b[0]
+; GISEL-NEXT:    umov w9, v0.b[1]
+; GISEL-NEXT:    umov w10, v0.b[2]
+; GISEL-NEXT:    umov w11, v0.b[3]
+; GISEL-NEXT:    umov w12, v0.b[4]
+; GISEL-NEXT:    umov w13, v0.b[5]
+; GISEL-NEXT:    umov w14, v0.b[6]
+; GISEL-NEXT:    umov w15, v0.b[7]
 ; GISEL-NEXT:    eor w8, w8, w9
-; GISEL-NEXT:    fmov w9, s7
-; GISEL-NEXT:    eor w10, w10, w11
-; GISEL-NEXT:    eor w11, w12, w13
-; GISEL-NEXT:    eor w8, w8, w10
-; GISEL-NEXT:    eor w9, w14, w9
-; GISEL-NEXT:    eor w9, w11, w9
+; GISEL-NEXT:    eor w9, w10, w11
+; GISEL-NEXT:    eor w10, w12, w13
+; GISEL-NEXT:    eor w11, w14, w15
+; GISEL-NEXT:    eor w8, w8, w9
+; GISEL-NEXT:    eor w9, w10, w11
 ; GISEL-NEXT:    eor w0, w8, w9
 ; GISEL-NEXT:    ret
   %xor_result = call i8 @llvm.vector.reduce.xor.v8i8(<8 x i8> %a)
@@ -349,27 +299,20 @@ define i8 @test_redxor_v16i8(<16 x i8> %a) {
 ; GISEL:       // %bb.0:
 ; GISEL-NEXT:    mov d1, v0.d[1]
 ; GISEL-NEXT:    eor v0.8b, v0.8b, v1.8b
-; GISEL-NEXT:    mov b1, v0.b[1]
-; GISEL-NEXT:    mov b2, v0.b[2]
-; GISEL-NEXT:    mov b3, v0.b[3]
-; GISEL-NEXT:    mov b4, v0.b[4]
-; GISEL-NEXT:    mov b5, v0.b[5]
-; GISEL-NEXT:    mov b6, v0.b[6]
-; GISEL-NEXT:    mov b7, v0.b[7]
-; GISEL-NEXT:    fmov w8, s0
-; GISEL-NEXT:    fmov w9, s1
-; GISEL-NEXT:    fmov w10, s2
-; GISEL-NEXT:    fmov w11, s3
-; GISEL-NEXT:    fmov w12, s4
-; GISEL-NEXT:    fmov w13, s5
-; GISEL-NEXT:    fmov w14, s6
+; GISEL-NEXT:    umov w8, v0.b[0]
+; GISEL-NEXT:    umov w9, v0.b[1]
+; GISEL-NEXT:    umov w10, v0.b[2]
+; GISEL-NEXT:    umov w11, v0.b[3]
+; GISEL-NEXT:    umov w12, v0.b[4]
+; GISEL-NEXT:    umov w13, v0.b[5]
+; GISEL-NEXT:    umov w14, v0.b[6]
+; GISEL-NEXT:    umov w15, v0.b[7]
 ; GISEL-NEXT:    eor w8, w8, w9
-; GISEL-NEXT:    fmov w9, s7
-; GISEL-NEXT:    eor w10, w10, w11
-; GISEL-NEXT:    eor w11, w12, w13
-; GISEL-NEXT:    eor w8, w8, w10
-; GISEL-NEXT:    eor w9, w14, w9
-; GISEL-NEXT:    eor w9, w11, w9
+; GISEL-NEXT:    eor w9, w10, w11
+; GISEL-NEXT:    eor w10, w12, w13
+; GISEL-NEXT:    eor w11, w14, w15
+; GISEL-NEXT:    eor w8, w8, w9
+; GISEL-NEXT:    eor w9, w10, w11
 ; GISEL-NEXT:    eor w0, w8, w9
 ; GISEL-NEXT:    ret
   %xor_result = call i8 @llvm.vector.reduce.xor.v16i8(<16 x i8> %a)
@@ -394,27 +337,20 @@ define i8 @test_redxor_v32i8(<32 x i8> %a) {
 ; GISEL-NEXT:    eor v0.16b, v0.16b, v1.16b
 ; GISEL-NEXT:    mov d1, v0.d[1]
 ; GISEL-NEXT:    eor v0.8b, v0.8b, v1.8b
-; GISEL-NEXT:    mov b1, v0.b[1]
-; GISEL-NEXT:    mov b2, v0.b[2]
-; GISEL-NEXT:    mov b3, v0.b[3]
-; GISEL-NEXT:    mov b4, v0.b[4]
-; GISEL-NEXT:    mov b5, v0.b[5]
-; GISEL-NEXT:    mov b6, v0.b[6]
-; GISEL-NEXT:    mov b7, v0.b[7]
-; GISEL-NEXT:    fmov w8, s0
-; GISEL-NEXT:    fmov w9, s1
-; GISEL-NEXT:    fmov w10, s2
-; GISEL-NEXT:    fmov w11, s3
-; GISEL-NEXT:    fmov w12, s4
-; GISEL-NEXT:    fmov w13, s5
-; GISEL-NEXT:    fmov w14, s6
+; GISEL-NEXT:    umov w8, v0.b[0]
+; GISEL-NEXT:    umov w9, v0.b[1]
+; GISEL-NEXT:    umov w10, v0.b[2]
+; GISEL-NEXT:    umov w11, v0.b[3]
+; GISEL-NEXT:    umov w12, v0.b[4]
+; GISEL-NEXT:    umov w13, v0.b[5]
+; GISEL-NEXT:    umov w14, v0.b[6]
+; GISEL-NEXT:    umov w15, v0.b[7]
+; GISEL-NEXT:    eor w8, w8, w9
+; GISEL-NEXT:    eor w9, w10, w11
+; GISEL-NEXT:    eor w10, w12, w13
+; GISEL-NEXT:    eor w11, w14, w15
 ; GISEL-NEXT:    eor w8, w8, w9
-; GISEL-NEXT:    fmov w9, s7
-; GISEL-NEXT:    eor w10, w10, w11
-; GISEL-NEXT:    eor w11, w12, w13
-; GISEL-NEXT:    eor w8, w8, w10
-; GISEL-NEXT:    eor w9, w14, w9
-; GISEL-NEXT:    eor w9, w11, w9
+; GISEL-NEXT:    eor w9, w10, w11
 ; GISEL-NEXT:    eor w0, w8, w9
 ; GISEL-NEXT:    ret
   %xor_result = call i8 @llvm.vector.reduce.xor.v32i8(<32 x i8> %a)
@@ -433,13 +369,10 @@ define i16 @test_redxor_v4i16(<4 x i16> %a) {
 ; GISEL-LABEL: test_redxor_v4i16:
 ; GISEL:       // %bb.0:
 ; GISEL-NEXT:    // kill: def $d0 killed $d0 def $q0
-; GISEL-NEXT:    mov h1, v0.h[1]
-; GISEL-NEXT:    mov h2, v0.h[2]
-; GISEL-NEXT:    mov h3, v0.h[3]
-; GISEL-NEXT:    fmov w8, s0
-; GISEL-NEXT:    fmov w9, s1
-; GISEL-NEXT:    fmov w10, s2
-; GISEL-NEXT:    fmov w11, s3
+; GISEL-NEXT:    umov w8, v0.h[0]
+; GISEL-NEXT:    umov w9, v0.h[1]
+; GISEL-NEXT:    umov w10, v0.h[2]
+; GISEL-NEXT:    umov w11, v0.h[3]
 ; GISEL-NEXT:    eor w8, w8, w9
 ; GISEL-NEXT:    eor w9, w10, w11
 ; GISEL-NEXT:    eor w0, w8, w9
@@ -463,13 +396,10 @@ define i16 @test_redxor_v8i16(<8 x i16> %a) {
 ; GISEL:       // %bb.0:
 ; GISEL-NEXT:    mov d1, v0.d[1]
 ; GISEL-NEXT:    eor v0.8b, v0.8b, v1.8b
-; GISEL-NEXT:    mov h1, v0.h[1]
-; GISEL-NEXT:    mov h2, v0.h[2]
-; GISEL-NEXT:    mov h3, v0.h[3]
-; GISEL-NEXT:    fmov w8, s0
-; GISEL-NEXT:    fmov w9, s1
-; GISEL-NEXT:    fmov w10, s2
-; GISEL-NEXT:    fmov w11, s3
+; GISEL-NEXT:    umov w8, v0.h[0]
+; GISEL-NEXT:    umov w9, v0.h[1]
+; GISEL-NEXT:    umov w10, v0.h[2]
+; GISEL-NEXT:    umov w11, v0.h[3]
 ; GISEL-NEXT:    eor w8, w8, w9
 ; GISEL-NEXT:    eor w9, w10, w11
 ; GISEL-NEXT:    eor w0, w8, w9
@@ -495,13 +425,10 @@ define i16 @test_redxor_v16i16(<16 x i16> %a) {
 ; GISEL-NEXT:    eor v0.16b, v0.16b, v1.16b
 ; GISEL-NEXT:    mov d1, v0.d[1]
 ; GISEL-NEXT:    eor v0.8b, v0.8b, v1.8b
-; GISEL-NEXT:    mov h1, v0.h[1]
-; GISEL-NEXT:    mov h2, v0.h[2]
-; GISEL-NEXT:    mov h3, v0.h[3]
-; GISEL-NEXT:    fmov w8, s0
-; GISEL-NEXT:    fmov w9, s1
-; GISEL-NEXT:    fmov w10, s2
-; GISEL-NEXT:    fmov w11, s3
+; GISEL-NEXT:    umov w8, v0.h[0]
+; GISEL-NEXT:    umov w9, v0.h[1]
+; GISEL-NEXT:    umov w10, v0.h[2]
+; GISEL-NEXT:    umov w11, v0.h[3]
 ; GISEL-NEXT:    eor w8, w8, w9
 ; GISEL-NEXT:    eor w9, w10, w11
 ; GISEL-NEXT:    eor w0, w8, w9
diff --git a/llvm/test/CodeGen/AArch64/sext.ll b/llvm/test/CodeGen/AArch64/sext.ll
index 4d26228caf62e..014e4071a4bf6 100644
--- a/llvm/test/CodeGen/AArch64/sext.ll
+++ b/llvm/test/CodeGen/AArch64/sext.ll
@@ -289,18 +289,14 @@ define <3 x i32> @sext_v3i16_v3i32(<3 x i16> %a) {
 ; CHECK-GI-LABEL: sext_v3i16_v3i32:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    mov h1, v0.h[1]
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    mov h2, v0.h[2]
-; CHECK-GI-NEXT:    sxth w8, w8
-; CHECK-GI-NEXT:    fmov w9, s1
-; CHECK-GI-NEXT:    fmov s0, w8
-; CHECK-GI-NEXT:    fmov w8, s2
-; CHECK-GI-NEXT:    sxth w9, w9
-; CHECK-GI-NEXT:    sxth w8, w8
-; CHECK-GI-NEXT:    mov v0.s[1], w9
-; CHECK-GI-NEXT:    mov v0.s[2], w8
-; CHECK-GI-NEXT:    mov v0.s[3], w8
+; CHECK-GI-NEXT:    smov w8, v0.h[0]
+; CHECK-GI-NEXT:    smov w9, v0.h[1]
+; CHECK-GI-NEXT:    fmov s1, w8
+; CHECK-GI-NEXT:    smov w8, v0.h[2]
+; CHECK-GI-NEXT:    mov v1.s[1], w9
+; CHECK-GI-NEXT:    mov v1.s[2], w8
+; CHECK-GI-NEXT:    mov v1.s[3], w8
+; CHECK-GI-NEXT:    mov v0.16b, v1.16b
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = sext <3 x i16> %a to <3 x i32>
@@ -322,15 +318,10 @@ define <3 x i64> @sext_v3i16_v3i64(<3 x i16> %a) {
 ; CHECK-GI-LABEL: sext_v3i16_v3i64:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    mov h1, v0.h[1]
-; CHECK-GI-NEXT:    mov h2, v0.h[2]
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    sxth x8, w8
-; CHECK-GI-NEXT:    fmov w9, s1
-; CHECK-GI-NEXT:    fmov w10, s2
+; CHECK-GI-NEXT:    smov x8, v0.h[0]
+; CHECK-GI-NEXT:    smov x9, v0.h[1]
+; CHECK-GI-NEXT:    smov x10, v0.h[2]
 ; CHECK-GI-NEXT:    fmov d0, x8
-; CHECK-GI-NEXT:    sxth x9, w9
-; CHECK-GI-NEXT:    sxth x10, w10
 ; CHECK-GI-NEXT:    fmov d1, x9
 ; CHECK-GI-NEXT:    fmov d2, x10
 ; CHECK-GI-NEXT:    ret
@@ -352,15 +343,10 @@ define <3 x i64> @sext_v3i32_v3i64(<3 x i32> %a) {
 ;
 ; CHECK-GI-LABEL: sext_v3i32_v3i64:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    mov s1, v0.s[1]
-; CHECK-GI-NEXT:    mov s2, v0.s[2]
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    sxtw x8, w8
-; CHECK-GI-NEXT:    fmov w9, s1
-; CHECK-GI-NEXT:    fmov w10, s2
+; CHECK-GI-NEXT:    smov x8, v0.s[0]
+; CHECK-GI-NEXT:    smov x9, v0.s[1]
+; CHECK-GI-NEXT:    smov x10, v0.s[2]
 ; CHECK-GI-NEXT:    fmov d0, x8
-; CHECK-GI-NEXT:    sxtw x9, w9
-; CHECK-GI-NEXT:    sxtw x10, w10
 ; CHECK-GI-NEXT:    fmov d1, x9
 ; CHECK-GI-NEXT:    fmov d2, x10
 ; CHECK-GI-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-umax-legalization.ll b/llvm/test/CodeGen/AArch64/vecreduce-umax-legalization.ll
index 53aefaf3d3360..7f804fe48fd85 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-umax-legalization.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-umax-legalization.ll
@@ -168,53 +168,32 @@ define i8 @test_v9i8(<9 x i8> %a) nounwind {
 ; CHECK-GI-LABEL: test_v9i8:
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    mov b1, v0.b[1]
-; CHECK-GI-NEXT:    mov b2, v0.b[2]
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    mov b3, v0.b[3]
-; CHECK-GI-NEXT:    mov b4, v0.b[4]
-; CHECK-GI-NEXT:    fmov w9, s0
-; CHECK-GI-NEXT:    uxtb w8, w8
-; CHECK-GI-NEXT:    fmov w10, s1
-; CHECK-GI-NEXT:    fmov w11, s2
-; CHECK-GI-NEXT:    fmov w12, s1
-; CHECK-GI-NEXT:    mov b1, v0.b[5]
-; CHECK-GI-NEXT:    mov b2, v0.b[6]
-; CHECK-GI-NEXT:    cmp w8, w10, uxtb
-; CHECK-GI-NEXT:    fmov w10, s3
-; CHECK-GI-NEXT:    uxtb w8, w11
-; CHECK-GI-NEXT:    csel w9, w9, w12, hi
-; CHECK-GI-NEXT:    cmp w8, w9, uxtb
-; CHECK-GI-NEXT:    uxtb w8, w10
-; CHECK-GI-NEXT:    fmov w10, s4
-; CHECK-GI-NEXT:    csel w9, w9, w11, lo
-; CHECK-GI-NEXT:    fmov w11, s3
-; CHECK-GI-NEXT:    mov b3, v0.b[7]
-; CHECK-GI-NEXT:    mov b0, v0.b[8]
-; CHECK-GI-NEXT:    cmp w8, w9, uxtb
-; CHECK-GI-NEXT:    uxtb w8, w10
-; CHECK-GI-NEXT:    fmov w10, s1
-; CHECK-GI-NEXT:    csel w9, w9, w11, lo
-; CHECK-GI-NEXT:    fmov w11, s4
-; CHECK-GI-NEXT:    cmp w8, w9, uxtb
-; CHECK-GI-NEXT:    uxtb w8, w10
-; CHECK-GI-NEXT:    fmov w10, s2
-; CHECK-GI-NEXT:    csel w9, w9, w11, lo
+; CHECK-GI-NEXT:    umov w8, v0.b[0]
+; CHECK-GI-NEXT:    umov w9, v0.b[1]
+; CHECK-GI-NEXT:    umov w10, v0.b[2]
 ; CHECK-GI-NEXT:    fmov w11, s1
-; CHECK-GI-NEXT:    cmp w8, w9, uxtb
-; CHECK-GI-NEXT:    uxtb w8, w10
-; CHECK-GI-NEXT:    fmov w10, s3
-; CHECK-GI-NEXT:    csel w9, w9, w11, lo
-; CHECK-GI-NEXT:    fmov w11, s2
-; CHECK-GI-NEXT:    cmp w8, w9, uxtb
-; CHECK-GI-NEXT:    uxtb w8, w10
-; CHECK-GI-NEXT:    fmov w10, s0
-; CHECK-GI-NEXT:    csel w9, w9, w11, lo
-; CHECK-GI-NEXT:    fmov w11, s3
-; CHECK-GI-NEXT:    cmp w8, w9, uxtb
-; CHECK-GI-NEXT:    uxtb w8, w10
-; CHECK-GI-NEXT:    csel w9, w9, w11, lo
-; CHECK-GI-NEXT:    cmp w8, w9, uxtb
-; CHECK-GI-NEXT:    csel w0, w9, w10, lo
+; CHECK-GI-NEXT:    cmp w8, w11, uxtb
+; CHECK-GI-NEXT:    umov w11, v0.b[3]
+; CHECK-GI-NEXT:    csel w8, w8, w9, hi
+; CHECK-GI-NEXT:    umov w9, v0.b[4]
+; CHECK-GI-NEXT:    cmp w10, w8, uxtb
+; CHECK-GI-NEXT:    csel w8, w8, w10, lo
+; CHECK-GI-NEXT:    umov w10, v0.b[5]
+; CHECK-GI-NEXT:    cmp w11, w8, uxtb
+; CHECK-GI-NEXT:    csel w8, w8, w11, lo
+; CHECK-GI-NEXT:    umov w11, v0.b[6]
+; CHECK-GI-NEXT:    cmp w9, w8, uxtb
+; CHECK-GI-NEXT:    csel w8, w8, w9, lo
+; CHECK-GI-NEXT:    umov w9, v0.b[7]
+; CHECK-GI-NEXT:    cmp w10, w8, uxtb
+; CHECK-GI-NEXT:    csel w8, w8, w10, lo
+; CHECK-GI-NEXT:    umov w10, v0.b[8]
+; CHECK-GI-NEXT:    cmp w11, w8, uxtb
+; CHECK-GI-NEXT:    csel w8, w8, w11, lo
+; CHECK-GI-NEXT:    cmp w9, w8, uxtb
+; CHECK-GI-NEXT:    csel w8, w8, w9, lo
+; CHECK-GI-NEXT:    cmp w10, w8, uxtb
+; CHECK-GI-NEXT:    csel w0, w8, w10, lo
 ; CHECK-GI-NEXT:    ret
   %b = call i8 @llvm.vector.reduce.umax.v9i8(<9 x i8> %a)
   ret i8 %b
@@ -259,21 +238,18 @@ define i1 @test_v4i1(<4 x i1> %a) nounwind {
 ; CHECK-GI-LABEL: test_v4i1:
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    mov h1, v0.h[1]
-; CHECK-GI-NEXT:    mov h2, v0.h[2]
-; CHECK-GI-NEXT:    mov h3, v0.h[3]
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    fmov w10, s1
-; CHECK-GI-NEXT:    fmov w12, s2
-; CHECK-GI-NEXT:    fmov w13, s3
-; CHECK-GI-NEXT:    and w9, w8, #0x1
-; CHECK-GI-NEXT:    and w11, w10, #0x1
-; CHECK-GI-NEXT:    cmp w9, w11
-; CHECK-GI-NEXT:    and w9, w12, #0x1
-; CHECK-GI-NEXT:    and w11, w13, #0x1
-; CHECK-GI-NEXT:    csel w8, w8, w10, hi
-; CHECK-GI-NEXT:    cmp w9, w11
-; CHECK-GI-NEXT:    csel w9, w12, w13, hi
+; CHECK-GI-NEXT:    umov w8, v0.h[0]
+; CHECK-GI-NEXT:    umov w9, v0.h[1]
+; CHECK-GI-NEXT:    umov w10, v0.h[2]
+; CHECK-GI-NEXT:    umov w11, v0.h[3]
+; CHECK-GI-NEXT:    and w12, w8, #0x1
+; CHECK-GI-NEXT:    and w13, w9, #0x1
+; CHECK-GI-NEXT:    cmp w12, w13
+; CHECK-GI-NEXT:    and w12, w10, #0x1
+; CHECK-GI-NEXT:    and w13, w11, #0x1
+; CHECK-GI-NEXT:    csel w8, w8, w9, hi
+; CHECK-GI-NEXT:    cmp w12, w13
+; CHECK-GI-NEXT:    csel w9, w10, w11, hi
 ; CHECK-GI-NEXT:    and w10, w8, #0x1
 ; CHECK-GI-NEXT:    and w11, w9, #0x1
 ; CHECK-GI-NEXT:    cmp w10, w11
diff --git a/llvm/test/CodeGen/AArch64/xtn.ll b/llvm/test/CodeGen/AArch64/xtn.ll
index 0dd4e3644b783..21982fadbe803 100644
--- a/llvm/test/CodeGen/AArch64/xtn.ll
+++ b/llvm/test/CodeGen/AArch64/xtn.ll
@@ -224,23 +224,13 @@ entry:
 }
 
 define <3 x i8> @xtn_v3i16_v3i8(<3 x i16> %a) {
-; CHECK-SD-LABEL: xtn_v3i16_v3i8:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-SD-NEXT:    umov w0, v0.h[0]
-; CHECK-SD-NEXT:    umov w1, v0.h[1]
-; CHECK-SD-NEXT:    umov w2, v0.h[2]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: xtn_v3i16_v3i8:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    mov h1, v0.h[1]
-; CHECK-GI-NEXT:    mov h2, v0.h[2]
-; CHECK-GI-NEXT:    fmov w0, s0
-; CHECK-GI-NEXT:    fmov w1, s1
-; CHECK-GI-NEXT:    fmov w2, s2
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: xtn_v3i16_v3i8:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    umov w0, v0.h[0]
+; CHECK-NEXT:    umov w1, v0.h[1]
+; CHECK-NEXT:    umov w2, v0.h[2]
+; CHECK-NEXT:    ret
 entry:
   %arg1 = trunc <3 x i16> %a to <3 x i8>
   ret <3 x i8> %arg1
diff --git a/llvm/test/CodeGen/AArch64/zext.ll b/llvm/test/CodeGen/AArch64/zext.ll
index 42c0bf79e7789..e513340f5b18a 100644
--- a/llvm/test/CodeGen/AArch64/zext.ll
+++ b/llvm/test/CodeGen/AArch64/zext.ll
@@ -2,6 +2,8 @@
 ; RUN: llc -mtriple=aarch64 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-SD
 ; RUN: llc -mtriple=aarch64 -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
+; CHECK-GI:       warning: Instruction selection used fallback path for zext_v16i10_v16i16
+
 define i16 @zext_i8_to_i16(i8 %a) {
 ; CHECK-LABEL: zext_i8_to_i16:
 ; CHECK:       // %bb.0: // %entry
@@ -333,18 +335,14 @@ define <3 x i32> @zext_v3i16_v3i32(<3 x i16> %a) {
 ; CHECK-GI-LABEL: zext_v3i16_v3i32:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    mov h1, v0.h[1]
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    mov h2, v0.h[2]
-; CHECK-GI-NEXT:    uxth w8, w8
-; CHECK-GI-NEXT:    fmov w9, s1
-; CHECK-GI-NEXT:    fmov s0, w8
-; CHECK-GI-NEXT:    fmov w8, s2
-; CHECK-GI-NEXT:    uxth w9, w9
-; CHECK-GI-NEXT:    uxth w8, w8
-; CHECK-GI-NEXT:    mov v0.s[1], w9
-; CHECK-GI-NEXT:    mov v0.s[2], w8
-; CHECK-GI-NEXT:    mov v0.s[3], w8
+; CHECK-GI-NEXT:    umov w8, v0.h[0]
+; CHECK-GI-NEXT:    umov w9, v0.h[1]
+; CHECK-GI-NEXT:    fmov s1, w8
+; CHECK-GI-NEXT:    umov w8, v0.h[2]
+; CHECK-GI-NEXT:    mov v1.s[1], w9
+; CHECK-GI-NEXT:    mov v1.s[2], w8
+; CHECK-GI-NEXT:    mov v1.s[3], w8
+; CHECK-GI-NEXT:    mov v0.16b, v1.16b
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = zext <3 x i16> %a to <3 x i32>
@@ -366,15 +364,10 @@ define <3 x i64> @zext_v3i16_v3i64(<3 x i16> %a) {
 ; CHECK-GI-LABEL: zext_v3i16_v3i64:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    mov h1, v0.h[1]
-; CHECK-GI-NEXT:    mov h2, v0.h[2]
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    ubfx x8, x8, #0, #16
-; CHECK-GI-NEXT:    fmov w9, s1
-; CHECK-GI-NEXT:    fmov w10, s2
+; CHECK-GI-NEXT:    umov w8, v0.h[0]
+; CHECK-GI-NEXT:    umov w9, v0.h[1]
+; CHECK-GI-NEXT:    umov w10, v0.h[2]
 ; CHECK-GI-NEXT:    fmov d0, x8
-; CHECK-GI-NEXT:    ubfx x9, x9, #0, #16
-; CHECK-GI-NEXT:    ubfx x10, x10, #0, #16
 ; CHECK-GI-NEXT:    fmov d1, x9
 ; CHECK-GI-NEXT:    fmov d2, x10
 ; CHECK-GI-NEXT:    ret
@@ -396,12 +389,10 @@ define <3 x i64> @zext_v3i32_v3i64(<3 x i32> %a) {
 ;
 ; CHECK-GI-LABEL: zext_v3i32_v3i64:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    mov s1, v0.s[1]
-; CHECK-GI-NEXT:    mov s2, v0.s[2]
-; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    mov w8, v0.s[0]
+; CHECK-GI-NEXT:    mov w9, v0.s[1]
+; CHECK-GI-NEXT:    mov w10, v0.s[2]
 ; CHECK-GI-NEXT:    fmov d0, x8
-; CHECK-GI-NEXT:    fmov w9, s1
-; CHECK-GI-NEXT:    fmov w10, s2
 ; CHECK-GI-NEXT:    fmov d1, x9
 ; CHECK-GI-NEXT:    fmov d2, x10
 ; CHECK-GI-NEXT:    ret

From ba4d36951f1deaf24dce526900a2d6dbdefa377b Mon Sep 17 00:00:00 2001
From: Guillaume Chatelet <gchatelet@google.com>
Date: Thu, 21 Dec 2023 10:51:23 +0100
Subject: [PATCH 040/342] [libc] Make BigInt bit_cast-able to compatible types
 (#75063)

This is a second take on #74837 to fix #74258
---
 libc/src/__support/CPP/bit.h          |  8 ++--
 libc/src/__support/FPUtil/FPBits.h    |  1 +
 libc/src/__support/UInt.h             | 44 ++++++++++++++++++-
 libc/test/src/__support/uint_test.cpp | 63 +++++++++++++++++++++++----
 4 files changed, 102 insertions(+), 14 deletions(-)

diff --git a/libc/src/__support/CPP/bit.h b/libc/src/__support/CPP/bit.h
index 4de142b56165b..122f6b8c33281 100644
--- a/libc/src/__support/CPP/bit.h
+++ b/libc/src/__support/CPP/bit.h
@@ -29,10 +29,10 @@ namespace LIBC_NAMESPACE::cpp {
 // UB in the implementation.
 template <
     typename To, typename From,
-    typename = cpp::enable_if_t<sizeof(To) == sizeof(From)>,
-    typename = cpp::enable_if_t<cpp::is_trivially_constructible<To>::value>,
-    typename = cpp::enable_if_t<cpp::is_trivially_copyable<To>::value>,
-    typename = cpp::enable_if_t<cpp::is_trivially_copyable<From>::value>>
+    typename = cpp::enable_if_t<sizeof(To) == sizeof(From) &&
+                                cpp::is_trivially_constructible<To>::value &&
+                                cpp::is_trivially_copyable<To>::value &&
+                                cpp::is_trivially_copyable<From>::value>>
 LIBC_INLINE constexpr To bit_cast(const From &from) {
   MSAN_UNPOISON(&from, sizeof(From));
 #if LIBC_HAS_BUILTIN(__builtin_bit_cast)
diff --git a/libc/src/__support/FPUtil/FPBits.h b/libc/src/__support/FPUtil/FPBits.h
index 37e2820eab855..b13ce80f94f6e 100644
--- a/libc/src/__support/FPUtil/FPBits.h
+++ b/libc/src/__support/FPUtil/FPBits.h
@@ -11,6 +11,7 @@
 
 #include "src/__support/CPP/bit.h"
 #include "src/__support/CPP/type_traits.h"
+#include "src/__support/UInt128.h"
 #include "src/__support/common.h"
 #include "src/__support/macros/attributes.h" // LIBC_INLINE
 
diff --git a/libc/src/__support/UInt.h b/libc/src/__support/UInt.h
index cfd495c586185..79e05940f0277 100644
--- a/libc/src/__support/UInt.h
+++ b/libc/src/__support/UInt.h
@@ -30,7 +30,7 @@ template <size_t Bits, bool Signed> struct BigInt {
   static_assert(Bits > 0 && Bits % 64 == 0,
                 "Number of bits in BigInt should be a multiple of 64.");
   LIBC_INLINE_VAR static constexpr size_t WORDCOUNT = Bits / 64;
-  uint64_t val[WORDCOUNT]{};
+  cpp::array<uint64_t, WORDCOUNT> val{};
 
   LIBC_INLINE_VAR static constexpr uint64_t MASK32 = 0xFFFFFFFFu;
 
@@ -954,6 +954,48 @@ struct make_signed<UInt<Bits>> : type_identity<Int<Bits>> {
                 "Number of bits in Int should be a multiple of 64.");
 };
 
+namespace internal {
+template <typename T> struct is_custom_uint : cpp::false_type {};
+template <size_t Bits> struct is_custom_uint<UInt<Bits>> : cpp::true_type {};
+} // namespace internal
+
+// bit_cast to UInt
+// Note: The standard scheme for SFINAE selection is to have exactly one
+// function instanciation valid at a time. This is usually done by having a
+// predicate in one function and the negated predicate in the other one.
+// e.g.
+// template<typename = cpp::enable_if_t< is_custom_uint<To>::value == true> ...
+// template<typename = cpp::enable_if_t< is_custom_uint<To>::value == false> ...
+//
+// Unfortunately this would make the default 'cpp::bit_cast' aware of
+// 'is_custom_uint' (or any other customization). To prevent exposing all
+// customizations in the original function, we create a different function with
+// four 'typename's instead of three - otherwise it would be considered as a
+// redeclaration of the same function leading to "error: template parameter
+// redefines default argument".
+template <typename To, typename From,
+          typename = cpp::enable_if_t<sizeof(To) == sizeof(From) &&
+                                      cpp::is_trivially_copyable<To>::value &&
+                                      cpp::is_trivially_copyable<From>::value>,
+          typename = cpp::enable_if_t<internal::is_custom_uint<To>::value>>
+LIBC_INLINE constexpr To bit_cast(const From &from) {
+  To out;
+  using Storage = decltype(out.val);
+  out.val = cpp::bit_cast<Storage>(from);
+  return out;
+}
+
+// bit_cast from UInt
+template <
+    typename To, size_t Bits,
+    typename = cpp::enable_if_t<sizeof(To) == sizeof(UInt<Bits>) &&
+                                cpp::is_trivially_constructible<To>::value &&
+                                cpp::is_trivially_copyable<To>::value &&
+                                cpp::is_trivially_copyable<UInt<Bits>>::value>>
+LIBC_INLINE constexpr To bit_cast(const UInt<Bits> &from) {
+  return cpp::bit_cast<To>(from.val);
+}
+
 } // namespace LIBC_NAMESPACE::cpp
 
 #endif // LLVM_LIBC_SRC___SUPPORT_UINT_H
diff --git a/libc/test/src/__support/uint_test.cpp b/libc/test/src/__support/uint_test.cpp
index 971bac55bd9d3..0ad72c35645c4 100644
--- a/libc/test/src/__support/uint_test.cpp
+++ b/libc/test/src/__support/uint_test.cpp
@@ -10,19 +10,62 @@
 #include "src/__support/UInt.h"
 
 #include "test/UnitTest/Test.h"
+#include <math.h> // HUGE_VALF, HUGE_VALF
 
-// We want to test LIBC_NAMESPACE::cpp::UInt<128> explicitly. So, for
+namespace LIBC_NAMESPACE {
+
+using LL_UInt64 = cpp::UInt<64>;
+// We want to test cpp::UInt<128> explicitly. So, for
 // convenience, we use a sugar which does not conflict with the UInt128 type
 // which can resolve to __uint128_t if the platform has it.
-using LL_UInt128 = LIBC_NAMESPACE::cpp::UInt<128>;
-using LL_UInt192 = LIBC_NAMESPACE::cpp::UInt<192>;
-using LL_UInt256 = LIBC_NAMESPACE::cpp::UInt<256>;
-using LL_UInt320 = LIBC_NAMESPACE::cpp::UInt<320>;
-using LL_UInt512 = LIBC_NAMESPACE::cpp::UInt<512>;
-using LL_UInt1024 = LIBC_NAMESPACE::cpp::UInt<1024>;
+using LL_UInt128 = cpp::UInt<128>;
+using LL_UInt192 = cpp::UInt<192>;
+using LL_UInt256 = cpp::UInt<256>;
+using LL_UInt320 = cpp::UInt<320>;
+using LL_UInt512 = cpp::UInt<512>;
+using LL_UInt1024 = cpp::UInt<1024>;
+
+using LL_Int128 = cpp::Int<128>;
+using LL_Int192 = cpp::Int<192>;
+
+TEST(LlvmLibcUIntClassTest, BitCastToFromDouble) {
+  static_assert(cpp::is_trivially_copyable<LL_UInt64>::value);
+  static_assert(sizeof(LL_UInt64) == sizeof(double));
+  const double inf = HUGE_VAL;
+  const double max = DBL_MAX;
+  const double array[] = {0.0, 0.1, 1.0, max, inf};
+  for (double value : array) {
+    LL_UInt64 back = cpp::bit_cast<LL_UInt64>(value);
+    double forth = cpp::bit_cast<double>(back);
+    EXPECT_TRUE(value == forth);
+  }
+}
 
-using LL_Int128 = LIBC_NAMESPACE::cpp::Int<128>;
-using LL_Int192 = LIBC_NAMESPACE::cpp::Int<192>;
+#ifdef __SIZEOF_INT128__
+TEST(LlvmLibcUIntClassTest, BitCastToFromNativeUint128) {
+  static_assert(cpp::is_trivially_copyable<LL_UInt128>::value);
+  static_assert(sizeof(LL_UInt128) == sizeof(__uint128_t));
+  const __uint128_t array[] = {0, 1, ~__uint128_t(0)};
+  for (__uint128_t value : array) {
+    LL_UInt128 back = cpp::bit_cast<LL_UInt128>(value);
+    __uint128_t forth = cpp::bit_cast<__uint128_t>(back);
+    EXPECT_TRUE(value == forth);
+  }
+}
+#endif
+
+#ifdef LIBC_COMPILER_HAS_FLOAT128
+TEST(LlvmLibcUIntClassTest, BitCastToFromNativeFloat128) {
+  static_assert(cpp::is_trivially_copyable<LL_UInt128>::value);
+  static_assert(sizeof(LL_UInt128) == sizeof(float128));
+  const float128 array[] = {0, 0.1, 1};
+  for (float128 value : array) {
+    LL_UInt128 back = cpp::bit_cast<LL_UInt128>(value);
+    float128 forth = cpp::bit_cast<float128>(back);
+    EXPECT_TRUE(value == forth);
+  }
+}
+#endif
 
 TEST(LlvmLibcUIntClassTest, BasicInit) {
   LL_UInt128 half_val(12345);
@@ -634,3 +677,5 @@ TEST(LlvmLibcUIntClassTest, ConstructorFromUInt128Tests) {
 }
 
 #endif // __SIZEOF_INT128__
+
+} // namespace LIBC_NAMESPACE

From 73948ec6b276ba6ab7c18eb543dd4ea5a37eeab8 Mon Sep 17 00:00:00 2001
From: Ben Shi <2283975856@qq.com>
Date: Thu, 21 Dec 2023 17:57:06 +0800
Subject: [PATCH 041/342] [clang][analyzer] Support `fflush` in the
 StreamChecker (#74296)

---
 .../StaticAnalyzer/Checkers/StreamChecker.cpp | 86 +++++++++++++++++++
 .../Analysis/Inputs/system-header-simulator.h |  1 +
 clang/test/Analysis/stream-error.c            | 67 +++++++++++++++
 3 files changed, 154 insertions(+)

diff --git a/clang/lib/StaticAnalyzer/Checkers/StreamChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/StreamChecker.cpp
index 925fc90e35543..254b36ed03968 100644
--- a/clang/lib/StaticAnalyzer/Checkers/StreamChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/StreamChecker.cpp
@@ -266,6 +266,8 @@ class StreamChecker : public Checker<check::PreCall, eval::Call,
        {&StreamChecker::preFseek, &StreamChecker::evalFseek, 0}},
       {{{"ftell"}, 1},
        {&StreamChecker::preDefault, &StreamChecker::evalFtell, 0}},
+      {{{"fflush"}, 1},
+       {&StreamChecker::preFflush, &StreamChecker::evalFflush, 0}},
       {{{"rewind"}, 1},
        {&StreamChecker::preDefault, &StreamChecker::evalRewind, 0}},
       {{{"fgetpos"}, 2},
@@ -360,6 +362,12 @@ class StreamChecker : public Checker<check::PreCall, eval::Call,
                          CheckerContext &C,
                          const StreamErrorState &ErrorKind) const;
 
+  void preFflush(const FnDescription *Desc, const CallEvent &Call,
+                 CheckerContext &C) const;
+
+  void evalFflush(const FnDescription *Desc, const CallEvent &Call,
+                  CheckerContext &C) const;
+
   /// Check that the stream (in StreamVal) is not NULL.
   /// If it can only be NULL a fatal error is emitted and nullptr returned.
   /// Otherwise the return value is a new state where the stream is constrained
@@ -1191,6 +1199,84 @@ void StreamChecker::evalSetFeofFerror(const FnDescription *Desc,
   C.addTransition(State);
 }
 
+void StreamChecker::preFflush(const FnDescription *Desc, const CallEvent &Call,
+                              CheckerContext &C) const {
+  ProgramStateRef State = C.getState();
+  SVal StreamVal = getStreamArg(Desc, Call);
+  std::optional<DefinedSVal> Stream = StreamVal.getAs<DefinedSVal>();
+  if (!Stream)
+    return;
+
+  ProgramStateRef StateNotNull, StateNull;
+  std::tie(StateNotNull, StateNull) =
+      C.getConstraintManager().assumeDual(State, *Stream);
+  if (StateNotNull && !StateNull)
+    ensureStreamOpened(StreamVal, C, StateNotNull);
+}
+
+void StreamChecker::evalFflush(const FnDescription *Desc, const CallEvent &Call,
+                               CheckerContext &C) const {
+  ProgramStateRef State = C.getState();
+  SVal StreamVal = getStreamArg(Desc, Call);
+  std::optional<DefinedSVal> Stream = StreamVal.getAs<DefinedSVal>();
+  if (!Stream)
+    return;
+
+  // Skip if the stream can be both NULL and non-NULL.
+  ProgramStateRef StateNotNull, StateNull;
+  std::tie(StateNotNull, StateNull) =
+      C.getConstraintManager().assumeDual(State, *Stream);
+  if (StateNotNull && StateNull)
+    return;
+  if (StateNotNull && !StateNull)
+    State = StateNotNull;
+  else
+    State = StateNull;
+
+  const CallExpr *CE = dyn_cast_or_null<CallExpr>(Call.getOriginExpr());
+  if (!CE)
+    return;
+
+  // `fflush` returns EOF on failure, otherwise returns 0.
+  ProgramStateRef StateFailed = bindInt(*EofVal, State, C, CE);
+  ProgramStateRef StateNotFailed = bindInt(0, State, C, CE);
+
+  // Clear error states if `fflush` returns 0, but retain their EOF flags.
+  auto ClearErrorInNotFailed = [&StateNotFailed, Desc](SymbolRef Sym,
+                                                       const StreamState *SS) {
+    if (SS->ErrorState & ErrorFError) {
+      StreamErrorState NewES =
+          (SS->ErrorState & ErrorFEof) ? ErrorFEof : ErrorNone;
+      StreamState NewSS = StreamState::getOpened(Desc, NewES, false);
+      StateNotFailed = StateNotFailed->set<StreamMap>(Sym, NewSS);
+    }
+  };
+
+  if (StateNotNull && !StateNull) {
+    // Skip if the input stream's state is unknown, open-failed or closed.
+    if (SymbolRef StreamSym = StreamVal.getAsSymbol()) {
+      const StreamState *SS = State->get<StreamMap>(StreamSym);
+      if (SS) {
+        assert(SS->isOpened() && "Stream is expected to be opened");
+        ClearErrorInNotFailed(StreamSym, SS);
+      } else
+        return;
+    }
+  } else {
+    // Clear error states for all streams.
+    const StreamMapTy &Map = StateNotFailed->get<StreamMap>();
+    for (const auto &I : Map) {
+      SymbolRef Sym = I.first;
+      const StreamState &SS = I.second;
+      if (SS.isOpened())
+        ClearErrorInNotFailed(Sym, &SS);
+    }
+  }
+
+  C.addTransition(StateNotFailed);
+  C.addTransition(StateFailed);
+}
+
 ProgramStateRef
 StreamChecker::ensureStreamNonNull(SVal StreamVal, const Expr *StreamE,
                                    CheckerContext &C,
diff --git a/clang/test/Analysis/Inputs/system-header-simulator.h b/clang/test/Analysis/Inputs/system-header-simulator.h
index 7089bd8bfc9d9..409a969a0d4cc 100644
--- a/clang/test/Analysis/Inputs/system-header-simulator.h
+++ b/clang/test/Analysis/Inputs/system-header-simulator.h
@@ -61,6 +61,7 @@ void clearerr(FILE *stream);
 int feof(FILE *stream);
 int ferror(FILE *stream);
 int fileno(FILE *stream);
+int fflush(FILE *stream);
 
 size_t strlen(const char *);
 
diff --git a/clang/test/Analysis/stream-error.c b/clang/test/Analysis/stream-error.c
index c8332bcbfa8ca..37e1e54dfc89d 100644
--- a/clang/test/Analysis/stream-error.c
+++ b/clang/test/Analysis/stream-error.c
@@ -299,6 +299,73 @@ void error_fseek_0(void) {
   fclose(F);
 }
 
+void error_fflush_after_fclose(void) {
+  FILE *F = tmpfile();
+  int Ret;
+  fflush(NULL);                      // no-warning
+  if (!F)
+    return;
+  if ((Ret = fflush(F)) != 0)
+    clang_analyzer_eval(Ret == EOF); // expected-warning {{TRUE}}
+  fclose(F);
+  fflush(F);                         // expected-warning {{Stream might be already closed}}
+}
+
+void error_fflush_on_open_failed_stream(void) {
+  FILE *F = tmpfile();
+  if (!F) {
+    fflush(F); // no-warning
+    return;
+  }
+  fclose(F);
+}
+
+void error_fflush_on_unknown_stream(FILE *F) {
+  fflush(F);   // no-warning
+  fclose(F);   // no-warning
+}
+
+void error_fflush_on_non_null_stream_clear_error_states(void) {
+  FILE *F0 = tmpfile(), *F1 = tmpfile();
+  // `fflush` clears a non-EOF stream's error state.
+  if (F0) {
+    StreamTesterChecker_make_ferror_stream(F0);
+    if (fflush(F0) == 0) {             // no-warning
+      clang_analyzer_eval(ferror(F0)); // expected-warning {{FALSE}}
+      clang_analyzer_eval(feof(F0));   // expected-warning {{FALSE}}
+    }
+    fclose(F0);
+  }
+  // `fflush` clears an EOF stream's error state.
+  if (F1) {
+    StreamTesterChecker_make_feof_stream(F1);
+    if (fflush(F1) == 0) {             // no-warning
+      clang_analyzer_eval(ferror(F1)); // expected-warning {{FALSE}}
+      clang_analyzer_eval(feof(F1));   // expected-warning {{TRUE}}
+    }
+    fclose(F1);
+  }
+}
+
+void error_fflush_on_null_stream_clear_error_states(void) {
+  FILE *F0 = tmpfile(), *F1 = tmpfile();
+  // `fflush` clears all stream's error states, while retains their EOF states.
+  if (F0 && F1) {
+    StreamTesterChecker_make_ferror_stream(F0);
+    StreamTesterChecker_make_feof_stream(F1);
+    if (fflush(NULL) == 0) {           // no-warning
+      clang_analyzer_eval(ferror(F0)); // expected-warning {{FALSE}}
+      clang_analyzer_eval(feof(F0));   // expected-warning {{FALSE}}
+      clang_analyzer_eval(ferror(F1)); // expected-warning {{FALSE}}
+      clang_analyzer_eval(feof(F1));   // expected-warning {{TRUE}}
+    }
+  }
+  if (F0)
+    fclose(F0);
+  if (F1)
+    fclose(F1);
+}
+
 void error_indeterminate(void) {
   FILE *F = fopen("file", "r+");
   if (!F)

From cb3a8934365c11ab23c918b44985f5a2f287acb1 Mon Sep 17 00:00:00 2001
From: Ethan Luis McDonough <ethanluismcdonough@gmail.com>
Date: Thu, 21 Dec 2023 04:00:35 -0600
Subject: [PATCH 042/342] [OpenMP] Check for gtest when building libomptarget
 unit tests (#76141)

This patch addresses an issue introduced in pull request #74398. CMake
will attempt to re-build gtest if openmp is enabled as a project (as
opposed to being enabled as a runtime). This patch adds a check that
prevents this from happening.
---
 openmp/libomptarget/CMakeLists.txt | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/openmp/libomptarget/CMakeLists.txt b/openmp/libomptarget/CMakeLists.txt
index 7060e20af0932..31e475d86960a 100644
--- a/openmp/libomptarget/CMakeLists.txt
+++ b/openmp/libomptarget/CMakeLists.txt
@@ -153,6 +153,8 @@ add_subdirectory(test)
 
 # Add unit tests if GMock/GTest is present
 if (EXISTS ${LLVM_THIRD_PARTY_DIR}/unittest)
-  add_subdirectory(${LLVM_THIRD_PARTY_DIR}/unittest ${CMAKE_CURRENT_BINARY_DIR}/third-party/unittest)
+  if (NOT TARGET llvm_gtest)
+    add_subdirectory(${LLVM_THIRD_PARTY_DIR}/unittest ${CMAKE_CURRENT_BINARY_DIR}/third-party/unittest)
+  endif()
   add_subdirectory(unittests)
 endif()

From 78bd124649ece163d3a26b33608bdbe518d8ff76 Mon Sep 17 00:00:00 2001
From: Alex Zinenko <zinenko@google.com>
Date: Thu, 21 Dec 2023 10:01:44 +0000
Subject: [PATCH 043/342] Revert "[mlir][python] Make the Context/Operation
 capsule creation methods work as documented. (#76010)"

This reverts commit bbc29768683b394b34600347f46be2b8245ddb30.

This change seems to be at odds with the non-owning part semantics of
MlirOperation in C API. Since downstream clients can only take and
return MlirOperation, it does not sound correct to force all returns of
MlirOperation transfer ownership. Specifically, this makes it impossible
for downstreams to implement IR-traversing functions that, e.g., look at
neighbors of an operation.

The following patch triggers the exception, and there does not seem to
be an alternative way for a downstream binding writer to express this:

```
diff --git a/mlir/lib/Bindings/Python/IRCore.cpp b/mlir/lib/Bindings/Python/IRCore.cpp
index 39757dfad5be..2ce640674245 100644
--- a/mlir/lib/Bindings/Python/IRCore.cpp
+++ b/mlir/lib/Bindings/Python/IRCore.cpp
@@ -3071,6 +3071,11 @@ void mlir::python::populateIRCore(py::module &m) {
                   py::arg("successors") = py::none(), py::arg("regions") = 0,
                   py::arg("loc") = py::none(), py::arg("ip") = py::none(),
                   py::arg("infer_type") = false, kOperationCreateDocstring)
+      .def("_get_first_in_block", [](PyOperation &self) -> MlirOperation {
+        MlirBlock block = mlirOperationGetBlock(self.get());
+        MlirOperation first = mlirBlockGetFirstOperation(block);
+        return first;
+      })
       .def_static(
           "parse",
           [](const std::string &sourceStr, const std::string &sourceName,
diff --git a/mlir/test/python/ir/operation.py b/mlir/test/python/ir/operation.py
index f59b1a26ba48..6b12b8da5c24 100644
--- a/mlir/test/python/ir/operation.py
+++ b/mlir/test/python/ir/operation.py
@@ -24,6 +24,25 @@ def expect_index_error(callback):
     except IndexError:
         pass

+@run
+def testCustomBind():
+    ctx = Context()
+    ctx.allow_unregistered_dialects = True
+    module = Module.parse(
+        r"""
+    func.func @f1(%arg0: i32) -> i32 {
+      %1 = "custom.addi"(%arg0, %arg0) : (i32, i32) -> i32
+      return %1 : i32
+    }
+  """,
+        ctx,
+    )
+    add = module.body.operations[0].regions[0].blocks[0].operations[0]
+    op = add.operation
+    # This will get a reference to itself.
+    f1 = op._get_first_in_block()
+
+

 # Verify iterator based traversal of the op/region/block hierarchy.
 # CHECK-LABEL: TEST: testTraverseOpRegionBlockIterators
```
---
 mlir/lib/Bindings/Python/IRCore.cpp      | 78 +++---------------------
 mlir/lib/Bindings/Python/IRModule.h      | 19 +-----
 mlir/test/python/ir/context_lifecycle.py | 45 +-------------
 mlir/test/python/ir/operation.py         | 13 ++++
 4 files changed, 26 insertions(+), 129 deletions(-)

diff --git a/mlir/lib/Bindings/Python/IRCore.cpp b/mlir/lib/Bindings/Python/IRCore.cpp
index 39757dfad5be1..5412c3dec4b1b 100644
--- a/mlir/lib/Bindings/Python/IRCore.cpp
+++ b/mlir/lib/Bindings/Python/IRCore.cpp
@@ -602,7 +602,7 @@ py::object PyMlirContext::createFromCapsule(py::object capsule) {
   MlirContext rawContext = mlirPythonCapsuleToContext(capsule.ptr());
   if (mlirContextIsNull(rawContext))
     throw py::error_already_set();
-  return stealExternalContext(rawContext).releaseObject();
+  return forContext(rawContext).releaseObject();
 }
 
 PyMlirContext *PyMlirContext::createNewContextForInit() {
@@ -615,35 +615,18 @@ PyMlirContextRef PyMlirContext::forContext(MlirContext context) {
   auto &liveContexts = getLiveContexts();
   auto it = liveContexts.find(context.ptr);
   if (it == liveContexts.end()) {
-    throw std::runtime_error(
-        "Cannot use a context that is not owned by the Python bindings.");
+    // Create.
+    PyMlirContext *unownedContextWrapper = new PyMlirContext(context);
+    py::object pyRef = py::cast(unownedContextWrapper);
+    assert(pyRef && "cast to py::object failed");
+    liveContexts[context.ptr] = unownedContextWrapper;
+    return PyMlirContextRef(unownedContextWrapper, std::move(pyRef));
   }
-
   // Use existing.
   py::object pyRef = py::cast(it->second);
   return PyMlirContextRef(it->second, std::move(pyRef));
 }
 
-PyMlirContextRef PyMlirContext::stealExternalContext(MlirContext context) {
-  py::gil_scoped_acquire acquire;
-  auto &liveContexts = getLiveContexts();
-  auto it = liveContexts.find(context.ptr);
-  if (it != liveContexts.end()) {
-    throw std::runtime_error(
-        "Cannot transfer ownership of the context to Python "
-        "as it is already owned by Python.");
-  }
-
-  PyMlirContext *unownedContextWrapper = new PyMlirContext(context);
-  // Note that the default return value policy on cast is automatic_reference,
-  // which does not take ownership (delete will not be called).
-  // Just be explicit.
-  py::object pyRef =
-      py::cast(unownedContextWrapper, py::return_value_policy::take_ownership);
-  assert(pyRef && "cast to py::object failed");
-  return PyMlirContextRef(unownedContextWrapper, std::move(pyRef));
-}
-
 PyMlirContext::LiveContextMap &PyMlirContext::getLiveContexts() {
   static LiveContextMap liveContexts;
   return liveContexts;
@@ -1162,18 +1145,6 @@ PyOperationRef PyOperation::forOperation(PyMlirContextRef contextRef,
   return PyOperationRef(existing, std::move(pyRef));
 }
 
-PyOperationRef PyOperation::stealExternalOperation(PyMlirContextRef contextRef,
-                                                   MlirOperation operation) {
-  auto &liveOperations = contextRef->liveOperations;
-  auto it = liveOperations.find(operation.ptr);
-  if (it != liveOperations.end()) {
-    throw std::runtime_error(
-        "Cannot transfer ownership of the operation to Python "
-        "as it is already owned by Python.");
-  }
-  return createInstance(std::move(contextRef), operation, py::none());
-}
-
 PyOperationRef PyOperation::createDetached(PyMlirContextRef contextRef,
                                            MlirOperation operation,
                                            py::object parentKeepAlive) {
@@ -1345,8 +1316,7 @@ py::object PyOperation::createFromCapsule(py::object capsule) {
   if (mlirOperationIsNull(rawOperation))
     throw py::error_already_set();
   MlirContext rawCtxt = mlirOperationGetContext(rawOperation);
-  return stealExternalOperation(PyMlirContext::forContext(rawCtxt),
-                                rawOperation)
+  return forOperation(PyMlirContext::forContext(rawCtxt), rawOperation)
       .releaseObject();
 }
 
@@ -2578,16 +2548,6 @@ void mlir::python::populateIRCore(py::module &m) {
       .def("_get_live_operation_count", &PyMlirContext::getLiveOperationCount)
       .def("_clear_live_operations", &PyMlirContext::clearLiveOperations)
       .def("_get_live_module_count", &PyMlirContext::getLiveModuleCount)
-      .def_static("_testing_create_raw_context_capsule",
-                  []() {
-                    // Creates an MlirContext not known to the Python bindings
-                    // and puts it in a capsule. Used to test interop. Using
-                    // this without passing it back to the capsule creation
-                    // API will leak.
-                    return py::reinterpret_steal<py::object>(
-                        mlirPythonContextToCapsule(
-                            mlirContextCreateWithThreading(false)));
-                  })
       .def_property_readonly(MLIR_PYTHON_CAPI_PTR_ATTR,
                              &PyMlirContext::getCapsule)
       .def(MLIR_PYTHON_CAPI_FACTORY_ATTR, &PyMlirContext::createFromCapsule)
@@ -3013,7 +2973,8 @@ void mlir::python::populateIRCore(py::module &m) {
            py::arg("binary") = false, kOperationPrintStateDocstring)
       .def("print",
            py::overload_cast<std::optional<int64_t>, bool, bool, bool, bool,
-                             bool, py::object, bool>(&PyOperationBase::print),
+                             bool, py::object, bool>(
+               &PyOperationBase::print),
            // Careful: Lots of arguments must match up with print method.
            py::arg("large_elements_limit") = py::none(),
            py::arg("enable_debug_info") = false,
@@ -3085,25 +3046,6 @@ void mlir::python::populateIRCore(py::module &m) {
       .def_property_readonly(MLIR_PYTHON_CAPI_PTR_ATTR,
                              &PyOperation::getCapsule)
       .def(MLIR_PYTHON_CAPI_FACTORY_ATTR, &PyOperation::createFromCapsule)
-      .def_static(
-          "_testing_create_raw_capsule",
-          [](std::string sourceStr) {
-            // Creates a raw context and an operation via parsing the given
-            // source and returns them in a capsule. Error handling is
-            // minimal as this is purely intended for testing interop with
-            // operation creation from capsule functions.
-            MlirContext context = mlirContextCreateWithThreading(false);
-            MlirOperation op = mlirOperationCreateParse(
-                context, toMlirStringRef(sourceStr), toMlirStringRef("temp"));
-            if (mlirOperationIsNull(op)) {
-              mlirContextDestroy(context);
-              throw std::invalid_argument("Failed to parse");
-            }
-            return py::make_tuple(py::reinterpret_steal<py::object>(
-                                      mlirPythonContextToCapsule(context)),
-                                  py::reinterpret_steal<py::object>(
-                                      mlirPythonOperationToCapsule(op)));
-          })
       .def_property_readonly("operation", [](py::object self) { return self; })
       .def_property_readonly("opview", &PyOperation::createOpView)
       .def_property_readonly(
diff --git a/mlir/lib/Bindings/Python/IRModule.h b/mlir/lib/Bindings/Python/IRModule.h
index 04164b78b3e25..79b7e0c96188c 100644
--- a/mlir/lib/Bindings/Python/IRModule.h
+++ b/mlir/lib/Bindings/Python/IRModule.h
@@ -176,19 +176,8 @@ class PyMlirContext {
   static PyMlirContext *createNewContextForInit();
 
   /// Returns a context reference for the singleton PyMlirContext wrapper for
-  /// the given context. It is only valid to call this on an MlirContext that
-  /// is already owned by the Python bindings. Typically this will be because
-  /// it came in some fashion from createNewContextForInit(). However, it
-  /// is also possible to explicitly transfer ownership of an existing
-  /// MlirContext to the Python bindings via stealExternalContext().
+  /// the given context.
   static PyMlirContextRef forContext(MlirContext context);
-
-  /// Explicitly takes ownership of an MlirContext that must not already be
-  /// known to the Python bindings. Once done, the life-cycle of the context
-  /// will be controlled by the Python bindings, and it will be destroyed
-  /// when the reference count goes to zero.
-  static PyMlirContextRef stealExternalContext(MlirContext context);
-
   ~PyMlirContext();
 
   /// Accesses the underlying MlirContext.
@@ -617,12 +606,6 @@ class PyOperation : public PyOperationBase, public BaseContextObject {
   forOperation(PyMlirContextRef contextRef, MlirOperation operation,
                pybind11::object parentKeepAlive = pybind11::object());
 
-  /// Explicitly takes ownership of an operation that must not already be known
-  /// to the Python bindings. Once done, the life-cycle of the operation
-  /// will be controlled by the Python bindings.
-  static PyOperationRef stealExternalOperation(PyMlirContextRef contextRef,
-                                               MlirOperation operation);
-
   /// Creates a detached operation. The operation must not be associated with
   /// any existing live operation.
   static PyOperationRef
diff --git a/mlir/test/python/ir/context_lifecycle.py b/mlir/test/python/ir/context_lifecycle.py
index fbd1851ba70ae..c20270999425e 100644
--- a/mlir/test/python/ir/context_lifecycle.py
+++ b/mlir/test/python/ir/context_lifecycle.py
@@ -45,46 +45,5 @@
 c4 = mlir.ir.Context()
 c4_capsule = c4._CAPIPtr
 assert '"mlir.ir.Context._CAPIPtr"' in repr(c4_capsule)
-# Because the context is already owned by Python, it cannot be created
-# a second time.
-try:
-    c5 = mlir.ir.Context._CAPICreate(c4_capsule)
-except RuntimeError:
-    pass
-else:
-    raise AssertionError(
-        "Should have gotten a RuntimeError when attempting to "
-        "re-create an already owned context"
-    )
-c4 = None
-c4_capsule = None
-gc.collect()
-assert mlir.ir.Context._get_live_count() == 0
-
-# Use a private testing method to create an unowned context capsule and
-# import it.
-c6_capsule = mlir.ir.Context._testing_create_raw_context_capsule()
-c6 = mlir.ir.Context._CAPICreate(c6_capsule)
-assert mlir.ir.Context._get_live_count() == 1
-c6_capsule = None
-c6 = None
-gc.collect()
-assert mlir.ir.Context._get_live_count() == 0
-
-# Also test operation import/export as it is tightly coupled to the context.
-(
-    raw_context_capsule,
-    raw_operation_capsule,
-) = mlir.ir.Operation._testing_create_raw_capsule("builtin.module {}")
-assert '"mlir.ir.Operation._CAPIPtr"' in repr(raw_operation_capsule)
-# Attempting to import an operation for an unknown context should fail.
-try:
-    mlir.ir.Operation._CAPICreate(raw_operation_capsule)
-except RuntimeError:
-    pass
-else:
-    raise AssertionError("Expected exception for unknown context")
-
-# Try again having imported the context.
-c7 = mlir.ir.Context._CAPICreate(raw_context_capsule)
-op7 = mlir.ir.Operation._CAPICreate(raw_operation_capsule)
+c5 = mlir.ir.Context._CAPICreate(c4_capsule)
+assert c4 is c5
diff --git a/mlir/test/python/ir/operation.py b/mlir/test/python/ir/operation.py
index f59b1a26ba48b..04f8a9936e31f 100644
--- a/mlir/test/python/ir/operation.py
+++ b/mlir/test/python/ir/operation.py
@@ -844,6 +844,19 @@ def testOperationName():
         print(op.operation.name)
 
 
+# CHECK-LABEL: TEST: testCapsuleConversions
+@run
+def testCapsuleConversions():
+    ctx = Context()
+    ctx.allow_unregistered_dialects = True
+    with Location.unknown(ctx):
+        m = Operation.create("custom.op1").operation
+        m_capsule = m._CAPIPtr
+        assert '"mlir.ir.Operation._CAPIPtr"' in repr(m_capsule)
+        m2 = Operation._CAPICreate(m_capsule)
+        assert m2 is m
+
+
 # CHECK-LABEL: TEST: testOperationErase
 @run
 def testOperationErase():

From 5b9be0ec8d42cb390048f5c3ac8782c377ef1aa6 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Thu, 21 Dec 2023 10:13:17 +0000
Subject: [PATCH 044/342] [AMDGPU] Test parsing elements of CPol operand in any
 order (#76139)

Co-authored-by: Mirko Brkusanin <Mirko.Brkusanin@amd.com>
---
 llvm/test/MC/AMDGPU/gfx12_asm_features.s | 29 ++++++++++++++++++++++++
 1 file changed, 29 insertions(+)
 create mode 100644 llvm/test/MC/AMDGPU/gfx12_asm_features.s

diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_features.s b/llvm/test/MC/AMDGPU/gfx12_asm_features.s
new file mode 100644
index 0000000000000..7e58bdb3b444e
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_features.s
@@ -0,0 +1,29 @@
+// RUN: llvm-mc -arch=amdgcn -show-encoding -mcpu=gfx1200 %s | FileCheck --check-prefix=GFX12 %s
+
+//
+// Elements of CPol operand can be given in any order
+//
+
+image_load v0, v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_LOAD_HT scope:SCOPE_SE
+// GFX12: encoding: [0x00,0x00,0x40,0xd0,0x00,0x00,0x24,0x00,0x00,0x00,0x00,0x00]
+
+image_load v0, v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D scope:SCOPE_SE th:TH_LOAD_HT
+// GFX12: encoding: [0x00,0x00,0x40,0xd0,0x00,0x00,0x24,0x00,0x00,0x00,0x00,0x00]
+
+image_sample v[29:30], [v31, v32, v33], s[32:39], s[68:71] dmask:0x3 dim:SQ_RSRC_IMG_3D th:TH_LOAD_NT scope:SCOPE_SYS
+// GFX12: encoding: [0x02,0xc0,0xc6,0xe4,0x1d,0x40,0x1c,0x22,0x1f,0x20,0x21,0x00]
+
+image_sample v[29:30], [v31, v32, v33], s[32:39], s[68:71] dmask:0x3 dim:SQ_RSRC_IMG_3D scope:SCOPE_SYS th:TH_LOAD_NT
+// GFX12: encoding: [0x02,0xc0,0xc6,0xe4,0x1d,0x40,0x1c,0x22,0x1f,0x20,0x21,0x00]
+
+buffer_load_b32 v5, off, s[8:11], s3 offset:8388607 th:TH_LOAD_NT_HT scope:SCOPE_DEV
+// GFX12: encoding: [0x03,0x00,0x05,0xc4,0x05,0x10,0xe8,0x00,0x00,0xff,0xff,0x7f]
+
+buffer_load_b32 v5, off, s[8:11], s3 offset:8388607 scope:SCOPE_DEV th:TH_LOAD_NT_HT
+// GFX12: encoding: [0x03,0x00,0x05,0xc4,0x05,0x10,0xe8,0x00,0x00,0xff,0xff,0x7f]
+
+tbuffer_load_d16_format_x v4, off, ttmp[4:7], s3 format:[BUF_FMT_8_UINT] offset:8388607 th:TH_LOAD_BYPASS scope:SCOPE_SYS
+// GFX12: encoding: [0x03,0x00,0x22,0xc4,0x04,0xe0,0xbc,0x02,0x00,0xff,0xff,0x7f]
+
+tbuffer_load_d16_format_x v4, off, ttmp[4:7], s3 format:[BUF_FMT_8_UINT] offset:8388607 scope:SCOPE_SYS th:TH_LOAD_BYPASS
+// GFX12: encoding: [0x03,0x00,0x22,0xc4,0x04,0xe0,0xbc,0x02,0x00,0xff,0xff,0x7f]

From 18af032c0e16252effeb6dfd02113812388f1d31 Mon Sep 17 00:00:00 2001
From: Yi Wu <43659785+yi-wu-arm@users.noreply.github.com>
Date: Thu, 21 Dec 2023 10:35:28 +0000
Subject: [PATCH 045/342] [flang] add GETLOG runtime and extension
 implementation: get login username (#74628)

Get login username, ussage:
```
CHARACTER(32) :: login
CALL getlog(login)
WRITE(*,*) login
```
getlog is required for an exascale proxyapp.
https://proxyapps.exascaleproject.org/app/minismac2d/

https://github.com/Mantevo/miniSMAC/blob/f90446714226eeef650b78bce06ca4967792e74d/ref/smac2d.f#L615

https://github.com/Mantevo/miniSMAC/blob/f90446714226eeef650b78bce06ca4967792e74d/ref/smac2d.f#L1570

---------

Co-authored-by: Yi Wu <43659785+PAX-12-WU@users.noreply.github.com>
Co-authored-by: Yi Wu <yiwu02@wdev-yiwu02.arm.com>
Co-authored-by: Kiran Chandramohan <kiranchandramohan@gmail.com>
---
 flang/docs/Intrinsics.md                 |  6 ++
 flang/include/flang/Runtime/extensions.h |  4 ++
 flang/runtime/character.cpp              | 22 +------
 flang/runtime/extensions.cpp             | 39 +++++++++++
 flang/runtime/tools.h                    | 22 +++++++
 flang/unittests/Runtime/CommandTest.cpp  | 83 ++++++++++++++++++++++++
 6 files changed, 155 insertions(+), 21 deletions(-)

diff --git a/flang/docs/Intrinsics.md b/flang/docs/Intrinsics.md
index fef2b4ea4dd8c..189920a0881b2 100644
--- a/flang/docs/Intrinsics.md
+++ b/flang/docs/Intrinsics.md
@@ -695,6 +695,11 @@ CACHESIZE, EOF, FP_CLASS, INT_PTR_KIND, ISNAN, LOC
 MALLOC
 ```
 
+### Library subroutine 
+```
+CALL GETLOG(USRNAME)
+```
+
 ## Intrinsic Procedure Name Resolution
 
 When the name of a procedure in a program is the same as the one of an intrinsic
@@ -754,6 +759,7 @@ This phase currently supports all the intrinsic procedures listed above but the
 | Intrinsic subroutines |MVBITS (elemental), CPU_TIME, DATE_AND_TIME, EVENT_QUERY, EXECUTE_COMMAND_LINE, GET_COMMAND, GET_COMMAND_ARGUMENT, GET_ENVIRONMENT_VARIABLE, MOVE_ALLOC, RANDOM_INIT, RANDOM_NUMBER, RANDOM_SEED, SYSTEM_CLOCK |
 | Atomic intrinsic subroutines | ATOMIC_ADD |
 | Collective intrinsic subroutines | CO_REDUCE |
+| Library subroutines | GETLOG|
 
 
 ### Intrinsic Function Folding
diff --git a/flang/include/flang/Runtime/extensions.h b/flang/include/flang/Runtime/extensions.h
index ad592814e5acb..175113c57ccb5 100644
--- a/flang/include/flang/Runtime/extensions.h
+++ b/flang/include/flang/Runtime/extensions.h
@@ -14,6 +14,7 @@
 
 #define FORTRAN_PROCEDURE_NAME(name) name##_
 
+#include <cstddef>
 #include <cstdint>
 
 extern "C" {
@@ -28,5 +29,8 @@ std::int32_t FORTRAN_PROCEDURE_NAME(iargc)();
 void FORTRAN_PROCEDURE_NAME(getarg)(
     std::int32_t &n, std::int8_t *arg, std::int64_t length);
 
+// GNU extension subroutine GETLOG(C).
+void FORTRAN_PROCEDURE_NAME(getlog)(std::byte *name, std::int64_t length);
+
 } // extern "C"
 #endif // FORTRAN_RUNTIME_EXTENSIONS_H_
diff --git a/flang/runtime/character.cpp b/flang/runtime/character.cpp
index 2afde7cd5e833..084aa0c9c8b64 100644
--- a/flang/runtime/character.cpp
+++ b/flang/runtime/character.cpp
@@ -11,6 +11,7 @@
 #include "tools.h"
 #include "flang/Common/bit-population-count.h"
 #include "flang/Common/uint128.h"
+#include "flang/Runtime/character.h"
 #include "flang/Runtime/cpp-type.h"
 #include "flang/Runtime/descriptor.h"
 #include <algorithm>
@@ -464,27 +465,6 @@ static void GeneralCharFuncKind(Descriptor &result, const Descriptor &string,
   }
 }
 
-template <typename TO, typename FROM>
-static void CopyAndPad(
-    TO *to, const FROM *from, std::size_t toChars, std::size_t fromChars) {
-  if constexpr (sizeof(TO) != sizeof(FROM)) {
-    std::size_t copyChars{std::min(toChars, fromChars)};
-    for (std::size_t j{0}; j < copyChars; ++j) {
-      to[j] = from[j];
-    }
-    for (std::size_t j{copyChars}; j < toChars; ++j) {
-      to[j] = static_cast<TO>(' ');
-    }
-  } else if (toChars <= fromChars) {
-    std::memcpy(to, from, toChars * sizeof(TO));
-  } else {
-    std::memcpy(to, from, fromChars * sizeof(TO));
-    for (std::size_t j{fromChars}; j < toChars; ++j) {
-      to[j] = static_cast<TO>(' ');
-    }
-  }
-}
-
 template <typename CHAR, bool ISMIN>
 static void MaxMinHelper(Descriptor &accumulator, const Descriptor &x,
     const Terminator &terminator) {
diff --git a/flang/runtime/extensions.cpp b/flang/runtime/extensions.cpp
index b8e9b6eae1320..1c025d40b3952 100644
--- a/flang/runtime/extensions.cpp
+++ b/flang/runtime/extensions.cpp
@@ -10,13 +10,29 @@
 // extensions that will eventually be implemented in Fortran.
 
 #include "flang/Runtime/extensions.h"
+#include "tools.h"
 #include "flang/Runtime/command.h"
 #include "flang/Runtime/descriptor.h"
 #include "flang/Runtime/io-api.h"
 
+#if _REENTRANT || _POSIX_C_SOURCE >= 199506L
+// System is posix-compliant and has getlogin_r
+#include <unistd.h>
+#endif
+
 extern "C" {
 
 namespace Fortran::runtime {
+
+void GetUsernameEnvVar(
+    const char *envName, std::byte *arg, std::int64_t length) {
+  Descriptor name{*Descriptor::Create(
+      1, std::strlen(envName) + 1, const_cast<char *>(envName), 0)};
+  Descriptor value{*Descriptor::Create(1, length, arg, 0)};
+
+  RTNAME(GetEnvVariable)
+  (name, &value, nullptr, false, nullptr, __FILE__, __LINE__);
+}
 namespace io {
 // SUBROUTINE FLUSH(N)
 //   FLUSH N
@@ -37,5 +53,28 @@ void FORTRAN_PROCEDURE_NAME(getarg)(
   (void)RTNAME(GetCommandArgument)(
       n, &value, nullptr, nullptr, __FILE__, __LINE__);
 }
+
+// CALL GETLOG(USRNAME)
+void FORTRAN_PROCEDURE_NAME(getlog)(std::byte *arg, std::int64_t length) {
+#if _REENTRANT || _POSIX_C_SOURCE >= 199506L
+  const int nameMaxLen{LOGIN_NAME_MAX + 1};
+  char str[nameMaxLen];
+
+  int error{getlogin_r(str, nameMaxLen)};
+  if (error == 0) {
+    // no error: find first \0 in string then pad from there
+    CopyAndPad(reinterpret_cast<char *>(arg), str, length, std::strlen(str));
+  } else {
+    // error occur: get username from environment variable
+    GetUsernameEnvVar("LOGNAME", arg, length);
+  }
+#elif _WIN32
+  // Get username from environment to avoid link to Advapi32.lib
+  GetUsernameEnvVar("USERNAME", arg, length);
+#else
+  GetUsernameEnvVar("LOGNAME", arg, length);
+#endif
+}
+
 } // namespace Fortran::runtime
 } // extern "C"
diff --git a/flang/runtime/tools.h b/flang/runtime/tools.h
index ea659190e1439..9811bce25acd3 100644
--- a/flang/runtime/tools.h
+++ b/flang/runtime/tools.h
@@ -411,5 +411,27 @@ RT_API_ATTRS void ShallowCopy(const Descriptor &to, const Descriptor &from,
     bool toIsContiguous, bool fromIsContiguous);
 RT_API_ATTRS void ShallowCopy(const Descriptor &to, const Descriptor &from);
 
+// Defines a utility function for copying and padding characters
+template <typename TO, typename FROM>
+RT_API_ATTRS void CopyAndPad(
+    TO *to, const FROM *from, std::size_t toChars, std::size_t fromChars) {
+  if constexpr (sizeof(TO) != sizeof(FROM)) {
+    std::size_t copyChars{std::min(toChars, fromChars)};
+    for (std::size_t j{0}; j < copyChars; ++j) {
+      to[j] = from[j];
+    }
+    for (std::size_t j{copyChars}; j < toChars; ++j) {
+      to[j] = static_cast<TO>(' ');
+    }
+  } else if (toChars <= fromChars) {
+    std::memcpy(to, from, toChars * sizeof(TO));
+  } else {
+    std::memcpy(to, from, std::min(toChars, fromChars) * sizeof(TO));
+    for (std::size_t j{fromChars}; j < toChars; ++j) {
+      to[j] = static_cast<TO>(' ');
+    }
+  }
+}
+
 } // namespace Fortran::runtime
 #endif // FORTRAN_RUNTIME_TOOLS_H_
diff --git a/flang/unittests/Runtime/CommandTest.cpp b/flang/unittests/Runtime/CommandTest.cpp
index 2b648b31666ae..dfc3ad68b3ab9 100644
--- a/flang/unittests/Runtime/CommandTest.cpp
+++ b/flang/unittests/Runtime/CommandTest.cpp
@@ -10,9 +10,15 @@
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
 #include "flang/Runtime/descriptor.h"
+#include "flang/Runtime/extensions.h"
 #include "flang/Runtime/main.h"
+#include <cstddef>
 #include <cstdlib>
 
+#if _REENTRANT || _POSIX_C_SOURCE >= 199506L
+#include <limits.h> // LOGIN_NAME_MAX used in getlog test
+#endif
+
 using namespace Fortran::runtime;
 
 template <std::size_t n = 64>
@@ -59,6 +65,13 @@ class CommandFixture : public ::testing::Test {
     return res;
   }
 
+  void CheckCharEqStr(const char *value, const std::string &expected) const {
+    ASSERT_NE(value, nullptr);
+    EXPECT_EQ(std::strncmp(value, expected.c_str(), expected.size()), 0)
+        << "expected: " << expected << "\n"
+        << "value: " << value;
+  }
+
   void CheckDescriptorEqStr(
       const Descriptor *value, const std::string &expected) const {
     ASSERT_NE(value, nullptr);
@@ -397,6 +410,11 @@ class EnvironmentVariables : public CommandFixture {
 protected:
   EnvironmentVariables() : CommandFixture(0, nullptr) {
     SetEnv("NAME", "VALUE");
+#ifdef _WIN32
+    SetEnv("USERNAME", "loginName");
+#else
+    SetEnv("LOGNAME", "loginName");
+#endif
     SetEnv("EMPTY", "");
   }
 
@@ -494,3 +512,68 @@ TEST_F(EnvironmentVariables, ErrMsgTooShort) {
       1);
   CheckDescriptorEqStr(errMsg.get(), "Mis");
 }
+
+// username first char must not be null
+TEST_F(EnvironmentVariables, GetlogGetName) {
+  const int charLen{3};
+  char input[charLen]{"\0\0"};
+
+  FORTRAN_PROCEDURE_NAME(getlog)
+  (reinterpret_cast<std::byte *>(input), charLen);
+
+  EXPECT_NE(input[0], '\0');
+}
+
+#if _REENTRANT || _POSIX_C_SOURCE >= 199506L
+TEST_F(EnvironmentVariables, GetlogPadSpace) {
+  // guarantee 1 char longer than max, last char should be pad space
+  const int charLen{LOGIN_NAME_MAX + 2};
+  char input[charLen];
+
+  FORTRAN_PROCEDURE_NAME(getlog)
+  (reinterpret_cast<std::byte *>(input), charLen);
+
+  EXPECT_EQ(input[charLen - 1], ' ');
+}
+#endif
+
+#ifdef _WIN32 // Test ability to get name from environment variable
+TEST_F(EnvironmentVariables, GetlogEnvGetName) {
+  if (EnableFineGrainedTests()) {
+    ASSERT_NE(std::getenv("USERNAME"), nullptr)
+        << "Environment variable USERNAME does not exist";
+
+    char input[]{"XXXXXXXXX"};
+    FORTRAN_PROCEDURE_NAME(getlog)
+    (reinterpret_cast<std::byte *>(input), sizeof(input));
+
+    CheckCharEqStr(input, "loginName");
+  }
+}
+
+TEST_F(EnvironmentVariables, GetlogEnvBufferShort) {
+  if (EnableFineGrainedTests()) {
+    ASSERT_NE(std::getenv("USERNAME"), nullptr)
+        << "Environment variable USERNAME does not exist";
+
+    char input[]{"XXXXXX"};
+    FORTRAN_PROCEDURE_NAME(getlog)
+    (reinterpret_cast<std::byte *>(input), sizeof(input));
+
+    CheckCharEqStr(input, "loginN");
+  }
+}
+
+TEST_F(EnvironmentVariables, GetlogEnvPadSpace) {
+  if (EnableFineGrainedTests()) {
+    ASSERT_NE(std::getenv("USERNAME"), nullptr)
+        << "Environment variable USERNAME does not exist";
+
+    char input[]{"XXXXXXXXXX"};
+    FORTRAN_PROCEDURE_NAME(getlog)
+    (reinterpret_cast<std::byte *>(input), sizeof(input));
+
+    CheckCharEqStr(input, "loginName ");
+  }
+}
+#endif

From 4bad0cb359d3066fb29f589e408a5b812a628896 Mon Sep 17 00:00:00 2001
From: zhongyunde 00443407 <zhongyunde@huawei.com>
Date: Wed, 20 Dec 2023 20:55:35 -0500
Subject: [PATCH 046/342] [AArch64] Precommit tests for PR75343, NFC

---
 llvm/test/CodeGen/AArch64/arm64-addrmode.ll   | 268 +++++++++++++++++-
 .../AArch64/large-offset-ldr-merge.mir        |  48 ++++
 2 files changed, 314 insertions(+), 2 deletions(-)
 create mode 100755 llvm/test/CodeGen/AArch64/large-offset-ldr-merge.mir

diff --git a/llvm/test/CodeGen/AArch64/arm64-addrmode.ll b/llvm/test/CodeGen/AArch64/arm64-addrmode.ll
index 3d4749a7b8e7d..d39029163a47a 100644
--- a/llvm/test/CodeGen/AArch64/arm64-addrmode.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-addrmode.ll
@@ -210,12 +210,26 @@ define void @t17(i64 %a) {
   ret void
 }
 
-define i32 @LdOffset_i8(ptr %a)  {
+; LDRBBroX
+define i8 @LdOffset_i8(ptr %a)  {
 ; CHECK-LABEL: LdOffset_i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w8, #56952 // =0xde78
 ; CHECK-NEXT:    movk w8, #15, lsl #16
 ; CHECK-NEXT:    ldrb w0, [x0, x8]
+; CHECK-NEXT:    ret
+  %arrayidx = getelementptr inbounds i8, ptr %a, i64 1039992
+  %val = load i8, ptr %arrayidx, align 1
+  ret i8 %val
+}
+
+; LDRBBroX
+define i32 @LdOffset_i8_zext32(ptr %a)  {
+; CHECK-LABEL: LdOffset_i8_zext32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #56952 // =0xde78
+; CHECK-NEXT:    movk w8, #15, lsl #16
+; CHECK-NEXT:    ldrb w0, [x0, x8]
 ; CHECK-NEXT:    ret
   %arrayidx = getelementptr inbounds i8, ptr %a, i64 1039992
   %val = load i8, ptr %arrayidx, align 1
@@ -223,11 +237,81 @@ define i32 @LdOffset_i8(ptr %a)  {
   ret i32 %conv
 }
 
-define i32 @LdOffset_i16(ptr %a)  {
+; LDRSBWroX
+define i32 @LdOffset_i8_sext32(ptr %a)  {
+; CHECK-LABEL: LdOffset_i8_sext32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #56952 // =0xde78
+; CHECK-NEXT:    movk w8, #15, lsl #16
+; CHECK-NEXT:    ldrsb w0, [x0, x8]
+; CHECK-NEXT:    ret
+  %arrayidx = getelementptr inbounds i8, ptr %a, i64 1039992
+  %val = load i8, ptr %arrayidx, align 1
+  %conv = sext i8 %val to i32
+  ret i32 %conv
+}
+
+; LDRBBroX
+define i64 @LdOffset_i8_zext64(ptr %a)  {
+; CHECK-LABEL: LdOffset_i8_zext64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #56952 // =0xde78
+; CHECK-NEXT:    movk w8, #15, lsl #16
+; CHECK-NEXT:    ldrb w0, [x0, x8]
+; CHECK-NEXT:    ret
+  %arrayidx = getelementptr inbounds i8, ptr %a, i64 1039992
+  %val = load i8, ptr %arrayidx, align 1
+  %conv = zext i8 %val to i64
+  ret i64 %conv
+}
+
+; LDRSBXroX
+define i64 @LdOffset_i8_sext64(ptr %a)  {
+; CHECK-LABEL: LdOffset_i8_sext64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #56952 // =0xde78
+; CHECK-NEXT:    movk w8, #15, lsl #16
+; CHECK-NEXT:    ldrsb x0, [x0, x8]
+; CHECK-NEXT:    ret
+  %arrayidx = getelementptr inbounds i8, ptr %a, i64 1039992
+  %val = load i8, ptr %arrayidx, align 1
+  %conv = sext i8 %val to i64
+  ret i64 %conv
+}
+
+; LDRHHroX
+define i16 @LdOffset_i16(ptr %a)  {
 ; CHECK-LABEL: LdOffset_i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w8, #48368 // =0xbcf0
 ; CHECK-NEXT:    movk w8, #31, lsl #16
+; CHECK-NEXT:    ldrh w0, [x0, x8]
+; CHECK-NEXT:    ret
+  %arrayidx = getelementptr inbounds i16, ptr %a, i64 1039992
+  %val = load i16, ptr %arrayidx, align 2
+  ret i16 %val
+}
+
+; LDRHHroX
+define i32 @LdOffset_i16_zext32(ptr %a)  {
+; CHECK-LABEL: LdOffset_i16_zext32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #48368 // =0xbcf0
+; CHECK-NEXT:    movk w8, #31, lsl #16
+; CHECK-NEXT:    ldrh w0, [x0, x8]
+; CHECK-NEXT:    ret
+  %arrayidx = getelementptr inbounds i16, ptr %a, i64 1039992
+  %val = load i16, ptr %arrayidx, align 2
+  %conv = zext i16 %val to i32
+  ret i32 %conv
+}
+
+; LDRSHWroX
+define i32 @LdOffset_i16_sext32(ptr %a)  {
+; CHECK-LABEL: LdOffset_i16_sext32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #48368 // =0xbcf0
+; CHECK-NEXT:    movk w8, #31, lsl #16
 ; CHECK-NEXT:    ldrsh w0, [x0, x8]
 ; CHECK-NEXT:    ret
   %arrayidx = getelementptr inbounds i16, ptr %a, i64 1039992
@@ -236,6 +320,35 @@ define i32 @LdOffset_i16(ptr %a)  {
   ret i32 %conv
 }
 
+; LDRHHroX
+define i64 @LdOffset_i16_zext64(ptr %a)  {
+; CHECK-LABEL: LdOffset_i16_zext64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #48368 // =0xbcf0
+; CHECK-NEXT:    movk w8, #31, lsl #16
+; CHECK-NEXT:    ldrh w0, [x0, x8]
+; CHECK-NEXT:    ret
+  %arrayidx = getelementptr inbounds i16, ptr %a, i64 1039992
+  %val = load i16, ptr %arrayidx, align 2
+  %conv = zext i16 %val to i64
+  ret i64 %conv
+}
+
+; LDRSHXroX
+define i64 @LdOffset_i16_sext64(ptr %a)  {
+; CHECK-LABEL: LdOffset_i16_sext64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #48368 // =0xbcf0
+; CHECK-NEXT:    movk w8, #31, lsl #16
+; CHECK-NEXT:    ldrsh x0, [x0, x8]
+; CHECK-NEXT:    ret
+  %arrayidx = getelementptr inbounds i16, ptr %a, i64 1039992
+  %val = load i16, ptr %arrayidx, align 2
+  %conv = sext i16 %val to i64
+  ret i64 %conv
+}
+
+; LDRWroX
 define i32 @LdOffset_i32(ptr %a)  {
 ; CHECK-LABEL: LdOffset_i32:
 ; CHECK:       // %bb.0:
@@ -248,6 +361,133 @@ define i32 @LdOffset_i32(ptr %a)  {
   ret i32 %val
 }
 
+; LDRWroX
+define i64 @LdOffset_i32_zext64(ptr %a)  {
+; CHECK-LABEL: LdOffset_i32_zext64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #31200 // =0x79e0
+; CHECK-NEXT:    movk w8, #63, lsl #16
+; CHECK-NEXT:    ldr w0, [x0, x8]
+; CHECK-NEXT:    ret
+  %arrayidx = getelementptr inbounds i32, ptr %a, i64 1039992
+  %val = load i32, ptr %arrayidx, align 2
+  %conv = zext i32 %val to i64
+  ret i64 %conv
+}
+
+; LDRSWroX
+define i64 @LdOffset_i32_sext64(ptr %a)  {
+; CHECK-LABEL: LdOffset_i32_sext64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #31200 // =0x79e0
+; CHECK-NEXT:    movk w8, #63, lsl #16
+; CHECK-NEXT:    ldrsw x0, [x0, x8]
+; CHECK-NEXT:    ret
+  %arrayidx = getelementptr inbounds i32, ptr %a, i64 1039992
+  %val = load i32, ptr %arrayidx, align 2
+  %conv = sext i32 %val to i64
+  ret i64 %conv
+}
+
+; LDRXroX
+define i64 @LdOffset_i64(ptr %a)  {
+; CHECK-LABEL: LdOffset_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #62400 // =0xf3c0
+; CHECK-NEXT:    movk w8, #126, lsl #16
+; CHECK-NEXT:    ldr x0, [x0, x8]
+; CHECK-NEXT:    ret
+  %arrayidx = getelementptr inbounds i64, ptr %a, i64 1039992
+  %val = load i64, ptr %arrayidx, align 4
+  ret i64 %val
+}
+
+; LDRDroX
+define <2 x i32> @LdOffset_v2i32(ptr %a)  {
+; CHECK-LABEL: LdOffset_v2i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #62400 // =0xf3c0
+; CHECK-NEXT:    movk w8, #126, lsl #16
+; CHECK-NEXT:    ldr d0, [x0, x8]
+; CHECK-NEXT:    ret
+  %arrayidx = getelementptr inbounds <2 x i32>, ptr %a, i64 1039992
+  %val = load <2 x i32>, ptr %arrayidx, align 4
+  ret <2 x i32> %val
+}
+
+; LDRQroX
+define <2 x i64> @LdOffset_v2i64(ptr %a)  {
+; CHECK-LABEL: LdOffset_v2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #59264 // =0xe780
+; CHECK-NEXT:    movk w8, #253, lsl #16
+; CHECK-NEXT:    ldr q0, [x0, x8]
+; CHECK-NEXT:    ret
+  %arrayidx = getelementptr inbounds <2 x i64>, ptr %a, i64 1039992
+  %val = load <2 x i64>, ptr %arrayidx, align 4
+  ret <2 x i64> %val
+}
+
+; LDRSBWroX
+define double @LdOffset_i8_f64(ptr %a)  {
+; CHECK-LABEL: LdOffset_i8_f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #56952 // =0xde78
+; CHECK-NEXT:    movk w8, #15, lsl #16
+; CHECK-NEXT:    ldrsb w8, [x0, x8]
+; CHECK-NEXT:    scvtf d0, w8
+; CHECK-NEXT:    ret
+  %arrayidx = getelementptr inbounds i8, ptr %a, i64 1039992
+  %val = load i8, ptr %arrayidx, align 1
+  %conv = sitofp i8 %val to double
+  ret double %conv
+}
+
+; LDRSHWroX
+define double @LdOffset_i16_f64(ptr %a)  {
+; CHECK-LABEL: LdOffset_i16_f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #48368 // =0xbcf0
+; CHECK-NEXT:    movk w8, #31, lsl #16
+; CHECK-NEXT:    ldrsh w8, [x0, x8]
+; CHECK-NEXT:    scvtf d0, w8
+; CHECK-NEXT:    ret
+  %arrayidx = getelementptr inbounds i16, ptr %a, i64 1039992
+  %val = load i16, ptr %arrayidx, align 2
+  %conv = sitofp i16 %val to double
+  ret double %conv
+}
+
+; LDRSroX
+define double @LdOffset_i32_f64(ptr %a)  {
+; CHECK-LABEL: LdOffset_i32_f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #31200 // =0x79e0
+; CHECK-NEXT:    movk w8, #63, lsl #16
+; CHECK-NEXT:    ldr s0, [x0, x8]
+; CHECK-NEXT:    ucvtf d0, d0
+; CHECK-NEXT:    ret
+  %arrayidx = getelementptr inbounds i32, ptr %a, i64 1039992
+  %val = load i32, ptr %arrayidx, align 4
+  %conv = uitofp i32 %val to double
+  ret double %conv
+}
+
+; LDRDroX
+define double @LdOffset_i64_f64(ptr %a)  {
+; CHECK-LABEL: LdOffset_i64_f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #62400 // =0xf3c0
+; CHECK-NEXT:    movk w8, #126, lsl #16
+; CHECK-NEXT:    ldr d0, [x0, x8]
+; CHECK-NEXT:    scvtf d0, d0
+; CHECK-NEXT:    ret
+  %arrayidx = getelementptr inbounds i64, ptr %a, i64 1039992
+  %val = load i64, ptr %arrayidx, align 8
+  %conv = sitofp i64 %val to double
+  ret double %conv
+}
+
 define i64 @LdOffset_i64_multi_offset(ptr %a) {
 ; CHECK-LABEL: LdOffset_i64_multi_offset:
 ; CHECK:       // %bb.0:
@@ -295,3 +535,27 @@ define i32 @LdOffset_i16_odd_offset(ptr nocapture noundef readonly %a)  {
   ret i32 %conv
 }
 
+; Already encoded with a single mov MOVNWi
+define i8 @LdOffset_i8_movnwi(ptr %a)  {
+; CHECK-LABEL: LdOffset_i8_movnwi:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #16777215 // =0xffffff
+; CHECK-NEXT:    ldrb w0, [x0, x8]
+; CHECK-NEXT:    ret
+  %arrayidx = getelementptr inbounds i8, ptr %a, i64 16777215
+  %val = load i8, ptr %arrayidx, align 1
+  ret i8 %val
+}
+
+; Negative test: the offset is too large to encoded with a add
+define i8 @LdOffset_i8_too_large(ptr %a)  {
+; CHECK-LABEL: LdOffset_i8_too_large:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #1 // =0x1
+; CHECK-NEXT:    movk w8, #256, lsl #16
+; CHECK-NEXT:    ldrb w0, [x0, x8]
+; CHECK-NEXT:    ret
+  %arrayidx = getelementptr inbounds i8, ptr %a, i64 16777217
+  %val = load i8, ptr %arrayidx, align 1
+  ret i8 %val
+}
diff --git a/llvm/test/CodeGen/AArch64/large-offset-ldr-merge.mir b/llvm/test/CodeGen/AArch64/large-offset-ldr-merge.mir
new file mode 100755
index 0000000000000..488f1ffdb52f3
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/large-offset-ldr-merge.mir
@@ -0,0 +1,48 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=aarch64 -run-pass aarch64-ldst-opt %s -o - | FileCheck %s
+
+
+---
+name:            LdOffset
+tracksRegLiveness: true
+liveins:
+  - { reg: '$x0', virtual-reg: '' }
+body:             |
+  bb.0.entry:
+    liveins: $x0
+
+    ; CHECK-LABEL: name: LdOffset
+    ; CHECK: liveins: $x0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: renamable $w8 = MOVZWi 56952, 0
+    ; CHECK-NEXT: renamable $w8 = MOVKWi $w8, 15, 16, implicit-def $x8
+    ; CHECK-NEXT: renamable $w0 = LDRBBroX killed renamable $x0, killed renamable $x8, 0, 0
+    ; CHECK-NEXT: RET undef $lr, implicit $w0
+    renamable $w8 = MOVZWi 56952, 0
+    renamable $w8 = MOVKWi $w8, 15, 16, implicit-def $x8
+    renamable $w0 = LDRBBroX killed renamable $x0, killed renamable $x8, 0, 0
+    RET undef $lr, implicit $w0
+...
+
+# Negative test: the IndexReg missing killed flags
+---
+name:            LdOffset_missing_killed
+tracksRegLiveness: true
+liveins:
+  - { reg: '$x0', virtual-reg: '' }
+body:             |
+  bb.0.entry:
+    liveins: $x0
+
+    ; CHECK-LABEL: name: LdOffset_missing_killed
+    ; CHECK: liveins: $x0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: renamable $w8 = MOVZWi 56952, 0
+    ; CHECK-NEXT: renamable $w8 = MOVKWi $w8, 15, 16, implicit-def $x8
+    ; CHECK-NEXT: renamable $w0 = LDRBBroX killed renamable $x0, renamable $x8, 0, 0
+    ; CHECK-NEXT: RET undef $lr, implicit $w0
+    renamable $w8 = MOVZWi 56952, 0
+    renamable $w8 = MOVKWi $w8, 15, 16, implicit-def $x8
+    renamable $w0 = LDRBBroX killed renamable $x0, renamable $x8, 0, 0
+    RET undef $lr, implicit $w0
+...

From 32878c2065c8005b3ea30c79e16dfd7eed55d645 Mon Sep 17 00:00:00 2001
From: zhongyunde 00443407 <zhongyunde@huawei.com>
Date: Fri, 10 Nov 2023 07:29:03 -0500
Subject: [PATCH 047/342] [AArch64] merge index address with large offset into
 base address

A case for this transformation, https://gcc.godbolt.org/z/nhYcWq1WE
Fold
  mov     w8, #56952
  movk    w8, #15, lsl #16
  ldrb    w0, [x0, x8]
into
  add     x0, x0, 1036288
  ldrb    w0, [x0, 3704]

Only LDRBBroX is supported for the first time.
Fix https://github.com/llvm/llvm-project/issues/71917
---
 llvm/lib/Target/AArch64/AArch64InstrInfo.cpp  |  10 +
 llvm/lib/Target/AArch64/AArch64InstrInfo.h    |   3 +
 .../AArch64/AArch64LoadStoreOptimizer.cpp     | 229 ++++++++++++++++++
 llvm/test/CodeGen/AArch64/arm64-addrmode.ll   |  15 +-
 .../AArch64/large-offset-ldr-merge.mir        |   5 +-
 5 files changed, 250 insertions(+), 12 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 7d71c316bcb0a..855c7f1710256 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -4089,6 +4089,16 @@ AArch64InstrInfo::getLdStOffsetOp(const MachineInstr &MI) {
   return MI.getOperand(Idx);
 }
 
+const MachineOperand &
+AArch64InstrInfo::getLdStAmountOp(const MachineInstr &MI) {
+  switch (MI.getOpcode()) {
+  default:
+    llvm_unreachable("Unexpected opcode");
+  case AArch64::LDRBBroX:
+    return MI.getOperand(4);
+  }
+}
+
 static const TargetRegisterClass *getRegClass(const MachineInstr &MI,
                                               Register Reg) {
   if (MI.getParent() == nullptr)
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
index 6526f6740747a..db24a19fe5f8e 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
@@ -111,6 +111,9 @@ class AArch64InstrInfo final : public AArch64GenInstrInfo {
   /// Returns the immediate offset operator of a load/store.
   static const MachineOperand &getLdStOffsetOp(const MachineInstr &MI);
 
+  /// Returns the shift amount operator of a load/store.
+  static const MachineOperand &getLdStAmountOp(const MachineInstr &MI);
+
   /// Returns whether the instruction is FP or NEON.
   static bool isFpOrNEON(const MachineInstr &MI);
 
diff --git a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
index dc6d5b8950c34..b435b3ce03e7e 100644
--- a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
@@ -62,6 +62,8 @@ STATISTIC(NumUnscaledPairCreated,
           "Number of load/store from unscaled generated");
 STATISTIC(NumZeroStoresPromoted, "Number of narrow zero stores promoted");
 STATISTIC(NumLoadsFromStoresPromoted, "Number of loads from stores promoted");
+STATISTIC(NumConstOffsetFolded,
+          "Number of const offset of index address folded");
 
 DEBUG_COUNTER(RegRenamingCounter, DEBUG_TYPE "-reg-renaming",
               "Controls which pairs are considered for renaming");
@@ -75,6 +77,11 @@ static cl::opt<unsigned> LdStLimit("aarch64-load-store-scan-limit",
 static cl::opt<unsigned> UpdateLimit("aarch64-update-scan-limit", cl::init(100),
                                      cl::Hidden);
 
+// The LdStConstLimit limits how far we search for const offset instructions
+// when we form index address load/store instructions.
+static cl::opt<unsigned> LdStConstLimit("aarch64-load-store-const-scan-limit",
+                                        cl::init(10), cl::Hidden);
+
 // Enable register renaming to find additional store pairing opportunities.
 static cl::opt<bool> EnableRenaming("aarch64-load-store-renaming",
                                     cl::init(true), cl::Hidden);
@@ -171,6 +178,13 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass {
   findMatchingUpdateInsnForward(MachineBasicBlock::iterator I,
                                 int UnscaledOffset, unsigned Limit);
 
+  // Scan the instruction list to find a register assigned with a const
+  // value that can be combined with the current instruction (a load or store)
+  // using base addressing with writeback. Scan forwards.
+  MachineBasicBlock::iterator
+  findMatchingConstOffsetBackward(MachineBasicBlock::iterator I, unsigned Limit,
+                                  unsigned &Offset);
+
   // Scan the instruction list to find a base register update that can
   // be combined with the current instruction (a load or store) using
   // pre or post indexed addressing with writeback. Scan backwards.
@@ -182,11 +196,19 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass {
   bool isMatchingUpdateInsn(MachineInstr &MemMI, MachineInstr &MI,
                             unsigned BaseReg, int Offset);
 
+  bool isMatchingMovConstInsn(MachineInstr &MemMI, MachineInstr &MI,
+                              unsigned IndexReg, unsigned &Offset);
+
   // Merge a pre- or post-index base register update into a ld/st instruction.
   MachineBasicBlock::iterator
   mergeUpdateInsn(MachineBasicBlock::iterator I,
                   MachineBasicBlock::iterator Update, bool IsPreIdx);
 
+  MachineBasicBlock::iterator
+  mergeConstOffsetInsn(MachineBasicBlock::iterator I,
+                       MachineBasicBlock::iterator Update, unsigned Offset,
+                       int Scale);
+
   // Find and merge zero store instructions.
   bool tryToMergeZeroStInst(MachineBasicBlock::iterator &MBBI);
 
@@ -199,6 +221,9 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass {
   // Find and merge a base register updates before or after a ld/st instruction.
   bool tryToMergeLdStUpdate(MachineBasicBlock::iterator &MBBI);
 
+  // Find and merge a index ldr/st instructions into a base ld/st instruction.
+  bool tryToMergeIndexLdSt(MachineBasicBlock::iterator &MBBI, int Scale);
+
   bool optimizeBlock(MachineBasicBlock &MBB, bool EnableNarrowZeroStOpt);
 
   bool runOnMachineFunction(MachineFunction &Fn) override;
@@ -481,6 +506,16 @@ static unsigned getPreIndexedOpcode(unsigned Opc) {
   }
 }
 
+static unsigned getBaseAddressOpcode(unsigned Opc) {
+  // TODO: Add more index address loads/stores.
+  switch (Opc) {
+  default:
+    llvm_unreachable("Opcode has no base address equivalent!");
+  case AArch64::LDRBBroX:
+    return AArch64::LDRBBui;
+  }
+}
+
 static unsigned getPostIndexedOpcode(unsigned Opc) {
   switch (Opc) {
   default:
@@ -722,6 +757,20 @@ static bool isMergeableLdStUpdate(MachineInstr &MI) {
   }
 }
 
+// Make sure this is a reg+reg Ld/St
+static bool isMergeableIndexLdSt(MachineInstr &MI, int &Scale) {
+  unsigned Opc = MI.getOpcode();
+  switch (Opc) {
+  default:
+    return false;
+  // Scaled instructions.
+  // TODO: Add more index address loads/stores.
+  case AArch64::LDRBBroX:
+    Scale = 1;
+    return true;
+  }
+}
+
 static bool isRewritableImplicitDef(unsigned Opc) {
   switch (Opc) {
   default:
@@ -2018,6 +2067,63 @@ AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I,
   return NextI;
 }
 
+MachineBasicBlock::iterator
+AArch64LoadStoreOpt::mergeConstOffsetInsn(MachineBasicBlock::iterator I,
+                                          MachineBasicBlock::iterator Update,
+                                          unsigned Offset, int Scale) {
+  assert((Update->getOpcode() == AArch64::MOVKWi) &&
+         "Unexpected const mov instruction to merge!");
+  MachineBasicBlock::iterator E = I->getParent()->end();
+  MachineBasicBlock::iterator NextI = next_nodbg(I, E);
+  MachineBasicBlock::iterator PrevI = prev_nodbg(Update, E);
+  MachineInstr &MemMI = *I;
+  unsigned Mask = (1 << 12) * Scale - 1;
+  unsigned Low = Offset & Mask;
+  unsigned High = Offset - Low;
+  Register BaseReg = AArch64InstrInfo::getLdStBaseOp(MemMI).getReg();
+  Register IndexReg = AArch64InstrInfo::getLdStOffsetOp(MemMI).getReg();
+  MachineInstrBuilder AddMIB, MemMIB;
+
+  // Add IndexReg, BaseReg, High (the BaseReg may be SP)
+  AddMIB =
+      BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(AArch64::ADDXri))
+          .addDef(IndexReg)
+          .addUse(BaseReg)
+          .addImm(High >> 12) // shifted value
+          .addImm(12);        // shift 12
+  (void)AddMIB;
+  // Ld/St DestReg, IndexReg, Imm12
+  unsigned NewOpc = getBaseAddressOpcode(I->getOpcode());
+  MemMIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
+               .add(getLdStRegOp(MemMI))
+               .add(AArch64InstrInfo::getLdStOffsetOp(MemMI))
+               .addImm(Low / Scale)
+               .setMemRefs(I->memoperands())
+               .setMIFlags(I->mergeFlagsWith(*Update));
+  (void)MemMIB;
+
+  ++NumConstOffsetFolded;
+  LLVM_DEBUG(dbgs() << "Creating base address load/store.\n");
+  LLVM_DEBUG(dbgs() << "    Replacing instructions:\n    ");
+  LLVM_DEBUG(PrevI->print(dbgs()));
+  LLVM_DEBUG(dbgs() << "    ");
+  LLVM_DEBUG(Update->print(dbgs()));
+  LLVM_DEBUG(dbgs() << "    ");
+  LLVM_DEBUG(I->print(dbgs()));
+  LLVM_DEBUG(dbgs() << "  with instruction:\n    ");
+  LLVM_DEBUG(((MachineInstr *)AddMIB)->print(dbgs()));
+  LLVM_DEBUG(dbgs() << "    ");
+  LLVM_DEBUG(((MachineInstr *)MemMIB)->print(dbgs()));
+  LLVM_DEBUG(dbgs() << "\n");
+
+  // Erase the old instructions for the block.
+  I->eraseFromParent();
+  PrevI->eraseFromParent();
+  Update->eraseFromParent();
+
+  return NextI;
+}
+
 bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr &MemMI,
                                                MachineInstr &MI,
                                                unsigned BaseReg, int Offset) {
@@ -2065,6 +2171,31 @@ bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr &MemMI,
   return false;
 }
 
+bool AArch64LoadStoreOpt::isMatchingMovConstInsn(MachineInstr &MemMI,
+                                                 MachineInstr &MI,
+                                                 unsigned IndexReg,
+                                                 unsigned &Offset) {
+  // The update instruction source and destination register must be the
+  // same as the load/store index register.
+  if (MI.getOpcode() == AArch64::MOVKWi &&
+      TRI->isSuperOrSubRegisterEq(IndexReg, MI.getOperand(1).getReg())) {
+
+    // movz + movk hold a large offset of a Ld/St instruction.
+    MachineBasicBlock::iterator B = MI.getParent()->begin();
+    MachineBasicBlock::iterator MBBI = &MI;
+    MBBI = prev_nodbg(MBBI, B);
+    MachineInstr &MovzMI = *MBBI;
+    if (MovzMI.getOpcode() == AArch64::MOVZWi) {
+      unsigned Low = MovzMI.getOperand(1).getImm();
+      unsigned High = MI.getOperand(2).getImm() << MI.getOperand(3).getImm();
+      Offset = High + Low;
+      // 12-bit optionally shifted immediates are legal for adds.
+      return Offset >> 24 == 0;
+    }
+  }
+  return false;
+}
+
 MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward(
     MachineBasicBlock::iterator I, int UnscaledOffset, unsigned Limit) {
   MachineBasicBlock::iterator E = I->getParent()->end();
@@ -2220,6 +2351,60 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
   return E;
 }
 
+MachineBasicBlock::iterator
+AArch64LoadStoreOpt::findMatchingConstOffsetBackward(
+    MachineBasicBlock::iterator I, unsigned Limit, unsigned &Offset) {
+  MachineBasicBlock::iterator B = I->getParent()->begin();
+  MachineBasicBlock::iterator E = I->getParent()->end();
+  MachineInstr &MemMI = *I;
+  MachineBasicBlock::iterator MBBI = I;
+
+  // If the load is the first instruction in the block, there's obviously
+  // not any matching load or store.
+  if (MBBI == B)
+    return E;
+
+  // Make sure the IndexReg is killed and the shift amount is zero.
+  // TODO: Relex this restriction to extend, simplify processing now.
+  if (!AArch64InstrInfo::getLdStOffsetOp(MemMI).isKill() ||
+      !AArch64InstrInfo::getLdStAmountOp(MemMI).isImm() ||
+      (AArch64InstrInfo::getLdStAmountOp(MemMI).getImm() != 0))
+    return E;
+
+  Register IndexReg = AArch64InstrInfo::getLdStOffsetOp(MemMI).getReg();
+
+  // Track which register units have been modified and used between the first
+  // insn (inclusive) and the second insn.
+  ModifiedRegUnits.clear();
+  UsedRegUnits.clear();
+  unsigned Count = 0;
+  do {
+    MBBI = prev_nodbg(MBBI, B);
+    MachineInstr &MI = *MBBI;
+
+    // Don't count transient instructions towards the search limit since there
+    // may be different numbers of them if e.g. debug information is present.
+    if (!MI.isTransient())
+      ++Count;
+
+    // If we found a match, return it.
+    if (isMatchingMovConstInsn(*I, MI, IndexReg, Offset)) {
+      return MBBI;
+    }
+
+    // Update the status of what the instruction clobbered and used.
+    LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI);
+
+    // Otherwise, if the index register is used or modified, we have no match,
+    // so return early.
+    if (!ModifiedRegUnits.available(IndexReg) ||
+        !UsedRegUnits.available(IndexReg))
+      return E;
+
+  } while (MBBI != B && Count < Limit);
+  return E;
+}
+
 bool AArch64LoadStoreOpt::tryToPromoteLoadFromStore(
     MachineBasicBlock::iterator &MBBI) {
   MachineInstr &MI = *MBBI;
@@ -2404,6 +2589,34 @@ bool AArch64LoadStoreOpt::tryToMergeLdStUpdate
   return false;
 }
 
+bool AArch64LoadStoreOpt::tryToMergeIndexLdSt(MachineBasicBlock::iterator &MBBI,
+                                              int Scale) {
+  MachineInstr &MI = *MBBI;
+  MachineBasicBlock::iterator E = MI.getParent()->end();
+  MachineBasicBlock::iterator Update;
+
+  // Don't know how to handle unscaled pre/post-index versions below, so bail.
+  if (TII->hasUnscaledLdStOffset(MI.getOpcode()))
+    return false;
+
+  // Look back to try to find a const offset for index LdSt instruction. For
+  // example,
+  // mov x8, #LargeImm   ; = a * (1<<12) + imm12
+  // ldr x1, [x0, x8]
+  // merged into:
+  // add x8, x0, a * (1<<12)
+  // ldr x1, [x8, imm12]
+  unsigned Offset;
+  Update = findMatchingConstOffsetBackward(MBBI, LdStConstLimit, Offset);
+  if (Update != E && (Offset & (Scale - 1)) == 0) {
+    // Merge the imm12 into the ld/st.
+    MBBI = mergeConstOffsetInsn(MBBI, Update, Offset, Scale);
+    return true;
+  }
+
+  return false;
+}
+
 bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
                                         bool EnableNarrowZeroStOpt) {
 
@@ -2482,6 +2695,22 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
       ++MBBI;
   }
 
+  // 5) Find a register assigned with a const value that can be combined with
+  // into the load or store. e.g.,
+  //        mov x8, #LargeImm   ; = a * (1<<12) + imm12
+  //        ldr x1, [x0, x8]
+  //        ; becomes
+  //        add x8, x0, a * (1<<12)
+  //        ldr x1, [x8, imm12]
+  for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
+       MBBI != E;) {
+    int Scale;
+    if (isMergeableIndexLdSt(*MBBI, Scale) && tryToMergeIndexLdSt(MBBI, Scale))
+      Modified = true;
+    else
+      ++MBBI;
+  }
+
   return Modified;
 }
 
diff --git a/llvm/test/CodeGen/AArch64/arm64-addrmode.ll b/llvm/test/CodeGen/AArch64/arm64-addrmode.ll
index d39029163a47a..2181eaaee7db6 100644
--- a/llvm/test/CodeGen/AArch64/arm64-addrmode.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-addrmode.ll
@@ -214,9 +214,8 @@ define void @t17(i64 %a) {
 define i8 @LdOffset_i8(ptr %a)  {
 ; CHECK-LABEL: LdOffset_i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #56952 // =0xde78
-; CHECK-NEXT:    movk w8, #15, lsl #16
-; CHECK-NEXT:    ldrb w0, [x0, x8]
+; CHECK-NEXT:    add x8, x0, #253, lsl #12 // =1036288
+; CHECK-NEXT:    ldrb w0, [x8, #3704]
 ; CHECK-NEXT:    ret
   %arrayidx = getelementptr inbounds i8, ptr %a, i64 1039992
   %val = load i8, ptr %arrayidx, align 1
@@ -227,9 +226,8 @@ define i8 @LdOffset_i8(ptr %a)  {
 define i32 @LdOffset_i8_zext32(ptr %a)  {
 ; CHECK-LABEL: LdOffset_i8_zext32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #56952 // =0xde78
-; CHECK-NEXT:    movk w8, #15, lsl #16
-; CHECK-NEXT:    ldrb w0, [x0, x8]
+; CHECK-NEXT:    add x8, x0, #253, lsl #12 // =1036288
+; CHECK-NEXT:    ldrb w0, [x8, #3704]
 ; CHECK-NEXT:    ret
   %arrayidx = getelementptr inbounds i8, ptr %a, i64 1039992
   %val = load i8, ptr %arrayidx, align 1
@@ -255,9 +253,8 @@ define i32 @LdOffset_i8_sext32(ptr %a)  {
 define i64 @LdOffset_i8_zext64(ptr %a)  {
 ; CHECK-LABEL: LdOffset_i8_zext64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #56952 // =0xde78
-; CHECK-NEXT:    movk w8, #15, lsl #16
-; CHECK-NEXT:    ldrb w0, [x0, x8]
+; CHECK-NEXT:    add x8, x0, #253, lsl #12 // =1036288
+; CHECK-NEXT:    ldrb w0, [x8, #3704]
 ; CHECK-NEXT:    ret
   %arrayidx = getelementptr inbounds i8, ptr %a, i64 1039992
   %val = load i8, ptr %arrayidx, align 1
diff --git a/llvm/test/CodeGen/AArch64/large-offset-ldr-merge.mir b/llvm/test/CodeGen/AArch64/large-offset-ldr-merge.mir
index 488f1ffdb52f3..15b6700398ea0 100755
--- a/llvm/test/CodeGen/AArch64/large-offset-ldr-merge.mir
+++ b/llvm/test/CodeGen/AArch64/large-offset-ldr-merge.mir
@@ -14,9 +14,8 @@ body:             |
     ; CHECK-LABEL: name: LdOffset
     ; CHECK: liveins: $x0
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: renamable $w8 = MOVZWi 56952, 0
-    ; CHECK-NEXT: renamable $w8 = MOVKWi $w8, 15, 16, implicit-def $x8
-    ; CHECK-NEXT: renamable $w0 = LDRBBroX killed renamable $x0, killed renamable $x8, 0, 0
+    ; CHECK-NEXT: $x8 = ADDXri $x0, 253, 12
+    ; CHECK-NEXT: renamable $w0 = LDRBBui killed renamable $x8, 3704
     ; CHECK-NEXT: RET undef $lr, implicit $w0
     renamable $w8 = MOVZWi 56952, 0
     renamable $w8 = MOVKWi $w8, 15, 16, implicit-def $x8

From f5687636415969e6d945659a0b78734abdfb0f06 Mon Sep 17 00:00:00 2001
From: zhongyunde 00443407 <zhongyunde@huawei.com>
Date: Mon, 18 Dec 2023 00:51:48 -0500
Subject: [PATCH 048/342] [AArch64] Fold more load.x into load.i with large
 offset

The list of load.x is refer to canFoldIntoAddrMode on D152828.
Also support LDRSroX missed in canFoldIntoAddrMode
---
 llvm/lib/Target/AArch64/AArch64InstrInfo.cpp  | 13 +++
 .../AArch64/AArch64LoadStoreOptimizer.cpp     | 53 +++++++++++-
 llvm/test/CodeGen/AArch64/arm64-addrmode.ll   | 85 ++++++++-----------
 3 files changed, 97 insertions(+), 54 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 855c7f1710256..175f6ef49c3ba 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -4094,7 +4094,20 @@ AArch64InstrInfo::getLdStAmountOp(const MachineInstr &MI) {
   switch (MI.getOpcode()) {
   default:
     llvm_unreachable("Unexpected opcode");
+  case AArch64::LDRBroX:
   case AArch64::LDRBBroX:
+  case AArch64::LDRSBXroX:
+  case AArch64::LDRSBWroX:
+  case AArch64::LDRHroX:
+  case AArch64::LDRHHroX:
+  case AArch64::LDRSHXroX:
+  case AArch64::LDRSHWroX:
+  case AArch64::LDRWroX:
+  case AArch64::LDRSroX:
+  case AArch64::LDRSWroX:
+  case AArch64::LDRDroX:
+  case AArch64::LDRXroX:
+  case AArch64::LDRQroX:
     return MI.getOperand(4);
   }
 }
diff --git a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
index b435b3ce03e7e..aa7a4bc235361 100644
--- a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
@@ -180,7 +180,7 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass {
 
   // Scan the instruction list to find a register assigned with a const
   // value that can be combined with the current instruction (a load or store)
-  // using base addressing with writeback. Scan forwards.
+  // using base addressing with writeback. Scan backwards.
   MachineBasicBlock::iterator
   findMatchingConstOffsetBackward(MachineBasicBlock::iterator I, unsigned Limit,
                                   unsigned &Offset);
@@ -221,7 +221,7 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass {
   // Find and merge a base register updates before or after a ld/st instruction.
   bool tryToMergeLdStUpdate(MachineBasicBlock::iterator &MBBI);
 
-  // Find and merge a index ldr/st instructions into a base ld/st instruction.
+  // Find and merge a index ldr/st instruction into a base ld/st instruction.
   bool tryToMergeIndexLdSt(MachineBasicBlock::iterator &MBBI, int Scale);
 
   bool optimizeBlock(MachineBasicBlock &MBB, bool EnableNarrowZeroStOpt);
@@ -511,8 +511,34 @@ static unsigned getBaseAddressOpcode(unsigned Opc) {
   switch (Opc) {
   default:
     llvm_unreachable("Opcode has no base address equivalent!");
+  case AArch64::LDRBroX:
+    return AArch64::LDRBui;
   case AArch64::LDRBBroX:
     return AArch64::LDRBBui;
+  case AArch64::LDRSBXroX:
+    return AArch64::LDRSBXui;
+  case AArch64::LDRSBWroX:
+    return AArch64::LDRSBWui;
+  case AArch64::LDRHroX:
+    return AArch64::LDRHui;
+  case AArch64::LDRHHroX:
+    return AArch64::LDRHHui;
+  case AArch64::LDRSHXroX:
+    return AArch64::LDRSHXui;
+  case AArch64::LDRSHWroX:
+    return AArch64::LDRSHWui;
+  case AArch64::LDRWroX:
+    return AArch64::LDRWui;
+  case AArch64::LDRSroX:
+    return AArch64::LDRSui;
+  case AArch64::LDRSWroX:
+    return AArch64::LDRSWui;
+  case AArch64::LDRDroX:
+    return AArch64::LDRDui;
+  case AArch64::LDRXroX:
+    return AArch64::LDRXui;
+  case AArch64::LDRQroX:
+    return AArch64::LDRQui;
   }
 }
 
@@ -764,10 +790,31 @@ static bool isMergeableIndexLdSt(MachineInstr &MI, int &Scale) {
   default:
     return false;
   // Scaled instructions.
-  // TODO: Add more index address loads/stores.
+  // TODO: Add more index address stores.
+  case AArch64::LDRBroX:
   case AArch64::LDRBBroX:
+  case AArch64::LDRSBXroX:
+  case AArch64::LDRSBWroX:
     Scale = 1;
     return true;
+  case AArch64::LDRHroX:
+  case AArch64::LDRHHroX:
+  case AArch64::LDRSHXroX:
+  case AArch64::LDRSHWroX:
+    Scale = 2;
+    return true;
+  case AArch64::LDRWroX:
+  case AArch64::LDRSroX:
+  case AArch64::LDRSWroX:
+    Scale = 4;
+    return true;
+  case AArch64::LDRDroX:
+  case AArch64::LDRXroX:
+    Scale = 8;
+    return true;
+  case AArch64::LDRQroX:
+    Scale = 16;
+    return true;
   }
 }
 
diff --git a/llvm/test/CodeGen/AArch64/arm64-addrmode.ll b/llvm/test/CodeGen/AArch64/arm64-addrmode.ll
index 2181eaaee7db6..bfef61abd8c12 100644
--- a/llvm/test/CodeGen/AArch64/arm64-addrmode.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-addrmode.ll
@@ -239,9 +239,8 @@ define i32 @LdOffset_i8_zext32(ptr %a)  {
 define i32 @LdOffset_i8_sext32(ptr %a)  {
 ; CHECK-LABEL: LdOffset_i8_sext32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #56952 // =0xde78
-; CHECK-NEXT:    movk w8, #15, lsl #16
-; CHECK-NEXT:    ldrsb w0, [x0, x8]
+; CHECK-NEXT:    add x8, x0, #253, lsl #12 // =1036288
+; CHECK-NEXT:    ldrsb w0, [x8, #3704]
 ; CHECK-NEXT:    ret
   %arrayidx = getelementptr inbounds i8, ptr %a, i64 1039992
   %val = load i8, ptr %arrayidx, align 1
@@ -266,9 +265,8 @@ define i64 @LdOffset_i8_zext64(ptr %a)  {
 define i64 @LdOffset_i8_sext64(ptr %a)  {
 ; CHECK-LABEL: LdOffset_i8_sext64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #56952 // =0xde78
-; CHECK-NEXT:    movk w8, #15, lsl #16
-; CHECK-NEXT:    ldrsb x0, [x0, x8]
+; CHECK-NEXT:    add x8, x0, #253, lsl #12 // =1036288
+; CHECK-NEXT:    ldrsb x0, [x8, #3704]
 ; CHECK-NEXT:    ret
   %arrayidx = getelementptr inbounds i8, ptr %a, i64 1039992
   %val = load i8, ptr %arrayidx, align 1
@@ -280,9 +278,8 @@ define i64 @LdOffset_i8_sext64(ptr %a)  {
 define i16 @LdOffset_i16(ptr %a)  {
 ; CHECK-LABEL: LdOffset_i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #48368 // =0xbcf0
-; CHECK-NEXT:    movk w8, #31, lsl #16
-; CHECK-NEXT:    ldrh w0, [x0, x8]
+; CHECK-NEXT:    add x8, x0, #506, lsl #12 // =2072576
+; CHECK-NEXT:    ldrh w0, [x8, #7408]
 ; CHECK-NEXT:    ret
   %arrayidx = getelementptr inbounds i16, ptr %a, i64 1039992
   %val = load i16, ptr %arrayidx, align 2
@@ -293,9 +290,8 @@ define i16 @LdOffset_i16(ptr %a)  {
 define i32 @LdOffset_i16_zext32(ptr %a)  {
 ; CHECK-LABEL: LdOffset_i16_zext32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #48368 // =0xbcf0
-; CHECK-NEXT:    movk w8, #31, lsl #16
-; CHECK-NEXT:    ldrh w0, [x0, x8]
+; CHECK-NEXT:    add x8, x0, #506, lsl #12 // =2072576
+; CHECK-NEXT:    ldrh w0, [x8, #7408]
 ; CHECK-NEXT:    ret
   %arrayidx = getelementptr inbounds i16, ptr %a, i64 1039992
   %val = load i16, ptr %arrayidx, align 2
@@ -307,9 +303,8 @@ define i32 @LdOffset_i16_zext32(ptr %a)  {
 define i32 @LdOffset_i16_sext32(ptr %a)  {
 ; CHECK-LABEL: LdOffset_i16_sext32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #48368 // =0xbcf0
-; CHECK-NEXT:    movk w8, #31, lsl #16
-; CHECK-NEXT:    ldrsh w0, [x0, x8]
+; CHECK-NEXT:    add x8, x0, #506, lsl #12 // =2072576
+; CHECK-NEXT:    ldrsh w0, [x8, #7408]
 ; CHECK-NEXT:    ret
   %arrayidx = getelementptr inbounds i16, ptr %a, i64 1039992
   %val = load i16, ptr %arrayidx, align 2
@@ -321,9 +316,8 @@ define i32 @LdOffset_i16_sext32(ptr %a)  {
 define i64 @LdOffset_i16_zext64(ptr %a)  {
 ; CHECK-LABEL: LdOffset_i16_zext64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #48368 // =0xbcf0
-; CHECK-NEXT:    movk w8, #31, lsl #16
-; CHECK-NEXT:    ldrh w0, [x0, x8]
+; CHECK-NEXT:    add x8, x0, #506, lsl #12 // =2072576
+; CHECK-NEXT:    ldrh w0, [x8, #7408]
 ; CHECK-NEXT:    ret
   %arrayidx = getelementptr inbounds i16, ptr %a, i64 1039992
   %val = load i16, ptr %arrayidx, align 2
@@ -335,9 +329,8 @@ define i64 @LdOffset_i16_zext64(ptr %a)  {
 define i64 @LdOffset_i16_sext64(ptr %a)  {
 ; CHECK-LABEL: LdOffset_i16_sext64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #48368 // =0xbcf0
-; CHECK-NEXT:    movk w8, #31, lsl #16
-; CHECK-NEXT:    ldrsh x0, [x0, x8]
+; CHECK-NEXT:    add x8, x0, #506, lsl #12 // =2072576
+; CHECK-NEXT:    ldrsh x0, [x8, #7408]
 ; CHECK-NEXT:    ret
   %arrayidx = getelementptr inbounds i16, ptr %a, i64 1039992
   %val = load i16, ptr %arrayidx, align 2
@@ -349,9 +342,8 @@ define i64 @LdOffset_i16_sext64(ptr %a)  {
 define i32 @LdOffset_i32(ptr %a)  {
 ; CHECK-LABEL: LdOffset_i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #31200 // =0x79e0
-; CHECK-NEXT:    movk w8, #63, lsl #16
-; CHECK-NEXT:    ldr w0, [x0, x8]
+; CHECK-NEXT:    add x8, x0, #1012, lsl #12 // =4145152
+; CHECK-NEXT:    ldr w0, [x8, #14816]
 ; CHECK-NEXT:    ret
   %arrayidx = getelementptr inbounds i32, ptr %a, i64 1039992
   %val = load i32, ptr %arrayidx, align 4
@@ -362,9 +354,8 @@ define i32 @LdOffset_i32(ptr %a)  {
 define i64 @LdOffset_i32_zext64(ptr %a)  {
 ; CHECK-LABEL: LdOffset_i32_zext64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #31200 // =0x79e0
-; CHECK-NEXT:    movk w8, #63, lsl #16
-; CHECK-NEXT:    ldr w0, [x0, x8]
+; CHECK-NEXT:    add x8, x0, #1012, lsl #12 // =4145152
+; CHECK-NEXT:    ldr w0, [x8, #14816]
 ; CHECK-NEXT:    ret
   %arrayidx = getelementptr inbounds i32, ptr %a, i64 1039992
   %val = load i32, ptr %arrayidx, align 2
@@ -376,9 +367,8 @@ define i64 @LdOffset_i32_zext64(ptr %a)  {
 define i64 @LdOffset_i32_sext64(ptr %a)  {
 ; CHECK-LABEL: LdOffset_i32_sext64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #31200 // =0x79e0
-; CHECK-NEXT:    movk w8, #63, lsl #16
-; CHECK-NEXT:    ldrsw x0, [x0, x8]
+; CHECK-NEXT:    add x8, x0, #1012, lsl #12 // =4145152
+; CHECK-NEXT:    ldrsw x0, [x8, #14816]
 ; CHECK-NEXT:    ret
   %arrayidx = getelementptr inbounds i32, ptr %a, i64 1039992
   %val = load i32, ptr %arrayidx, align 2
@@ -390,9 +380,8 @@ define i64 @LdOffset_i32_sext64(ptr %a)  {
 define i64 @LdOffset_i64(ptr %a)  {
 ; CHECK-LABEL: LdOffset_i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #62400 // =0xf3c0
-; CHECK-NEXT:    movk w8, #126, lsl #16
-; CHECK-NEXT:    ldr x0, [x0, x8]
+; CHECK-NEXT:    add x8, x0, #2024, lsl #12 // =8290304
+; CHECK-NEXT:    ldr x0, [x8, #29632]
 ; CHECK-NEXT:    ret
   %arrayidx = getelementptr inbounds i64, ptr %a, i64 1039992
   %val = load i64, ptr %arrayidx, align 4
@@ -403,9 +392,8 @@ define i64 @LdOffset_i64(ptr %a)  {
 define <2 x i32> @LdOffset_v2i32(ptr %a)  {
 ; CHECK-LABEL: LdOffset_v2i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #62400 // =0xf3c0
-; CHECK-NEXT:    movk w8, #126, lsl #16
-; CHECK-NEXT:    ldr d0, [x0, x8]
+; CHECK-NEXT:    add x8, x0, #2024, lsl #12 // =8290304
+; CHECK-NEXT:    ldr d0, [x8, #29632]
 ; CHECK-NEXT:    ret
   %arrayidx = getelementptr inbounds <2 x i32>, ptr %a, i64 1039992
   %val = load <2 x i32>, ptr %arrayidx, align 4
@@ -416,9 +404,8 @@ define <2 x i32> @LdOffset_v2i32(ptr %a)  {
 define <2 x i64> @LdOffset_v2i64(ptr %a)  {
 ; CHECK-LABEL: LdOffset_v2i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #59264 // =0xe780
-; CHECK-NEXT:    movk w8, #253, lsl #16
-; CHECK-NEXT:    ldr q0, [x0, x8]
+; CHECK-NEXT:    add x8, x0, #4048, lsl #12 // =16580608
+; CHECK-NEXT:    ldr q0, [x8, #59264]
 ; CHECK-NEXT:    ret
   %arrayidx = getelementptr inbounds <2 x i64>, ptr %a, i64 1039992
   %val = load <2 x i64>, ptr %arrayidx, align 4
@@ -429,9 +416,8 @@ define <2 x i64> @LdOffset_v2i64(ptr %a)  {
 define double @LdOffset_i8_f64(ptr %a)  {
 ; CHECK-LABEL: LdOffset_i8_f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #56952 // =0xde78
-; CHECK-NEXT:    movk w8, #15, lsl #16
-; CHECK-NEXT:    ldrsb w8, [x0, x8]
+; CHECK-NEXT:    add x8, x0, #253, lsl #12 // =1036288
+; CHECK-NEXT:    ldrsb w8, [x8, #3704]
 ; CHECK-NEXT:    scvtf d0, w8
 ; CHECK-NEXT:    ret
   %arrayidx = getelementptr inbounds i8, ptr %a, i64 1039992
@@ -444,9 +430,8 @@ define double @LdOffset_i8_f64(ptr %a)  {
 define double @LdOffset_i16_f64(ptr %a)  {
 ; CHECK-LABEL: LdOffset_i16_f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #48368 // =0xbcf0
-; CHECK-NEXT:    movk w8, #31, lsl #16
-; CHECK-NEXT:    ldrsh w8, [x0, x8]
+; CHECK-NEXT:    add x8, x0, #506, lsl #12 // =2072576
+; CHECK-NEXT:    ldrsh w8, [x8, #7408]
 ; CHECK-NEXT:    scvtf d0, w8
 ; CHECK-NEXT:    ret
   %arrayidx = getelementptr inbounds i16, ptr %a, i64 1039992
@@ -459,9 +444,8 @@ define double @LdOffset_i16_f64(ptr %a)  {
 define double @LdOffset_i32_f64(ptr %a)  {
 ; CHECK-LABEL: LdOffset_i32_f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #31200 // =0x79e0
-; CHECK-NEXT:    movk w8, #63, lsl #16
-; CHECK-NEXT:    ldr s0, [x0, x8]
+; CHECK-NEXT:    add x8, x0, #1012, lsl #12 // =4145152
+; CHECK-NEXT:    ldr s0, [x8, #14816]
 ; CHECK-NEXT:    ucvtf d0, d0
 ; CHECK-NEXT:    ret
   %arrayidx = getelementptr inbounds i32, ptr %a, i64 1039992
@@ -474,9 +458,8 @@ define double @LdOffset_i32_f64(ptr %a)  {
 define double @LdOffset_i64_f64(ptr %a)  {
 ; CHECK-LABEL: LdOffset_i64_f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #62400 // =0xf3c0
-; CHECK-NEXT:    movk w8, #126, lsl #16
-; CHECK-NEXT:    ldr d0, [x0, x8]
+; CHECK-NEXT:    add x8, x0, #2024, lsl #12 // =8290304
+; CHECK-NEXT:    ldr d0, [x8, #29632]
 ; CHECK-NEXT:    scvtf d0, d0
 ; CHECK-NEXT:    ret
   %arrayidx = getelementptr inbounds i64, ptr %a, i64 1039992

From c4ff0a67d146030636e96eab4992233a7b5858d8 Mon Sep 17 00:00:00 2001
From: Paschalis Mpeis <paschalis.mpeis@arm.com>
Date: Thu, 21 Dec 2023 11:02:54 +0000
Subject: [PATCH 049/342] [TLI] Add getLibFunc that accepts an Opcode and
 scalar Type. (#75919)

It sets a LibFunc similarly with the other two getLibFunc methods.
Currently, it supports only the FRem Instruction.

Add tests for FRem.
---
 .../include/llvm/Analysis/TargetLibraryInfo.h | 10 ++++++
 llvm/lib/Analysis/TargetLibraryInfo.cpp       | 10 ++++++
 .../Analysis/TargetLibraryInfoTest.cpp        | 35 +++++++++++++++++++
 3 files changed, 55 insertions(+)

diff --git a/llvm/include/llvm/Analysis/TargetLibraryInfo.h b/llvm/include/llvm/Analysis/TargetLibraryInfo.h
index 2ffd4d4b71439..daf1d8e2079f8 100644
--- a/llvm/include/llvm/Analysis/TargetLibraryInfo.h
+++ b/llvm/include/llvm/Analysis/TargetLibraryInfo.h
@@ -156,6 +156,10 @@ class TargetLibraryInfoImpl {
   /// FDecl is assumed to have a parent Module when using this function.
   bool getLibFunc(const Function &FDecl, LibFunc &F) const;
 
+  /// Searches for a function name using an Instruction \p Opcode.
+  /// Currently, only the frem instruction is supported.
+  bool getLibFunc(unsigned int Opcode, Type *Ty, LibFunc &F) const;
+
   /// Forces a function to be marked as unavailable.
   void setUnavailable(LibFunc F) {
     setState(F, Unavailable);
@@ -360,6 +364,12 @@ class TargetLibraryInfo {
            getLibFunc(*(CB.getCalledFunction()), F);
   }
 
+  /// Searches for a function name using an Instruction \p Opcode.
+  /// Currently, only the frem instruction is supported.
+  bool getLibFunc(unsigned int Opcode, Type *Ty, LibFunc &F) const {
+    return Impl->getLibFunc(Opcode, Ty, F);
+  }
+
   /// Disables all builtins.
   ///
   /// This can be used for options like -fno-builtin.
diff --git a/llvm/lib/Analysis/TargetLibraryInfo.cpp b/llvm/lib/Analysis/TargetLibraryInfo.cpp
index 20959cf6948f6..bbb7c86d21856 100644
--- a/llvm/lib/Analysis/TargetLibraryInfo.cpp
+++ b/llvm/lib/Analysis/TargetLibraryInfo.cpp
@@ -1149,6 +1149,16 @@ bool TargetLibraryInfoImpl::getLibFunc(const Function &FDecl,
   return isValidProtoForLibFunc(*FDecl.getFunctionType(), F, *M);
 }
 
+bool TargetLibraryInfoImpl::getLibFunc(unsigned int Opcode, Type *Ty,
+                                       LibFunc &F) const {
+  // Must be a frem instruction with float or double arguments.
+  if (Opcode != Instruction::FRem || (!Ty->isDoubleTy() && !Ty->isFloatTy()))
+    return false;
+
+  F = Ty->isDoubleTy() ? LibFunc_fmod : LibFunc_fmodf;
+  return true;
+}
+
 void TargetLibraryInfoImpl::disableAllFunctions() {
   memset(AvailableArray, 0, sizeof(AvailableArray));
 }
diff --git a/llvm/unittests/Analysis/TargetLibraryInfoTest.cpp b/llvm/unittests/Analysis/TargetLibraryInfoTest.cpp
index 292b5cade9509..34b06fe480f36 100644
--- a/llvm/unittests/Analysis/TargetLibraryInfoTest.cpp
+++ b/llvm/unittests/Analysis/TargetLibraryInfoTest.cpp
@@ -621,3 +621,38 @@ TEST_F(TargetLibraryInfoTest, ValidProto) {
     EXPECT_TRUE(isLibFunc(F, LF));
   }
 }
+
+namespace {
+
+/// Creates TLI for AArch64 and uses it to get the LibFunc names for the given
+/// Instruction opcode and Type.
+class TLITestAarch64 : public ::testing::Test {
+private:
+  const Triple TargetTriple;
+
+protected:
+  LLVMContext Ctx;
+  std::unique_ptr<TargetLibraryInfoImpl> TLII;
+  std::unique_ptr<TargetLibraryInfo> TLI;
+
+  /// Create TLI for AArch64
+  TLITestAarch64() : TargetTriple(Triple("aarch64-unknown-linux-gnu")) {
+    TLII = std::make_unique<TargetLibraryInfoImpl>(
+        TargetLibraryInfoImpl(TargetTriple));
+    TLI = std::make_unique<TargetLibraryInfo>(TargetLibraryInfo(*TLII));
+  }
+
+  /// Returns the TLI function name for the given \p Opcode and type \p Ty.
+  StringRef getScalarName(unsigned int Opcode, Type *Ty) {
+    LibFunc Func;
+    if (!TLI->getLibFunc(Opcode, Ty, Func))
+      return "";
+    return TLI->getName(Func);
+  }
+};
+} // end anonymous namespace
+
+TEST_F(TLITestAarch64, TestFrem) {
+  EXPECT_EQ(getScalarName(Instruction::FRem, Type::getDoubleTy(Ctx)), "fmod");
+  EXPECT_EQ(getScalarName(Instruction::FRem, Type::getFloatTy(Ctx)), "fmodf");
+}
\ No newline at end of file

From e6d2bb0ed8c8e9ee97f502315871d7819b795058 Mon Sep 17 00:00:00 2001
From: Z572 <zhengjunjie@iscas.ac.cn>
Date: Thu, 21 Dec 2023 05:13:09 -0600
Subject: [PATCH 050/342] [InstCombine] Simplifiy `(-x * y * -x)` into `(x * y
 * x)` (#72953)

fix https://github.com/llvm/llvm-project/issues/72259
proof: https://alive2.llvm.org/ce/z/HsrmTC
---
 .../InstCombine/InstCombineMulDivRem.cpp      |   7 +
 .../InstCombine/mul-inseltpoison.ll           |   5 +-
 llvm/test/Transforms/InstCombine/mul.ll       | 173 ++++++++++++++++++
 3 files changed, 182 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
index e5566578869dd..f0ea3d9fcad5d 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
@@ -350,6 +350,13 @@ Instruction *InstCombinerImpl::visitMul(BinaryOperator &I) {
   if (match(&I, m_c_Mul(m_OneUse(m_Neg(m_Value(X))), m_Value(Y))))
     return BinaryOperator::CreateNeg(Builder.CreateMul(X, Y));
 
+  // (-X * Y) * -X --> (X * Y) * X
+  // (-X << Y) * -X --> (X << Y) * X
+  if (match(Op1, m_Neg(m_Value(X)))) {
+    if (Value *NegOp0 = Negator::Negate(false, /*IsNSW*/ false, Op0, *this))
+      return BinaryOperator::CreateMul(NegOp0, X);
+  }
+
   // (X / Y) *  Y = X - (X % Y)
   // (X / Y) * -Y = (X % Y) - X
   {
diff --git a/llvm/test/Transforms/InstCombine/mul-inseltpoison.ll b/llvm/test/Transforms/InstCombine/mul-inseltpoison.ll
index 448558d755a57..8fe4261bbf009 100644
--- a/llvm/test/Transforms/InstCombine/mul-inseltpoison.ll
+++ b/llvm/test/Transforms/InstCombine/mul-inseltpoison.ll
@@ -672,9 +672,8 @@ define <2 x i32> @test_mul_canonicalize_vec(<2 x i32> %x, <2 x i32> %y) {
 
 define i32 @test_mul_canonicalize_multiple_uses(i32 %x, i32 %y) {
 ; CHECK-LABEL: @test_mul_canonicalize_multiple_uses(
-; CHECK-NEXT:    [[NEG:%.*]] = sub i32 0, [[X:%.*]]
-; CHECK-NEXT:    [[MUL:%.*]] = mul i32 [[NEG]], [[Y:%.*]]
-; CHECK-NEXT:    [[MUL2:%.*]] = mul i32 [[MUL]], [[NEG]]
+; CHECK-NEXT:    [[MUL_NEG:%.*]] = mul i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[MUL2:%.*]] = mul i32 [[MUL_NEG]], [[X]]
 ; CHECK-NEXT:    ret i32 [[MUL2]]
 ;
   %neg = sub i32 0, %x
diff --git a/llvm/test/Transforms/InstCombine/mul.ll b/llvm/test/Transforms/InstCombine/mul.ll
index 9fe8462c5d315..b404fcffbf422 100644
--- a/llvm/test/Transforms/InstCombine/mul.ll
+++ b/llvm/test/Transforms/InstCombine/mul.ll
@@ -1255,17 +1255,190 @@ define <2 x i32> @test_mul_canonicalize_vec(<2 x i32> %x, <2 x i32> %y) {
 
 define i32 @test_mul_canonicalize_multiple_uses(i32 %x, i32 %y) {
 ; CHECK-LABEL: @test_mul_canonicalize_multiple_uses(
+; CHECK-NEXT:    [[MUL_NEG:%.*]] = mul i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[MUL2:%.*]] = mul i32 [[MUL_NEG]], [[X]]
+; CHECK-NEXT:    ret i32 [[MUL2]]
+;
+  %neg = sub i32 0, %x
+  %mul = mul i32 %neg, %y
+  %mul2 = mul i32 %mul, %neg
+  ret i32 %mul2
+}
+
+define i32 @mul_nsw_mul_nsw_neg(i32 %x, i32 %y) {
+; CHECK-LABEL: @mul_nsw_mul_nsw_neg(
+; CHECK-NEXT:    [[MUL_NEG:%.*]] = mul i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[MUL2:%.*]] = mul i32 [[MUL_NEG]], [[X]]
+; CHECK-NEXT:    ret i32 [[MUL2]]
+;
+  %neg = sub i32 0, %x
+  %mul = mul nsw i32 %neg, %y
+  %mul2 = mul nsw i32 %mul, %neg
+  ret i32 %mul2
+}
+
+define i32 @mul_mul_nsw_neg(i32 %x,i32 %y) {
+; CHECK-LABEL: @mul_mul_nsw_neg(
+; CHECK-NEXT:    [[MUL_NEG:%.*]] = mul i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[MUL2:%.*]] = mul i32 [[MUL_NEG]], [[X]]
+; CHECK-NEXT:    ret i32 [[MUL2]]
+;
+  %neg = sub i32 0, %x
+  %mul = mul nsw i32 %neg, %y
+  %mul2 = mul i32 %mul, %neg
+  ret i32 %mul2
+}
+
+define i32 @mul_nsw_mul_neg(i32 %x,i32 %y) {
+; CHECK-LABEL: @mul_nsw_mul_neg(
+; CHECK-NEXT:    [[MUL_NEG:%.*]] = mul i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[MUL2:%.*]] = mul i32 [[MUL_NEG]], [[X]]
+; CHECK-NEXT:    ret i32 [[MUL2]]
+;
+  %neg = sub i32 0, %x
+  %mul = mul i32 %neg, %y
+  %mul2 = mul nsw i32 %mul, %neg
+  ret i32 %mul2
+}
+
+define i32 @mul_nsw_mul_neg_onearg(i32 %x) {
+; CHECK-LABEL: @mul_nsw_mul_neg_onearg(
+; CHECK-NEXT:    [[MUL_NEG:%.*]] = mul i32 [[X:%.*]], [[X]]
+; CHECK-NEXT:    [[MUL2:%.*]] = mul i32 [[MUL_NEG]], [[X]]
+; CHECK-NEXT:    ret i32 [[MUL2]]
+;
+  %neg = sub i32 0, %x
+  %mul = mul i32 %neg, %x
+  %mul2 = mul nsw i32 %mul, %neg
+  ret i32 %mul2
+}
+
+define i8 @mul_mul_nsw_neg_onearg(i8 %x) {
+; CHECK-LABEL: @mul_mul_nsw_neg_onearg(
+; CHECK-NEXT:    [[MUL_NEG:%.*]] = mul i8 [[X:%.*]], [[X]]
+; CHECK-NEXT:    [[MUL2:%.*]] = mul i8 [[MUL_NEG]], [[X]]
+; CHECK-NEXT:    ret i8 [[MUL2]]
+;
+  %neg = sub i8 0, %x
+  %mul = mul nsw i8 %neg, %x
+  %mul2 = mul i8 %mul, %neg
+  ret i8 %mul2
+}
+
+define i32 @mul_nsw_mul_nsw_neg_onearg(i32 %x) {
+; CHECK-LABEL: @mul_nsw_mul_nsw_neg_onearg(
+; CHECK-NEXT:    [[MUL_NEG:%.*]] = mul i32 [[X:%.*]], [[X]]
+; CHECK-NEXT:    [[MUL2:%.*]] = mul i32 [[MUL_NEG]], [[X]]
+; CHECK-NEXT:    ret i32 [[MUL2]]
+;
+  %neg = sub i32 0, %x
+  %mul = mul nsw i32 %neg, %x
+  %mul2 = mul nsw i32 %mul, %neg
+  ret i32 %mul2
+}
+
+define i32 @mul_nsw_shl_nsw_neg(i32 %x, i32 %y) {
+; CHECK-LABEL: @mul_nsw_shl_nsw_neg(
+; CHECK-NEXT:    [[SHL_NEG:%.*]] = shl i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[MUL:%.*]] = mul i32 [[SHL_NEG]], [[X]]
+; CHECK-NEXT:    ret i32 [[MUL]]
+;
+  %neg = sub i32 0, %x
+  %shl = shl nsw i32 %neg, %y
+  %mul = mul nsw i32 %shl, %neg
+  ret i32 %mul
+}
+
+define i32 @mul_shl_nsw_neg(i32 %x,i32 %y) {
+; CHECK-LABEL: @mul_shl_nsw_neg(
+; CHECK-NEXT:    [[SHL_NEG:%.*]] = shl i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[MUL:%.*]] = mul i32 [[SHL_NEG]], [[X]]
+; CHECK-NEXT:    ret i32 [[MUL]]
+;
+  %neg = sub i32 0, %x
+  %shl = shl nsw i32 %neg, %y
+  %mul = mul i32 %shl, %neg
+  ret i32 %mul
+}
+
+define i32 @mul_nsw_shl_neg(i32 %x,i32 %y) {
+; CHECK-LABEL: @mul_nsw_shl_neg(
+; CHECK-NEXT:    [[SHL_NEG:%.*]] = shl i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[MUL:%.*]] = mul i32 [[SHL_NEG]], [[X]]
+; CHECK-NEXT:    ret i32 [[MUL]]
+;
+  %neg = sub i32 0, %x
+  %shl = shl i32 %neg, %y
+  %mul = mul nsw i32 %shl, %neg
+  ret i32 %mul
+}
+
+define i32 @mul_nsw_shl_neg_onearg(i32 %x) {
+; CHECK-LABEL: @mul_nsw_shl_neg_onearg(
+; CHECK-NEXT:    [[SHL_NEG:%.*]] = shl i32 [[X:%.*]], [[X]]
+; CHECK-NEXT:    [[MUL:%.*]] = mul i32 [[SHL_NEG]], [[X]]
+; CHECK-NEXT:    ret i32 [[MUL]]
+;
+  %neg = sub i32 0, %x
+  %shl = shl i32 %neg, %x
+  %mul = mul nsw i32 %shl, %neg
+  ret i32 %mul
+}
+
+define i8 @mul_shl_nsw_neg_onearg(i8 %x) {
+; CHECK-LABEL: @mul_shl_nsw_neg_onearg(
+; CHECK-NEXT:    [[SHL_NEG:%.*]] = shl i8 [[X:%.*]], [[X]]
+; CHECK-NEXT:    [[MUL:%.*]] = mul i8 [[SHL_NEG]], [[X]]
+; CHECK-NEXT:    ret i8 [[MUL]]
+;
+  %neg = sub i8 0, %x
+  %shl = shl nsw i8 %neg, %x
+  %mul = mul i8 %shl, %neg
+  ret i8 %mul
+}
+
+define i32 @mul_nsw_shl_nsw_neg_onearg(i32 %x) {
+; CHECK-LABEL: @mul_nsw_shl_nsw_neg_onearg(
+; CHECK-NEXT:    [[SHL_NEG:%.*]] = mul i32 [[X:%.*]], [[X]]
+; CHECK-NEXT:    [[MUL:%.*]] = mul i32 [[SHL_NEG]], [[X]]
+; CHECK-NEXT:    ret i32 [[MUL]]
+;
+  %neg = sub i32 0, %x
+  %shl = mul nsw i32 %neg, %x
+  %mul = mul nsw i32 %shl, %neg
+  ret i32 %mul
+}
+
+define i32 @mul_use_mul_neg(i32 %x,i32 %y) {
+; CHECK-LABEL: @mul_use_mul_neg(
 ; CHECK-NEXT:    [[NEG:%.*]] = sub i32 0, [[X:%.*]]
 ; CHECK-NEXT:    [[MUL:%.*]] = mul i32 [[NEG]], [[Y:%.*]]
+; CHECK-NEXT:    call void @use32(i32 [[MUL]])
 ; CHECK-NEXT:    [[MUL2:%.*]] = mul i32 [[MUL]], [[NEG]]
 ; CHECK-NEXT:    ret i32 [[MUL2]]
 ;
   %neg = sub i32 0, %x
   %mul = mul i32 %neg, %y
+  call void @use32(i32 %mul)
   %mul2 = mul i32 %mul, %neg
   ret i32 %mul2
 }
 
+define i32 @mul_shl_use_mul_neg(i32 %x,i32 %y) {
+; CHECK-LABEL: @mul_shl_use_mul_neg(
+; CHECK-NEXT:    [[NEG:%.*]] = sub i32 0, [[X:%.*]]
+; CHECK-NEXT:    [[SHL:%.*]] = shl i32 [[NEG]], [[Y:%.*]]
+; CHECK-NEXT:    call void @use32(i32 [[SHL]])
+; CHECK-NEXT:    [[MUL2:%.*]] = mul i32 [[SHL]], [[NEG]]
+; CHECK-NEXT:    ret i32 [[MUL2]]
+;
+  %neg = sub i32 0, %x
+  %shl = shl i32 %neg, %y
+  call void @use32(i32 %shl)
+  %mul2 = mul i32 %shl, %neg
+  ret i32 %mul2
+}
+
 @X = global i32 5
 
 define i64 @test_mul_canonicalize_neg_is_not_undone(i64 %L1) {

From cf0be7b4920cec762639a1f39e8ccf1868e44c40 Mon Sep 17 00:00:00 2001
From: Felipe de Azevedo Piovezan <fpiovezan@apple.com>
Date: Thu, 21 Dec 2023 09:04:24 -0300
Subject: [PATCH 051/342] [lldb] Add actionable feedback when overwriting a
 command fails (#76030)

If adding a user commands fails because a command with the same name
already exists, we only say that "force replace is not set" without
telling the user _how_ to set it. There are two ways to do so; this
commit changes the error message to mention both.
---
 lldb/source/Interpreter/CommandInterpreter.cpp      |  6 +++++-
 .../commands/command/script/TestCommandScript.py    | 13 +++++++++++++
 2 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/lldb/source/Interpreter/CommandInterpreter.cpp b/lldb/source/Interpreter/CommandInterpreter.cpp
index e1275ce711fc1..00651df48b622 100644
--- a/lldb/source/Interpreter/CommandInterpreter.cpp
+++ b/lldb/source/Interpreter/CommandInterpreter.cpp
@@ -1160,7 +1160,11 @@ Status CommandInterpreter::AddUserCommand(llvm::StringRef name,
 
   if (UserCommandExists(name)) {
     if (!can_replace) {
-      result.SetErrorString("user command exists and force replace not set");
+      result.SetErrorStringWithFormatv(
+          "user command \"{0}\" already exists and force replace was not set "
+          "by --overwrite or 'settings set interpreter.require-overwrite "
+          "false'",
+          name);
       return result;
     }
     if (cmd_sp->IsMultiwordObject()) {
diff --git a/lldb/test/API/commands/command/script/TestCommandScript.py b/lldb/test/API/commands/command/script/TestCommandScript.py
index cac11834fa736..850552032902f 100644
--- a/lldb/test/API/commands/command/script/TestCommandScript.py
+++ b/lldb/test/API/commands/command/script/TestCommandScript.py
@@ -161,6 +161,19 @@ def cleanup():
         )
         self.expect("my_command", substrs=["a.out"])
 
+        # Test that without --overwrite we are not allowed to redefine the command.
+        self.expect(
+            "command script add my_command --class welcome.TargetnameCommand",
+            substrs=[
+                (
+                    'user command "my_command" already exists and force replace was'
+                    " not set by --overwrite or 'settings set"
+                    " interpreter.require-overwrite false'"
+                ),
+            ],
+            error=True,
+        )
+
         self.runCmd("command script clear")
 
         self.expect(

From a047675bbf476300fd159736d8ab0d6cb23fe934 Mon Sep 17 00:00:00 2001
From: Kerry McLaughlin <kerry.mclaughlin@arm.com>
Date: Thu, 21 Dec 2023 12:08:47 +0000
Subject: [PATCH 052/342] [Clang][SME2] Add builtins for multi-vector fp round
 to integral value (#75941)

Adds the following SME2 builtins:
 - svrinta, svrintm, svrintn, svrintp (x2 & x4)
---
 clang/include/clang/Basic/arm_sve.td          |  15 +
 .../aarch64-sme2-intrinsics/acle_sme2_frint.c | 282 ++++++++++++++++++
 2 files changed, 297 insertions(+)
 create mode 100644 clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_frint.c

diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td
index 04bf7acdeba79..dcce325188bc4 100644
--- a/clang/include/clang/Basic/arm_sve.td
+++ b/clang/include/clang/Basic/arm_sve.td
@@ -2141,6 +2141,21 @@ let TargetGuard = "sme2" in {
   defm SVMAXNM : SInstMinMaxByVector<"max">;
 }
 
+let TargetGuard = "sme2" in {
+  // FRINTA / FRINTM / FRINTN / FRINTP
+  def SVRINTA_X2 : SInst<"svrinta[_{d}_x2]", "22", "f", MergeNone, "aarch64_sve_frinta_x2", [IsStreaming], []>;
+  def SVRINTA_X4 : SInst<"svrinta[_{d}_x4]", "44", "f", MergeNone, "aarch64_sve_frinta_x4", [IsStreaming], []>;
+
+  def SVRINTM_X2 : SInst<"svrintm[_{d}_x2]", "22", "f", MergeNone, "aarch64_sve_frintm_x2", [IsStreaming], []>;
+  def SVRINTM_X4 : SInst<"svrintm[_{d}_x4]", "44", "f", MergeNone, "aarch64_sve_frintm_x4", [IsStreaming], []>;
+
+  def SVRINTN_X2 : SInst<"svrintn[_{d}_x2]", "22", "f", MergeNone, "aarch64_sve_frintn_x2", [IsStreaming], []>;
+  def SVRINTN_X4 : SInst<"svrintn[_{d}_x4]", "44", "f", MergeNone, "aarch64_sve_frintn_x4", [IsStreaming], []>;
+
+  def SVRINTP_X2 : SInst<"svrintp[_{d}_x2]", "22", "f", MergeNone, "aarch64_sve_frintp_x2", [IsStreaming], []>;
+  def SVRINTP_X4 : SInst<"svrintp[_{d}_x4]", "44", "f", MergeNone, "aarch64_sve_frintp_x4", [IsStreaming], []>;
+}
+
 let TargetGuard = "sme2" in {
   def SVSCLAMP_X2 : SInst<"svclamp[_single_{d}_x2]",  "22dd",   "csil",     MergeNone, "aarch64_sve_sclamp_single_x2",  [IsStreaming], []>;
   def SVUCLAMP_X2 : SInst<"svclamp[_single_{d}_x2]",  "22dd",   "UcUsUiUl", MergeNone, "aarch64_sve_uclamp_single_x2",  [IsStreaming], []>;
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_frint.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_frint.c
new file mode 100644
index 0000000000000..2a34b0e2878ef
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_frint.c
@@ -0,0 +1,282 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+
+// REQUIRES: aarch64-registered-target
+
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+
+#include <arm_sme_draft_spec_subject_to_change.h>
+
+#ifdef SVE_OVERLOADED_FORMS
+// A simple used,unused... macro, long enough to represent any SVE builtin.
+#define SVE_ACLE_FUNC(A1,A2_UNUSED) A1
+#else
+#define SVE_ACLE_FUNC(A1,A2) A1##A2
+#endif
+
+// FRINTA
+
+// CHECK-LABEL: @test_svfrinta_f32_x2(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN]], i64 4)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.frinta.x2.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP2]], 0
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP3]], i64 0)
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP2]], 1
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], i64 4)
+// CHECK-NEXT:    ret <vscale x 8 x float> [[TMP6]]
+//
+// CPP-CHECK-LABEL: @_Z20test_svfrinta_f32_x213svfloat32x2_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.frinta.x2.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]])
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP2]], 0
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP3]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP2]], 1
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], i64 4)
+// CPP-CHECK-NEXT:    ret <vscale x 8 x float> [[TMP6]]
+//
+svfloat32x2_t test_svfrinta_f32_x2(svfloat32x2_t zn) __arm_streaming {
+  return SVE_ACLE_FUNC(svrinta,_f32_x2)(zn);
+}
+
+// CHECK-LABEL: @test_svfrinta_f32_x4(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 4)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 12)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.frinta.x4.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]])
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP4]], 0
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> poison, <vscale x 4 x float> [[TMP5]], i64 0)
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP4]], 1
+// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]], i64 4)
+// CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP4]], 2
+// CHECK-NEXT:    [[TMP10:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP8]], <vscale x 4 x float> [[TMP9]], i64 8)
+// CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP4]], 3
+// CHECK-NEXT:    [[TMP12:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP10]], <vscale x 4 x float> [[TMP11]], i64 12)
+// CHECK-NEXT:    ret <vscale x 16 x float> [[TMP12]]
+//
+// CPP-CHECK-LABEL: @_Z20test_svfrinta_f32_x413svfloat32x4_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 12)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.frinta.x4.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]])
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP4]], 0
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> poison, <vscale x 4 x float> [[TMP5]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP4]], 1
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP4]], 2
+// CPP-CHECK-NEXT:    [[TMP10:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP8]], <vscale x 4 x float> [[TMP9]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP4]], 3
+// CPP-CHECK-NEXT:    [[TMP12:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP10]], <vscale x 4 x float> [[TMP11]], i64 12)
+// CPP-CHECK-NEXT:    ret <vscale x 16 x float> [[TMP12]]
+//
+svfloat32x4_t test_svfrinta_f32_x4(svfloat32x4_t zn) __arm_streaming {
+  return SVE_ACLE_FUNC(svrinta,_f32_x4)(zn);
+}
+
+// FRINTM
+
+// CHECK-LABEL: @test_svfrintam_f32_x2(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN]], i64 4)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.frintm.x2.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP2]], 0
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP3]], i64 0)
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP2]], 1
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], i64 4)
+// CHECK-NEXT:    ret <vscale x 8 x float> [[TMP6]]
+//
+// CPP-CHECK-LABEL: @_Z21test_svfrintam_f32_x213svfloat32x2_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.frintm.x2.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]])
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP2]], 0
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP3]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP2]], 1
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], i64 4)
+// CPP-CHECK-NEXT:    ret <vscale x 8 x float> [[TMP6]]
+//
+svfloat32x2_t test_svfrintam_f32_x2(svfloat32x2_t zn) __arm_streaming {
+  return SVE_ACLE_FUNC(svrintm,_f32_x2)(zn);
+}
+
+// CHECK-LABEL: @test_svfrintm_f32_x4(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 4)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 12)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.frintm.x4.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]])
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP4]], 0
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> poison, <vscale x 4 x float> [[TMP5]], i64 0)
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP4]], 1
+// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]], i64 4)
+// CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP4]], 2
+// CHECK-NEXT:    [[TMP10:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP8]], <vscale x 4 x float> [[TMP9]], i64 8)
+// CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP4]], 3
+// CHECK-NEXT:    [[TMP12:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP10]], <vscale x 4 x float> [[TMP11]], i64 12)
+// CHECK-NEXT:    ret <vscale x 16 x float> [[TMP12]]
+//
+// CPP-CHECK-LABEL: @_Z20test_svfrintm_f32_x413svfloat32x4_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 12)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.frintm.x4.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]])
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP4]], 0
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> poison, <vscale x 4 x float> [[TMP5]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP4]], 1
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP4]], 2
+// CPP-CHECK-NEXT:    [[TMP10:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP8]], <vscale x 4 x float> [[TMP9]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP4]], 3
+// CPP-CHECK-NEXT:    [[TMP12:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP10]], <vscale x 4 x float> [[TMP11]], i64 12)
+// CPP-CHECK-NEXT:    ret <vscale x 16 x float> [[TMP12]]
+//
+svfloat32x4_t test_svfrintm_f32_x4(svfloat32x4_t zn) __arm_streaming {
+  return SVE_ACLE_FUNC(svrintm,_f32_x4)(zn);
+}
+
+// FRINTN
+
+// CHECK-LABEL: @test_svfrintn_f32_x2(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN]], i64 4)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.frintn.x2.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP2]], 0
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP3]], i64 0)
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP2]], 1
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], i64 4)
+// CHECK-NEXT:    ret <vscale x 8 x float> [[TMP6]]
+//
+// CPP-CHECK-LABEL: @_Z20test_svfrintn_f32_x213svfloat32x2_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.frintn.x2.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]])
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP2]], 0
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP3]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP2]], 1
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], i64 4)
+// CPP-CHECK-NEXT:    ret <vscale x 8 x float> [[TMP6]]
+//
+svfloat32x2_t test_svfrintn_f32_x2(svfloat32x2_t zn) __arm_streaming {
+  return SVE_ACLE_FUNC(svrintn,_f32_x2)(zn);
+}
+
+// CHECK-LABEL: @test_svfrintn_f32_x4(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 4)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 12)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.frintn.x4.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]])
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP4]], 0
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> poison, <vscale x 4 x float> [[TMP5]], i64 0)
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP4]], 1
+// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]], i64 4)
+// CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP4]], 2
+// CHECK-NEXT:    [[TMP10:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP8]], <vscale x 4 x float> [[TMP9]], i64 8)
+// CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP4]], 3
+// CHECK-NEXT:    [[TMP12:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP10]], <vscale x 4 x float> [[TMP11]], i64 12)
+// CHECK-NEXT:    ret <vscale x 16 x float> [[TMP12]]
+//
+// CPP-CHECK-LABEL: @_Z20test_svfrintn_f32_x413svfloat32x4_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 12)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.frintn.x4.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]])
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP4]], 0
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> poison, <vscale x 4 x float> [[TMP5]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP4]], 1
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP4]], 2
+// CPP-CHECK-NEXT:    [[TMP10:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP8]], <vscale x 4 x float> [[TMP9]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP4]], 3
+// CPP-CHECK-NEXT:    [[TMP12:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP10]], <vscale x 4 x float> [[TMP11]], i64 12)
+// CPP-CHECK-NEXT:    ret <vscale x 16 x float> [[TMP12]]
+//
+svfloat32x4_t test_svfrintn_f32_x4(svfloat32x4_t zn) __arm_streaming {
+  return SVE_ACLE_FUNC(svrintn,_f32_x4)(zn);
+}
+
+// FRINTP
+
+// CHECK-LABEL: @test_svfrintp_f32_x2(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN]], i64 4)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.frintp.x2.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP2]], 0
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP3]], i64 0)
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP2]], 1
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], i64 4)
+// CHECK-NEXT:    ret <vscale x 8 x float> [[TMP6]]
+//
+// CPP-CHECK-LABEL: @_Z20test_svfrintp_f32_x213svfloat32x2_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.frintp.x2.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]])
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP2]], 0
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP3]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP2]], 1
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], i64 4)
+// CPP-CHECK-NEXT:    ret <vscale x 8 x float> [[TMP6]]
+//
+svfloat32x2_t test_svfrintp_f32_x2(svfloat32x2_t zn) __arm_streaming {
+  return SVE_ACLE_FUNC(svrintp,_f32_x2)(zn);
+}
+
+// CHECK-LABEL: @test_svfrintp_f32_x4(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 4)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 12)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.frintp.x4.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]])
+// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP4]], 0
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> poison, <vscale x 4 x float> [[TMP5]], i64 0)
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP4]], 1
+// CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]], i64 4)
+// CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP4]], 2
+// CHECK-NEXT:    [[TMP10:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP8]], <vscale x 4 x float> [[TMP9]], i64 8)
+// CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP4]], 3
+// CHECK-NEXT:    [[TMP12:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP10]], <vscale x 4 x float> [[TMP11]], i64 12)
+// CHECK-NEXT:    ret <vscale x 16 x float> [[TMP12]]
+//
+// CPP-CHECK-LABEL: @_Z20test_svfrintp_f32_x413svfloat32x4_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 12)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.frintp.x4.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]])
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP4]], 0
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> poison, <vscale x 4 x float> [[TMP5]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP4]], 1
+// CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP4]], 2
+// CPP-CHECK-NEXT:    [[TMP10:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP8]], <vscale x 4 x float> [[TMP9]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP4]], 3
+// CPP-CHECK-NEXT:    [[TMP12:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP10]], <vscale x 4 x float> [[TMP11]], i64 12)
+// CPP-CHECK-NEXT:    ret <vscale x 16 x float> [[TMP12]]
+//
+svfloat32x4_t test_svfrintp_f32_x4(svfloat32x4_t zn) __arm_streaming {
+  return SVE_ACLE_FUNC(svrintp,_f32_x4)(zn);
+}

From 36b6f77565c9d3d75c03600df4b8719bb518bdc0 Mon Sep 17 00:00:00 2001
From: Guillaume Chatelet <gchatelet@google.com>
Date: Thu, 21 Dec 2023 13:16:51 +0100
Subject: [PATCH 053/342] [libc][NFC] Unify `FPBits` implementations (#76033)

`FPBits` is currently implemented as a general case and a specialization
for `long double` when `long double` is x86 80-bit extended precision.
This patch is a first of a series to provide an implementation based on
`FPType` instead of the C++ float type (which implementation is
architecture dependent).
---
 libc/src/__support/FPUtil/FPBits.h            | 178 ++++++++++--------
 libc/src/__support/FPUtil/FloatProperties.h   |   2 +-
 .../__support/FPUtil/x86_64/LongDoubleBits.h  |  96 ++--------
 3 files changed, 124 insertions(+), 152 deletions(-)

diff --git a/libc/src/__support/FPUtil/FPBits.h b/libc/src/__support/FPUtil/FPBits.h
index b13ce80f94f6e..790449cc0080c 100644
--- a/libc/src/__support/FPUtil/FPBits.h
+++ b/libc/src/__support/FPUtil/FPBits.h
@@ -21,41 +21,37 @@
 namespace LIBC_NAMESPACE {
 namespace fputil {
 
-// A generic class to represent single precision, double precision, and quad
-// precision IEEE 754 floating point formats.
-// On most platforms, the 'float' type corresponds to single precision floating
-// point numbers, the 'double' type corresponds to double precision floating
-// point numers, and the 'long double' type corresponds to the quad precision
-// floating numbers. On x86 platforms however, the 'long double' type maps to
-// an x87 floating point format. This format is an IEEE 754 extension format.
-// It is handled as an explicit specialization of this class.
-template <typename T> struct FPBits : private FloatProperties<T> {
-  static_assert(cpp::is_floating_point_v<T>,
-                "FPBits instantiated with invalid type.");
-  using typename FloatProperties<T>::StorageType;
-  using FloatProperties<T>::TOTAL_LEN;
+namespace internal {
 
-private:
-  using FloatProperties<T>::EXP_SIG_MASK;
-
-public:
-  using FloatProperties<T>::EXP_MASK;
-  using FloatProperties<T>::EXP_BIAS;
-  using FloatProperties<T>::EXP_LEN;
-  using FloatProperties<T>::FRACTION_MASK;
-  using FloatProperties<T>::FRACTION_LEN;
+// This is a temporary class to unify common methods and properties between
+// FPBits and FPBits<long double>.
+template <FPType fp_type> struct FPBitsCommon : private FPProperties<fp_type> {
+  using UP = FPProperties<fp_type>;
+  using typename UP::StorageType;
+  using UP::TOTAL_LEN;
 
-private:
-  using FloatProperties<T>::QUIET_NAN_MASK;
+protected:
+  using UP::EXP_SIG_MASK;
+  using UP::QUIET_NAN_MASK;
 
 public:
-  using FloatProperties<T>::SIGN_MASK;
+  using UP::EXP_BIAS;
+  using UP::EXP_LEN;
+  using UP::EXP_MASK;
+  using UP::EXP_MASK_SHIFT;
+  using UP::FP_MASK;
+  using UP::FRACTION_LEN;
+  using UP::FRACTION_MASK;
+  using UP::SIGN_MASK;
 
   // Reinterpreting bits as an integer value and interpreting the bits of an
   // integer value as a floating point value is used in tests. So, a convenient
   // type is provided for such reinterpretations.
   StorageType bits;
 
+  LIBC_INLINE constexpr FPBitsCommon() : bits(0) {}
+  LIBC_INLINE explicit constexpr FPBitsCommon(StorageType bits) : bits(bits) {}
+
   LIBC_INLINE constexpr void set_mantissa(StorageType mantVal) {
     mantVal &= FRACTION_MASK;
     bits &= ~FRACTION_MASK;
@@ -66,16 +62,89 @@ template <typename T> struct FPBits : private FloatProperties<T> {
     return bits & FRACTION_MASK;
   }
 
-  LIBC_INLINE constexpr void set_biased_exponent(StorageType expVal) {
-    expVal = (expVal << FRACTION_LEN) & EXP_MASK;
+  LIBC_INLINE constexpr void set_sign(bool signVal) {
+    if (get_sign() != signVal)
+      bits ^= SIGN_MASK;
+  }
+
+  LIBC_INLINE constexpr bool get_sign() const {
+    return (bits & SIGN_MASK) != 0;
+  }
+
+  LIBC_INLINE constexpr void set_biased_exponent(StorageType biased) {
+    // clear exponent bits
     bits &= ~EXP_MASK;
-    bits |= expVal;
+    // set exponent bits
+    bits |= (biased << EXP_MASK_SHIFT) & EXP_MASK;
   }
 
   LIBC_INLINE constexpr uint16_t get_biased_exponent() const {
-    return uint16_t((bits & EXP_MASK) >> FRACTION_LEN);
+    return uint16_t((bits & EXP_MASK) >> EXP_MASK_SHIFT);
+  }
+
+  LIBC_INLINE constexpr int get_exponent() const {
+    return int(get_biased_exponent()) - EXP_BIAS;
   }
 
+  // If the number is subnormal, the exponent is treated as if it were the
+  // minimum exponent for a normal number. This is to keep continuity between
+  // the normal and subnormal ranges, but it causes problems for functions where
+  // values are calculated from the exponent, since just subtracting the bias
+  // will give a slightly incorrect result. Additionally, zero has an exponent
+  // of zero, and that should actually be treated as zero.
+  LIBC_INLINE constexpr int get_explicit_exponent() const {
+    const int biased_exp = int(get_biased_exponent());
+    if (is_zero()) {
+      return 0;
+    } else if (biased_exp == 0) {
+      return 1 - EXP_BIAS;
+    } else {
+      return biased_exp - EXP_BIAS;
+    }
+  }
+
+  LIBC_INLINE constexpr StorageType uintval() const { return bits & FP_MASK; }
+
+  LIBC_INLINE constexpr bool is_zero() const {
+    return (bits & EXP_SIG_MASK) == 0;
+  }
+};
+
+} // namespace internal
+
+// A generic class to represent single precision, double precision, and quad
+// precision IEEE 754 floating point formats.
+// On most platforms, the 'float' type corresponds to single precision floating
+// point numbers, the 'double' type corresponds to double precision floating
+// point numers, and the 'long double' type corresponds to the quad precision
+// floating numbers. On x86 platforms however, the 'long double' type maps to
+// an x87 floating point format. This format is an IEEE 754 extension format.
+// It is handled as an explicit specialization of this class.
+template <typename T>
+struct FPBits : public internal::FPBitsCommon<get_fp_type<T>()> {
+  static_assert(cpp::is_floating_point_v<T>,
+                "FPBits instantiated with invalid type.");
+  using UP = internal::FPBitsCommon<get_fp_type<T>()>;
+  using StorageType = typename UP::StorageType;
+  using UP::bits;
+
+private:
+  using UP::EXP_SIG_MASK;
+  using UP::QUIET_NAN_MASK;
+
+public:
+  using UP::EXP_BIAS;
+  using UP::EXP_LEN;
+  using UP::EXP_MASK;
+  using UP::EXP_MASK_SHIFT;
+  using UP::FRACTION_LEN;
+  using UP::FRACTION_MASK;
+  using UP::SIGN_MASK;
+  using UP::TOTAL_LEN;
+
+  using UP::get_biased_exponent;
+  using UP::is_zero;
+
   // The function return mantissa with the implicit bit set iff the current
   // value is a valid normal number.
   LIBC_INLINE constexpr StorageType get_explicit_mantissa() {
@@ -85,19 +154,6 @@ template <typename T> struct FPBits : private FloatProperties<T> {
            (FRACTION_MASK & bits);
   }
 
-  LIBC_INLINE constexpr void set_sign(bool signVal) {
-    bits |= SIGN_MASK;
-    if (!signVal)
-      bits -= SIGN_MASK;
-  }
-
-  LIBC_INLINE constexpr bool get_sign() const {
-    return (bits & SIGN_MASK) != 0;
-  }
-
-  static_assert(sizeof(T) == sizeof(StorageType),
-                "Data type and integral representation have different sizes.");
-
   static constexpr int MAX_BIASED_EXPONENT = (1 << EXP_LEN) - 1;
   static constexpr StorageType MIN_SUBNORMAL = StorageType(1);
   static constexpr StorageType MAX_SUBNORMAL = FRACTION_MASK;
@@ -109,49 +165,21 @@ template <typename T> struct FPBits : private FloatProperties<T> {
   // type match.
   template <typename XType, cpp::enable_if_t<cpp::is_same_v<T, XType>, int> = 0>
   LIBC_INLINE constexpr explicit FPBits(XType x)
-      : bits(cpp::bit_cast<StorageType>(x)) {}
+      : UP(cpp::bit_cast<StorageType>(x)) {}
 
   template <typename XType,
             cpp::enable_if_t<cpp::is_same_v<XType, StorageType>, int> = 0>
-  LIBC_INLINE constexpr explicit FPBits(XType x) : bits(x) {}
-
-  LIBC_INLINE constexpr FPBits() : bits(0) {}
+  LIBC_INLINE constexpr explicit FPBits(XType x) : UP(x) {}
 
-  LIBC_INLINE constexpr T get_val() const { return cpp::bit_cast<T>(bits); }
+  LIBC_INLINE constexpr FPBits() : UP() {}
 
   LIBC_INLINE constexpr void set_val(T value) {
     bits = cpp::bit_cast<StorageType>(value);
   }
 
-  LIBC_INLINE constexpr explicit operator T() const { return get_val(); }
-
-  LIBC_INLINE constexpr StorageType uintval() const { return bits; }
-
-  LIBC_INLINE constexpr int get_exponent() const {
-    return int(get_biased_exponent()) - EXP_BIAS;
-  }
-
-  // If the number is subnormal, the exponent is treated as if it were the
-  // minimum exponent for a normal number. This is to keep continuity between
-  // the normal and subnormal ranges, but it causes problems for functions where
-  // values are calculated from the exponent, since just subtracting the bias
-  // will give a slightly incorrect result. Additionally, zero has an exponent
-  // of zero, and that should actually be treated as zero.
-  LIBC_INLINE constexpr int get_explicit_exponent() const {
-    const int biased_exp = int(get_biased_exponent());
-    if (is_zero()) {
-      return 0;
-    } else if (biased_exp == 0) {
-      return 1 - EXP_BIAS;
-    } else {
-      return biased_exp - EXP_BIAS;
-    }
-  }
+  LIBC_INLINE constexpr T get_val() const { return cpp::bit_cast<T>(bits); }
 
-  LIBC_INLINE constexpr bool is_zero() const {
-    // Remove sign bit by shift
-    return (bits << 1) == 0;
-  }
+  LIBC_INLINE constexpr explicit operator T() const { return get_val(); }
 
   LIBC_INLINE constexpr bool is_inf() const {
     return (bits & EXP_SIG_MASK) == EXP_MASK;
diff --git a/libc/src/__support/FPUtil/FloatProperties.h b/libc/src/__support/FPUtil/FloatProperties.h
index bcf1f7cfabd34..6bf75b7167d32 100644
--- a/libc/src/__support/FPUtil/FloatProperties.h
+++ b/libc/src/__support/FPUtil/FloatProperties.h
@@ -111,7 +111,7 @@ struct FPProperties : public internal::FPBaseProperties<fp_type> {
       (1U << (EXP_LEN - 1U)) - 1U;
   static_assert(EXP_BIAS > 0);
 
-private:
+protected:
   // The shift amount to get the *significand* part to the least significant
   // bit. Always `0` but kept for consistency.
   LIBC_INLINE_VAR static constexpr int SIG_MASK_SHIFT = 0;
diff --git a/libc/src/__support/FPUtil/x86_64/LongDoubleBits.h b/libc/src/__support/FPUtil/x86_64/LongDoubleBits.h
index b2b016adb661a..1011e61f03fd6 100644
--- a/libc/src/__support/FPUtil/x86_64/LongDoubleBits.h
+++ b/libc/src/__support/FPUtil/x86_64/LongDoubleBits.h
@@ -26,20 +26,27 @@
 namespace LIBC_NAMESPACE {
 namespace fputil {
 
-template <> struct FPBits<long double> : private FloatProperties<long double> {
-  using typename FloatProperties<long double>::StorageType;
-  using FloatProperties<long double>::TOTAL_LEN;
-  using FloatProperties<long double>::EXP_MASK;
-  using FloatProperties<long double>::EXP_BIAS;
-  using FloatProperties<long double>::EXP_LEN;
-  using FloatProperties<long double>::FRACTION_MASK;
-  using FloatProperties<long double>::FRACTION_LEN;
+template <>
+struct FPBits<long double>
+    : public internal::FPBitsCommon<FPType::X86_Binary80> {
+  using UP = internal::FPBitsCommon<FPType::X86_Binary80>;
+  using StorageType = typename UP::StorageType;
+  using UP::bits;
 
 private:
-  using FloatProperties<long double>::QUIET_NAN_MASK;
+  using UP::EXP_SIG_MASK;
+  using UP::QUIET_NAN_MASK;
 
 public:
-  using FloatProperties<long double>::SIGN_MASK;
+  using UP::EXP_BIAS;
+  using UP::EXP_LEN;
+  using UP::EXP_MASK;
+  using UP::EXP_MASK_SHIFT;
+  using UP::FP_MASK;
+  using UP::FRACTION_LEN;
+  using UP::FRACTION_MASK;
+  using UP::SIGN_MASK;
+  using UP::TOTAL_LEN;
 
   static constexpr int MAX_BIASED_EXPONENT = 0x7FFF;
   static constexpr StorageType MIN_SUBNORMAL = StorageType(1);
@@ -51,18 +58,6 @@ template <> struct FPBits<long double> : private FloatProperties<long double> {
       (StorageType(MAX_BIASED_EXPONENT - 1) << (FRACTION_LEN + 1)) |
       (StorageType(1) << FRACTION_LEN) | MAX_SUBNORMAL;
 
-  StorageType bits;
-
-  LIBC_INLINE constexpr void set_mantissa(StorageType mantVal) {
-    mantVal &= FRACTION_MASK;
-    bits &= ~FRACTION_MASK;
-    bits |= mantVal;
-  }
-
-  LIBC_INLINE constexpr StorageType get_mantissa() const {
-    return bits & FRACTION_MASK;
-  }
-
   LIBC_INLINE constexpr StorageType get_explicit_mantissa() const {
     // The x86 80 bit float represents the leading digit of the mantissa
     // explicitly. This is the mask for that bit.
@@ -70,16 +65,6 @@ template <> struct FPBits<long double> : private FloatProperties<long double> {
     return bits & (FRACTION_MASK | EXPLICIT_BIT_MASK);
   }
 
-  LIBC_INLINE constexpr void set_biased_exponent(StorageType expVal) {
-    expVal = (expVal << (TOTAL_LEN - 1 - EXP_LEN)) & EXP_MASK;
-    bits &= ~EXP_MASK;
-    bits |= expVal;
-  }
-
-  LIBC_INLINE constexpr uint16_t get_biased_exponent() const {
-    return uint16_t((bits & EXP_MASK) >> (TOTAL_LEN - 1 - EXP_LEN));
-  }
-
   LIBC_INLINE constexpr void set_implicit_bit(bool implicitVal) {
     bits &= ~(StorageType(1) << FRACTION_LEN);
     bits |= (StorageType(implicitVal) << FRACTION_LEN);
@@ -89,22 +74,12 @@ template <> struct FPBits<long double> : private FloatProperties<long double> {
     return bool((bits & (StorageType(1) << FRACTION_LEN)) >> FRACTION_LEN);
   }
 
-  LIBC_INLINE constexpr void set_sign(bool signVal) {
-    bits &= ~SIGN_MASK;
-    StorageType sign1 = StorageType(signVal) << (TOTAL_LEN - 1);
-    bits |= sign1;
-  }
-
-  LIBC_INLINE constexpr bool get_sign() const {
-    return bool((bits & SIGN_MASK) >> (TOTAL_LEN - 1));
-  }
-
-  LIBC_INLINE constexpr FPBits() : bits(0) {}
+  LIBC_INLINE constexpr FPBits() : UP() {}
 
   template <typename XType,
             cpp::enable_if_t<cpp::is_same_v<long double, XType>, int> = 0>
   LIBC_INLINE constexpr explicit FPBits(XType x)
-      : bits(cpp::bit_cast<StorageType>(x)) {
+      : UP(cpp::bit_cast<StorageType>(x)) {
     // bits starts uninitialized, and setting it to a long double only
     // overwrites the first 80 bits. This clears those upper bits.
     bits = bits & ((StorageType(1) << 80) - 1);
@@ -112,47 +87,16 @@ template <> struct FPBits<long double> : private FloatProperties<long double> {
 
   template <typename XType,
             cpp::enable_if_t<cpp::is_same_v<XType, StorageType>, int> = 0>
-  LIBC_INLINE constexpr explicit FPBits(XType x) : bits(x) {}
+  LIBC_INLINE constexpr explicit FPBits(XType x) : UP(x) {}
 
   LIBC_INLINE constexpr operator long double() {
     return cpp::bit_cast<long double>(bits);
   }
 
-  LIBC_INLINE constexpr StorageType uintval() {
-    // We zero the padding bits as they can contain garbage.
-    return bits & FP_MASK;
-  }
-
   LIBC_INLINE constexpr long double get_val() const {
     return cpp::bit_cast<long double>(bits);
   }
 
-  LIBC_INLINE constexpr int get_exponent() const {
-    return int(get_biased_exponent()) - EXP_BIAS;
-  }
-
-  // If the number is subnormal, the exponent is treated as if it were the
-  // minimum exponent for a normal number. This is to keep continuity between
-  // the normal and subnormal ranges, but it causes problems for functions where
-  // values are calculated from the exponent, since just subtracting the bias
-  // will give a slightly incorrect result. Additionally, zero has an exponent
-  // of zero, and that should actually be treated as zero.
-  LIBC_INLINE constexpr int get_explicit_exponent() const {
-    const int biased_exp = int(get_biased_exponent());
-    if (is_zero()) {
-      return 0;
-    } else if (biased_exp == 0) {
-      return 1 - EXP_BIAS;
-    } else {
-      return biased_exp - EXP_BIAS;
-    }
-  }
-
-  LIBC_INLINE constexpr bool is_zero() const {
-    return get_biased_exponent() == 0 && get_mantissa() == 0 &&
-           get_implicit_bit() == 0;
-  }
-
   LIBC_INLINE constexpr bool is_inf() const {
     return get_biased_exponent() == MAX_BIASED_EXPONENT &&
            get_mantissa() == 0 && get_implicit_bit() == 1;

From 7c8787511b8ba6c29aa8f6551f3406ecbe69243d Mon Sep 17 00:00:00 2001
From: Christian Sigg <chsigg@users.noreply.github.com>
Date: Thu, 21 Dec 2023 13:24:11 +0100
Subject: [PATCH 054/342] [mlir][bazel] Fix build after
 acaff70841f59a1aec2a3c417e9f3a0f14eb47ad

---
 utils/bazel/llvm-project-overlay/mlir/python/BUILD.bazel | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/utils/bazel/llvm-project-overlay/mlir/python/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/python/BUILD.bazel
index 049098b158f29..5882a311c9b6f 100644
--- a/utils/bazel/llvm-project-overlay/mlir/python/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/python/BUILD.bazel
@@ -1451,7 +1451,7 @@ filegroup(
 
 filegroup(
     name = "TransformExtrasPackagePyFiles",
-    srcs = glob(["mlir/extras/dialects/transform/*.py"]),
+    srcs = glob(["mlir/dialects/transform/extras/*.py"]),
 )
 
 ##---------------------------------------------------------------------------##

From 23adef4b85e341d4f0d1ff2d2185e4b3fa499a05 Mon Sep 17 00:00:00 2001
From: Dmitri Gribenko <gribozavr@gmail.com>
Date: Thu, 21 Dec 2023 13:08:31 +0100
Subject: [PATCH 055/342] [llvm-readtapi][test] Write test outputs into a
 temporary directory

---
 llvm/test/tools/llvm-readtapi/stubify-invalid.test | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/llvm/test/tools/llvm-readtapi/stubify-invalid.test b/llvm/test/tools/llvm-readtapi/stubify-invalid.test
index 3ecc9ff1aeee3..ae97d47c51991 100644
--- a/llvm/test/tools/llvm-readtapi/stubify-invalid.test
+++ b/llvm/test/tools/llvm-readtapi/stubify-invalid.test
@@ -1,6 +1,7 @@
 ; RUN: rm -rf %t
-; RUN: not llvm-readtapi -stubify %t/objc.dylib %t/flat_namespace.dylib %t/thread_local.dylib %t/fat.dylib --o tmp.tbd 2>&1 | FileCheck %s --allow-empty --check-prefix OUT
-; RUN: not llvm-readtapi -stubify --o tmp.tbd 2>&1 | FileCheck %s --allow-empty --check-prefix IN 
+; RUN: mkdir -p %t
+; RUN: not llvm-readtapi -stubify %t/objc.dylib %t/flat_namespace.dylib %t/thread_local.dylib %t/fat.dylib --o %t/tmp.tbd 2>&1 | FileCheck %s --allow-empty --check-prefix OUT
+; RUN: not llvm-readtapi -stubify --o %t/tmp.tbd 2>&1 | FileCheck %s --allow-empty --check-prefix IN
 
 ; OUT: error: cannot write multiple inputs into single output file
 ; IN: error: stubify requires at least one input file

From 2e3d77d6edae0c790bacbc5841f664bb08bab159 Mon Sep 17 00:00:00 2001
From: Paschalis Mpeis <paschalis.mpeis@arm.com>
Date: Thu, 21 Dec 2023 12:37:57 +0000
Subject: [PATCH 056/342] [TLI] Pass replace-with-veclib works with Scalable
 Vectors. (#73642)

[TLI] Pass replace-with-veclib works with Scalable Vectors.

The pass is heavily refactored.
It uses the Masked variant of a TLI method when the Intrinsic operates on Scalable Vectors.

 Improve tests for ArmPL and SLEEF Intrinsics:
- Auto-generate test `armpl-intrinsics.ll`, and use active lane mask to have shorter `shufflevector` check lines.
- Update scripts now add `@llvm.compiler.used` instead of using the regex: `@[[LLVM_COMPILER_USED:[a-zA-Z0-9_$"\\.-]+]]`
-  Add simplifycfg pass and noalias to ensure tail folding. `noalias` attribute was added only to the `%in.ptr` parameter of the ArmPL Intrinsics.
---
 llvm/lib/Analysis/VFABIDemangling.cpp         |   2 +-
 llvm/lib/CodeGen/ReplaceWithVeclib.cpp        | 211 +++++++++---------
 .../replace-intrinsics-with-veclib-armpl.ll   |  36 +--
 ...e-intrinsics-with-veclib-sleef-scalable.ll |  37 +--
 4 files changed, 144 insertions(+), 142 deletions(-)

diff --git a/llvm/lib/Analysis/VFABIDemangling.cpp b/llvm/lib/Analysis/VFABIDemangling.cpp
index 22fc52070015c..426f98c0c6284 100644
--- a/llvm/lib/Analysis/VFABIDemangling.cpp
+++ b/llvm/lib/Analysis/VFABIDemangling.cpp
@@ -126,7 +126,7 @@ static ParseRet tryParseLinearTokenWithRuntimeStep(StringRef &ParseString,
   return ParseRet::None;
 }
 
-/// The function looks for the following stringt at the beginning of
+/// The function looks for the following string at the beginning of
 /// the input string `ParseString`:
 ///
 ///  <token> <number>
diff --git a/llvm/lib/CodeGen/ReplaceWithVeclib.cpp b/llvm/lib/CodeGen/ReplaceWithVeclib.cpp
index 36c91b7fa97e4..893aa4a91828d 100644
--- a/llvm/lib/CodeGen/ReplaceWithVeclib.cpp
+++ b/llvm/lib/CodeGen/ReplaceWithVeclib.cpp
@@ -15,14 +15,17 @@
 #include "llvm/CodeGen/ReplaceWithVeclib.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/Analysis/DemandedBits.h"
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/VectorUtils.h"
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstIterator.h"
+#include "llvm/Support/TypeSize.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
 
 using namespace llvm;
@@ -38,138 +41,137 @@ STATISTIC(NumTLIFuncDeclAdded,
 STATISTIC(NumFuncUsedAdded,
           "Number of functions added to `llvm.compiler.used`");
 
-static bool replaceWithTLIFunction(CallInst &CI, const StringRef TLIName) {
-  Module *M = CI.getModule();
-
-  Function *OldFunc = CI.getCalledFunction();
-
-  // Check if the vector library function is already declared in this module,
-  // otherwise insert it.
+/// Returns a vector Function that it adds to the Module \p M. When an \p
+/// ScalarFunc is not null, it copies its attributes to the newly created
+/// Function.
+Function *getTLIFunction(Module *M, FunctionType *VectorFTy,
+                         const StringRef TLIName,
+                         Function *ScalarFunc = nullptr) {
   Function *TLIFunc = M->getFunction(TLIName);
   if (!TLIFunc) {
-    TLIFunc = Function::Create(OldFunc->getFunctionType(),
-                               Function::ExternalLinkage, TLIName, *M);
-    TLIFunc->copyAttributesFrom(OldFunc);
+    TLIFunc =
+        Function::Create(VectorFTy, Function::ExternalLinkage, TLIName, *M);
+    if (ScalarFunc)
+      TLIFunc->copyAttributesFrom(ScalarFunc);
 
     LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Added vector library function `"
                       << TLIName << "` of type `" << *(TLIFunc->getType())
                       << "` to module.\n");
 
     ++NumTLIFuncDeclAdded;
-
-    // Add the freshly created function to llvm.compiler.used,
-    // similar to as it is done in InjectTLIMappings
+    // Add the freshly created function to llvm.compiler.used, similar to as it
+    // is done in InjectTLIMappings.
     appendToCompilerUsed(*M, {TLIFunc});
-
     LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Adding `" << TLIName
                       << "` to `@llvm.compiler.used`.\n");
     ++NumFuncUsedAdded;
   }
+  return TLIFunc;
+}
 
-  // Replace the call to the vector intrinsic with a call
-  // to the corresponding function from the vector library.
-  IRBuilder<> IRBuilder(&CI);
-  SmallVector<Value *> Args(CI.args());
-  // Preserve the operand bundles.
-  SmallVector<OperandBundleDef, 1> OpBundles;
-  CI.getOperandBundlesAsDefs(OpBundles);
-  CallInst *Replacement = IRBuilder.CreateCall(TLIFunc, Args, OpBundles);
-  assert(OldFunc->getFunctionType() == TLIFunc->getFunctionType() &&
-         "Expecting function types to be identical");
-  CI.replaceAllUsesWith(Replacement);
-  if (isa<FPMathOperator>(Replacement)) {
-    // Preserve fast math flags for FP math.
-    Replacement->copyFastMathFlags(&CI);
+/// Replace the call to the vector intrinsic ( \p CalltoReplace ) with a call to
+/// the corresponding function from the vector library ( \p TLIVecFunc ).
+static void replaceWithTLIFunction(CallInst &CalltoReplace, VFInfo &Info,
+                                   Function *TLIVecFunc) {
+  IRBuilder<> IRBuilder(&CalltoReplace);
+  SmallVector<Value *> Args(CalltoReplace.args());
+  if (auto OptMaskpos = Info.getParamIndexForOptionalMask()) {
+    auto *MaskTy = VectorType::get(Type::getInt1Ty(CalltoReplace.getContext()),
+                                   Info.Shape.VF);
+    Args.insert(Args.begin() + OptMaskpos.value(),
+                Constant::getAllOnesValue(MaskTy));
   }
 
-  LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Replaced call to `"
-                    << OldFunc->getName() << "` with call to `" << TLIName
-                    << "`.\n");
-  ++NumCallsReplaced;
-  return true;
+  // Preserve the operand bundles.
+  SmallVector<OperandBundleDef, 1> OpBundles;
+  CalltoReplace.getOperandBundlesAsDefs(OpBundles);
+  CallInst *Replacement = IRBuilder.CreateCall(TLIVecFunc, Args, OpBundles);
+  CalltoReplace.replaceAllUsesWith(Replacement);
+  // Preserve fast math flags for FP math.
+  if (isa<FPMathOperator>(Replacement))
+    Replacement->copyFastMathFlags(&CalltoReplace);
 }
 
+/// Returns true when successfully replaced \p CallToReplace with a suitable
+/// function taking vector arguments, based on available mappings in the \p TLI.
+/// Currently only works when \p CallToReplace is a call to vectorized
+/// intrinsic.
 static bool replaceWithCallToVeclib(const TargetLibraryInfo &TLI,
-                                    CallInst &CI) {
-  if (!CI.getCalledFunction()) {
+                                    CallInst &CallToReplace) {
+  if (!CallToReplace.getCalledFunction())
     return false;
-  }
 
-  auto IntrinsicID = CI.getCalledFunction()->getIntrinsicID();
-  if (IntrinsicID == Intrinsic::not_intrinsic) {
-    // Replacement is only performed for intrinsic functions
+  auto IntrinsicID = CallToReplace.getCalledFunction()->getIntrinsicID();
+  // Replacement is only performed for intrinsic functions.
+  if (IntrinsicID == Intrinsic::not_intrinsic)
     return false;
-  }
 
-  // Convert vector arguments to scalar type and check that
-  // all vector operands have identical vector width.
+  // Compute arguments types of the corresponding scalar call. Additionally
+  // checks if in the vector call, all vector operands have the same EC.
   ElementCount VF = ElementCount::getFixed(0);
-  SmallVector<Type *> ScalarTypes;
-  for (auto Arg : enumerate(CI.args())) {
-    auto *ArgType = Arg.value()->getType();
-    // Vector calls to intrinsics can still have
-    // scalar operands for specific arguments.
+  SmallVector<Type *> ScalarArgTypes;
+  for (auto Arg : enumerate(CallToReplace.args())) {
+    auto *ArgTy = Arg.value()->getType();
     if (isVectorIntrinsicWithScalarOpAtArg(IntrinsicID, Arg.index())) {
-      ScalarTypes.push_back(ArgType);
-    } else {
-      // The argument in this place should be a vector if
-      // this is a call to a vector intrinsic.
-      auto *VectorArgTy = dyn_cast<VectorType>(ArgType);
-      if (!VectorArgTy) {
-        // The argument is not a vector, do not perform
-        // the replacement.
-        return false;
-      }
-      ElementCount NumElements = VectorArgTy->getElementCount();
-      if (NumElements.isScalable()) {
-        // The current implementation does not support
-        // scalable vectors.
+      ScalarArgTypes.push_back(ArgTy);
+    } else if (auto *VectorArgTy = dyn_cast<VectorType>(ArgTy)) {
+      ScalarArgTypes.push_back(ArgTy->getScalarType());
+      // Disallow vector arguments with different VFs. When processing the first
+      // vector argument, store it's VF, and for the rest ensure that they match
+      // it.
+      if (VF.isZero())
+        VF = VectorArgTy->getElementCount();
+      else if (VF != VectorArgTy->getElementCount())
         return false;
-      }
-      if (VF.isNonZero() && VF != NumElements) {
-        // The different arguments differ in vector size.
-        return false;
-      } else {
-        VF = NumElements;
-      }
-      ScalarTypes.push_back(VectorArgTy->getElementType());
-    }
+    } else
+      // Exit when it is supposed to be a vector argument but it isn't.
+      return false;
   }
 
-  // Try to reconstruct the name for the scalar version of this
-  // intrinsic using the intrinsic ID and the argument types
-  // converted to scalar above.
-  std::string ScalarName;
-  if (Intrinsic::isOverloaded(IntrinsicID)) {
-    ScalarName = Intrinsic::getName(IntrinsicID, ScalarTypes, CI.getModule());
-  } else {
-    ScalarName = Intrinsic::getName(IntrinsicID).str();
-  }
+  // Try to reconstruct the name for the scalar version of this intrinsic using
+  // the intrinsic ID and the argument types converted to scalar above.
+  std::string ScalarName =
+      (Intrinsic::isOverloaded(IntrinsicID)
+           ? Intrinsic::getName(IntrinsicID, ScalarArgTypes,
+                                CallToReplace.getModule())
+           : Intrinsic::getName(IntrinsicID).str());
+
+  // Try to find the mapping for the scalar version of this intrinsic and the
+  // exact vector width of the call operands in the TargetLibraryInfo. First,
+  // check with a non-masked variant, and if that fails try with a masked one.
+  const VecDesc *VD =
+      TLI.getVectorMappingInfo(ScalarName, VF, /*Masked*/ false);
+  if (!VD && !(VD = TLI.getVectorMappingInfo(ScalarName, VF, /*Masked*/ true)))
+    return false;
 
-  if (!TLI.isFunctionVectorizable(ScalarName)) {
-    // The TargetLibraryInfo does not contain a vectorized version of
-    // the scalar function.
+  LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Found TLI mapping from: `" << ScalarName
+                    << "` and vector width " << VF << " to: `"
+                    << VD->getVectorFnName() << "`.\n");
+
+  // Replace the call to the intrinsic with a call to the vector library
+  // function.
+  Type *ScalarRetTy = CallToReplace.getType()->getScalarType();
+  FunctionType *ScalarFTy =
+      FunctionType::get(ScalarRetTy, ScalarArgTypes, /*isVarArg*/ false);
+  const std::string MangledName = VD->getVectorFunctionABIVariantString();
+  auto OptInfo = VFABI::tryDemangleForVFABI(MangledName, ScalarFTy);
+  if (!OptInfo)
     return false;
-  }
 
-  // Try to find the mapping for the scalar version of this intrinsic
-  // and the exact vector width of the call operands in the
-  // TargetLibraryInfo.
-  StringRef TLIName = TLI.getVectorizedFunction(ScalarName, VF);
-
-  LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Looking up TLI mapping for `"
-                    << ScalarName << "` and vector width " << VF << ".\n");
-
-  if (!TLIName.empty()) {
-    // Found the correct mapping in the TargetLibraryInfo,
-    // replace the call to the intrinsic with a call to
-    // the vector library function.
-    LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Found TLI function `" << TLIName
-                      << "`.\n");
-    return replaceWithTLIFunction(CI, TLIName);
-  }
+  FunctionType *VectorFTy = VFABI::createFunctionType(*OptInfo, ScalarFTy);
+  if (!VectorFTy)
+    return false;
+
+  Function *FuncToReplace = CallToReplace.getCalledFunction();
+  Function *TLIFunc = getTLIFunction(CallToReplace.getModule(), VectorFTy,
+                                     VD->getVectorFnName(), FuncToReplace);
+  replaceWithTLIFunction(CallToReplace, *OptInfo, TLIFunc);
 
-  return false;
+  LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Replaced call to `"
+                    << FuncToReplace->getName() << "` with call to `"
+                    << TLIFunc->getName() << "`.\n");
+  ++NumCallsReplaced;
+  return true;
 }
 
 static bool runImpl(const TargetLibraryInfo &TLI, Function &F) {
@@ -185,9 +187,8 @@ static bool runImpl(const TargetLibraryInfo &TLI, Function &F) {
   }
   // Erase the calls to the intrinsics that have been replaced
   // with calls to the vector library.
-  for (auto *CI : ReplacedCalls) {
+  for (auto *CI : ReplacedCalls)
     CI->eraseFromParent();
-  }
   return Changed;
 }
 
@@ -207,10 +208,10 @@ PreservedAnalyses ReplaceWithVeclib::run(Function &F,
     PA.preserve<DemandedBitsAnalysis>();
     PA.preserve<OptimizationRemarkEmitterAnalysis>();
     return PA;
-  } else {
-    // The pass did not replace any calls, hence it preserves all analyses.
-    return PreservedAnalyses::all();
   }
+
+  // The pass did not replace any calls, hence it preserves all analyses.
+  return PreservedAnalyses::all();
 }
 
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/llvm/test/CodeGen/AArch64/replace-intrinsics-with-veclib-armpl.ll b/llvm/test/CodeGen/AArch64/replace-intrinsics-with-veclib-armpl.ll
index 18431ae021f97..d41870ec6e791 100644
--- a/llvm/test/CodeGen/AArch64/replace-intrinsics-with-veclib-armpl.ll
+++ b/llvm/test/CodeGen/AArch64/replace-intrinsics-with-veclib-armpl.ll
@@ -15,7 +15,7 @@ declare <vscale x 2 x double> @llvm.cos.nxv2f64(<vscale x 2 x double>)
 declare <vscale x 4 x float> @llvm.cos.nxv4f32(<vscale x 4 x float>)
 
 ;.
-; CHECK: @llvm.compiler.used = appending global [16 x ptr] [ptr @armpl_vcosq_f64, ptr @armpl_vcosq_f32, ptr @armpl_vsinq_f64, ptr @armpl_vsinq_f32, ptr @armpl_vexpq_f64, ptr @armpl_vexpq_f32, ptr @armpl_vexp2q_f64, ptr @armpl_vexp2q_f32, ptr @armpl_vexp10q_f64, ptr @armpl_vexp10q_f32, ptr @armpl_vlogq_f64, ptr @armpl_vlogq_f32, ptr @armpl_vlog2q_f64, ptr @armpl_vlog2q_f32, ptr @armpl_vlog10q_f64, ptr @armpl_vlog10q_f32], section "llvm.metadata"
+; CHECK: @llvm.compiler.used = appending global [32 x ptr] [ptr @armpl_vcosq_f64, ptr @armpl_vcosq_f32, ptr @armpl_svcos_f64_x, ptr @armpl_svcos_f32_x, ptr @armpl_vsinq_f64, ptr @armpl_vsinq_f32, ptr @armpl_svsin_f64_x, ptr @armpl_svsin_f32_x, ptr @armpl_vexpq_f64, ptr @armpl_vexpq_f32, ptr @armpl_svexp_f64_x, ptr @armpl_svexp_f32_x, ptr @armpl_vexp2q_f64, ptr @armpl_vexp2q_f32, ptr @armpl_svexp2_f64_x, ptr @armpl_svexp2_f32_x, ptr @armpl_vexp10q_f64, ptr @armpl_vexp10q_f32, ptr @armpl_svexp10_f64_x, ptr @armpl_svexp10_f32_x, ptr @armpl_vlogq_f64, ptr @armpl_vlogq_f32, ptr @armpl_svlog_f64_x, ptr @armpl_svlog_f32_x, ptr @armpl_vlog2q_f64, ptr @armpl_vlog2q_f32, ptr @armpl_svlog2_f64_x, ptr @armpl_svlog2_f32_x, ptr @armpl_vlog10q_f64, ptr @armpl_vlog10q_f32, ptr @armpl_svlog10_f64_x, ptr @armpl_svlog10_f32_x], section "llvm.metadata"
 ;.
 define <2 x double> @llvm_cos_f64(<2 x double> %in) {
 ; CHECK-LABEL: define <2 x double> @llvm_cos_f64
@@ -40,7 +40,7 @@ define <4 x float> @llvm_cos_f32(<4 x float> %in) {
 define <vscale x 2 x double> @llvm_cos_vscale_f64(<vscale x 2 x double> %in) #0 {
 ; CHECK-LABEL: define <vscale x 2 x double> @llvm_cos_vscale_f64
 ; CHECK-SAME: (<vscale x 2 x double> [[IN:%.*]]) #[[ATTR1:[0-9]+]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @llvm.cos.nxv2f64(<vscale x 2 x double> [[IN]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @armpl_svcos_f64_x(<vscale x 2 x double> [[IN]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 ;
   %1 = call fast <vscale x 2 x double> @llvm.cos.nxv2f64(<vscale x 2 x double> %in)
@@ -50,7 +50,7 @@ define <vscale x 2 x double> @llvm_cos_vscale_f64(<vscale x 2 x double> %in) #0
 define <vscale x 4 x float> @llvm_cos_vscale_f32(<vscale x 4 x float> %in) #0 {
 ; CHECK-LABEL: define <vscale x 4 x float> @llvm_cos_vscale_f32
 ; CHECK-SAME: (<vscale x 4 x float> [[IN:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @llvm.cos.nxv4f32(<vscale x 4 x float> [[IN]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @armpl_svcos_f32_x(<vscale x 4 x float> [[IN]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 ;
   %1 = call fast <vscale x 4 x float> @llvm.cos.nxv4f32(<vscale x 4 x float> %in)
@@ -85,7 +85,7 @@ define <4 x float> @llvm_sin_f32(<4 x float> %in) {
 define <vscale x 2 x double> @llvm_sin_vscale_f64(<vscale x 2 x double> %in) #0 {
 ; CHECK-LABEL: define <vscale x 2 x double> @llvm_sin_vscale_f64
 ; CHECK-SAME: (<vscale x 2 x double> [[IN:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @llvm.sin.nxv2f64(<vscale x 2 x double> [[IN]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @armpl_svsin_f64_x(<vscale x 2 x double> [[IN]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 ;
   %1 = call fast <vscale x 2 x double> @llvm.sin.nxv2f64(<vscale x 2 x double> %in)
@@ -95,7 +95,7 @@ define <vscale x 2 x double> @llvm_sin_vscale_f64(<vscale x 2 x double> %in) #0
 define <vscale x 4 x float> @llvm_sin_vscale_f32(<vscale x 4 x float> %in) #0 {
 ; CHECK-LABEL: define <vscale x 4 x float> @llvm_sin_vscale_f32
 ; CHECK-SAME: (<vscale x 4 x float> [[IN:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @llvm.sin.nxv4f32(<vscale x 4 x float> [[IN]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @armpl_svsin_f32_x(<vscale x 4 x float> [[IN]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 ;
   %1 = call fast <vscale x 4 x float> @llvm.sin.nxv4f32(<vscale x 4 x float> %in)
@@ -130,7 +130,7 @@ define <4 x float> @llvm_exp_f32(<4 x float> %in) {
 define <vscale x 2 x double> @llvm_exp_vscale_f64(<vscale x 2 x double> %in) #0 {
 ; CHECK-LABEL: define <vscale x 2 x double> @llvm_exp_vscale_f64
 ; CHECK-SAME: (<vscale x 2 x double> [[IN:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @llvm.exp.nxv2f64(<vscale x 2 x double> [[IN]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @armpl_svexp_f64_x(<vscale x 2 x double> [[IN]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 ;
   %1 = call fast <vscale x 2 x double> @llvm.exp.nxv2f64(<vscale x 2 x double> %in)
@@ -140,7 +140,7 @@ define <vscale x 2 x double> @llvm_exp_vscale_f64(<vscale x 2 x double> %in) #0
 define <vscale x 4 x float> @llvm_exp_vscale_f32(<vscale x 4 x float> %in) #0 {
 ; CHECK-LABEL: define <vscale x 4 x float> @llvm_exp_vscale_f32
 ; CHECK-SAME: (<vscale x 4 x float> [[IN:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @llvm.exp.nxv4f32(<vscale x 4 x float> [[IN]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @armpl_svexp_f32_x(<vscale x 4 x float> [[IN]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 ;
   %1 = call fast <vscale x 4 x float> @llvm.exp.nxv4f32(<vscale x 4 x float> %in)
@@ -175,7 +175,7 @@ define <4 x float> @llvm_exp2_f32(<4 x float> %in) {
 define <vscale x 2 x double> @llvm_exp2_vscale_f64(<vscale x 2 x double> %in) #0 {
 ; CHECK-LABEL: define <vscale x 2 x double> @llvm_exp2_vscale_f64
 ; CHECK-SAME: (<vscale x 2 x double> [[IN:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @llvm.exp2.nxv2f64(<vscale x 2 x double> [[IN]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @armpl_svexp2_f64_x(<vscale x 2 x double> [[IN]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 ;
   %1 = call fast <vscale x 2 x double> @llvm.exp2.nxv2f64(<vscale x 2 x double> %in)
@@ -185,7 +185,7 @@ define <vscale x 2 x double> @llvm_exp2_vscale_f64(<vscale x 2 x double> %in) #0
 define <vscale x 4 x float> @llvm_exp2_vscale_f32(<vscale x 4 x float> %in) #0 {
 ; CHECK-LABEL: define <vscale x 4 x float> @llvm_exp2_vscale_f32
 ; CHECK-SAME: (<vscale x 4 x float> [[IN:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @llvm.exp2.nxv4f32(<vscale x 4 x float> [[IN]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @armpl_svexp2_f32_x(<vscale x 4 x float> [[IN]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 ;
   %1 = call fast <vscale x 4 x float> @llvm.exp2.nxv4f32(<vscale x 4 x float> %in)
@@ -220,7 +220,7 @@ define <4 x float> @llvm_exp10_f32(<4 x float> %in) {
 define <vscale x 2 x double> @llvm_exp10_vscale_f64(<vscale x 2 x double> %in) #0 {
 ; CHECK-LABEL: define <vscale x 2 x double> @llvm_exp10_vscale_f64
 ; CHECK-SAME: (<vscale x 2 x double> [[IN:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @llvm.exp10.nxv2f64(<vscale x 2 x double> [[IN]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @armpl_svexp10_f64_x(<vscale x 2 x double> [[IN]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 ;
   %1 = call fast <vscale x 2 x double> @llvm.exp10.nxv2f64(<vscale x 2 x double> %in)
@@ -230,7 +230,7 @@ define <vscale x 2 x double> @llvm_exp10_vscale_f64(<vscale x 2 x double> %in) #
 define <vscale x 4 x float> @llvm_exp10_vscale_f32(<vscale x 4 x float> %in) #0 {
 ; CHECK-LABEL: define <vscale x 4 x float> @llvm_exp10_vscale_f32
 ; CHECK-SAME: (<vscale x 4 x float> [[IN:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @llvm.exp10.nxv4f32(<vscale x 4 x float> [[IN]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @armpl_svexp10_f32_x(<vscale x 4 x float> [[IN]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 ;
   %1 = call fast <vscale x 4 x float> @llvm.exp10.nxv4f32(<vscale x 4 x float> %in)
@@ -265,7 +265,7 @@ define <4 x float> @llvm_log_f32(<4 x float> %in) {
 define <vscale x 2 x double> @llvm_log_vscale_f64(<vscale x 2 x double> %in) #0 {
 ; CHECK-LABEL: define <vscale x 2 x double> @llvm_log_vscale_f64
 ; CHECK-SAME: (<vscale x 2 x double> [[IN:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @llvm.log.nxv2f64(<vscale x 2 x double> [[IN]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @armpl_svlog_f64_x(<vscale x 2 x double> [[IN]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 ;
   %1 = call fast <vscale x 2 x double> @llvm.log.nxv2f64(<vscale x 2 x double> %in)
@@ -275,7 +275,7 @@ define <vscale x 2 x double> @llvm_log_vscale_f64(<vscale x 2 x double> %in) #0
 define <vscale x 4 x float> @llvm_log_vscale_f32(<vscale x 4 x float> %in) #0 {
 ; CHECK-LABEL: define <vscale x 4 x float> @llvm_log_vscale_f32
 ; CHECK-SAME: (<vscale x 4 x float> [[IN:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @llvm.log.nxv4f32(<vscale x 4 x float> [[IN]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @armpl_svlog_f32_x(<vscale x 4 x float> [[IN]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 ;
   %1 = call fast <vscale x 4 x float> @llvm.log.nxv4f32(<vscale x 4 x float> %in)
@@ -310,7 +310,7 @@ define <4 x float> @llvm_log2_f32(<4 x float> %in) {
 define <vscale x 2 x double> @llvm_log2_vscale_f64(<vscale x 2 x double> %in) #0 {
 ; CHECK-LABEL: define <vscale x 2 x double> @llvm_log2_vscale_f64
 ; CHECK-SAME: (<vscale x 2 x double> [[IN:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @llvm.log2.nxv2f64(<vscale x 2 x double> [[IN]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @armpl_svlog2_f64_x(<vscale x 2 x double> [[IN]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 ;
   %1 = call fast <vscale x 2 x double> @llvm.log2.nxv2f64(<vscale x 2 x double> %in)
@@ -320,7 +320,7 @@ define <vscale x 2 x double> @llvm_log2_vscale_f64(<vscale x 2 x double> %in) #0
 define <vscale x 4 x float> @llvm_log2_vscale_f32(<vscale x 4 x float> %in) #0 {
 ; CHECK-LABEL: define <vscale x 4 x float> @llvm_log2_vscale_f32
 ; CHECK-SAME: (<vscale x 4 x float> [[IN:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @llvm.log2.nxv4f32(<vscale x 4 x float> [[IN]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @armpl_svlog2_f32_x(<vscale x 4 x float> [[IN]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 ;
   %1 = call fast <vscale x 4 x float> @llvm.log2.nxv4f32(<vscale x 4 x float> %in)
@@ -355,7 +355,7 @@ define <4 x float> @llvm_log10_f32(<4 x float> %in) {
 define <vscale x 2 x double> @llvm_log10_vscale_f64(<vscale x 2 x double> %in) #0 {
 ; CHECK-LABEL: define <vscale x 2 x double> @llvm_log10_vscale_f64
 ; CHECK-SAME: (<vscale x 2 x double> [[IN:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @llvm.log10.nxv2f64(<vscale x 2 x double> [[IN]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @armpl_svlog10_f64_x(<vscale x 2 x double> [[IN]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 ;
   %1 = call fast <vscale x 2 x double> @llvm.log10.nxv2f64(<vscale x 2 x double> %in)
@@ -365,7 +365,7 @@ define <vscale x 2 x double> @llvm_log10_vscale_f64(<vscale x 2 x double> %in) #
 define <vscale x 4 x float> @llvm_log10_vscale_f32(<vscale x 4 x float> %in) #0 {
 ; CHECK-LABEL: define <vscale x 4 x float> @llvm_log10_vscale_f32
 ; CHECK-SAME: (<vscale x 4 x float> [[IN:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @llvm.log10.nxv4f32(<vscale x 4 x float> [[IN]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @armpl_svlog10_f32_x(<vscale x 4 x float> [[IN]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 ;
   %1 = call fast <vscale x 4 x float> @llvm.log10.nxv4f32(<vscale x 4 x float> %in)
@@ -380,7 +380,7 @@ declare <vscale x 4 x float> @llvm.pow.nxv4f32(<vscale x 4 x float>, <vscale x 4
 ;
 ; There is a bug in the replace-with-veclib pass, and for intrinsics which take
 ; more than one arguments, but has just one overloaded type, it incorrectly
-; reconstructs the scalar name, for pow specificlly it is searching for:
+; reconstructs the scalar name, for pow specifically it is searching for:
 ; llvm.pow.f64.f64 and llvm.pow.f32.f32
 ;
 
diff --git a/llvm/test/CodeGen/AArch64/replace-intrinsics-with-veclib-sleef-scalable.ll b/llvm/test/CodeGen/AArch64/replace-intrinsics-with-veclib-sleef-scalable.ll
index 8b06c41bcb1a6..c2ff6014bc694 100644
--- a/llvm/test/CodeGen/AArch64/replace-intrinsics-with-veclib-sleef-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/replace-intrinsics-with-veclib-sleef-scalable.ll
@@ -3,8 +3,9 @@
 
 target triple = "aarch64-unknown-linux-gnu"
 
-; NOTE: The existing TLI mappings are not used since the -replace-with-veclib pass is broken for scalable vectors.
-
+;.
+; CHECK: @llvm.compiler.used = appending global [16 x ptr] [ptr @_ZGVsMxv_cos, ptr @_ZGVsMxv_cosf, ptr @_ZGVsMxv_exp, ptr @_ZGVsMxv_expf, ptr @_ZGVsMxv_exp2, ptr @_ZGVsMxv_exp2f, ptr @_ZGVsMxv_exp10, ptr @_ZGVsMxv_exp10f, ptr @_ZGVsMxv_log, ptr @_ZGVsMxv_logf, ptr @_ZGVsMxv_log10, ptr @_ZGVsMxv_log10f, ptr @_ZGVsMxv_log2, ptr @_ZGVsMxv_log2f, ptr @_ZGVsMxv_sin, ptr @_ZGVsMxv_sinf], section "llvm.metadata"
+;.
 define <vscale x 2 x double> @llvm_ceil_vscale_f64(<vscale x 2 x double> %in) {
 ; CHECK-LABEL: @llvm_ceil_vscale_f64(
 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @llvm.ceil.nxv2f64(<vscale x 2 x double> [[IN:%.*]])
@@ -43,7 +44,7 @@ define <vscale x 4 x float> @llvm_copysign_vscale_f32(<vscale x 4 x float> %mag,
 
 define <vscale x 2 x double> @llvm_cos_vscale_f64(<vscale x 2 x double> %in) {
 ; CHECK-LABEL: @llvm_cos_vscale_f64(
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @llvm.cos.nxv2f64(<vscale x 2 x double> [[IN:%.*]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @_ZGVsMxv_cos(<vscale x 2 x double> [[IN:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 ;
   %1 = call fast <vscale x 2 x double> @llvm.cos.nxv2f64(<vscale x 2 x double> %in)
@@ -52,7 +53,7 @@ define <vscale x 2 x double> @llvm_cos_vscale_f64(<vscale x 2 x double> %in) {
 
 define <vscale x 4 x float> @llvm_cos_vscale_f32(<vscale x 4 x float> %in) {
 ; CHECK-LABEL: @llvm_cos_vscale_f32(
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @llvm.cos.nxv4f32(<vscale x 4 x float> [[IN:%.*]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @_ZGVsMxv_cosf(<vscale x 4 x float> [[IN:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 ;
   %1 = call fast <vscale x 4 x float> @llvm.cos.nxv4f32(<vscale x 4 x float> %in)
@@ -61,7 +62,7 @@ define <vscale x 4 x float> @llvm_cos_vscale_f32(<vscale x 4 x float> %in) {
 
 define <vscale x 2 x double> @llvm_exp_vscale_f64(<vscale x 2 x double> %in) {
 ; CHECK-LABEL: @llvm_exp_vscale_f64(
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @llvm.exp.nxv2f64(<vscale x 2 x double> [[IN:%.*]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @_ZGVsMxv_exp(<vscale x 2 x double> [[IN:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 ;
   %1 = call fast <vscale x 2 x double> @llvm.exp.nxv2f64(<vscale x 2 x double> %in)
@@ -70,7 +71,7 @@ define <vscale x 2 x double> @llvm_exp_vscale_f64(<vscale x 2 x double> %in) {
 
 define <vscale x 4 x float> @llvm_exp_vscale_f32(<vscale x 4 x float> %in) {
 ; CHECK-LABEL: @llvm_exp_vscale_f32(
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @llvm.exp.nxv4f32(<vscale x 4 x float> [[IN:%.*]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @_ZGVsMxv_expf(<vscale x 4 x float> [[IN:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 ;
   %1 = call fast <vscale x 4 x float> @llvm.exp.nxv4f32(<vscale x 4 x float> %in)
@@ -79,7 +80,7 @@ define <vscale x 4 x float> @llvm_exp_vscale_f32(<vscale x 4 x float> %in) {
 
 define <vscale x 2 x double> @llvm_exp2_vscale_f64(<vscale x 2 x double> %in) {
 ; CHECK-LABEL: @llvm_exp2_vscale_f64(
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @llvm.exp2.nxv2f64(<vscale x 2 x double> [[IN:%.*]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @_ZGVsMxv_exp2(<vscale x 2 x double> [[IN:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 ;
   %1 = call fast <vscale x 2 x double> @llvm.exp2.nxv2f64(<vscale x 2 x double> %in)
@@ -88,7 +89,7 @@ define <vscale x 2 x double> @llvm_exp2_vscale_f64(<vscale x 2 x double> %in) {
 
 define <vscale x 4 x float> @llvm_exp2_vscale_f32(<vscale x 4 x float> %in) {
 ; CHECK-LABEL: @llvm_exp2_vscale_f32(
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @llvm.exp2.nxv4f32(<vscale x 4 x float> [[IN:%.*]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @_ZGVsMxv_exp2f(<vscale x 4 x float> [[IN:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 ;
   %1 = call fast <vscale x 4 x float> @llvm.exp2.nxv4f32(<vscale x 4 x float> %in)
@@ -97,7 +98,7 @@ define <vscale x 4 x float> @llvm_exp2_vscale_f32(<vscale x 4 x float> %in) {
 
 define <vscale x 2 x double> @llvm_exp10_vscale_f64(<vscale x 2 x double> %in) {
 ; CHECK-LABEL: @llvm_exp10_vscale_f64(
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @llvm.exp10.nxv2f64(<vscale x 2 x double> [[IN:%.*]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @_ZGVsMxv_exp10(<vscale x 2 x double> [[IN:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 ;
   %1 = call fast <vscale x 2 x double> @llvm.exp10.nxv2f64(<vscale x 2 x double> %in)
@@ -106,7 +107,7 @@ define <vscale x 2 x double> @llvm_exp10_vscale_f64(<vscale x 2 x double> %in) {
 
 define <vscale x 4 x float> @llvm_exp10_vscale_f32(<vscale x 4 x float> %in) {
 ; CHECK-LABEL: @llvm_exp10_vscale_f32(
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @llvm.exp10.nxv4f32(<vscale x 4 x float> [[IN:%.*]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @_ZGVsMxv_exp10f(<vscale x 4 x float> [[IN:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 ;
   %1 = call fast <vscale x 4 x float> @llvm.exp10.nxv4f32(<vscale x 4 x float> %in)
@@ -169,7 +170,7 @@ define <vscale x 4 x float> @llvm_fma_vscale_f32(<vscale x 4 x float> %a, <vscal
 
 define <vscale x 2 x double> @llvm_log_vscale_f64(<vscale x 2 x double> %in) {
 ; CHECK-LABEL: @llvm_log_vscale_f64(
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @llvm.log.nxv2f64(<vscale x 2 x double> [[IN:%.*]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @_ZGVsMxv_log(<vscale x 2 x double> [[IN:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 ;
   %1 = call fast <vscale x 2 x double> @llvm.log.nxv2f64(<vscale x 2 x double> %in)
@@ -178,7 +179,7 @@ define <vscale x 2 x double> @llvm_log_vscale_f64(<vscale x 2 x double> %in) {
 
 define <vscale x 4 x float> @llvm_log_vscale_f32(<vscale x 4 x float> %in) {
 ; CHECK-LABEL: @llvm_log_vscale_f32(
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @llvm.log.nxv4f32(<vscale x 4 x float> [[IN:%.*]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @_ZGVsMxv_logf(<vscale x 4 x float> [[IN:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 ;
   %1 = call fast <vscale x 4 x float> @llvm.log.nxv4f32(<vscale x 4 x float> %in)
@@ -187,7 +188,7 @@ define <vscale x 4 x float> @llvm_log_vscale_f32(<vscale x 4 x float> %in) {
 
 define <vscale x 2 x double> @llvm_log10_vscale_f64(<vscale x 2 x double> %in) {
 ; CHECK-LABEL: @llvm_log10_vscale_f64(
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @llvm.log10.nxv2f64(<vscale x 2 x double> [[IN:%.*]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @_ZGVsMxv_log10(<vscale x 2 x double> [[IN:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 ;
   %1 = call fast <vscale x 2 x double> @llvm.log10.nxv2f64(<vscale x 2 x double> %in)
@@ -196,7 +197,7 @@ define <vscale x 2 x double> @llvm_log10_vscale_f64(<vscale x 2 x double> %in) {
 
 define <vscale x 4 x float> @llvm_log10_vscale_f32(<vscale x 4 x float> %in) {
 ; CHECK-LABEL: @llvm_log10_vscale_f32(
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @llvm.log10.nxv4f32(<vscale x 4 x float> [[IN:%.*]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @_ZGVsMxv_log10f(<vscale x 4 x float> [[IN:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 ;
   %1 = call fast <vscale x 4 x float> @llvm.log10.nxv4f32(<vscale x 4 x float> %in)
@@ -205,7 +206,7 @@ define <vscale x 4 x float> @llvm_log10_vscale_f32(<vscale x 4 x float> %in) {
 
 define <vscale x 2 x double> @llvm_log2_vscale_f64(<vscale x 2 x double> %in) {
 ; CHECK-LABEL: @llvm_log2_vscale_f64(
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @llvm.log2.nxv2f64(<vscale x 2 x double> [[IN:%.*]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @_ZGVsMxv_log2(<vscale x 2 x double> [[IN:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 ;
   %1 = call fast <vscale x 2 x double> @llvm.log2.nxv2f64(<vscale x 2 x double> %in)
@@ -214,7 +215,7 @@ define <vscale x 2 x double> @llvm_log2_vscale_f64(<vscale x 2 x double> %in) {
 
 define <vscale x 4 x float> @llvm_log2_vscale_f32(<vscale x 4 x float> %in) {
 ; CHECK-LABEL: @llvm_log2_vscale_f32(
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @llvm.log2.nxv4f32(<vscale x 4 x float> [[IN:%.*]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @_ZGVsMxv_log2f(<vscale x 4 x float> [[IN:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 ;
   %1 = call fast <vscale x 4 x float> @llvm.log2.nxv4f32(<vscale x 4 x float> %in)
@@ -331,7 +332,7 @@ define <vscale x 4 x float> @llvm_round_vscale_f32(<vscale x 4 x float> %in) {
 
 define <vscale x 2 x double> @llvm_sin_vscale_f64(<vscale x 2 x double> %in) {
 ; CHECK-LABEL: @llvm_sin_vscale_f64(
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @llvm.sin.nxv2f64(<vscale x 2 x double> [[IN:%.*]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 2 x double> @_ZGVsMxv_sin(<vscale x 2 x double> [[IN:%.*]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
 ;
   %1 = call fast <vscale x 2 x double> @llvm.sin.nxv2f64(<vscale x 2 x double> %in)
@@ -340,7 +341,7 @@ define <vscale x 2 x double> @llvm_sin_vscale_f64(<vscale x 2 x double> %in) {
 
 define <vscale x 4 x float> @llvm_sin_vscale_f32(<vscale x 4 x float> %in) {
 ; CHECK-LABEL: @llvm_sin_vscale_f32(
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @llvm.sin.nxv4f32(<vscale x 4 x float> [[IN:%.*]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <vscale x 4 x float> @_ZGVsMxv_sinf(<vscale x 4 x float> [[IN:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 ;
   %1 = call fast <vscale x 4 x float> @llvm.sin.nxv4f32(<vscale x 4 x float> %in)

From cd09f4b9510583e847267cae04eee87caf4d5e9d Mon Sep 17 00:00:00 2001
From: yan zhou <42528857+zhou3968322@users.noreply.github.com>
Date: Thu, 21 Dec 2023 20:39:05 +0800
Subject: [PATCH 057/342] [CodeGen] This patch fix a bug that may caused error
 for a self-defined target in SelectionDAG::getNode (#75320)

we need first judge N1.getNumOperands() > 0.

If Lowering Generated SDNode like.

```
v2i32 t20:  TargetOpNode.
i32 t21: extract_vector_elt t20  0
i32 t22: extract_vector_elt t20 1
```

will cause a error.
---
 llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 5be1892a44f6d..81facf92e55ae 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -6858,8 +6858,8 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
     // expanding copies of large vectors from registers. This only works for
     // fixed length vectors, since we need to know the exact number of
     // elements.
-    if (N2C && N1.getOperand(0).getValueType().isFixedLengthVector() &&
-        N1.getOpcode() == ISD::CONCAT_VECTORS && N1.getNumOperands() > 0) {
+    if (N2C && N1.getOpcode() == ISD::CONCAT_VECTORS &&
+        N1.getOperand(0).getValueType().isFixedLengthVector()) {
       unsigned Factor =
         N1.getOperand(0).getValueType().getVectorNumElements();
       return getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
@@ -6976,7 +6976,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
 
     // EXTRACT_SUBVECTOR of CONCAT_VECTOR can be simplified if the pieces of
     // the concat have the same type as the extract.
-    if (N1.getOpcode() == ISD::CONCAT_VECTORS && N1.getNumOperands() > 0 &&
+    if (N1.getOpcode() == ISD::CONCAT_VECTORS &&
         VT == N1.getOperand(0).getValueType()) {
       unsigned Factor = VT.getVectorMinNumElements();
       return N1.getOperand(N2C->getZExtValue() / Factor);

From 95e5839e06fdffd278499257c6e7679bba3d6868 Mon Sep 17 00:00:00 2001
From: oltolm <oleg.tolmatcev@gmail.com>
Date: Thu, 21 Dec 2023 13:42:22 +0100
Subject: [PATCH 058/342] [lldb] add support for thread names on Windows
 (#74731)

This PR adds support for thread names in lldb on Windows.

```
(lldb) thr list
Process 2960 stopped
  thread #53: tid = 0x03a0, 0x00007ff84582db34 ntdll.dll`NtWaitForMultipleObjects + 20
  thread #29: tid = 0x04ec, 0x00007ff845830a14 ntdll.dll`NtWaitForAlertByThreadId + 20, name = 'SPUW.6'
  thread #89: tid = 0x057c, 0x00007ff845830a14 ntdll.dll`NtWaitForAlertByThreadId + 20, name = 'PPU[0x1000019] physics[main]'
  thread #3: tid = 0x0648, 0x00007ff843c2cafe combase.dll`InternalDoATClassCreate + 39518
  thread #93: tid = 0x0688, 0x00007ff845830a14 ntdll.dll`NtWaitForAlertByThreadId + 20, name = 'PPU[0x100501d] uMovie::StreamingThread'
  thread #1: tid = 0x087c, 0x00007ff842e7a104 win32u.dll`NtUserMsgWaitForMultipleObjectsEx + 20
  thread #96: tid = 0x0890, 0x00007ff845830a14 ntdll.dll`NtWaitForAlertByThreadId + 20, name = 'PPU[0x1002020] HLE Video Decoder'
<...>
```
---
 .../Windows/Common/TargetThreadWindows.cpp    | 37 ++++++++--
 .../Windows/Common/TargetThreadWindows.h      |  2 +
 lldb/unittests/Thread/CMakeLists.txt          |  2 +
 lldb/unittests/Thread/ThreadTest.cpp          | 71 +++++++++++++++++++
 4 files changed, 106 insertions(+), 6 deletions(-)

diff --git a/lldb/source/Plugins/Process/Windows/Common/TargetThreadWindows.cpp b/lldb/source/Plugins/Process/Windows/Common/TargetThreadWindows.cpp
index 37dc8f6d6d14a..ad67e764fe10f 100644
--- a/lldb/source/Plugins/Process/Windows/Common/TargetThreadWindows.cpp
+++ b/lldb/source/Plugins/Process/Windows/Common/TargetThreadWindows.cpp
@@ -7,18 +7,14 @@
 //===----------------------------------------------------------------------===//
 
 #include "lldb/Host/HostInfo.h"
-#include "lldb/Host/HostNativeThreadBase.h"
-#include "lldb/Host/windows/HostThreadWindows.h"
-#include "lldb/Host/windows/windows.h"
-#include "lldb/Target/RegisterContext.h"
 #include "lldb/Target/Unwind.h"
 #include "lldb/Utility/LLDBLog.h"
 #include "lldb/Utility/Log.h"
-#include "lldb/Utility/State.h"
 
 #include "ProcessWindows.h"
-#include "ProcessWindowsLog.h"
 #include "TargetThreadWindows.h"
+#include "lldb/Host/windows/HostThreadWindows.h"
+#include <llvm/Support/ConvertUTF.h>
 
 #if defined(__x86_64__) || defined(_M_AMD64)
 #include "x64/RegisterContextWindows_x64.h"
@@ -33,6 +29,9 @@
 using namespace lldb;
 using namespace lldb_private;
 
+using GetThreadDescriptionFunctionPtr = HRESULT
+WINAPI (*)(HANDLE hThread, PWSTR *ppszThreadDescription);
+
 TargetThreadWindows::TargetThreadWindows(ProcessWindows &process,
                                          const HostThread &thread)
     : Thread(process, thread.GetNativeThread().GetThreadId()),
@@ -175,3 +174,29 @@ Status TargetThreadWindows::DoResume() {
 
   return Status();
 }
+
+const char *TargetThreadWindows::GetName() {
+  Log *log = GetLog(LLDBLog::Thread);
+  static GetThreadDescriptionFunctionPtr GetThreadDescription = []() {
+    HMODULE hModule = ::LoadLibraryW(L"Kernel32.dll");
+    return hModule ? reinterpret_cast<GetThreadDescriptionFunctionPtr>(
+                         ::GetProcAddress(hModule, "GetThreadDescription"))
+                   : nullptr;
+  }();
+  LLDB_LOGF(log, "GetProcAddress: %p",
+            reinterpret_cast<void *>(GetThreadDescription));
+  if (!GetThreadDescription)
+    return m_name.c_str();
+  PWSTR pszThreadName;
+  if (SUCCEEDED(GetThreadDescription(
+          m_host_thread.GetNativeThread().GetSystemHandle(), &pszThreadName))) {
+    LLDB_LOGF(log, "GetThreadDescription: %ls", pszThreadName);
+    llvm::convertUTF16ToUTF8String(
+        llvm::ArrayRef(reinterpret_cast<char *>(pszThreadName),
+                       wcslen(pszThreadName) * sizeof(wchar_t)),
+        m_name);
+    ::LocalFree(pszThreadName);
+  }
+
+  return m_name.c_str();
+}
diff --git a/lldb/source/Plugins/Process/Windows/Common/TargetThreadWindows.h b/lldb/source/Plugins/Process/Windows/Common/TargetThreadWindows.h
index 2845847738f60..07e1db464ad59 100644
--- a/lldb/source/Plugins/Process/Windows/Common/TargetThreadWindows.h
+++ b/lldb/source/Plugins/Process/Windows/Common/TargetThreadWindows.h
@@ -34,6 +34,7 @@ class TargetThreadWindows : public lldb_private::Thread {
   lldb::RegisterContextSP
   CreateRegisterContextForFrame(StackFrame *frame) override;
   bool CalculateStopInfo() override;
+  const char *GetName() override;
 
   Status DoResume();
 
@@ -42,6 +43,7 @@ class TargetThreadWindows : public lldb_private::Thread {
 private:
   lldb::RegisterContextSP m_thread_reg_ctx_sp;
   HostThread m_host_thread;
+  std::string m_name;
 };
 } // namespace lldb_private
 
diff --git a/lldb/unittests/Thread/CMakeLists.txt b/lldb/unittests/Thread/CMakeLists.txt
index d6e365adac5dd..f6c8795f349a5 100644
--- a/lldb/unittests/Thread/CMakeLists.txt
+++ b/lldb/unittests/Thread/CMakeLists.txt
@@ -11,5 +11,7 @@ add_lldb_unittest(ThreadTests
       lldbInterpreter
       lldbBreakpoint
       lldbPluginPlatformLinux
+      lldbPluginPlatformWindows
+      lldbPluginProcessWindowsCommon
   )
 
diff --git a/lldb/unittests/Thread/ThreadTest.cpp b/lldb/unittests/Thread/ThreadTest.cpp
index bd8cdce99f172..4c660e9815c3e 100644
--- a/lldb/unittests/Thread/ThreadTest.cpp
+++ b/lldb/unittests/Thread/ThreadTest.cpp
@@ -8,9 +8,20 @@
 
 #include "lldb/Target/Thread.h"
 #include "Plugins/Platform/Linux/PlatformLinux.h"
+#include <thread>
+#ifdef _WIN32
+#include "lldb/Host/windows/HostThreadWindows.h"
+#include "lldb/Host/windows/windows.h"
+
+#include "Plugins/Platform/Windows/PlatformWindows.h"
+#include "Plugins/Process/Windows/Common/LocalDebugDelegate.h"
+#include "Plugins/Process/Windows/Common/ProcessWindows.h"
+#include "Plugins/Process/Windows/Common/TargetThreadWindows.h"
+#endif
 #include "lldb/Core/Debugger.h"
 #include "lldb/Host/FileSystem.h"
 #include "lldb/Host/HostInfo.h"
+#include "lldb/Host/HostThread.h"
 #include "lldb/Target/Process.h"
 #include "lldb/Target/StopInfo.h"
 #include "lldb/Utility/ArchSpec.h"
@@ -21,14 +32,33 @@ using namespace lldb_private::repro;
 using namespace lldb;
 
 namespace {
+
+#ifdef _WIN32
+using SetThreadDescriptionFunctionPtr = HRESULT
+WINAPI (*)(HANDLE hThread, PCWSTR lpThreadDescription);
+
+static SetThreadDescriptionFunctionPtr SetThreadName;
+#endif
+
 class ThreadTest : public ::testing::Test {
 public:
   void SetUp() override {
     FileSystem::Initialize();
     HostInfo::Initialize();
+#ifdef _WIN32
+    HMODULE hModule = ::LoadLibraryW(L"Kernel32.dll");
+    if (hModule) {
+      SetThreadName = reinterpret_cast<SetThreadDescriptionFunctionPtr>(
+          ::GetProcAddress(hModule, "SetThreadDescription"));
+    }
+    PlatformWindows::Initialize();
+#endif
     platform_linux::PlatformLinux::Initialize();
   }
   void TearDown() override {
+#ifdef _WIN32
+    PlatformWindows::Terminate();
+#endif
     platform_linux::PlatformLinux::Terminate();
     HostInfo::Terminate();
     FileSystem::Terminate();
@@ -88,6 +118,47 @@ TargetSP CreateTarget(DebuggerSP &debugger_sp, ArchSpec &arch) {
   return target_sp;
 }
 
+#ifdef _WIN32
+std::shared_ptr<TargetThreadWindows>
+CreateWindowsThread(const ProcessWindowsSP &process_sp, std::thread &t) {
+  HostThread host_thread((lldb::thread_t)t.native_handle());
+  ThreadSP thread_sp =
+      std::make_shared<TargetThreadWindows>(*process_sp.get(), host_thread);
+  return std::static_pointer_cast<TargetThreadWindows>(thread_sp);
+}
+
+TEST_F(ThreadTest, GetThreadDescription) {
+  if (!SetThreadName)
+    return;
+
+  ArchSpec arch(HostInfo::GetArchitecture());
+  Platform::SetHostPlatform(PlatformWindows::CreateInstance(true, &arch));
+
+  DebuggerSP debugger_sp = Debugger::CreateInstance();
+  ASSERT_TRUE(debugger_sp);
+
+  TargetSP target_sp = CreateTarget(debugger_sp, arch);
+  ASSERT_TRUE(target_sp);
+
+  ListenerSP listener_sp(Listener::MakeListener("dummy"));
+  auto process_sp = std::static_pointer_cast<ProcessWindows>(
+      ProcessWindows::CreateInstance(target_sp, listener_sp, nullptr, false));
+  ASSERT_TRUE(process_sp);
+
+  std::thread t([]() {});
+  auto thread_sp = CreateWindowsThread(process_sp, t);
+  DWORD tid = thread_sp->GetHostThread().GetNativeThread().GetThreadId();
+  HANDLE hThread = ::OpenThread(THREAD_SET_LIMITED_INFORMATION, FALSE, tid);
+  ASSERT_TRUE(hThread);
+
+  SetThreadName(hThread, L"thread name");
+  ::CloseHandle(hThread);
+  ASSERT_STREQ(thread_sp->GetName(), "thread name");
+
+  t.join();
+}
+#endif
+
 TEST_F(ThreadTest, SetStopInfo) {
   ArchSpec arch("powerpc64-pc-linux");
 

From b223aebd3ff9fd705d0b9054023ad6b77c933d92 Mon Sep 17 00:00:00 2001
From: Shengchen Kan <shengchen.kan@intel.com>
Date: Thu, 21 Dec 2023 19:01:47 +0800
Subject: [PATCH 059/342] [X86][NFC] Refine code in X86InstrArithmetic.td

1. Remove redandunt classes
2. Correct comments
3. Move duplicated `let` statement into class definition
4. Simplify the variable name and align the code
---
 llvm/lib/Target/X86/X86InstrArithmetic.td | 1279 +++++++++------------
 llvm/lib/Target/X86/X86InstrUtils.td      |    9 +
 2 files changed, 566 insertions(+), 722 deletions(-)

diff --git a/llvm/lib/Target/X86/X86InstrArithmetic.td b/llvm/lib/Target/X86/X86InstrArithmetic.td
index 46b430a842ef0..664ba316cd75b 100644
--- a/llvm/lib/Target/X86/X86InstrArithmetic.td
+++ b/llvm/lib/Target/X86/X86InstrArithmetic.td
@@ -44,107 +44,69 @@ def PLEA32r   : PseudoI<(outs GR32:$dst), (ins anymem:$src), []>;
 def PLEA64r   : PseudoI<(outs GR64:$dst), (ins anymem:$src), []>;
 }
 
-//===----------------------------------------------------------------------===//
-//  Fixed-Register Multiplication and Division Instructions.
-//
-
-// BinOpRR - Binary instructions with inputs "reg, reg".
-class BinOpRR<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
-              dag outlist, X86FoldableSchedWrite sched, list<dag> pattern>
-  : ITy<opcode, MRMDestReg, typeinfo, outlist,
-        (ins typeinfo.RegClass:$src1, typeinfo.RegClass:$src2),
-        mnemonic, "{$src2, $src1|$src1, $src2}", pattern>,
-    Sched<[sched]>;
-
-// BinOpRR_F - Binary instructions with inputs "reg, reg", where the pattern
-// has just a EFLAGS as a result.
-class BinOpRR_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
-                SDPatternOperator opnode>
-  : BinOpRR<opcode, mnemonic, typeinfo, (outs), WriteALU,
-            [(set EFLAGS,
-                  (opnode typeinfo.RegClass:$src1, typeinfo.RegClass:$src2))]>;
-
-// BinOpRR_RF - Binary instructions with inputs "reg, reg", where the pattern
-// has both a regclass and EFLAGS as a result.
-class BinOpRR_RF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
-                 SDNode opnode>
-  : BinOpRR<opcode, mnemonic, typeinfo, (outs typeinfo.RegClass:$dst), WriteALU,
-            [(set typeinfo.RegClass:$dst, EFLAGS,
-                  (opnode typeinfo.RegClass:$src1, typeinfo.RegClass:$src2))]>;
-
-// BinOpRR_RFF - Binary instructions with inputs "reg, reg", where the pattern
-// has both a regclass and EFLAGS as a result, and has EFLAGS as input.
-class BinOpRR_RFF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
-                  SDNode opnode>
-  : BinOpRR<opcode, mnemonic, typeinfo, (outs typeinfo.RegClass:$dst), WriteADC,
-            [(set typeinfo.RegClass:$dst, EFLAGS,
-                  (opnode typeinfo.RegClass:$src1, typeinfo.RegClass:$src2,
-                          EFLAGS))]>;
-
-// BinOpRR_Rev - Binary instructions with inputs "reg, reg"(reversed encoding).
-class BinOpRR_Rev<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
-                  X86FoldableSchedWrite sched = WriteALU>
-  : ITy<opcode, MRMSrcReg, typeinfo,
-        (outs typeinfo.RegClass:$dst),
-        (ins typeinfo.RegClass:$src1, typeinfo.RegClass:$src2),
-        mnemonic, "{$src2, $dst|$dst, $src2}", []>,
-    Sched<[sched]> {
-  // The disassembler should know about this, but not the asmparser.
-  let isCodeGenOnly = 1;
-  let ForceDisassemble = 1;
-  let hasSideEffects = 0;
+// BinOpRR - Instructions that read "reg, reg".
+class BinOpRR<bits<8> o, string m, X86TypeInfo t, dag out, list<dag> p>
+  : ITy<o, MRMDestReg, t, out, (ins t.RegClass:$src1, t.RegClass:$src2), m,
+        "{$src2, $src1|$src1, $src2}", p>, Sched<[WriteALU]>;
+// BinOpRR_F - Instructions that read "reg, reg" and write EFLAGS only.
+class BinOpRR_F<bits<8> o, string m, X86TypeInfo t, SDPatternOperator node>
+  : BinOpRR<o, m, t, (outs),
+            [(set EFLAGS, (node t.RegClass:$src1, t.RegClass:$src2))]>,
+    DefEFLAGS;
+// BinOpRR_F_Rev - Reversed encoding of BinOpRR_F
+class BinOpRR_F_Rev<bits<8> o, string m, X86TypeInfo t>
+  : BinOpRR_F<o, m, t, null_frag>, DisassembleOnly {
+  let Form = MRMSrcReg;
 }
-
-// BinOpRR_RFF_Rev - Binary instructions with inputs "reg, reg"(reversed
-// encoding), with sched = WriteADC.
-class BinOpRR_RFF_Rev<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo>
-  : BinOpRR_Rev<opcode, mnemonic, typeinfo, WriteADC>;
-
-// BinOpRR_F_Rev - Binary instructions with inputs "reg, reg"(reversed
-// encoding), without outlist dag.
-class BinOpRR_F_Rev<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo>
-  : ITy<opcode, MRMSrcReg, typeinfo, (outs),
-        (ins typeinfo.RegClass:$src1, typeinfo.RegClass:$src2),
-        mnemonic, "{$src2, $src1|$src1, $src2}", []>,
-    Sched<[WriteALU]> {
-  // The disassembler should know about this, but not the asmparser.
-  let isCodeGenOnly = 1;
-  let ForceDisassemble = 1;
-  let hasSideEffects = 0;
+// BinOpRR_RF - Instructions that read "reg, reg", and write "reg", EFLAGS.
+class BinOpRR_RF<bits<8> o, string m, X86TypeInfo t, SDPatternOperator node>
+  : BinOpRR<o, m, t, (outs t.RegClass:$dst),
+            [(set t.RegClass:$dst, EFLAGS,
+             (node t.RegClass:$src1, t.RegClass:$src2))]>, DefEFLAGS;
+// BinOpRR_RF_Rev - Reversed encoding of BinOpRR_RF.
+class BinOpRR_RF_Rev<bits<8> o, string m, X86TypeInfo t>
+  : BinOpRR_RF<o, m, t, null_frag>, DisassembleOnly {
+  let Form = MRMSrcReg;
+}
+// BinOpRRF_RF - Instructions that read "reg, reg", write "reg" and read/write
+// EFLAGS.
+class BinOpRRF_RF<bits<8> o, string m, X86TypeInfo t, SDPatternOperator node>
+  : BinOpRR<o, m, t, (outs t.RegClass:$dst),
+            [(set t.RegClass:$dst, EFLAGS,
+             (node t.RegClass:$src1, t.RegClass:$src2,
+             EFLAGS))]>, DefEFLAGS, UseEFLAGS {
+  let SchedRW = [WriteADC];
+}
+// BinOpRRF_RF_Rev - Reversed encoding of BinOpRRF_RF
+class BinOpRRF_RF_Rev<bits<8> o, string m, X86TypeInfo t>
+  : BinOpRRF_RF<o, m, t, null_frag>, DisassembleOnly {
+  let Form = MRMSrcReg;
 }
 
-// BinOpRM - Binary instructions with inputs "reg, [mem]".
-class BinOpRM<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
-              dag outlist, X86FoldableSchedWrite sched, list<dag> pattern>
-  : ITy<opcode, MRMSrcMem, typeinfo, outlist,
-        (ins typeinfo.RegClass:$src1, typeinfo.MemOperand:$src2),
-        mnemonic, "{$src2, $src1|$src1, $src2}", pattern>,
-    Sched<[sched.Folded, sched.ReadAfterFold]>;
-
-// BinOpRM_F - Binary instructions with inputs "reg, [mem]", where the pattern
-// has just a EFLAGS as a result.
-class BinOpRM_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
-                SDNode opnode>
-  : BinOpRM<opcode, mnemonic, typeinfo, (outs), WriteALU,
-            [(set EFLAGS,
-            (opnode typeinfo.RegClass:$src1, (typeinfo.LoadNode addr:$src2)))]>;
-
-// BinOpRM_RF - Binary instructions with inputs "reg, [mem]", where the pattern
-// has both a regclass and EFLAGS as a result.
-class BinOpRM_RF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
-                 SDNode opnode>
-  : BinOpRM<opcode, mnemonic, typeinfo, (outs typeinfo.RegClass:$dst), WriteALU,
-            [(set typeinfo.RegClass:$dst, EFLAGS,
-            (opnode typeinfo.RegClass:$src1, (typeinfo.LoadNode addr:$src2)))]>;
-
-// BinOpRM_RFF - Binary instructions with inputs "reg, [mem]", where the pattern
-// has both a regclass and EFLAGS as a result, and has EFLAGS as input.
-class BinOpRM_RFF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
-                  SDNode opnode>
-  : BinOpRM<opcode, mnemonic, typeinfo, (outs typeinfo.RegClass:$dst), WriteADC,
-            [(set typeinfo.RegClass:$dst, EFLAGS,
-             (opnode typeinfo.RegClass:$src1, (typeinfo.LoadNode addr:$src2),
-             EFLAGS))]> {
+// BinOpRM - Instructions that read "reg, [mem]".
+class BinOpRM<bits<8> o, string m, X86TypeInfo t, dag out, list<dag> p>
+  : ITy<o, MRMSrcMem, t, out, (ins t.RegClass:$src1, t.MemOperand:$src2), m,
+        "{$src2, $src1|$src1, $src2}", p>,
+    Sched<[WriteALU.Folded, WriteALU.ReadAfterFold]> {
+  let mayLoad = 1;
+}
+// BinOpRM_F - Instructions that read "reg, [mem]" and write EFLAGS only.
+class BinOpRM_F<bits<8> o, string m, X86TypeInfo t, SDNode node>
+  : BinOpRM<o, m, t, (outs),
+            [(set EFLAGS, (node t.RegClass:$src1,
+             (t.LoadNode addr:$src2)))]>, DefEFLAGS;
+// BinOpRM_RF - Instructions that read "reg, reg", and write "reg", EFLAGS.
+class BinOpRM_RF<bits<8> o, string m, X86TypeInfo t, SDPatternOperator node>
+  : BinOpRM<o, m, t, (outs t.RegClass:$dst),
+            [(set t.RegClass:$dst, EFLAGS, (node t.RegClass:$src1,
+             (t.LoadNode addr:$src2)))]>, DefEFLAGS;
+// BinOpRMF_RF - Instructions that read "reg, [mem]", write "reg" and read/write
+// EFLAGS.
+class BinOpRMF_RF<bits<8> o, string m, X86TypeInfo t, SDNode node>
+  : BinOpRM<o, m, t, (outs t.RegClass:$dst),
+            [(set t.RegClass:$dst, EFLAGS,
+             (node t.RegClass:$src1, (t.LoadNode addr:$src2), EFLAGS))]>,
+    DefEFLAGS, UseEFLAGS {
   let SchedRW = [WriteADC.Folded, WriteADC.ReadAfterFold,
                  // base, scale, index, offset, segment.
                  ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault,
@@ -152,395 +114,327 @@ class BinOpRM_RFF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
                  WriteADC.ReadAfterFold];
 }
 
-// BinOpRI - Binary instructions with inputs "reg, imm".
-class BinOpRI<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
-              Format f, dag outlist, X86FoldableSchedWrite sched, list<dag> pattern>
-  : ITy<opcode, f, typeinfo, outlist,
-        (ins typeinfo.RegClass:$src1, typeinfo.ImmOperand:$src2),
-        mnemonic, "{$src2, $src1|$src1, $src2}", pattern>,
-    Sched<[sched]> {
-  let ImmT = typeinfo.ImmEncoding;
+// BinOpRI - Instructions that read "reg, imm".
+class BinOpRI<bits<8> o, string m, X86TypeInfo t, Format f, dag out, list<dag> p>
+  : ITy<o, f, t, out, (ins t.RegClass:$src1, t.ImmOperand:$src2), m,
+        "{$src2, $src1|$src1, $src2}", p>, Sched<[WriteALU]> {
+  let ImmT = t.ImmEncoding;
 }
-
-// BinOpRI_F - Binary instructions with inputs "reg, imm", where the pattern
-// has EFLAGS as a result.
-class BinOpRI_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
-                SDPatternOperator opnode, Format f>
-  : BinOpRI<opcode, mnemonic, typeinfo, f, (outs), WriteALU,
-            [(set EFLAGS,
-                (opnode typeinfo.RegClass:$src1, typeinfo.ImmOperator:$src2))]>;
-
-// BinOpRI_RF - Binary instructions with inputs "reg, imm", where the pattern
-// has both a regclass and EFLAGS as a result.
-class BinOpRI_RF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
-                 SDNode opnode, Format f>
-  : BinOpRI<opcode, mnemonic, typeinfo, f, (outs typeinfo.RegClass:$dst), WriteALU,
-            [(set typeinfo.RegClass:$dst, EFLAGS,
-                (opnode typeinfo.RegClass:$src1, typeinfo.ImmOperator:$src2))]>;
-
-// BinOpRI_RFF - Binary instructions with inputs "reg, imm", where the pattern
-// has both a regclass and EFLAGS as a result, and has EFLAGS as input.
-class BinOpRI_RFF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
-                  SDNode opnode, Format f>
-  : BinOpRI<opcode, mnemonic, typeinfo, f, (outs typeinfo.RegClass:$dst), WriteADC,
-            [(set typeinfo.RegClass:$dst, EFLAGS,
-                (opnode typeinfo.RegClass:$src1, typeinfo.ImmOperator:$src2,
-                        EFLAGS))]>;
-
-// BinOpRI8 - Binary instructions with inputs "reg, imm8".
-class BinOpRI8<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
-               Format f, dag outlist, X86FoldableSchedWrite sched, list<dag> pattern>
-  : ITy<opcode, f, typeinfo, outlist,
-        (ins typeinfo.RegClass:$src1, typeinfo.Imm8Operand:$src2),
-        mnemonic, "{$src2, $src1|$src1, $src2}", pattern>,
-    Sched<[sched]> {
-  let ImmT = Imm8; // Always 8-bit immediate.
+// BinOpRI_F - Instructions that read "reg, imm" and write EFLAGS only.
+class BinOpRI_F<bits<8> o, string m, X86TypeInfo t, SDPatternOperator node,
+                Format f>
+  : BinOpRI<o, m, t, f, (outs),
+            [(set EFLAGS, (node t.RegClass:$src1,
+             t.ImmOperator:$src2))]>, DefEFLAGS;
+// BinOpRI_RF - Instructions that read "reg, imm" and write "reg", EFLAGS.
+class BinOpRI_RF<bits<8> o, string m, X86TypeInfo t, SDNode node, Format f>
+  : BinOpRI<o, m, t, f, (outs t.RegClass:$dst),
+            [(set t.RegClass:$dst, EFLAGS,
+             (node t.RegClass:$src1, t.ImmOperator:$src2))]>, DefEFLAGS;
+// BinOpRIF_RF - Instructions that read "reg, imm", write "reg" and read/write
+// EFLAGS.
+class BinOpRIF_RF<bits<8> o, string m, X86TypeInfo t, SDNode node, Format f>
+  : BinOpRI<o, m, t, f, (outs t.RegClass:$dst),
+            [(set t.RegClass:$dst, EFLAGS,
+             (node t.RegClass:$src1, t.ImmOperator:$src2,
+             EFLAGS))]>, DefEFLAGS, UseEFLAGS {
+  let SchedRW = [WriteADC];
+}
+// BinOpRI8 - Instructions that read "reg, imm8".
+class BinOpRI8<bits<8> o, string m, X86TypeInfo t, Format f, dag out>
+  : ITy<o, f, t, out, (ins t.RegClass:$src1, t.Imm8Operand:$src2), m,
+        "{$src2, $src1|$src1, $src2}", []>, Sched<[WriteALU]> {
+  let ImmT = Imm8;
+}
+// BinOpRI8_F - Instructions that read "reg, imm8" and write EFLAGS only.
+class BinOpRI8_F<bits<8> o, string m, X86TypeInfo t, Format f>
+  : BinOpRI8<o, m, t, f, (outs)>, DefEFLAGS;
+// BinOpRI8_RF - Instructions that read "reg, imm8" and write "reg", EFLAGS.
+class BinOpRI8_RF<bits<8> o, string m, X86TypeInfo t, Format f>
+  : BinOpRI8<o, m, t, f, (outs t.RegClass:$dst)>, DefEFLAGS;
+// BinOpRI8F_RF - Instructions that read "reg, imm", write "reg" and read/write
+// EFLAGS.
+class BinOpRI8F_RF<bits<8> o, string m, X86TypeInfo t, Format f>
+  : BinOpRI8<o, m, t, f, (outs t.RegClass:$dst)>, DefEFLAGS, UseEFLAGS {
+  let SchedRW = [WriteADC];
 }
 
-// BinOpRI8_F - Binary instructions with inputs "reg, imm8".
-class BinOpRI8_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, Format f>
-  : BinOpRI8<opcode, mnemonic, typeinfo, f, (outs), WriteALU, []>;
-
-// BinOpRI8_RF - Binary instructions with inputs "reg, imm8".
-class BinOpRI8_RF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, Format f>
-  : BinOpRI8<opcode, mnemonic, typeinfo, f, (outs typeinfo.RegClass:$dst), WriteALU, []>;
-
-// BinOpRI8_RFF - Binary instructions with inputs "reg, imm8".
-class BinOpRI8_RFF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, Format f>
-  : BinOpRI8<opcode, mnemonic, typeinfo, f, (outs typeinfo.RegClass:$dst), WriteADC, []>;
-
-// BinOpMR - Binary instructions with inputs "[mem], reg".
-class BinOpMR<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
-              list<dag> pattern>
-  : ITy<opcode, MRMDestMem, typeinfo,
-        (outs), (ins typeinfo.MemOperand:$dst, typeinfo.RegClass:$src),
-        mnemonic, "{$src, $dst|$dst, $src}", pattern>;
-
-// BinOpMR_RMW - Binary instructions with inputs "[mem], reg", where the pattern
-// implicitly use EFLAGS.
-class BinOpMR_RMW<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
-                  SDNode opnode>
-  : BinOpMR<opcode, mnemonic, typeinfo,
-            [(store (opnode (load addr:$dst), typeinfo.RegClass:$src), addr:$dst),
+// BinOpMR - Instructions that read "[mem], reg".
+class BinOpMR<bits<8> o, string m, X86TypeInfo t, list<dag> p>
+  : ITy<o, MRMDestMem, t, (outs), (ins t.MemOperand:$dst, t.RegClass:$src), m,
+        "{$src, $dst|$dst, $src}", p> {
+  let mayLoad = 1;
+}
+// BinOpMR_F - Instructions that read "[mem], imm8" and write EFLAGS only.
+class BinOpMR_F<bits<8> o, string m, X86TypeInfo t, SDPatternOperator node>
+  : BinOpMR<o, m, t,
+            [(set EFLAGS, (node (t.LoadNode addr:$dst), t.RegClass:$src))]>,
+    Sched<[WriteALU.Folded, ReadDefault, ReadDefault, ReadDefault,
+            ReadDefault, ReadDefault, WriteALU.ReadAfterFold]>, DefEFLAGS;
+// BinOpMR_MF - Instructions that read "[mem], reg" and write "[mem]", EFLAGS.
+class BinOpMR_MF<bits<8> o, string m, X86TypeInfo t, SDNode node>
+  : BinOpMR<o, m, t,
+            [(store (node (load addr:$dst), t.RegClass:$src), addr:$dst),
              (implicit EFLAGS)]>,
     Sched<[WriteALURMW,
            // base, scale, index, offset, segment
-           ReadDefault, ReadDefault, ReadDefault,
-           ReadDefault, ReadDefault,
-           WriteALU.ReadAfterFold]>;  // reg
-
-// BinOpMR_RMW_FF - Binary instructions with inputs "[mem], reg", where the
-// pattern sets EFLAGS and implicitly uses EFLAGS.
-class BinOpMR_RMW_FF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
-                     SDNode opnode>
-  : BinOpMR<opcode, mnemonic, typeinfo,
-            [(store (opnode (load addr:$dst), typeinfo.RegClass:$src, EFLAGS),
-                    addr:$dst),
-             (implicit EFLAGS)]>,
+           ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault,
+           WriteALU.ReadAfterFold]>, // reg
+    DefEFLAGS {
+  let mayStore = 1;
+}
+// BinOpMRF_MF - Instructions that read "[mem], reg", write "[mem]" and
+// read/write EFLAGS.
+class BinOpMRF_MF<bits<8> o, string m, X86TypeInfo t, SDNode node>
+  : BinOpMR<o, m, t,
+            [(store (node (load addr:$dst), t.RegClass:$src, EFLAGS),
+             addr:$dst), (implicit EFLAGS)]>,
     Sched<[WriteADCRMW,
           // base, scale, index, offset, segment
           ReadDefault, ReadDefault, ReadDefault,
           ReadDefault, ReadDefault,
           WriteALU.ReadAfterFold,    // reg
-          WriteALU.ReadAfterFold]>;  // EFLAGS
-
-// BinOpMR_F - Binary instructions with inputs "[mem], reg", where the pattern
-// has EFLAGS as a result.
-class BinOpMR_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
-                SDPatternOperator opnode>
-  : BinOpMR<opcode, mnemonic, typeinfo,
-            [(set EFLAGS, (opnode (typeinfo.LoadNode addr:$dst),
-                                   typeinfo.RegClass:$src))]>,
-    Sched<[WriteALU.Folded, ReadDefault, ReadDefault, ReadDefault,
-            ReadDefault, ReadDefault, WriteALU.ReadAfterFold]>;
-
-// BinOpMI - Binary instructions with inputs "[mem], imm".
-class BinOpMI<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
-              Format f, list<dag> pattern>
-  : ITy<opcode, f, typeinfo,
-        (outs), (ins typeinfo.MemOperand:$dst, typeinfo.ImmOperand:$src),
-        mnemonic, "{$src, $dst|$dst, $src}", pattern> {
-  let ImmT = typeinfo.ImmEncoding;
+          WriteALU.ReadAfterFold]>,  // EFLAGS
+    DefEFLAGS, UseEFLAGS {
+  let mayStore = 1;
 }
 
-// BinOpMI_RMW - Binary instructions with inputs "[mem], imm", where the
-// pattern implicitly use EFLAGS.
-class BinOpMI_RMW<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
-                  SDNode opnode, Format f>
-  : BinOpMI<opcode, mnemonic, typeinfo, f,
-            [(store (opnode (typeinfo.VT (load addr:$dst)),
-                            typeinfo.ImmOperator:$src), addr:$dst),
-             (implicit EFLAGS)]>,
-    Sched<[WriteALURMW]>;
-
-// BinOpMI_RMW_FF - Binary instructions with inputs "[mem], imm", where the
-// pattern sets EFLAGS and implicitly uses EFLAGS.
-class BinOpMI_RMW_FF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
-                     SDNode opnode, Format f>
-  : BinOpMI<opcode, mnemonic, typeinfo, f,
-            [(store (opnode (typeinfo.VT (load addr:$dst)),
-                             typeinfo.ImmOperator:$src, EFLAGS), addr:$dst),
-                             (implicit EFLAGS)]>,
-    Sched<[WriteADCRMW]>;
-
-// BinOpMI_F - Binary instructions with inputs "[mem], imm", where the pattern
-// has EFLAGS as a result.
-class BinOpMI_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
-                SDPatternOperator opnode, Format f>
-  : BinOpMI<opcode, mnemonic, typeinfo, f,
-            [(set EFLAGS, (opnode (typeinfo.LoadNode addr:$dst),
-                                  typeinfo.ImmOperator:$src))]>,
-    Sched<[WriteALU.Folded]>;
-
-// BinOpMI8 - Binary instructions with inputs "[mem], imm8".
-class BinOpMI8<string mnemonic, X86TypeInfo typeinfo,
-               Format f, list<dag> pattern>
-  : ITy<0x82, f, typeinfo,
-        (outs), (ins typeinfo.MemOperand:$dst, typeinfo.Imm8Operand:$src),
-        mnemonic, "{$src, $dst|$dst, $src}", pattern> {
-  let ImmT = Imm8; // Always 8-bit immediate.
+// BinOpMI - Instructions that read "[mem], imm".
+class BinOpMI<bits<8> o, string m, X86TypeInfo t, Format f, list<dag> p>
+  : ITy<o, f, t, (outs), (ins t.MemOperand:$dst, t.ImmOperand:$src), m,
+        "{$src, $dst|$dst, $src}", p> {
+  let ImmT = t.ImmEncoding;
+  let mayLoad = 1;
+}
+// BinOpMI_F - Instructions that read "[mem], imm" and write EFLAGS only.
+class BinOpMI_F<bits<8> o, string m, X86TypeInfo t, SDPatternOperator node,
+                Format f>
+  : BinOpMI<o, m, t, f,
+            [(set EFLAGS, (node (t.LoadNode addr:$dst), t.ImmOperator:$src))]>,
+    Sched<[WriteALU.Folded]>, DefEFLAGS;
+// BinOpMI_MF - Instructions that read "[mem], imm" and write "[mem]", EFLAGS.
+class BinOpMI_MF<bits<8> o, string m, X86TypeInfo t, SDNode node, Format f>
+  : BinOpMI<o, m, t, f,
+            [(store (node (t.VT (load addr:$dst)),
+             t.ImmOperator:$src), addr:$dst), (implicit EFLAGS)]>,
+    Sched<[WriteALURMW]>, DefEFLAGS {
+  let mayStore = 1;
+}
+// BinOpMIF_MF - Instructions that read "[mem], imm", write "[mem]" and
+// read/write EFLAGS.
+class BinOpMIF_MF<bits<8> o, string m, X86TypeInfo t, SDNode node, Format f>
+  : BinOpMI<o, m, t, f,
+            [(store (node (t.VT (load addr:$dst)),
+             t.ImmOperator:$src, EFLAGS), addr:$dst), (implicit EFLAGS)]>,
+    Sched<[WriteADCRMW]>, DefEFLAGS, UseEFLAGS {
+  let mayStore = 1;
 }
 
-// BinOpMI8_RMW - Binary instructions with inputs "[mem], imm8".
-class BinOpMI8_RMW<string mnemonic, X86TypeInfo typeinfo, Format f>
-  : BinOpMI8<mnemonic, typeinfo, f, []>, Sched<[WriteALURMW]>;
-
-// BinOpMI8_RMW_FF - Binary instructions with inputs "[mem], imm8".
-class BinOpMI8_RMW_FF<string mnemonic, X86TypeInfo typeinfo, Format f>
-  : BinOpMI8<mnemonic, typeinfo, f, []>, Sched<[WriteADCRMW]>;
-
-// BinOpMI8_F - Binary instructions with inputs "[mem], imm8"
-class BinOpMI8_F<string mnemonic, X86TypeInfo typeinfo, Format f>
-  : BinOpMI8<mnemonic, typeinfo, f, []>, Sched<[WriteALU.Folded]>;
-
-// BinOpAI - Binary instructions with input imm, that implicitly use A reg and
-// implicitly define Areg and EFLAGS.
-class BinOpAI<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
-              Register areg, string operands, X86FoldableSchedWrite sched = WriteALU>
-  : ITy<opcode, RawFrm, typeinfo,
-        (outs), (ins typeinfo.ImmOperand:$src),
-        mnemonic, operands, []>,
-    Sched<[sched]> {
-  let ImmT = typeinfo.ImmEncoding;
+// BinOpMI8 - Instructions that read "[mem], imm8".
+class BinOpMI8<string m, X86TypeInfo t, Format f>
+  : ITy<0x82, f, t, (outs), (ins t.MemOperand:$dst, t.Imm8Operand:$src), m,
+        "{$src, $dst|$dst, $src}", []> {
+  let ImmT = Imm8;
+  let mayLoad = 1;
+}
+// BinOpMI8_F - Instructions that read "[mem], imm8" and write EFLAGS only.
+class BinOpMI8_F<string m, X86TypeInfo t, Format f>
+  : BinOpMI8<m, t, f>, Sched<[WriteALU.Folded]>, DefEFLAGS;
+// BinOpMI8_MF - Instructions that read "[mem], imm8" and write "[mem]", EFLAGS.
+class BinOpMI8_MF<string m, X86TypeInfo t, Format f>
+  : BinOpMI8<m, t, f>, Sched<[WriteALURMW]>, DefEFLAGS {
+  let mayStore = 1;
+}
+// BinOpMI8F_MF - Instructions that read "[mem], imm8", write "[mem]" and
+// read/write EFLAGS.
+class BinOpMI8F_MF<string m, X86TypeInfo t, Format f>
+  : BinOpMI8<m, t, f>, Sched<[WriteADCRMW]>, DefEFLAGS, UseEFLAGS {
+  let mayStore = 1;
+}
+
+// BinOpAI - Instructions that read "a-reg imm" (Accumulator register).
+class BinOpAI<bits<8> o, string m, X86TypeInfo t, Register areg, string args>
+  : ITy<o, RawFrm, t, (outs), (ins t.ImmOperand:$src), m, args, []>,
+    Sched<[WriteALU]> {
+  let ImmT = t.ImmEncoding;
   let Uses = [areg];
-  let Defs = [areg, EFLAGS];
-  let hasSideEffects = 0;
 }
+// BinOpAI_F - Instructions that read "a-reg imm" and write EFLAGS only.
+class BinOpAI_F<bits<8> o, string m, X86TypeInfo t, Register areg, string args>
+  : BinOpAI<o, m, t, areg, args>, DefEFLAGS;
 
-// BinOpAI_RFF - Binary instructions with input imm, that implicitly use and
-// define Areg and EFLAGS.
-class BinOpAI_RFF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
-                  Register areg, string operands>
-  : BinOpAI<opcode, mnemonic, typeinfo, areg, operands, WriteADC> {
+// BinOpAI_AF - Instructions that read "a-reg imm" and write a-reg/EFLAGS.
+class BinOpAI_AF<bits<8> o, string m, X86TypeInfo t, Register areg,
+                 string args> : BinOpAI<o, m, t, areg, args> {
+  let Defs = [areg, EFLAGS];
+}
+// BinOpAIF_AF - Instructions that read "a-reg imm", write a-reg and read/write
+// EFLAGS.
+class BinOpAIF_AF<bits<8> o, string m, X86TypeInfo t, Register areg,
+                  string args> : BinOpAI<o, m, t, areg, args> {
   let Uses = [areg, EFLAGS];
+  let Defs = [areg, EFLAGS];
+  let SchedRW = [WriteADC];
 }
 
-// BinOpAI_F - Binary instructions with input imm, that implicitly use A reg and
-// implicitly define EFLAGS.
-class BinOpAI_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
-                Register areg, string operands>
-  : BinOpAI<opcode, mnemonic, typeinfo, areg, operands> {
-  let Defs = [EFLAGS];
+// UnaryOpR - Instructions that read "reg" and write "reg".
+class UnaryOpR<bits<8> o, Format f, string m, X86TypeInfo t, list<dag> p>
+  : ITy<o, f, t, (outs t.RegClass:$dst),
+        (ins t.RegClass:$src1), m, "$dst", p>, Sched<[WriteALU]>;
+
+// UnaryOpM - Instructions that read "[mem]" and writes "[mem]".
+class UnaryOpM<bits<8> o, Format f, string m, X86TypeInfo t, list<dag> p>
+  : ITy<o, f, t, (outs), (ins t.MemOperand:$dst), m, "$dst", p>,
+    Sched<[WriteALURMW]> {
+  let mayLoad = 1;
+  let mayStore = 1;
 }
 
-//  UnaryOpM - Unary instructions with a memory operand.
-class UnaryOpM<bits<8> opcode, Format f, string mnemonic, X86TypeInfo info,
-               list<dag> pattern>
-  : ITy<opcode, f, info, (outs), (ins info.MemOperand:$dst), mnemonic,
-        "$dst", pattern>;
-
-//  UnaryOpR - Unary instructions with a register.
-class UnaryOpR<bits<8> opcode, Format f, string mnemonic, X86TypeInfo info,
-               list<dag> pattern>
-  : ITy<opcode, f, info, (outs info.RegClass:$dst),
-        (ins info.RegClass:$src1), mnemonic, "$dst", pattern>;
-
-//  INCDECR - Instructions like "inc reg".
-class INCDECR<Format f, string mnemonic, X86TypeInfo info,
-              SDPatternOperator node>
-  : UnaryOpR<0xFE, f, mnemonic, info,
-             [(set info.RegClass:$dst, EFLAGS,
-              (node info.RegClass:$src1, 1))]>;
-
-//  INCDECM - Instructions like "inc [mem]".
-class INCDECM<Format f, string mnemonic, X86TypeInfo info, int num>
-  : UnaryOpM<0xFE, f, mnemonic, info,
-             [(store (add (info.LoadNode addr:$dst), num), addr:$dst),
-              (implicit EFLAGS)]>;
-
-//  INCDECR_ALT - Instructions like "inc reg" short forms.
-class INCDECR_ALT<bits<8> opcode, string mnemonic, X86TypeInfo info>
-  : UnaryOpR<opcode, AddRegFrm, mnemonic, info, []>{
+// INCDECR - Instructions like "inc reg".
+class INCDECR<Format f, string m, X86TypeInfo t, SDPatternOperator node>
+  : UnaryOpR<0xFE, f, m, t,
+             [(set t.RegClass:$dst, EFLAGS, (node t.RegClass:$src1, 1))]>,
+    DefEFLAGS {
+  let isConvertibleToThreeAddress = 1; // Can xform into LEA.
+}
+
+// INCDECM - Instructions like "inc [mem]".
+class INCDECM<Format f, string m, X86TypeInfo t, int num>
+  : UnaryOpM<0xFE, f, m, t,
+             [(store (add (t.LoadNode addr:$dst), num), addr:$dst),
+              (implicit EFLAGS)]>, DefEFLAGS;
+
+// INCDECR_ALT - Instructions like "inc reg" short forms.
+class INCDECR_ALT<bits<8> o, string m, X86TypeInfo t>
+  : UnaryOpR<o, AddRegFrm, m, t, []>, DefEFLAGS {
+  // Short forms only valid in 32-bit mode. Selected during MCInst lowering.
   let Predicates = [Not64BitMode];
-  let Opcode = opcode;
+  let Opcode = o;
 }
 
-//  MulOpR - Instructions like "mul reg".
-class MulOpR<bits<8> opcode, Format f, string mnemonic, X86TypeInfo info,
-             X86FoldableSchedWrite sched, list<dag> pattern>
-  : ITy<opcode, f, info, (outs), (ins info.RegClass:$src), mnemonic,
-        "$src", pattern>,
-    Sched<[sched]>;
-
-//  MulOpM - Instructions like "mul [mem]".
-class MulOpM<bits<8> opcode, Format f, string mnemonic, X86TypeInfo info,
-             X86FoldableSchedWrite sched, list<dag> pattern>
-  : ITy<opcode, f, info, (outs), (ins info.MemOperand:$src), mnemonic,
-        "$src", pattern>, SchedLoadReg<sched>;
-
-//  NegOpR - Instructions like "neg reg", with implicit EFLAGS.
-class NegOpR<bits<8> opcode, string mnemonic, X86TypeInfo info>
-  : UnaryOpR<opcode, MRM3r, mnemonic, info,
-             [(set info.RegClass:$dst, (ineg info.RegClass:$src1)),
-              (implicit EFLAGS)]>;
-
-//  NotOpR - Instructions like "not reg".
-class NotOpR<bits<8> opcode, string mnemonic, X86TypeInfo info>
-  : UnaryOpR<opcode, MRM2r, mnemonic, info,
-               [(set info.RegClass:$dst,
-                (not info.RegClass:$src1))]>;
-
-//  NegOpM - Instructions like "neg [mem]", with implicit EFLAGS.
-class NegOpM<bits<8> opcode, string mnemonic, X86TypeInfo info>
-  : UnaryOpM<opcode, MRM3m, mnemonic, info,
-             [(store (ineg (info.LoadNode addr:$dst)), addr:$dst),
-              (implicit EFLAGS)]>;
-
-//  NotOpM - Instructions like "neg [mem]".
-class NotOpM<bits<8> opcode, string mnemonic, X86TypeInfo info>
-  : UnaryOpM<opcode, MRM2m, mnemonic,  info,
-             [(store (not (info.LoadNode addr:$dst)), addr:$dst)]>;
-
-// BinOpRR_C - Binary instructions with inputs "reg, reg", which used mainly
-// with Constraints = "$src1 = $dst".
-class BinOpRR_C<bits<8> opcode, Format f, string mnemonic, X86TypeInfo info,
-                list<dag> pattern>
-  : ITy<opcode, f, info, (outs info.RegClass:$dst),
-        (ins info.RegClass:$src1, info.RegClass:$src2),
-        mnemonic, "{$src2, $dst|$dst, $src2}", pattern>;
-
-// BinOpRM_C - Binary instructions with inputs "reg, [mem]", which used mainly
-// with Constraints = "$src1 = $dst".
-class BinOpRM_C<bits<8> opcode, Format f, string mnemonic, X86TypeInfo info,
-                list<dag> pattern>
-  : ITy<opcode, f, info, (outs info.RegClass:$dst),
-        (ins info.RegClass:$src1, info.MemOperand:$src2),
-        mnemonic, "{$src2, $dst|$dst, $src2}", pattern>;
+// MulOpR - Instructions like "mul reg".
+class MulOpR<bits<8> o, Format f, string m, X86TypeInfo t,
+             X86FoldableSchedWrite sched, list<dag> p>
+  : ITy<o, f, t, (outs), (ins t.RegClass:$src), m, "$src", p>, Sched<[sched]>;
+
+// MulOpM - Instructions like "mul [mem]".
+class MulOpM<bits<8> o, Format f, string m, X86TypeInfo t,
+             X86FoldableSchedWrite sched, list<dag> p>
+  : ITy<o, f, t, (outs), (ins t.MemOperand:$src), m,
+        "$src", p>, SchedLoadReg<sched> {
+  let mayLoad = 1;
+}
+
+// NegOpR - Instructions like "neg reg".
+class NegOpR<bits<8> o, string m, X86TypeInfo t>
+  : UnaryOpR<o, MRM3r, m, t,
+             [(set t.RegClass:$dst, (ineg t.RegClass:$src1)),
+              (implicit EFLAGS)]>, DefEFLAGS;
+
+// NegOpM - Instructions like "neg [mem]".
+class NegOpM<bits<8> o, string m, X86TypeInfo t>
+  : UnaryOpM<o, MRM3m, m, t,
+             [(store (ineg (t.LoadNode addr:$dst)), addr:$dst),
+              (implicit EFLAGS)]>, DefEFLAGS;
+
+// NOTE: NOT does not set EFLAGS!
+// NotOpR - Instructions like "not reg".
+class NotOpR<bits<8> o, string m, X86TypeInfo t>
+  : UnaryOpR<o, MRM2r, m, t, [(set t.RegClass:$dst, (not t.RegClass:$src1))]>;
+
+// NotOpM - Instructions like "neg [mem]".
+class NotOpM<bits<8> o, string m, X86TypeInfo t>
+  : UnaryOpM<o, MRM2m, m,  t,
+             [(store (not (t.LoadNode addr:$dst)), addr:$dst)]>;
 
 // IMulOpRR - Instructions like "imul reg, reg, i8".
-class IMulOpRR<bits<8> opcode, string mnemonic, X86TypeInfo info,
-               X86FoldableSchedWrite sched>
-  : BinOpRR_C<opcode, MRMSrcReg, mnemonic, info,
-              [(set info.RegClass:$dst, EFLAGS,
-                (X86smul_flag info.RegClass:$src1,
-                info.RegClass:$src2))]>,
-    Sched<[sched]>, TB;
+class IMulOpRR<bits<8> o, string m, X86TypeInfo t, X86FoldableSchedWrite sched>
+  : BinOpRR_RF<o, m, t, X86smul_flag>, TB {
+  let Form = MRMSrcReg;
+  let SchedRW = [sched];
+  // X = IMUL Y, Z --> X = IMUL Z, Y
+  let isCommutable = 1;
+}
 
 // IMulOpRM - Instructions like "imul reg, reg, [mem]".
-class IMulOpRM<bits<8> opcode, string mnemonic, X86TypeInfo info,
-               X86FoldableSchedWrite sched>
-  : BinOpRM_C<opcode, MRMSrcMem, mnemonic, info,
-              [(set info.RegClass:$dst, EFLAGS,
-                (X86smul_flag info.RegClass:$src1, (info.LoadNode addr:$src2)))]>,
-    Sched<[sched.Folded, sched.ReadAfterFold]>, TB;
+class IMulOpRM<bits<8> o, string m, X86TypeInfo t, X86FoldableSchedWrite sched>
+  : BinOpRM_RF<o, m, t, X86smul_flag>, TB {
+let Form = MRMSrcMem;
+let SchedRW = [sched.Folded, sched.ReadAfterFold];
+}
 
 // IMulOpRRI8 - Instructions like "imul reg, reg, i8".
-class IMulOpRRI8<bits<8> opcode, string mnemonic, X86TypeInfo info,
+class IMulOpRRI8<bits<8> o, string m, X86TypeInfo t,
                  X86FoldableSchedWrite sched>
-  : ITy<opcode, MRMSrcReg, info, (outs info.RegClass:$dst),
-        (ins info.RegClass:$src1, info.Imm8Operand:$src2), mnemonic,
-        "{$src2, $src1, $dst|$dst, $src1, $src2}", []>, Sched<[sched]> {
+  : ITy<o, MRMSrcReg, t, (outs t.RegClass:$dst),
+        (ins t.RegClass:$src1, t.Imm8Operand:$src2), m,
+        "{$src2, $src1, $dst|$dst, $src1, $src2}", []>, Sched<[sched]>, DefEFLAGS {
   let ImmT = Imm8;
 }
 
 // IMulOpRRI - Instructions like "imul reg, reg, i16/i32/i64".
-class IMulOpRRI<bits<8> opcode, string mnemonic, X86TypeInfo info,
+class IMulOpRRI<bits<8> o, string m, X86TypeInfo t,
                 X86FoldableSchedWrite sched>
-  : ITy<opcode, MRMSrcReg, info, (outs info.RegClass:$dst),
-        (ins info.RegClass:$src1, info.ImmOperand:$src2), mnemonic,
+  : ITy<o, MRMSrcReg, t, (outs t.RegClass:$dst),
+        (ins t.RegClass:$src1, t.ImmOperand:$src2), m,
         "{$src2, $src1, $dst|$dst, $src1, $src2}",
-        [(set info.RegClass:$dst, EFLAGS,
-         (X86smul_flag info.RegClass:$src1,
-                       info.ImmNoSuOperator:$src2))]>,
-    Sched<[sched]>{
-  let ImmT = info.ImmEncoding;
+        [(set t.RegClass:$dst, EFLAGS, (X86smul_flag t.RegClass:$src1,
+         t.ImmNoSuOperator:$src2))]>,
+    Sched<[sched]>, DefEFLAGS {
+  let ImmT = t.ImmEncoding;
 }
 
 // IMulOpRMI8 - Instructions like "imul reg, [mem], i8".
-class IMulOpRMI8<bits<8> opcode, string mnemonic, X86TypeInfo info,
+class IMulOpRMI8<bits<8> o, string m, X86TypeInfo t,
                  X86FoldableSchedWrite sched>
-  : ITy<opcode, MRMSrcMem, info, (outs info.RegClass:$dst),
-        (ins info.MemOperand:$src1, info.Imm8Operand:$src2), mnemonic,
-        "{$src2, $src1, $dst|$dst, $src1, $src2}", []>, Sched<[sched.Folded]> {
+  : ITy<o, MRMSrcMem, t, (outs t.RegClass:$dst),
+        (ins t.MemOperand:$src1, t.Imm8Operand:$src2), m,
+        "{$src2, $src1, $dst|$dst, $src1, $src2}", []>, Sched<[sched.Folded]>,
+    DefEFLAGS {
   let ImmT = Imm8;
+  let mayLoad = 1;
 }
 
 // IMulOpRMI - Instructions like "imul reg, [mem], i16/i32/i64".
-class IMulOpRMI<bits<8> opcode, string mnemonic, X86TypeInfo info,
+class IMulOpRMI<bits<8> o, string m, X86TypeInfo t,
                 X86FoldableSchedWrite sched>
-  : ITy<opcode, MRMSrcMem, info, (outs info.RegClass:$dst),
-        (ins info.MemOperand:$src1, info.ImmOperand:$src2), mnemonic,
+  : ITy<o, MRMSrcMem, t, (outs t.RegClass:$dst),
+        (ins t.MemOperand:$src1, t.ImmOperand:$src2), m,
         "{$src2, $src1, $dst|$dst, $src1, $src2}",
-        [(set info.RegClass:$dst, EFLAGS,
-         (X86smul_flag (info.LoadNode addr:$src1),
-                        info.ImmNoSuOperator:$src2))]>,
-    Sched<[sched.Folded]>{
-  let ImmT = info.ImmEncoding;
+        [(set t.RegClass:$dst, EFLAGS,
+         (X86smul_flag (t.LoadNode addr:$src1), t.ImmNoSuOperator:$src2))]>,
+    Sched<[sched.Folded]>, DefEFLAGS {
+  let ImmT = t.ImmEncoding;
 }
 
-let Defs = [EFLAGS] in {
-let Constraints = "$src1 = $dst", SchedRW = [WriteALU] in {
-// Short forms only valid in 32-bit mode. Selected during MCInst lowering.
-let hasSideEffects = 0 in {
+let Constraints = "$src1 = $dst" in {
 def INC16r_alt : INCDECR_ALT<0x40, "inc", Xi16>;
 def INC32r_alt : INCDECR_ALT<0x40, "inc", Xi32>;
-} // hasSideEffects = 0
-
-let isConvertibleToThreeAddress = 1 in { // Can xform into LEA.
 def INC8r  : INCDECR<MRM0r, "inc", Xi8, X86add_flag_nocf>;
 def INC16r : INCDECR<MRM0r, "inc", Xi16, X86add_flag_nocf>;
 def INC32r : INCDECR<MRM0r, "inc", Xi32, X86add_flag_nocf>;
 def INC64r : INCDECR<MRM0r, "inc", Xi64, X86add_flag_nocf>;
-} // isConvertibleToThreeAddress = 1
-} // Constraints = "$src1 = $dst", SchedRW
-
-let SchedRW = [WriteALURMW] in {
-let Predicates = [UseIncDec] in {
-  def INC8m  : INCDECM<MRM0m, "inc", Xi8, 1>;
-  def INC16m : INCDECM<MRM0m, "inc", Xi16, 1>;
-  def INC32m : INCDECM<MRM0m, "inc", Xi32, 1>;
-} // Predicates
-let Predicates = [UseIncDec, In64BitMode] in {
-  def INC64m : INCDECM<MRM0m, "inc", Xi64, 1>;
-} // Predicates
-} // SchedRW
 
-let Constraints = "$src1 = $dst", SchedRW = [WriteALU] in {
-// Short forms only valid in 32-bit mode. Selected during MCInst lowering.
-let hasSideEffects = 0 in {
 def DEC16r_alt : INCDECR_ALT<0x48, "dec", Xi16>;
 def DEC32r_alt : INCDECR_ALT<0x48, "dec", Xi32>;
-} // hasSideEffects = 0
-
-let isConvertibleToThreeAddress = 1 in { // Can xform into LEA.
 def DEC8r  : INCDECR<MRM1r, "dec", Xi8, X86sub_flag_nocf>;
 def DEC16r : INCDECR<MRM1r, "dec", Xi16, X86sub_flag_nocf>;
 def DEC32r : INCDECR<MRM1r, "dec", Xi32, X86sub_flag_nocf>;
 def DEC64r : INCDECR<MRM1r, "dec", Xi64, X86sub_flag_nocf>;
-} // isConvertibleToThreeAddress = 1
-} // Constraints = "$src1 = $dst", SchedRW
+}
 
-let SchedRW = [WriteALURMW] in {
 let Predicates = [UseIncDec] in {
-  def DEC8m  : INCDECM<MRM1m, "dec", Xi8, -1>;
-  def DEC16m : INCDECM<MRM1m, "dec", Xi16, -1>;
-  def DEC32m : INCDECM<MRM1m, "dec", Xi32, -1>;
-} // Predicates
+def INC8m  : INCDECM<MRM0m, "inc", Xi8, 1>;
+def INC16m : INCDECM<MRM0m, "inc", Xi16, 1>;
+def INC32m : INCDECM<MRM0m, "inc", Xi32, 1>;
+def DEC8m  : INCDECM<MRM1m, "dec", Xi8, -1>;
+def DEC16m : INCDECM<MRM1m, "dec", Xi16, -1>;
+def DEC32m : INCDECM<MRM1m, "dec", Xi32, -1>;
+}
 let Predicates = [UseIncDec, In64BitMode] in {
-  def DEC64m : INCDECM<MRM1m, "dec", Xi64, -1>;
-} // Predicates
-} // SchedRW
-} // Defs = [EFLAGS]
+def INC64m : INCDECM<MRM0m, "inc", Xi64, 1>;
+def DEC64m : INCDECM<MRM1m, "dec", Xi64, -1>;
+}
 
 // Extra precision multiplication
 
@@ -554,14 +448,14 @@ def MUL8r  : MulOpR<0xF6, MRM4r, "mul", Xi8, WriteIMul8,
                // syntax can be accepted.
                [(set AL, (mul AL, GR8:$src)), (implicit EFLAGS)]>;
 // AX,DX = AX*GR16
-let Defs = [AX,DX,EFLAGS], Uses = [AX], hasSideEffects = 0 in
+let Defs = [AX,DX,EFLAGS], Uses = [AX] in
 def MUL16r : MulOpR<0xF7, MRM4r, "mul", Xi16, WriteIMul16, []>;
 // EAX,EDX = EAX*GR32
-let Defs = [EAX,EDX,EFLAGS], Uses = [EAX], hasSideEffects = 0 in
+let Defs = [EAX,EDX,EFLAGS], Uses = [EAX] in
 def MUL32r : MulOpR<0xF7, MRM4r, "mul", Xi32, WriteIMul32,
                [/*(set EAX, EDX, EFLAGS, (X86umul_flag EAX, GR32:$src))*/]>;
 // RAX,RDX = RAX*GR64
-let Defs = [RAX,RDX,EFLAGS], Uses = [RAX], hasSideEffects = 0 in
+let Defs = [RAX,RDX,EFLAGS], Uses = [RAX] in
 def MUL64r : MulOpR<0xF7, MRM4r, "mul", Xi64, WriteIMul64,
                 [/*(set RAX, RDX, EFLAGS, (X86umul_flag RAX, GR64:$src))*/]>;
 // AL,AH = AL*[mem8]
@@ -573,7 +467,6 @@ def MUL8m  : MulOpM<0xF6, MRM4m, "mul", Xi8, WriteIMul8,
                [(set AL, (mul AL, (loadi8 addr:$src))),
                 (implicit EFLAGS)]>;
 // AX,DX = AX*[mem16]
-let mayLoad = 1, hasSideEffects = 0 in {
 let Defs = [AX,DX,EFLAGS], Uses = [AX] in
 def MUL16m : MulOpM<0xF7, MRM4m, "mul", Xi16, WriteIMul16, []>;
 // EAX,EDX = EAX*[mem32]
@@ -582,10 +475,8 @@ def MUL32m : MulOpM<0xF7, MRM4m, "mul", Xi32, WriteIMul32, []>;
 // RAX,RDX = RAX*[mem64]
 let Defs = [RAX,RDX,EFLAGS], Uses = [RAX] in
 def MUL64m : MulOpM<0xF7, MRM4m, "mul", Xi64, WriteIMul64, []>,
-                Requires<[In64BitMode]>;
-}
+             Requires<[In64BitMode]>;
 
-let hasSideEffects = 0 in {
 // AL,AH = AL*GR8
 let Defs = [AL,EFLAGS,AX], Uses = [AL] in
 def IMUL8r  : MulOpR<0xF6, MRM5r, "imul", Xi8, WriteIMul8, []>;
@@ -599,7 +490,6 @@ def IMUL32r : MulOpR<0xF7, MRM5r, "imul", Xi32, WriteIMul32, []>;
 let Defs = [RAX,RDX,EFLAGS], Uses = [RAX] in
 def IMUL64r : MulOpR<0xF7, MRM5r, "imul", Xi64, WriteIMul64, []>;
 
-let mayLoad = 1 in {
 // AL,AH = AL*[mem8]
 let Defs = [AL,EFLAGS,AX], Uses = [AL] in
 def IMUL8m  : MulOpM<0xF6, MRM5m, "imul", Xi8, WriteIMul8, []>;
@@ -612,28 +502,21 @@ def IMUL32m : MulOpM<0xF7, MRM5m, "imul", Xi32, WriteIMul32, []>;
 // RAX,RDX = RAX*[mem64]
 let Defs = [RAX,RDX,EFLAGS], Uses = [RAX] in
 def IMUL64m : MulOpM<0xF7, MRM5m, "imul", Xi64, WriteIMul64, []>,
-                 Requires<[In64BitMode]>;
-}
+              Requires<[In64BitMode]>;
 
-let Defs = [EFLAGS] in {
 let Constraints = "$src1 = $dst" in {
-let isCommutable = 1 in {
-// X = IMUL Y, Z --> X = IMUL Z, Y
 // Register-Register Signed Integer Multiply
 def IMUL16rr : IMulOpRR<0xAF, "imul", Xi16, WriteIMul16Reg>;
 def IMUL32rr : IMulOpRR<0xAF, "imul", Xi32, WriteIMul32Reg>;
 def IMUL64rr : IMulOpRR<0xAF, "imul", Xi64, WriteIMul64Reg>;
-} // isCommutable
 
 // Register-Memory Signed Integer Multiply
 def IMUL16rm : IMulOpRM<0xAF, "imul", Xi16, WriteIMul16Reg>;
 def IMUL32rm : IMulOpRM<0xAF, "imul", Xi32, WriteIMul32Reg>;
 def IMUL64rm : IMulOpRM<0xAF, "imul", Xi64, WriteIMul64Reg>;
-} // Constraints = "$src1 = $dst"
-} // Defs = [EFLAGS]
+}
 
 // Surprisingly enough, these are not two address instructions!
-let Defs = [EFLAGS] in {
 // NOTE: These are order specific, we want the ri8 forms to be listed
 // first so that they are slightly preferred to the ri forms.
 
@@ -653,7 +536,6 @@ def IMUL64rri32 : IMulOpRRI<0x69, "imul", Xi64, WriteIMul64Imm>;
 
 // Memory-Integer Signed Integer Multiply
 // GR16 = [mem16]*I8
-let mayLoad = 1 in {
 def IMUL16rmi8 : IMulOpRMI8<0x6B, "imul", Xi16, WriteIMul16Imm>;
 // GR16 = [mem16]*I16
 def IMUL16rmi  : IMulOpRMI<0x69, "imul", Xi16, WriteIMul16Imm>;
@@ -665,9 +547,6 @@ def IMUL32rmi  : IMulOpRMI<0x69, "imul", Xi32, WriteIMul32Imm>;
 def IMUL64rmi8 : IMulOpRMI8<0x6B, "imul", Xi64, WriteIMul64Imm>;
 // GR64 = [mem64]*I32
 def IMUL64rmi32 : IMulOpRMI<0x69, "imul", Xi64, WriteIMul64Imm>;
-} // mayLoad
-} // Defs = [EFLAGS]
-} // hasSideEffects
 
 // unsigned division/remainder
 let hasSideEffects = 1 in { // so that we don't speculatively execute
@@ -713,7 +592,6 @@ def IDIV32r: MulOpR<0xF7, MRM7r, "idiv", Xi32, WriteIDiv32, []>;
 let Defs = [RAX,RDX,EFLAGS], Uses = [RAX,RDX] in
 def IDIV64r: MulOpR<0xF7, MRM7r, "idiv", Xi64, WriteIDiv64, []>;
 
-let mayLoad = 1 in {
 let Defs = [AL,AH,EFLAGS], Uses = [AX] in
 // AX/[mem8] = AL,AH
 def IDIV8m : MulOpM<0xF6, MRM7m, "idiv", Xi8, WriteIDiv8, []>;
@@ -727,47 +605,31 @@ let Defs = [RAX,RDX,EFLAGS], Uses = [RAX,RDX] in // RDX:RAX/[mem64] = RAX,RDX
 // RDX:RAX/[mem64] = RAX,RDX
 def IDIV64m: MulOpM<0xF7, MRM7m, "idiv", Xi64, WriteIDiv64, []>,
              Requires<[In64BitMode]>;
-}
 } // hasSideEffects = 1
 
-//===----------------------------------------------------------------------===//
-//  Two address Instructions.
-//
-
-// unary instructions
-let Defs = [EFLAGS] in {
-let Constraints = "$src1 = $dst", SchedRW = [WriteALU] in {
+let Constraints = "$src1 = $dst" in {
 def NEG8r  : NegOpR<0xF6, "neg", Xi8>;
 def NEG16r : NegOpR<0xF7, "neg", Xi16>;
 def NEG32r : NegOpR<0xF7, "neg", Xi32>;
 def NEG64r : NegOpR<0xF7, "neg", Xi64>;
-} // Constraints = "$src1 = $dst", SchedRW
+}
 
-// Read-modify-write negate.
-let SchedRW = [WriteALURMW] in {
 def NEG8m  : NegOpM<0xF6, "neg", Xi8>;
 def NEG16m : NegOpM<0xF7, "neg", Xi16>;
 def NEG32m : NegOpM<0xF7, "neg", Xi32>;
 def NEG64m : NegOpM<0xF7, "neg", Xi64>, Requires<[In64BitMode]>;
-} // SchedRW
-} // Defs = [EFLAGS]
 
-
-// Note: NOT does not set EFLAGS!
-
-let Constraints = "$src1 = $dst", SchedRW = [WriteALU] in {
+let Constraints = "$src1 = $dst" in {
 def NOT8r  : NotOpR<0xF6, "not", Xi8>;
 def NOT16r : NotOpR<0xF7, "not", Xi16>;
 def NOT32r : NotOpR<0xF7, "not", Xi32>;
 def NOT64r : NotOpR<0xF7, "not", Xi64>;
-} // Constraints = "$src1 = $dst", SchedRW
+}
 
-let SchedRW = [WriteALURMW] in {
 def NOT8m  : NotOpM<0xF6, "not", Xi8>;
 def NOT16m : NotOpM<0xF7, "not", Xi16>;
 def NOT32m : NotOpM<0xF7, "not", Xi32>;
 def NOT64m : NotOpM<0xF7, "not", Xi64>, Requires<[In64BitMode]>;
-} // SchedRW
 
 /// ArithBinOp_RF - This is an arithmetic binary operator where the pattern is
 /// defined with "(set GPR:$dst, EFLAGS, (...".
@@ -779,81 +641,73 @@ multiclass ArithBinOp_RF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
                          SDNode opnodeflag, SDNode opnode,
                          bit CommutableRR, bit ConvertibleToThreeAddress,
                          bit ConvertibleToThreeAddressRR> {
-  let Defs = [EFLAGS] in {
-    let Constraints = "$src1 = $dst" in {
-      let isCommutable = CommutableRR in {
-        let isConvertibleToThreeAddress = ConvertibleToThreeAddressRR in {
-          def NAME#8rr  : BinOpRR_RF<BaseOpc, mnemonic, Xi8 , opnodeflag>;
-          def NAME#16rr : BinOpRR_RF<BaseOpc, mnemonic, Xi16, opnodeflag>;
-          def NAME#32rr : BinOpRR_RF<BaseOpc, mnemonic, Xi32, opnodeflag>;
-          def NAME#64rr : BinOpRR_RF<BaseOpc, mnemonic, Xi64, opnodeflag>;
-        } // isConvertibleToThreeAddress
-      } // isCommutable
-
-      def NAME#8rr_REV  : BinOpRR_Rev<BaseOpc2, mnemonic, Xi8>;
-      def NAME#16rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi16>;
-      def NAME#32rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi32>;
-      def NAME#64rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi64>;
-
-      def NAME#8rm   : BinOpRM_RF<BaseOpc2, mnemonic, Xi8 , opnodeflag>;
-      def NAME#16rm  : BinOpRM_RF<BaseOpc2, mnemonic, Xi16, opnodeflag>;
-      def NAME#32rm  : BinOpRM_RF<BaseOpc2, mnemonic, Xi32, opnodeflag>;
-      def NAME#64rm  : BinOpRM_RF<BaseOpc2, mnemonic, Xi64, opnodeflag>;
-
-      let isConvertibleToThreeAddress = ConvertibleToThreeAddress, hasSideEffects= 0 in {
-        def NAME#8ri   : BinOpRI_RF<0x80, mnemonic, Xi8 , opnodeflag, RegMRM>;
-
-        // NOTE: These are order specific, we want the ri8 forms to be listed
-        // first so that they are slightly preferred to the ri forms.
-        def NAME#16ri8 : BinOpRI8_RF<0x82, mnemonic, Xi16, RegMRM>;
-        def NAME#32ri8 : BinOpRI8_RF<0x82, mnemonic, Xi32, RegMRM>;
-        def NAME#64ri8 : BinOpRI8_RF<0x82, mnemonic, Xi64, RegMRM>;
-
-        def NAME#16ri  : BinOpRI_RF<0x80, mnemonic, Xi16, opnodeflag, RegMRM>;
-        def NAME#32ri  : BinOpRI_RF<0x80, mnemonic, Xi32, opnodeflag, RegMRM>;
-        def NAME#64ri32: BinOpRI_RF<0x80, mnemonic, Xi64, opnodeflag, RegMRM>;
-      }
-    } // Constraints = "$src1 = $dst"
-
-    let mayLoad = 1, mayStore = 1, hasSideEffects = 0 in {
-      def NAME#8mr    : BinOpMR_RMW<BaseOpc, mnemonic, Xi8 , opnode>;
-      def NAME#16mr   : BinOpMR_RMW<BaseOpc, mnemonic, Xi16, opnode>;
-      def NAME#32mr   : BinOpMR_RMW<BaseOpc, mnemonic, Xi32, opnode>;
-      def NAME#64mr   : BinOpMR_RMW<BaseOpc, mnemonic, Xi64, opnode>;
-
-      // NOTE: These are order specific, we want the mi8 forms to be listed
-      // first so that they are slightly preferred to the mi forms.
-      def NAME#16mi8  : BinOpMI8_RMW<mnemonic, Xi16, MemMRM>;
-      def NAME#32mi8  : BinOpMI8_RMW<mnemonic, Xi32, MemMRM>;
-      let Predicates = [In64BitMode] in
-      def NAME#64mi8  : BinOpMI8_RMW<mnemonic, Xi64, MemMRM>;
-
-      def NAME#8mi    : BinOpMI_RMW<0x80, mnemonic, Xi8 , opnode, MemMRM>;
-      def NAME#16mi   : BinOpMI_RMW<0x80, mnemonic, Xi16, opnode, MemMRM>;
-      def NAME#32mi   : BinOpMI_RMW<0x80, mnemonic, Xi32, opnode, MemMRM>;
-      let Predicates = [In64BitMode] in
-      def NAME#64mi32 : BinOpMI_RMW<0x80, mnemonic, Xi64, opnode, MemMRM>;
+  let Constraints = "$src1 = $dst" in {
+    let isCommutable = CommutableRR,
+        isConvertibleToThreeAddress = ConvertibleToThreeAddressRR in {
+    def NAME#8rr  : BinOpRR_RF<BaseOpc, mnemonic, Xi8 , opnodeflag>;
+    def NAME#16rr : BinOpRR_RF<BaseOpc, mnemonic, Xi16, opnodeflag>;
+    def NAME#32rr : BinOpRR_RF<BaseOpc, mnemonic, Xi32, opnodeflag>;
+    def NAME#64rr : BinOpRR_RF<BaseOpc, mnemonic, Xi64, opnodeflag>;
     }
 
-    // These are for the disassembler since 0x82 opcode behaves like 0x80, but
-    // not in 64-bit mode.
-    let Predicates = [Not64BitMode], isCodeGenOnly = 1, ForceDisassemble = 1,
-        hasSideEffects = 0 in {
-      let Constraints = "$src1 = $dst" in
-        def NAME#8ri8 : BinOpRI8_RF<0x82, mnemonic, Xi8, RegMRM>;
-      let mayLoad = 1, mayStore = 1 in
-        def NAME#8mi8 : BinOpMI8_RMW<mnemonic, Xi8, MemMRM>;
+  def NAME#8rr_REV  : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi8>;
+  def NAME#16rr_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi16>;
+  def NAME#32rr_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi32>;
+  def NAME#64rr_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi64>;
+
+  def NAME#8rm   : BinOpRM_RF<BaseOpc2, mnemonic, Xi8 , opnodeflag>;
+  def NAME#16rm  : BinOpRM_RF<BaseOpc2, mnemonic, Xi16, opnodeflag>;
+  def NAME#32rm  : BinOpRM_RF<BaseOpc2, mnemonic, Xi32, opnodeflag>;
+  def NAME#64rm  : BinOpRM_RF<BaseOpc2, mnemonic, Xi64, opnodeflag>;
+
+    let isConvertibleToThreeAddress = ConvertibleToThreeAddress in {
+    def NAME#8ri   : BinOpRI_RF<0x80, mnemonic, Xi8 , opnodeflag, RegMRM>;
+    // NOTE: These are order specific, we want the ri8 forms to be listed
+    // first so that they are slightly preferred to the ri forms.
+    def NAME#16ri8 : BinOpRI8_RF<0x82, mnemonic, Xi16, RegMRM>;
+    def NAME#32ri8 : BinOpRI8_RF<0x82, mnemonic, Xi32, RegMRM>;
+    def NAME#64ri8 : BinOpRI8_RF<0x82, mnemonic, Xi64, RegMRM>;
+
+    def NAME#16ri  : BinOpRI_RF<0x80, mnemonic, Xi16, opnodeflag, RegMRM>;
+    def NAME#32ri  : BinOpRI_RF<0x80, mnemonic, Xi32, opnodeflag, RegMRM>;
+    def NAME#64ri32: BinOpRI_RF<0x80, mnemonic, Xi64, opnodeflag, RegMRM>;
     }
-  } // Defs = [EFLAGS]
-
-  def NAME#8i8   : BinOpAI<BaseOpc4, mnemonic, Xi8 , AL,
-                           "{$src, %al|al, $src}">;
-  def NAME#16i16 : BinOpAI<BaseOpc4, mnemonic, Xi16, AX,
-                           "{$src, %ax|ax, $src}">;
-  def NAME#32i32 : BinOpAI<BaseOpc4, mnemonic, Xi32, EAX,
-                           "{$src, %eax|eax, $src}">;
-  def NAME#64i32 : BinOpAI<BaseOpc4, mnemonic, Xi64, RAX,
-                           "{$src, %rax|rax, $src}">;
+  } // Constraints = "$src1 = $dst"
+
+  def NAME#8mr    : BinOpMR_MF<BaseOpc, mnemonic, Xi8 , opnode>;
+  def NAME#16mr   : BinOpMR_MF<BaseOpc, mnemonic, Xi16, opnode>;
+  def NAME#32mr   : BinOpMR_MF<BaseOpc, mnemonic, Xi32, opnode>;
+  def NAME#64mr   : BinOpMR_MF<BaseOpc, mnemonic, Xi64, opnode>;
+
+  // NOTE: These are order specific, we want the mi8 forms to be listed
+  // first so that they are slightly preferred to the mi forms.
+  def NAME#16mi8  : BinOpMI8_MF<mnemonic, Xi16, MemMRM>;
+  def NAME#32mi8  : BinOpMI8_MF<mnemonic, Xi32, MemMRM>;
+  let Predicates = [In64BitMode] in
+  def NAME#64mi8  : BinOpMI8_MF<mnemonic, Xi64, MemMRM>;
+
+  def NAME#8mi    : BinOpMI_MF<0x80, mnemonic, Xi8 , opnode, MemMRM>;
+  def NAME#16mi   : BinOpMI_MF<0x80, mnemonic, Xi16, opnode, MemMRM>;
+  def NAME#32mi   : BinOpMI_MF<0x80, mnemonic, Xi32, opnode, MemMRM>;
+  let Predicates = [In64BitMode] in
+  def NAME#64mi32 : BinOpMI_MF<0x80, mnemonic, Xi64, opnode, MemMRM>;
+
+  // These are for the disassembler since 0x82 opcode behaves like 0x80, but
+  // not in 64-bit mode.
+  let Predicates = [Not64BitMode] in {
+  let Constraints = "$src1 = $dst" in
+  def NAME#8ri8 : BinOpRI8_RF<0x82, mnemonic, Xi8, RegMRM>, DisassembleOnly;
+  def NAME#8mi8 : BinOpMI8_MF<mnemonic, Xi8, MemMRM>, DisassembleOnly;
+  }
+
+  def NAME#8i8 : BinOpAI_AF<BaseOpc4, mnemonic, Xi8 , AL,
+                            "{$src, %al|al, $src}">;
+  def NAME#16i16 : BinOpAI_AF<BaseOpc4, mnemonic, Xi16, AX,
+                              "{$src, %ax|ax, $src}">;
+  def NAME#32i32 : BinOpAI_AF<BaseOpc4, mnemonic, Xi32, EAX,
+                              "{$src, %eax|eax, $src}">;
+  def NAME#64i32 : BinOpAI_AF<BaseOpc4, mnemonic, Xi64, RAX,
+                              "{$src, %rax|rax, $src}">;
 }
 
 /// ArithBinOp_RFF - This is an arithmetic binary operator where the pattern is
@@ -866,80 +720,73 @@ multiclass ArithBinOp_RFF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
                           string mnemonic, Format RegMRM, Format MemMRM,
                           SDNode opnode, bit CommutableRR,
                            bit ConvertibleToThreeAddress> {
-  let Uses = [EFLAGS], Defs = [EFLAGS] in {
-    let Constraints = "$src1 = $dst" in {
-      let isCommutable = CommutableRR in {
-        def NAME#8rr  : BinOpRR_RFF<BaseOpc, mnemonic, Xi8 , opnode>;
-        let isConvertibleToThreeAddress = ConvertibleToThreeAddress in {
-          def NAME#16rr : BinOpRR_RFF<BaseOpc, mnemonic, Xi16, opnode>;
-          def NAME#32rr : BinOpRR_RFF<BaseOpc, mnemonic, Xi32, opnode>;
-          def NAME#64rr : BinOpRR_RFF<BaseOpc, mnemonic, Xi64, opnode>;
-        } // isConvertibleToThreeAddress
-      } // isCommutable
-
-      def NAME#8rr_REV  : BinOpRR_RFF_Rev<BaseOpc2, mnemonic, Xi8>;
-      def NAME#16rr_REV : BinOpRR_RFF_Rev<BaseOpc2, mnemonic, Xi16>;
-      def NAME#32rr_REV : BinOpRR_RFF_Rev<BaseOpc2, mnemonic, Xi32>;
-      def NAME#64rr_REV : BinOpRR_RFF_Rev<BaseOpc2, mnemonic, Xi64>;
-
-      def NAME#8rm   : BinOpRM_RFF<BaseOpc2, mnemonic, Xi8 , opnode>;
-      def NAME#16rm  : BinOpRM_RFF<BaseOpc2, mnemonic, Xi16, opnode>;
-      def NAME#32rm  : BinOpRM_RFF<BaseOpc2, mnemonic, Xi32, opnode>;
-      def NAME#64rm  : BinOpRM_RFF<BaseOpc2, mnemonic, Xi64, opnode>;
-
-      def NAME#8ri   : BinOpRI_RFF<0x80, mnemonic, Xi8 , opnode, RegMRM>;
-
-      let isConvertibleToThreeAddress = ConvertibleToThreeAddress, hasSideEffects = 0 in {
-        // NOTE: These are order specific, we want the ri8 forms to be listed
-        // first so that they are slightly preferred to the ri forms.
-        def NAME#16ri8 : BinOpRI8_RFF<0x82, mnemonic, Xi16, RegMRM>;
-        def NAME#32ri8 : BinOpRI8_RFF<0x82, mnemonic, Xi32, RegMRM>;
-        def NAME#64ri8 : BinOpRI8_RFF<0x82, mnemonic, Xi64, RegMRM>;
-
-        def NAME#16ri  : BinOpRI_RFF<0x80, mnemonic, Xi16, opnode, RegMRM>;
-        def NAME#32ri  : BinOpRI_RFF<0x80, mnemonic, Xi32, opnode, RegMRM>;
-        def NAME#64ri32: BinOpRI_RFF<0x80, mnemonic, Xi64, opnode, RegMRM>;
-      }
-    } // Constraints = "$src1 = $dst"
-
-    def NAME#8mr    : BinOpMR_RMW_FF<BaseOpc, mnemonic, Xi8 , opnode>;
-    def NAME#16mr   : BinOpMR_RMW_FF<BaseOpc, mnemonic, Xi16, opnode>;
-    def NAME#32mr   : BinOpMR_RMW_FF<BaseOpc, mnemonic, Xi32, opnode>;
-    def NAME#64mr   : BinOpMR_RMW_FF<BaseOpc, mnemonic, Xi64, opnode>;
-
-    // NOTE: These are order specific, we want the mi8 forms to be listed
-    // first so that they are slightly preferred to the mi forms.
-    let mayLoad = 1, mayStore = 1, hasSideEffects = 0 in {
-    def NAME#16mi8  : BinOpMI8_RMW_FF<mnemonic, Xi16, MemMRM>;
-    def NAME#32mi8  : BinOpMI8_RMW_FF<mnemonic, Xi32, MemMRM>;
-    let Predicates = [In64BitMode] in
-    def NAME#64mi8  : BinOpMI8_RMW_FF<mnemonic, Xi64, MemMRM>;
-
-    def NAME#8mi    : BinOpMI_RMW_FF<0x80, mnemonic, Xi8 , opnode, MemMRM>;
-    def NAME#16mi   : BinOpMI_RMW_FF<0x80, mnemonic, Xi16, opnode, MemMRM>;
-    def NAME#32mi   : BinOpMI_RMW_FF<0x80, mnemonic, Xi32, opnode, MemMRM>;
-    let Predicates = [In64BitMode] in
-    def NAME#64mi32 : BinOpMI_RMW_FF<0x80, mnemonic, Xi64, opnode, MemMRM>;
-    }
-
-    // These are for the disassembler since 0x82 opcode behaves like 0x80, but
-    // not in 64-bit mode.
-    let Predicates = [Not64BitMode], isCodeGenOnly = 1, ForceDisassemble = 1,
-        hasSideEffects = 0 in {
-      let Constraints = "$src1 = $dst" in
-        def NAME#8ri8 : BinOpRI8_RFF<0x82, mnemonic, Xi8, RegMRM>;
-      let mayLoad = 1, mayStore = 1 in
-        def NAME#8mi8 : BinOpMI8_RMW_FF<mnemonic, Xi8, MemMRM>;
+  let Constraints = "$src1 = $dst" in {
+    let isCommutable = CommutableRR in {
+    def NAME#8rr  : BinOpRRF_RF<BaseOpc, mnemonic, Xi8 , opnode>;
+      let isConvertibleToThreeAddress = ConvertibleToThreeAddress in {
+      def NAME#16rr : BinOpRRF_RF<BaseOpc, mnemonic, Xi16, opnode>;
+      def NAME#32rr : BinOpRRF_RF<BaseOpc, mnemonic, Xi32, opnode>;
+      def NAME#64rr : BinOpRRF_RF<BaseOpc, mnemonic, Xi64, opnode>;
+    } // isConvertibleToThreeAddress
+  } // isCommutable
+
+  def NAME#8rr_REV  : BinOpRRF_RF_Rev<BaseOpc2, mnemonic, Xi8>;
+  def NAME#16rr_REV : BinOpRRF_RF_Rev<BaseOpc2, mnemonic, Xi16>;
+  def NAME#32rr_REV : BinOpRRF_RF_Rev<BaseOpc2, mnemonic, Xi32>;
+  def NAME#64rr_REV : BinOpRRF_RF_Rev<BaseOpc2, mnemonic, Xi64>;
+
+  def NAME#8rm   : BinOpRMF_RF<BaseOpc2, mnemonic, Xi8 , opnode>;
+  def NAME#16rm  : BinOpRMF_RF<BaseOpc2, mnemonic, Xi16, opnode>;
+  def NAME#32rm  : BinOpRMF_RF<BaseOpc2, mnemonic, Xi32, opnode>;
+  def NAME#64rm  : BinOpRMF_RF<BaseOpc2, mnemonic, Xi64, opnode>;
+
+  def NAME#8ri   : BinOpRIF_RF<0x80, mnemonic, Xi8 , opnode, RegMRM>;
+    let isConvertibleToThreeAddress = ConvertibleToThreeAddress in {
+    // NOTE: These are order specific, we want the ri8 forms to be listed
+    // first so that they are slightly preferred to the ri forms.
+    def NAME#16ri8 : BinOpRI8F_RF<0x82, mnemonic, Xi16, RegMRM>;
+    def NAME#32ri8 : BinOpRI8F_RF<0x82, mnemonic, Xi32, RegMRM>;
+    def NAME#64ri8 : BinOpRI8F_RF<0x82, mnemonic, Xi64, RegMRM>;
+
+    def NAME#16ri  : BinOpRIF_RF<0x80, mnemonic, Xi16, opnode, RegMRM>;
+    def NAME#32ri  : BinOpRIF_RF<0x80, mnemonic, Xi32, opnode, RegMRM>;
+    def NAME#64ri32: BinOpRIF_RF<0x80, mnemonic, Xi64, opnode, RegMRM>;
     }
-  } // Uses = [EFLAGS], Defs = [EFLAGS]
-
-  def NAME#8i8   : BinOpAI_RFF<BaseOpc4, mnemonic, Xi8 , AL,
-                               "{$src, %al|al, $src}">;
-  def NAME#16i16 : BinOpAI_RFF<BaseOpc4, mnemonic, Xi16, AX,
+  } // Constraints = "$src1 = $dst"
+
+  def NAME#8mr    : BinOpMRF_MF<BaseOpc, mnemonic, Xi8 , opnode>;
+  def NAME#16mr   : BinOpMRF_MF<BaseOpc, mnemonic, Xi16, opnode>;
+  def NAME#32mr   : BinOpMRF_MF<BaseOpc, mnemonic, Xi32, opnode>;
+  def NAME#64mr   : BinOpMRF_MF<BaseOpc, mnemonic, Xi64, opnode>;
+
+  // NOTE: These are order specific, we want the mi8 forms to be listed
+  // first so that they are slightly preferred to the mi forms.
+  def NAME#16mi8  : BinOpMI8F_MF<mnemonic, Xi16, MemMRM>;
+  def NAME#32mi8  : BinOpMI8F_MF<mnemonic, Xi32, MemMRM>;
+  let Predicates = [In64BitMode] in
+  def NAME#64mi8  : BinOpMI8F_MF<mnemonic, Xi64, MemMRM>;
+
+  def NAME#8mi    : BinOpMIF_MF<0x80, mnemonic, Xi8 , opnode, MemMRM>;
+  def NAME#16mi   : BinOpMIF_MF<0x80, mnemonic, Xi16, opnode, MemMRM>;
+  def NAME#32mi   : BinOpMIF_MF<0x80, mnemonic, Xi32, opnode, MemMRM>;
+  let Predicates = [In64BitMode] in
+  def NAME#64mi32 : BinOpMIF_MF<0x80, mnemonic, Xi64, opnode, MemMRM>;
+
+  // These are for the disassembler since 0x82 opcode behaves like 0x80, but
+  // not in 64-bit mode.
+  let Predicates = [Not64BitMode]  in {
+    let Constraints = "$src1 = $dst" in
+    def NAME#8ri8 : BinOpRI8F_RF<0x82, mnemonic, Xi8, RegMRM>, DisassembleOnly;
+  def NAME#8mi8 : BinOpMI8F_MF<mnemonic, Xi8, MemMRM>, DisassembleOnly;
+  }
+
+  def NAME#8i8 : BinOpAIF_AF<BaseOpc4, mnemonic, Xi8 , AL,
+                             "{$src, %al|al, $src}">;
+  def NAME#16i16 : BinOpAIF_AF<BaseOpc4, mnemonic, Xi16, AX,
                                "{$src, %ax|ax, $src}">;
-  def NAME#32i32 : BinOpAI_RFF<BaseOpc4, mnemonic, Xi32, EAX,
+  def NAME#32i32 : BinOpAIF_AF<BaseOpc4, mnemonic, Xi32, EAX,
                                "{$src, %eax|eax, $src}">;
-  def NAME#64i32 : BinOpAI_RFF<BaseOpc4, mnemonic, Xi64, RAX,
+  def NAME#64i32 : BinOpAIF_AF<BaseOpc4, mnemonic, Xi64, RAX,
                                "{$src, %rax|rax, $src}">;
 }
 
@@ -949,80 +796,75 @@ multiclass ArithBinOp_RFF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
 ///
 multiclass ArithBinOp_F<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
                         string mnemonic, Format RegMRM, Format MemMRM,
-                        SDNode opnode,
-                        bit CommutableRR, bit ConvertibleToThreeAddress> {
-  let Defs = [EFLAGS] in {
-    let isCommutable = CommutableRR in {
-      def NAME#8rr  : BinOpRR_F<BaseOpc, mnemonic, Xi8 , opnode>;
-      let isConvertibleToThreeAddress = ConvertibleToThreeAddress in {
-        def NAME#16rr : BinOpRR_F<BaseOpc, mnemonic, Xi16, opnode>;
-        def NAME#32rr : BinOpRR_F<BaseOpc, mnemonic, Xi32, opnode>;
-        def NAME#64rr : BinOpRR_F<BaseOpc, mnemonic, Xi64, opnode>;
-      }
-    } // isCommutable
-
-    def NAME#8rr_REV  : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi8>;
-    def NAME#16rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi16>;
-    def NAME#32rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi32>;
-    def NAME#64rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi64>;
-
-    def NAME#8rm   : BinOpRM_F<BaseOpc2, mnemonic, Xi8 , opnode>;
-    def NAME#16rm  : BinOpRM_F<BaseOpc2, mnemonic, Xi16, opnode>;
-    def NAME#32rm  : BinOpRM_F<BaseOpc2, mnemonic, Xi32, opnode>;
-    def NAME#64rm  : BinOpRM_F<BaseOpc2, mnemonic, Xi64, opnode>;
-
-    def NAME#8ri   : BinOpRI_F<0x80, mnemonic, Xi8 , opnode, RegMRM>;
-
-    let isConvertibleToThreeAddress = ConvertibleToThreeAddress, hasSideEffects = 0 in {
-      // NOTE: These are order specific, we want the ri8 forms to be listed
-      // first so that they are slightly preferred to the ri forms.
-      def NAME#16ri8 : BinOpRI8_F<0x82, mnemonic, Xi16, RegMRM>;
-      def NAME#32ri8 : BinOpRI8_F<0x82, mnemonic, Xi32, RegMRM>;
-      def NAME#64ri8 : BinOpRI8_F<0x82, mnemonic, Xi64, RegMRM>;
-
-      def NAME#16ri  : BinOpRI_F<0x80, mnemonic, Xi16, opnode, RegMRM>;
-      def NAME#32ri  : BinOpRI_F<0x80, mnemonic, Xi32, opnode, RegMRM>;
-      def NAME#64ri32: BinOpRI_F<0x80, mnemonic, Xi64, opnode, RegMRM>;
-    }
-
-    def NAME#8mr    : BinOpMR_F<BaseOpc, mnemonic, Xi8 , opnode>;
-    def NAME#16mr   : BinOpMR_F<BaseOpc, mnemonic, Xi16, opnode>;
-    def NAME#32mr   : BinOpMR_F<BaseOpc, mnemonic, Xi32, opnode>;
-    def NAME#64mr   : BinOpMR_F<BaseOpc, mnemonic, Xi64, opnode>;
-
-    // NOTE: These are order specific, we want the mi8 forms to be listed
-    // first so that they are slightly preferred to the mi forms.
-    let mayLoad = 1, hasSideEffects = 0 in {
-      def NAME#16mi8  : BinOpMI8_F<mnemonic, Xi16, MemMRM>;
-      def NAME#32mi8  : BinOpMI8_F<mnemonic, Xi32, MemMRM>;
-      let Predicates = [In64BitMode] in
-      def NAME#64mi8  : BinOpMI8_F<mnemonic, Xi64, MemMRM>;
-
-      def NAME#8mi    : BinOpMI_F<0x80, mnemonic, Xi8 , opnode, MemMRM>;
-      def NAME#16mi   : BinOpMI_F<0x80, mnemonic, Xi16, opnode, MemMRM>;
-      def NAME#32mi   : BinOpMI_F<0x80, mnemonic, Xi32, opnode, MemMRM>;
-      let Predicates = [In64BitMode] in
-      def NAME#64mi32 : BinOpMI_F<0x80, mnemonic, Xi64, opnode, MemMRM>;
-    }
-
-    // These are for the disassembler since 0x82 opcode behaves like 0x80, but
-    // not in 64-bit mode.
-    let Predicates = [Not64BitMode], isCodeGenOnly = 1, ForceDisassemble = 1,
-        hasSideEffects = 0 in {
-      def NAME#8ri8 : BinOpRI8_F<0x82, mnemonic, Xi8, RegMRM>;
-      let mayLoad = 1 in
-        def NAME#8mi8 : BinOpMI8_F<mnemonic, Xi8, MemMRM>;
-    }
-  } // Defs = [EFLAGS]
-
-  def NAME#8i8   : BinOpAI_F<BaseOpc4, mnemonic, Xi8 , AL,
-                             "{$src, %al|al, $src}">;
+                        SDNode opnode, bit CommutableRR,
+                        bit ConvertibleToThreeAddress> {
+  let isCommutable = CommutableRR in {
+  def NAME#8rr  : BinOpRR_F<BaseOpc, mnemonic, Xi8 , opnode>;
+    let isConvertibleToThreeAddress = ConvertibleToThreeAddress in {
+    def NAME#16rr : BinOpRR_F<BaseOpc, mnemonic, Xi16, opnode>;
+    def NAME#32rr : BinOpRR_F<BaseOpc, mnemonic, Xi32, opnode>;
+    def NAME#64rr : BinOpRR_F<BaseOpc, mnemonic, Xi64, opnode>;
+    } // isConvertibleToThreeAddress
+  } // isCommutable
+
+  def NAME#8rr_REV  : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi8>;
+  def NAME#16rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi16>;
+  def NAME#32rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi32>;
+  def NAME#64rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi64>;
+
+  def NAME#8rm   : BinOpRM_F<BaseOpc2, mnemonic, Xi8 , opnode>;
+  def NAME#16rm  : BinOpRM_F<BaseOpc2, mnemonic, Xi16, opnode>;
+  def NAME#32rm  : BinOpRM_F<BaseOpc2, mnemonic, Xi32, opnode>;
+  def NAME#64rm  : BinOpRM_F<BaseOpc2, mnemonic, Xi64, opnode>;
+
+  def NAME#8ri   : BinOpRI_F<0x80, mnemonic, Xi8 , opnode, RegMRM>;
+
+  let isConvertibleToThreeAddress = ConvertibleToThreeAddress in {
+  // NOTE: These are order specific, we want the ri8 forms to be listed
+  // first so that they are slightly preferred to the ri forms.
+  def NAME#16ri8 : BinOpRI8_F<0x82, mnemonic, Xi16, RegMRM>;
+  def NAME#32ri8 : BinOpRI8_F<0x82, mnemonic, Xi32, RegMRM>;
+  def NAME#64ri8 : BinOpRI8_F<0x82, mnemonic, Xi64, RegMRM>;
+
+  def NAME#16ri  : BinOpRI_F<0x80, mnemonic, Xi16, opnode, RegMRM>;
+  def NAME#32ri  : BinOpRI_F<0x80, mnemonic, Xi32, opnode, RegMRM>;
+  def NAME#64ri32: BinOpRI_F<0x80, mnemonic, Xi64, opnode, RegMRM>;
+  }
+
+  def NAME#8mr    : BinOpMR_F<BaseOpc, mnemonic, Xi8 , opnode>;
+  def NAME#16mr   : BinOpMR_F<BaseOpc, mnemonic, Xi16, opnode>;
+  def NAME#32mr   : BinOpMR_F<BaseOpc, mnemonic, Xi32, opnode>;
+  def NAME#64mr   : BinOpMR_F<BaseOpc, mnemonic, Xi64, opnode>;
+
+  // NOTE: These are order specific, we want the mi8 forms to be listed
+  // first so that they are slightly preferred to the mi forms.
+  def NAME#16mi8  : BinOpMI8_F<mnemonic, Xi16, MemMRM>;
+  def NAME#32mi8  : BinOpMI8_F<mnemonic, Xi32, MemMRM>;
+  let Predicates = [In64BitMode] in
+  def NAME#64mi8  : BinOpMI8_F<mnemonic, Xi64, MemMRM>;
+
+  def NAME#8mi    : BinOpMI_F<0x80, mnemonic, Xi8 , opnode, MemMRM>;
+  def NAME#16mi   : BinOpMI_F<0x80, mnemonic, Xi16, opnode, MemMRM>;
+  def NAME#32mi   : BinOpMI_F<0x80, mnemonic, Xi32, opnode, MemMRM>;
+  let Predicates = [In64BitMode] in
+  def NAME#64mi32 : BinOpMI_F<0x80, mnemonic, Xi64, opnode, MemMRM>;
+
+  // These are for the disassembler since 0x82 opcode behaves like 0x80, but
+  // not in 64-bit mode.
+  let Predicates = [Not64BitMode] in {
+  def NAME#8ri8 : BinOpRI8_F<0x82, mnemonic, Xi8, RegMRM>, DisassembleOnly;
+    let mayLoad = 1 in
+    def NAME#8mi8 : BinOpMI8_F<mnemonic, Xi8, MemMRM>;
+  }
+
+  def NAME#8i8 : BinOpAI_F<BaseOpc4, mnemonic, Xi8 , AL,
+                           "{$src, %al|al, $src}">;
   def NAME#16i16 : BinOpAI_F<BaseOpc4, mnemonic, Xi16, AX,
-                             "{$src, %ax|ax, $src}">;
+                           "{$src, %ax|ax, $src}">;
   def NAME#32i32 : BinOpAI_F<BaseOpc4, mnemonic, Xi32, EAX,
-                             "{$src, %eax|eax, $src}">;
+                           "{$src, %eax|eax, $src}">;
   def NAME#64i32 : BinOpAI_F<BaseOpc4, mnemonic, Xi64, RAX,
-                             "{$src, %rax|rax, $src}">;
+                           "{$src, %rax|rax, $src}">;
 }
 
 
@@ -1193,44 +1035,37 @@ def : Pat<(store (X86adc_flag i64relocImmSExt32_su:$src, (load addr:$dst), EFLAG
 // they don't have all the usual imm8 and REV forms, and are encoded into a
 // different space.
 let isCompare = 1 in {
-  let Defs = [EFLAGS] in {
-    let isCommutable = 1 in {
-      // Avoid selecting these and instead use a test+and. Post processing will
-      // combine them. This gives bunch of other patterns that start with
-      // and a chance to match.
-      def TEST8rr  : BinOpRR_F<0x84, "test", Xi8 , null_frag>;
-      def TEST16rr : BinOpRR_F<0x84, "test", Xi16, null_frag>;
-      def TEST32rr : BinOpRR_F<0x84, "test", Xi32, null_frag>;
-      def TEST64rr : BinOpRR_F<0x84, "test", Xi64, null_frag>;
-    } // isCommutable
-
-    let hasSideEffects = 0, mayLoad = 1 in {
-    def TEST8mr    : BinOpMR_F<0x84, "test", Xi8 , null_frag>;
-    def TEST16mr   : BinOpMR_F<0x84, "test", Xi16, null_frag>;
-    def TEST32mr   : BinOpMR_F<0x84, "test", Xi32, null_frag>;
-    def TEST64mr   : BinOpMR_F<0x84, "test", Xi64, null_frag>;
-    }
-
-    def TEST8ri    : BinOpRI_F<0xF6, "test", Xi8 , X86testpat, MRM0r>;
-    def TEST16ri   : BinOpRI_F<0xF6, "test", Xi16, X86testpat, MRM0r>;
-    def TEST32ri   : BinOpRI_F<0xF6, "test", Xi32, X86testpat, MRM0r>;
-    def TEST64ri32 : BinOpRI_F<0xF6, "test", Xi64, X86testpat, MRM0r>;
-
-    def TEST8mi    : BinOpMI_F<0xF6, "test", Xi8 , X86testpat, MRM0m>;
-    def TEST16mi   : BinOpMI_F<0xF6, "test", Xi16, X86testpat, MRM0m>;
-    def TEST32mi   : BinOpMI_F<0xF6, "test", Xi32, X86testpat, MRM0m>;
-    let Predicates = [In64BitMode] in
-    def TEST64mi32 : BinOpMI_F<0xF6, "test", Xi64, X86testpat, MRM0m>;
-  } // Defs = [EFLAGS]
-
-  def TEST8i8    : BinOpAI_F<0xA8, "test", Xi8 , AL,
-                             "{$src, %al|al, $src}">;
-  def TEST16i16  : BinOpAI_F<0xA8, "test", Xi16, AX,
-                             "{$src, %ax|ax, $src}">;
-  def TEST32i32  : BinOpAI_F<0xA8, "test", Xi32, EAX,
-                             "{$src, %eax|eax, $src}">;
-  def TEST64i32  : BinOpAI_F<0xA8, "test", Xi64, RAX,
-                             "{$src, %rax|rax, $src}">;
+  let isCommutable = 1 in {
+  // Avoid selecting these and instead use a test+and. Post processing will
+  // combine them. This gives bunch of other patterns that start with
+  // and a chance to match.
+  def TEST8rr  : BinOpRR_F<0x84, "test", Xi8 , null_frag>;
+  def TEST16rr : BinOpRR_F<0x84, "test", Xi16, null_frag>;
+  def TEST32rr : BinOpRR_F<0x84, "test", Xi32, null_frag>;
+  def TEST64rr : BinOpRR_F<0x84, "test", Xi64, null_frag>;
+  } // isCommutable
+
+def TEST8mr    : BinOpMR_F<0x84, "test", Xi8 , null_frag>;
+def TEST16mr   : BinOpMR_F<0x84, "test", Xi16, null_frag>;
+def TEST32mr   : BinOpMR_F<0x84, "test", Xi32, null_frag>;
+def TEST64mr   : BinOpMR_F<0x84, "test", Xi64, null_frag>;
+
+def TEST8ri    : BinOpRI_F<0xF6, "test", Xi8 , X86testpat, MRM0r>;
+def TEST16ri   : BinOpRI_F<0xF6, "test", Xi16, X86testpat, MRM0r>;
+def TEST32ri   : BinOpRI_F<0xF6, "test", Xi32, X86testpat, MRM0r>;
+def TEST64ri32 : BinOpRI_F<0xF6, "test", Xi64, X86testpat, MRM0r>;
+
+def TEST8mi    : BinOpMI_F<0xF6, "test", Xi8 , X86testpat, MRM0m>;
+def TEST16mi   : BinOpMI_F<0xF6, "test", Xi16, X86testpat, MRM0m>;
+def TEST32mi   : BinOpMI_F<0xF6, "test", Xi32, X86testpat, MRM0m>;
+
+  let Predicates = [In64BitMode] in
+  def TEST64mi32 : BinOpMI_F<0xF6, "test", Xi64, X86testpat, MRM0m>;
+
+def TEST8i8 : BinOpAI_F<0xA8, "test", Xi8 , AL, "{$src, %al|al, $src}">;
+def TEST16i16 : BinOpAI_F<0xA8, "test", Xi16, AX, "{$src, %ax|ax, $src}">;
+def TEST32i32 : BinOpAI_F<0xA8, "test", Xi32, EAX, "{$src, %eax|eax, $src}">;
+def TEST64i32 : BinOpAI_F<0xA8, "test", Xi64, RAX, "{$src, %rax|rax, $src}">;
 } // isCompare
 
 // Patterns to match a relocImm into the immediate field.
@@ -1355,29 +1190,29 @@ let Uses = [RDX] in
 // We don't have patterns for these as there is no advantage over ADC for
 // most code.
 class ADCOXOpRR <bits<8> opcode, string mnemonic, X86TypeInfo info>
-  : BinOpRR_C<opcode, MRMSrcReg, mnemonic, info, []>{
+  : BinOpRR_RF<opcode, mnemonic, info, null_frag> {
   let Opcode = opcode;
   let OpSize = OpSizeFixed;
+  let Form = MRMSrcReg;
 }
 
 class ADCOXOpRM <bits<8> opcode, string mnemonic, X86TypeInfo info>
-  : BinOpRM_C<opcode, MRMSrcMem, mnemonic, info, []>{
+  : BinOpRM_RF<opcode, mnemonic, info, null_frag> {
   let Opcode = opcode;
   let OpSize = OpSizeFixed;
+  let Form = MRMSrcMem;
 }
 
-let Predicates = [HasADX], Defs = [EFLAGS], Uses = [EFLAGS],
-    Constraints = "$src1 = $dst", hasSideEffects = 0 in {
+let Predicates = [HasADX], Constraints = "$src1 = $dst" in {
   let SchedRW = [WriteADC], isCommutable = 1 in {
   def ADCX32rr : ADCOXOpRR<0xF6, "adcx", Xi32>, T8PD;
   def ADCX64rr : ADCOXOpRR<0xF6, "adcx", Xi64>, T8PD;
 
   def ADOX32rr : ADCOXOpRR<0xF6, "adox", Xi32>, T8XS;
   def ADOX64rr : ADCOXOpRR<0xF6, "adox", Xi64>, T8XS;
-  } // SchedRW
+  }
 
-  let mayLoad = 1,
-      SchedRW = [WriteADC.Folded, WriteADC.ReadAfterFold,
+  let SchedRW = [WriteADC.Folded, WriteADC.ReadAfterFold,
                  // Memory operand.
                  ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault,
                  // Implicit read of EFLAGS
@@ -1387,5 +1222,5 @@ let Predicates = [HasADX], Defs = [EFLAGS], Uses = [EFLAGS],
 
   def ADOX32rm : ADCOXOpRM<0xF6, "adox", Xi32>, T8XS;
   def ADOX64rm : ADCOXOpRM<0xF6, "adox", Xi64>, T8XS;
-  } // mayLoad, SchedRW
+  }
 }
diff --git a/llvm/lib/Target/X86/X86InstrUtils.td b/llvm/lib/Target/X86/X86InstrUtils.td
index 3d8f4e642aebe..80854bf606316 100644
--- a/llvm/lib/Target/X86/X86InstrUtils.td
+++ b/llvm/lib/Target/X86/X86InstrUtils.td
@@ -116,6 +116,14 @@ class NotEVEX2VEXConvertible { bit notEVEX2VEXConvertible = 1; }
 class ExplicitREX2Prefix { ExplicitOpPrefix explicitOpPrefix = ExplicitREX2; }
 class ExplicitVEXPrefix { ExplicitOpPrefix explicitOpPrefix = ExplicitVEX; }
 class ExplicitEVEXPrefix { ExplicitOpPrefix explicitOpPrefix = ExplicitEVEX; }
+class DefEFLAGS { list<Register> Defs = [EFLAGS]; }
+class UseEFLAGS { list<Register> Uses = [EFLAGS]; }
+class DisassembleOnly {
+  // The disassembler should know about this, but not the asmparser.
+  bit isCodeGenOnly = 1;
+  bit ForceDisassemble = 1;
+}
+
 
 // SchedModel info for instruction that loads one value and gets the second
 // (and possibly third) value from a register.
@@ -988,6 +996,7 @@ class ITy<bits<8> opcode, Format f, X86TypeInfo typeinfo, dag outs, dag ins,
       f, outs, ins,
       !strconcat(mnemonic, "{", typeinfo.InstrSuffix, "}\t", args), pattern> {
 
+  let hasSideEffects = 0;
   // Infer instruction prefixes from type info.
   let OpSize = typeinfo.OpSize;
   let hasREX_W  = typeinfo.HasREX_W;

From 513c2151cd0cddd90af91a6614b15b74b538963e Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett@linaro.org>
Date: Thu, 21 Dec 2023 13:10:33 +0000
Subject: [PATCH 060/342] [lldb][test] Only link Windows libraries on Windows

ld.lld: error: unable to find library -llldbPluginProcessWindowsCommon

https://lab.llvm.org/buildbot/#/builders/96/builds/50407

Fixes 95e5839e06fdffd278499257c6e7679bba3d6868.
---
 lldb/unittests/Thread/CMakeLists.txt | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/lldb/unittests/Thread/CMakeLists.txt b/lldb/unittests/Thread/CMakeLists.txt
index f6c8795f349a5..a90643eb0c110 100644
--- a/lldb/unittests/Thread/CMakeLists.txt
+++ b/lldb/unittests/Thread/CMakeLists.txt
@@ -1,3 +1,8 @@
+if (CMAKE_SYSTEM_NAME MATCHES "Windows")
+  list(APPEND LLDB_WINDOWS_LIBS lldbPluginProcessWindows)
+  list(APPEND LLDB_WINDOWS_LIBS lldbPluginProcessWindowsCommon)
+endif()
+
 add_lldb_unittest(ThreadTests
   ThreadTest.cpp
 
@@ -11,7 +16,6 @@ add_lldb_unittest(ThreadTests
       lldbInterpreter
       lldbBreakpoint
       lldbPluginPlatformLinux
-      lldbPluginPlatformWindows
-      lldbPluginProcessWindowsCommon
+      ${LLDB_WINDOWS_LIBS}
   )
 

From 17afa5befb4cbe86c22c25ae1603433c8bd21551 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andrzej=20Warzy=C5=84ski?= <andrzej.warzynski@arm.com>
Date: Thu, 21 Dec 2023 13:20:16 +0000
Subject: [PATCH 061/342] [mlir][nfc] Update tests for Contract -> Op
 transforms (#76054)

Updates two tests for vector.contract -> vector.outerproduct
transformations:

1. Rename "vector-contract-to-outerproduct-transforms.mlir" as
   "vector-contract-to-outerproduct-matmul-transforms.mlir". The new
   name more accurate captures what's being tested. it is also
   consistent with
   "vector-contract-to-outerproduct-matvec-transforms.mlir", which
   covers vector matvec operations and makes finding relevant tests
   easier.

2. For matmul tests, move the traits definining the iteration spaces to
   the top of the file. This is consistent with how matvec tests are
   defined and also makes it easy to quickly identify what cases are
   covered.

3. For matmul tests, use more meaningful names for function arguments.
   This helps keep things consistent across the file (i.e. function
   definitions wih check lines and comments).

4. For matvec test, move a few tests around so that the most basic case
   (without masking) is first.

5. Update comments.
---
 ...ct-to-outerproduct-matmul-transforms.mlir} | 233 +++++++++---------
 ...act-to-outerproduct-matvec-transforms.mlir |  60 ++---
 2 files changed, 149 insertions(+), 144 deletions(-)
 rename mlir/test/Dialect/Vector/{vector-contract-to-outerproduct-transforms.mlir => vector-contract-to-outerproduct-matmul-transforms.mlir} (81%)

diff --git a/mlir/test/Dialect/Vector/vector-contract-to-outerproduct-transforms.mlir b/mlir/test/Dialect/Vector/vector-contract-to-outerproduct-matmul-transforms.mlir
similarity index 81%
rename from mlir/test/Dialect/Vector/vector-contract-to-outerproduct-transforms.mlir
rename to mlir/test/Dialect/Vector/vector-contract-to-outerproduct-matmul-transforms.mlir
index 7588b738ff9aa..7a60ff8ea8589 100644
--- a/mlir/test/Dialect/Vector/vector-contract-to-outerproduct-transforms.mlir
+++ b/mlir/test/Dialect/Vector/vector-contract-to-outerproduct-matmul-transforms.mlir
@@ -1,20 +1,22 @@
 // RUN: mlir-opt %s --transform-interpreter --split-input-file | FileCheck %s
 
-// NOTE - tests in this file are duplicated so that there's a version for
-//    * _fixed width_ and for _scalable_ vectors.
-// In order for the "vector.contract -> vector.outerproduct" patterns to work,
-// only the non-reduction dimension can be scalable (*). For Matmul operations
-// that is set to be the N dimension (i.e. rows of the output matrix), which
-// matches how matrix multiplication are normally implemented for e.g. 
-// Arm SVE. However, making the M dimension scalable (i.e. columns of the
-// output matrix) should work as well.
-//
-// (*) The conversion tested in this file unrolls along the reduction
-// dimension, which is not supported for scalable vectors.
+/// Tests for `vector.contract` -> `vector.outerproduct` transformations for
+/// matmul operations:
+///   C += A * B.
+/// (A, B and C are 2-d matrices). ATM three different variants / are tested:
+///   * plain (no mask, fixed-wdith vectors),
+///   * masked (fixed-width vectors,
+///   * scalable (mask + scalable vectors).
+/// In order for the "vector.contract -> vector.outerproduct" patterns to work,
+/// only the non-reduction dimension can be scalable (*). For matmul operations
+/// that is set to be the N dimension (i.e. rows of the output matrix), which
+/// matches how matrix multiplication are normally implemented for e.g.
+/// Arm SVE. However, making the M dimension scalable (i.e. columns of the
+/// output matrix) should work as well.
+///
+/// (*) The conversion tested in this file unrolls along the reduction
+/// dimension, which is not supported for scalable vectors.
 
-// ============================================================================
-//  Matmul 0 (plain + masked + mixed types)
-// ============================================================================
 #matmat_accesses_0 = [
   affine_map<(m, n, k) -> (m, k)>,
   affine_map<(m, n, k) -> (k, n)>,
@@ -25,6 +27,49 @@
   iterator_types = ["parallel", "parallel", "reduction"]
 }
 
+#matmat_accesses_1 = [
+  affine_map<(m, n, k) -> (m, k)>,
+  affine_map<(m, n, k) -> (n, k)>,
+  affine_map<(m, n, k) -> (m, n)>
+]
+#matmat_trait_1 = {
+  indexing_maps = #matmat_accesses_1,
+  iterator_types = ["parallel", "parallel", "reduction"]
+}
+
+#matmat_accesses_2 = [
+  affine_map<(m, n, k) -> (k, m)>,
+  affine_map<(m, n, k) -> (k, n)>,
+  affine_map<(m, n, k) -> (m, n)>
+]
+#matmat_trait_2 = {
+  indexing_maps = #matmat_accesses_2,
+  iterator_types = ["parallel", "parallel", "reduction"]
+}
+
+#matmat_accesses_3 = [
+  affine_map<(m, n, k) -> (k, m)>,
+  affine_map<(m, n, k) -> (n, k)>,
+  affine_map<(m, n, k) -> (m, n)>
+]
+#matmat_trait_3 = {
+  indexing_maps = #matmat_accesses_3,
+  iterator_types = ["parallel", "parallel", "reduction"]
+}
+
+#matmat_accesses_4 = [
+  affine_map<(m, n, k) -> (m, k)>,
+  affine_map<(m, n, k) -> (k, n)>,
+  affine_map<(m, n, k) -> (n, m)>
+]
+#matmat_trait_4 = {
+  indexing_maps = #matmat_accesses_4,
+  iterator_types = ["parallel", "parallel", "reduction"]
+}
+
+// ============================================================================
+//  Matmul 0 (plain + masked + mixed types)
+// ============================================================================
 // CHECK-LABEL: func @matmul
 // CHECK-SAME: %[[A:[a-zA-Z0-9]*]]: vector<2x4xf32>,
 // CHECK-SAME: %[[B:[a-zA-Z0-9]*]]: vector<4x3xf32>,
@@ -53,10 +98,10 @@
 // CHECK-SAME:  : vector<2xf32>, vector<3xf32>
 //
 //      CHECK: return %[[c3]] : vector<2x3xf32>
-func.func @matmul(%arg0: vector<2x4xf32>,
-                  %arg1: vector<4x3xf32>,
-                  %arg2: vector<2x3xf32>) -> vector<2x3xf32> {
-  %0 = vector.contract #matmat_trait_0 %arg0, %arg1, %arg2
+func.func @matmul(%A: vector<2x4xf32>,
+                  %B: vector<4x3xf32>,
+                  %C: vector<2x3xf32>) -> vector<2x3xf32> {
+  %0 = vector.contract #matmat_trait_0 %A, %B, %C
     : vector<2x4xf32>, vector<4x3xf32> into vector<2x3xf32>
   return %0 : vector<2x3xf32>
 }
@@ -89,10 +134,10 @@ func.func @matmul(%arg0: vector<2x4xf32>,
 // CHECK-SAME:  : vector<2xf32>, vector<[3]xf32>
 //
 //      CHECK: return %[[c3]] : vector<2x[3]xf32>
-func.func @matmul_scalable(%arg0: vector<2x4xf32>,
-                           %arg1: vector<4x[3]xf32>,
-                           %arg2: vector<2x[3]xf32>) -> vector<2x[3]xf32> {
-  %0 = vector.contract #matmat_trait_0 %arg0, %arg1, %arg2
+func.func @matmul_scalable(%A: vector<2x4xf32>,
+                           %B: vector<4x[3]xf32>,
+                           %C: vector<2x[3]xf32>) -> vector<2x[3]xf32> {
+  %0 = vector.contract #matmat_trait_0 %A, %B, %C
     : vector<2x4xf32>, vector<4x[3]xf32> into vector<2x[3]xf32>
   return %0 : vector<2x[3]xf32>
 }
@@ -114,11 +159,11 @@ func.func @matmul_scalable(%arg0: vector<2x4xf32>,
 // CHECK:         %[[T_MASK_R4:.*]] = vector.extract %[[T_MASK]][4] : vector<3x7xi1> from vector<5x3x7xi1>
 // CHECK:         %{{.*}} = vector.mask %[[T_MASK_R4]] { vector.outerproduct %{{.*}} {kind = #vector.kind<add>} : vector<3xf32>, vector<7xf32> } : vector<3x7xi1> -> vector<3x7xf32>
 
-func.func @masked_matmul(%arg0: vector<3x5xf32>,
-                         %arg1: vector<5x7xf32>,
-                         %arg2: vector<3x7xf32>,
+func.func @masked_matmul(%A: vector<3x5xf32>,
+                         %B: vector<5x7xf32>,
+                         %C: vector<3x7xf32>,
                          %m : vector<3x7x5xi1>) -> vector<3x7xf32> {
-  %0 = vector.mask %m { vector.contract #matmat_trait_0 %arg0, %arg1, %arg2
+  %0 = vector.mask %m { vector.contract #matmat_trait_0 %A, %B, %C
   : vector<3x5xf32>, vector<5x7xf32> into vector<3x7xf32> } : vector<3x7x5xi1> -> vector<3x7xf32>
   return %0 : vector<3x7xf32>
 }
@@ -140,11 +185,11 @@ func.func @masked_matmul(%arg0: vector<3x5xf32>,
 // CHECK:         %[[T_MASK_R4:.*]] = vector.extract %[[T_MASK]][4] : vector<3x[7]xi1> from vector<5x3x[7]xi1>
 // CHECK:         %{{.*}} = vector.mask %[[T_MASK_R4]] { vector.outerproduct %{{.*}} {kind = #vector.kind<add>} : vector<3xf32>, vector<[7]xf32> } : vector<3x[7]xi1> -> vector<3x[7]xf32>
 
-func.func @masked_matmul_scalable(%arg0: vector<3x5xf32>,
-                                  %arg1: vector<5x[7]xf32>,
-                                  %arg2: vector<3x[7]xf32>,
+func.func @masked_matmul_scalable(%A: vector<3x5xf32>,
+                                  %B: vector<5x[7]xf32>,
+                                  %C: vector<3x[7]xf32>,
                                   %m : vector<3x[7]x5xi1>) -> vector<3x[7]xf32> {
-  %0 = vector.mask %m { vector.contract #matmat_trait_0 %arg0, %arg1, %arg2
+  %0 = vector.mask %m { vector.contract #matmat_trait_0 %A, %B, %C
   : vector<3x5xf32>, vector<5x[7]xf32> into vector<3x[7]xf32> } : vector<3x[7]x5xi1> -> vector<3x[7]xf32>
   return %0 : vector<3x[7]xf32>
 }
@@ -160,11 +205,11 @@ func.func @masked_matmul_scalable(%arg0: vector<3x5xf32>,
 //      CHECK: %[[b1:.*]] = arith.extf %[[b0]] : vector<3xf16> to vector<3xf32>
 //      CHECK: %[[c0:.*]] = vector.outerproduct %[[a1]], %[[b1]], %[[C]]
 //      CHECK: return %[[c0]] : vector<2x3xf32>
-func.func @matmul_mixed(%arg0: vector<2x1xf16>,
-                          %arg1: vector<1x3xf16>,
-                          %arg2: vector<2x3xf32>) -> vector<2x3xf32>
+func.func @matmul_mixed(%A: vector<2x1xf16>,
+                        %B: vector<1x3xf16>,
+                        %C: vector<2x3xf32>) -> vector<2x3xf32>
 {
-  %0 = vector.contract #matmat_trait_0 %arg0, %arg1, %arg2
+  %0 = vector.contract #matmat_trait_0 %A, %B, %C
     : vector<2x1xf16>, vector<1x3xf16> into vector<2x3xf32>
   return %0 : vector<2x3xf32>
 }
@@ -180,28 +225,18 @@ func.func @matmul_mixed(%arg0: vector<2x1xf16>,
 //      CHECK: %[[b1:.*]] = arith.extf %[[b0]] : vector<[3]xf16> to vector<[3]xf32>
 //      CHECK: %[[c0:.*]] = vector.outerproduct %[[a1]], %[[b1]], %[[C]]
 //      CHECK: return %[[c0]] : vector<2x[3]xf32>
-func.func @matmul_mixed_scalable(%arg0: vector<2x1xf16>,
-                                   %arg1: vector<1x[3]xf16>,
-                                   %arg2: vector<2x[3]xf32>) -> vector<2x[3]xf32>
+func.func @matmul_mixed_scalable(%A: vector<2x1xf16>,
+                                 %B: vector<1x[3]xf16>,
+                                 %C: vector<2x[3]xf32>) -> vector<2x[3]xf32>
 {
-  %0 = vector.contract #matmat_trait_0 %arg0, %arg1, %arg2
+  %0 = vector.contract #matmat_trait_0 %A, %B, %C
     : vector<2x1xf16>, vector<1x[3]xf16> into vector<2x[3]xf32>
   return %0 : vector<2x[3]xf32>
 }
 
 // ============================================================================
-//  Matmul 1 (plain)
+//  Matmul 1 (plain + scalable)
 // ============================================================================
-#matmat_accesses_1 = [
-  affine_map<(m, n, k) -> (m, k)>,
-  affine_map<(m, n, k) -> (n, k)>,
-  affine_map<(m, n, k) -> (m, n)>
-]
-#matmat_trait_1 = {
-  indexing_maps = #matmat_accesses_1,
-  iterator_types = ["parallel", "parallel", "reduction"]
-}
-
 // CHECK-LABEL: func @matmul_1
 // CHECK-SAME: %[[A:[a-zA-Z0-9]*]]: vector<2x1xf32>,
 // CHECK-SAME: %[[B:[a-zA-Z0-9]*]]: vector<3x1xf32>,
@@ -212,11 +247,11 @@ func.func @matmul_mixed_scalable(%arg0: vector<2x1xf16>,
 //      CHECK: %[[b0:.*]] = vector.extract %[[Bt]][0] : vector<3xf32> from vector<1x3xf32>
 //      CHECK: %[[c0:.*]] = vector.outerproduct %[[a0]], %[[b0]], %[[C]]
 //      CHECK: return %[[c0]] : vector<2x3xf32>
-func.func @matmul_1(%arg0: vector<2x1xf32>,
-                    %arg1: vector<3x1xf32>,
-                    %arg2: vector<2x3xf32>) -> vector<2x3xf32>
+func.func @matmul_1(%A: vector<2x1xf32>,
+                    %B: vector<3x1xf32>,
+                    %C: vector<2x3xf32>) -> vector<2x3xf32>
 {
-  %0 = vector.contract #matmat_trait_1 %arg0, %arg1, %arg2
+  %0 = vector.contract #matmat_trait_1 %A, %B, %C
     : vector<2x1xf32>, vector<3x1xf32> into vector<2x3xf32>
   return %0 : vector<2x3xf32>
 }
@@ -231,28 +266,18 @@ func.func @matmul_1(%arg0: vector<2x1xf32>,
 //      CHECK: %[[b0:.*]] = vector.extract %[[Bt]][0] : vector<[3]xf32> from vector<1x[3]xf32>
 //      CHECK: %[[c0:.*]] = vector.outerproduct %[[a0]], %[[b0]], %[[C]]
 //      CHECK: return %[[c0]] : vector<2x[3]xf32>
-func.func @matmul_1_scalable(%arg0: vector<2x1xf32>,
-                             %arg1: vector<[3]x1xf32>,
-                             %arg2: vector<2x[3]xf32>) -> vector<2x[3]xf32>
+func.func @matmul_1_scalable(%A: vector<2x1xf32>,
+                             %B: vector<[3]x1xf32>,
+                             %C: vector<2x[3]xf32>) -> vector<2x[3]xf32>
 {
-  %0 = vector.contract #matmat_trait_1 %arg0, %arg1, %arg2
+  %0 = vector.contract #matmat_trait_1 %A, %B, %C
     : vector<2x1xf32>, vector<[3]x1xf32> into vector<2x[3]xf32>
   return %0 : vector<2x[3]xf32>
 }
 
 // ============================================================================
-//  Matmul 2 (plain)
+//  Matmul 2 (plain + scalable)
 // ============================================================================
-#matmat_accesses_2 = [
-  affine_map<(m, n, k) -> (k, m)>,
-  affine_map<(m, n, k) -> (k, n)>,
-  affine_map<(m, n, k) -> (m, n)>
-]
-#matmat_trait_2 = {
-  indexing_maps = #matmat_accesses_2,
-  iterator_types = ["parallel", "parallel", "reduction"]
-}
-
 // CHECK-LABEL: func @matmul_2
 // CHECK-SAME: %[[A:[a-zA-Z0-9]*]]: vector<1x2xf32>,
 // CHECK-SAME: %[[B:[a-zA-Z0-9]*]]: vector<1x3xf32>,
@@ -261,11 +286,11 @@ func.func @matmul_1_scalable(%arg0: vector<2x1xf32>,
 //      CHECK: %[[b0:.*]] = vector.extract %[[B]][0] : vector<3xf32> from vector<1x3xf32>
 //      CHECK: %[[c0:.*]] = vector.outerproduct %[[a0]], %[[b0]], %[[C]]
 //      CHECK: return %[[c0]] : vector<2x3xf32>
-func.func @matmul_2(%arg0: vector<1x2xf32>,
-                    %arg1: vector<1x3xf32>,
-                    %arg2: vector<2x3xf32>) -> vector<2x3xf32>
+func.func @matmul_2(%A: vector<1x2xf32>,
+                    %B: vector<1x3xf32>,
+                    %C: vector<2x3xf32>) -> vector<2x3xf32>
 {
-  %0 = vector.contract #matmat_trait_2 %arg0, %arg1, %arg2
+  %0 = vector.contract #matmat_trait_2 %A, %B, %C
     : vector<1x2xf32>, vector<1x3xf32> into vector<2x3xf32>
   return %0 : vector<2x3xf32>
 }
@@ -278,28 +303,18 @@ func.func @matmul_2(%arg0: vector<1x2xf32>,
 //      CHECK: %[[b0:.*]] = vector.extract %[[B]][0] : vector<[3]xf32> from vector<1x[3]xf32>
 //      CHECK: %[[c0:.*]] = vector.outerproduct %[[a0]], %[[b0]], %[[C]]
 //      CHECK: return %[[c0]] : vector<2x[3]xf32>
-func.func @matmul_2_scalable(%arg0: vector<1x2xf32>,
-                             %arg1: vector<1x[3]xf32>,
-                             %arg2: vector<2x[3]xf32>) -> vector<2x[3]xf32>
+func.func @matmul_2_scalable(%A: vector<1x2xf32>,
+                             %B: vector<1x[3]xf32>,
+                             %C: vector<2x[3]xf32>) -> vector<2x[3]xf32>
 {
-  %0 = vector.contract #matmat_trait_2 %arg0, %arg1, %arg2
+  %0 = vector.contract #matmat_trait_2 %A, %B, %C
     : vector<1x2xf32>, vector<1x[3]xf32> into vector<2x[3]xf32>
   return %0 : vector<2x[3]xf32>
 }
 
 // ============================================================================
-//  Matmul 3 (plain)
+//  Matmul 3 (plain + scalable)
 // ============================================================================
-#matmat_accesses_3 = [
-  affine_map<(m, n, k) -> (k, m)>,
-  affine_map<(m, n, k) -> (n, k)>,
-  affine_map<(m, n, k) -> (m, n)>
-]
-#matmat_trait_3 = {
-  indexing_maps = #matmat_accesses_3,
-  iterator_types = ["parallel", "parallel", "reduction"]
-}
-
 // CHECK-LABEL: func @matmul_3
 // CHECK-SAME: %[[A:[a-zA-Z0-9]*]]: vector<1x2xf32>,
 // CHECK-SAME: %[[B:[a-zA-Z0-9]*]]: vector<3x1xf32>,
@@ -309,11 +324,11 @@ func.func @matmul_2_scalable(%arg0: vector<1x2xf32>,
 //      CHECK: %[[b0:.*]] = vector.extract %[[Bt]][0] : vector<3xf32> from vector<1x3xf32>
 //      CHECK: %[[c0:.*]] = vector.outerproduct %[[a0]], %[[b0]], %[[C]]
 //      CHECK: return %[[c0]] : vector<2x3xf32>
-func.func @matmul_3(%arg0: vector<1x2xf32>,
-                    %arg1: vector<3x1xf32>,
-                    %arg2: vector<2x3xf32>) -> vector<2x3xf32>
+func.func @matmul_3(%A: vector<1x2xf32>,
+                    %B: vector<3x1xf32>,
+                    %C: vector<2x3xf32>) -> vector<2x3xf32>
 {
-  %0 = vector.contract #matmat_trait_3 %arg0, %arg1, %arg2
+  %0 = vector.contract #matmat_trait_3 %A, %B, %C
     : vector<1x2xf32>, vector<3x1xf32> into vector<2x3xf32>
   return %0 : vector<2x3xf32>
 }
@@ -327,28 +342,18 @@ func.func @matmul_3(%arg0: vector<1x2xf32>,
 //      CHECK: %[[b0:.*]] = vector.extract %[[Bt]][0] : vector<[3]xf32> from vector<1x[3]xf32>
 //      CHECK: %[[c0:.*]] = vector.outerproduct %[[a0]], %[[b0]], %[[C]]
 //      CHECK: return %[[c0]] : vector<2x[3]xf32>
-func.func @matmul_3_scalable(%arg0: vector<1x2xf32>,
-                             %arg1: vector<[3]x1xf32>,
-                             %arg2: vector<2x[3]xf32>) -> vector<2x[3]xf32>
+func.func @matmul_3_scalable(%A: vector<1x2xf32>,
+                             %B: vector<[3]x1xf32>,
+                             %C: vector<2x[3]xf32>) -> vector<2x[3]xf32>
 {
-  %0 = vector.contract #matmat_trait_3 %arg0, %arg1, %arg2
+  %0 = vector.contract #matmat_trait_3 %A, %B, %C
     : vector<1x2xf32>, vector<[3]x1xf32> into vector<2x[3]xf32>
   return %0 : vector<2x[3]xf32>
 }
 
 // ============================================================================
-//  Matmul 4 (plain)
+//  Matmul 4 (plain + scalable)
 // ============================================================================
-#matmat_accesses_4 = [
-  affine_map<(m, n, k) -> (m, k)>,
-  affine_map<(m, n, k) -> (k, n)>,
-  affine_map<(m, n, k) -> (n, m)>
-]
-#matmat_trait_4 = {
-  indexing_maps = #matmat_accesses_4,
-  iterator_types = ["parallel", "parallel", "reduction"]
-}
-
 // CHECK-LABEL: func @matmul_4
 // CHECK-SAME: %[[A:[a-zA-Z0-9]*]]: vector<2x1xf32>,
 // CHECK-SAME: %[[B:[a-zA-Z0-9]*]]: vector<1x3xf32>,
@@ -358,11 +363,11 @@ func.func @matmul_3_scalable(%arg0: vector<1x2xf32>,
 //      CHECK: %[[a0:.*]] = vector.extract %[[At]][0] : vector<2xf32> from vector<1x2xf32>
 //      CHECK: %[[c0:.*]] = vector.outerproduct %[[b0]], %[[a0]], %[[C]]
 //      CHECK: return %[[c0]] : vector<3x2xf32>
-func.func @matmul_4(%arg0: vector<2x1xf32>,
-                    %arg1: vector<1x3xf32>,
-                    %arg2: vector<3x2xf32>) -> vector<3x2xf32>
+func.func @matmul_4(%A: vector<2x1xf32>,
+                    %B: vector<1x3xf32>,
+                    %C: vector<3x2xf32>) -> vector<3x2xf32>
 {
-  %0 = vector.contract #matmat_trait_4 %arg0, %arg1, %arg2
+  %0 = vector.contract #matmat_trait_4 %A, %B, %C
     : vector<2x1xf32>, vector<1x3xf32> into vector<3x2xf32>
   return %0 : vector<3x2xf32>
 }
@@ -376,11 +381,11 @@ func.func @matmul_4(%arg0: vector<2x1xf32>,
 //      CHECK: %[[a0:.*]] = vector.extract %[[At]][0] : vector<[2]xf32> from vector<1x[2]xf32>
 //      CHECK: %[[c0:.*]] = vector.outerproduct %[[b0]], %[[a0]], %[[C]]
 //      CHECK: return %[[c0]] : vector<3x[2]xf32>
-func.func @matmul_4_scalable(%arg0: vector<[2]x1xf32>,
-                             %arg1: vector<1x3xf32>,
-                             %arg2: vector<3x[2]xf32>) -> vector<3x[2]xf32>
+func.func @matmul_4_scalable(%A: vector<[2]x1xf32>,
+                             %B: vector<1x3xf32>,
+                             %C: vector<3x[2]xf32>) -> vector<3x[2]xf32>
 {
-  %0 = vector.contract #matmat_trait_4 %arg0, %arg1, %arg2
+  %0 = vector.contract #matmat_trait_4 %A, %B, %C
     : vector<[2]x1xf32>, vector<1x3xf32> into vector<3x[2]xf32>
   return %0 : vector<3x[2]xf32>
 }
diff --git a/mlir/test/Dialect/Vector/vector-contract-to-outerproduct-matvec-transforms.mlir b/mlir/test/Dialect/Vector/vector-contract-to-outerproduct-matvec-transforms.mlir
index c09a4d569638a..d86c6158bcdf2 100644
--- a/mlir/test/Dialect/Vector/vector-contract-to-outerproduct-matvec-transforms.mlir
+++ b/mlir/test/Dialect/Vector/vector-contract-to-outerproduct-matvec-transforms.mlir
@@ -235,6 +235,23 @@ func.func @masked_matvec_mk_k_m_max_scalable_parallel_dim(%A: vector<[2]x3xf32>,
 // ============================================================================
 //  Matvec 2 (plain + masked + scalable)
 // ============================================================================
+// CHECK-LABEL: func @matvec_km_k_m
+// CHECK-SAME: %[[A:.*0]]: vector<2x2xf32>
+// CHECK-SAME: %[[X:.*1]]: vector<2xf32>
+// CHECK-SAME: %[[B:.*2]]: vector<2xf32>
+// CHECK: %[[T3:.*]] = vector.extract %[[A]][0] : vector<2xf32> from vector<2x2xf32>
+// CHECK: %[[T4:.*]] = vector.extract %[[X]][0] : f32 from vector<2xf32>
+// CHECK: %[[T5:.*]] = vector.outerproduct %[[T3]], %[[T4]], %[[B]] {kind = #vector.kind<add>} : vector<2xf32>, f32
+// CHECK: %[[T6:.*]] = vector.extract %[[A]][1] : vector<2xf32> from vector<2x2xf32>
+// CHECK: %[[T7:.*]] = vector.extract %[[X]][1] : f32 from vector<2xf32>
+// CHECK: %[[T8:.*]] = vector.outerproduct %[[T6]], %[[T7]], %[[T5]] {kind = #vector.kind<add>} : vector<2xf32>, f32
+func.func @matvec_km_k_m(%A: vector<2x2xf32>,
+                         %x: vector<2xf32>,
+                         %b: vector<2xf32>) -> vector<2xf32> {
+  %0 = vector.contract #matvec_trait_2 %A, %x, %b : vector<2x2xf32>, vector<2xf32> into vector<2xf32>
+  return %0 : vector<2xf32>
+}
+
 // CHECK-LABEL: @masked_matvec_km_k_m
 // CHECK-SAME:  %[[A:.+]]: vector<2x4xf32>
 // CHECK-SAME:  %[[X:.+]]: vector<2xf32>
@@ -273,26 +290,27 @@ func.func @masked_matvec_km_k_m_scalable_parallel_dim(%A: vector<2x[4]xf32>,
   return %res : vector<[4]xf32>
 }
 
-// CHECK-LABEL: func @matvec_km_k_m
+// ============================================================================
+//  Matvec 3 (plain + masked + scalable)
+// ============================================================================
+// CHECK-LABEL: func @matvec_k_mk_m
 // CHECK-SAME: %[[A:.*0]]: vector<2x2xf32>
 // CHECK-SAME: %[[X:.*1]]: vector<2xf32>
 // CHECK-SAME: %[[B:.*2]]: vector<2xf32>
-// CHECK: %[[T3:.*]] = vector.extract %[[A]][0] : vector<2xf32> from vector<2x2xf32>
-// CHECK: %[[T4:.*]] = vector.extract %[[X]][0] : f32 from vector<2xf32>
-// CHECK: %[[T5:.*]] = vector.outerproduct %[[T3]], %[[T4]], %[[B]] {kind = #vector.kind<add>} : vector<2xf32>, f32
-// CHECK: %[[T6:.*]] = vector.extract %[[A]][1] : vector<2xf32> from vector<2x2xf32>
-// CHECK: %[[T7:.*]] = vector.extract %[[X]][1] : f32 from vector<2xf32>
-// CHECK: %[[T8:.*]] = vector.outerproduct %[[T6]], %[[T7]], %[[T5]] {kind = #vector.kind<add>} : vector<2xf32>, f32
-func.func @matvec_km_k_m(%A: vector<2x2xf32>,
+// CHECK: %[[T3:.*]] = vector.transpose %[[A]], [1, 0] : vector<2x2xf32> to vector<2x2xf32>
+// CHECK: %[[T4:.*]] = vector.extract %[[T3]][0] : vector<2xf32> from vector<2x2xf32>
+// CHECK: %[[T5:.*]] = vector.extract %[[X]][0] : f32 from vector<2xf32>
+// CHECK: %[[T6:.*]] = vector.outerproduct %[[T4]], %[[T5]], %[[B]] {kind = #vector.kind<add>} : vector<2xf32>, f32
+// CHECK: %[[T7:.*]] = vector.extract %[[T3]][1] : vector<2xf32> from vector<2x2xf32>
+// CHECK: %[[T8:.*]] = vector.extract %[[X]][1] : f32 from vector<2xf32>
+// CHECK: %[[T9:.*]] = vector.outerproduct %[[T7]], %[[T8]], %[[T6]] {kind = #vector.kind<add>} : vector<2xf32>, f32
+func.func @matvec_k_mk_m(%A: vector<2x2xf32>, 
                          %x: vector<2xf32>,
                          %b: vector<2xf32>) -> vector<2xf32> {
-  %0 = vector.contract #matvec_trait_2 %A, %x, %b : vector<2x2xf32>, vector<2xf32> into vector<2xf32>
+  %0 = vector.contract #matvec_trait_3 %x, %A, %b : vector<2xf32>, vector<2x2xf32> into vector<2xf32>
   return %0 : vector<2xf32>
 }
 
-// ============================================================================
-//  Matvec 3 (plain + masked + scalable)
-// ============================================================================
 // CHECK-LABEL: @masked_matvec_k_mk_m
 // CHECK-SAME:  %[[A:.+]]: vector<4x2xf32>
 // CHECK-SAME:  %[[X:.+]]: vector<2xf32>
@@ -331,24 +349,6 @@ func.func @masked_matvec_k_mk_m_scalable_parallel_dim(%A: vector<[4]x2xf32>,
   return %res : vector<[4]xf32>
 }
 
-// CHECK-LABEL: func @matvec_k_mk_m
-// CHECK-SAME: %[[A:.*0]]: vector<2x2xf32>
-// CHECK-SAME: %[[X:.*1]]: vector<2xf32>
-// CHECK-SAME: %[[B:.*2]]: vector<2xf32>
-// CHECK: %[[T3:.*]] = vector.transpose %[[A]], [1, 0] : vector<2x2xf32> to vector<2x2xf32>
-// CHECK: %[[T4:.*]] = vector.extract %[[T3]][0] : vector<2xf32> from vector<2x2xf32>
-// CHECK: %[[T5:.*]] = vector.extract %[[X]][0] : f32 from vector<2xf32>
-// CHECK: %[[T6:.*]] = vector.outerproduct %[[T4]], %[[T5]], %[[B]] {kind = #vector.kind<add>} : vector<2xf32>, f32
-// CHECK: %[[T7:.*]] = vector.extract %[[T3]][1] : vector<2xf32> from vector<2x2xf32>
-// CHECK: %[[T8:.*]] = vector.extract %[[X]][1] : f32 from vector<2xf32>
-// CHECK: %[[T9:.*]] = vector.outerproduct %[[T7]], %[[T8]], %[[T6]] {kind = #vector.kind<add>} : vector<2xf32>, f32
-func.func @matvec_k_mk_m(%A: vector<2x2xf32>, 
-                         %x: vector<2xf32>,
-                         %b: vector<2xf32>) -> vector<2xf32> {
-  %0 = vector.contract #matvec_trait_3 %x, %A, %b : vector<2xf32>, vector<2x2xf32> into vector<2xf32>
-  return %0 : vector<2xf32>
-}
-
 // ============================================================================
 //  Matvec 4 (plain + masked + scalable)
 // ============================================================================

From dddb9d1ee3e283133ce1abb50b7c7a3715317b9d Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett@linaro.org>
Date: Thu, 21 Dec 2023 13:27:00 +0000
Subject: [PATCH 062/342] [lldb][test] Fix missing-braces warnings in unit
 tests

```
/home/worker/2.0.1/lldb-x86_64-debian/llvm-project/lldb/unittests/Utility/ChecksumTest.cpp:15:38: warning: suggest braces around initialization of subobject [-Wmissing-braces]
static llvm::MD5::MD5Result hash1 = {0, 1, 2,  3,  4,  5,  6,  7,
                                     ^~~~~~~~~~~~~~~~~~~~~~~~~~~~
                                     {
```
And others.
---
 lldb/unittests/Utility/ChecksumTest.cpp | 12 ++++++------
 lldb/unittests/Utility/FileSpecTest.cpp |  3 ++-
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/lldb/unittests/Utility/ChecksumTest.cpp b/lldb/unittests/Utility/ChecksumTest.cpp
index 7537d30b5ff5b..a81aba2ee98ca 100644
--- a/lldb/unittests/Utility/ChecksumTest.cpp
+++ b/lldb/unittests/Utility/ChecksumTest.cpp
@@ -12,14 +12,14 @@
 
 using namespace lldb_private;
 
-static llvm::MD5::MD5Result hash1 = {0, 1, 2,  3,  4,  5,  6,  7,
-                                     8, 9, 10, 11, 12, 13, 14, 15};
+static llvm::MD5::MD5Result hash1 = {
+    {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}};
 
-static llvm::MD5::MD5Result hash2 = {0, 1, 2,  3,  4,  5,  6,  7,
-                                     8, 9, 10, 11, 12, 13, 14, 15};
+static llvm::MD5::MD5Result hash2 = {
+    {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}};
 
-static llvm::MD5::MD5Result hash3 = {8, 9, 10, 11, 12, 13, 14, 15,
-                                     0, 1, 2,  3,  4,  5,  6,  7};
+static llvm::MD5::MD5Result hash3 = {
+    {8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7}};
 
 TEST(ChecksumTest, TestConstructor) {
   Checksum checksum1;
diff --git a/lldb/unittests/Utility/FileSpecTest.cpp b/lldb/unittests/Utility/FileSpecTest.cpp
index 565395a495be6..9faad10e47301 100644
--- a/lldb/unittests/Utility/FileSpecTest.cpp
+++ b/lldb/unittests/Utility/FileSpecTest.cpp
@@ -536,7 +536,8 @@ TEST(FileSpecTest, TestGetComponents) {
 }
 
 TEST(FileSpecTest, TestChecksum) {
-  Checksum checksum({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15});
+  Checksum checksum(llvm::MD5::MD5Result{
+      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}});
   FileSpec file_spec("/foo/bar", FileSpec::Style::posix, checksum);
   EXPECT_TRUE(static_cast<bool>(file_spec.GetChecksum()));
   EXPECT_EQ(file_spec.GetChecksum(), checksum);

From 3d9fc3fed036442e88f43e9d70362d1fd9f239ab Mon Sep 17 00:00:00 2001
From: madanial0 <118996571+madanial0@users.noreply.github.com>
Date: Thu, 21 Dec 2023 08:58:55 -0500
Subject: [PATCH 063/342] [flang] add no-cpp-dep test for AIX 64 bit (#74637)

Add a new test for no-cpp-dep on AIX as it requires 64 bit OBJECT_MODE
since only 64-bit AIX is supported. AIX does not allow `-o /dev/null`
and requires `-lpthread` flag to be added.

---------

Co-authored-by: Mark Danial <mark.danial@ibm.com>
---
 flang/test/Runtime/no-cpp-dep.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/flang/test/Runtime/no-cpp-dep.c b/flang/test/Runtime/no-cpp-dep.c
index f8fe97b5bf78e..654bebed345b1 100644
--- a/flang/test/Runtime/no-cpp-dep.c
+++ b/flang/test/Runtime/no-cpp-dep.c
@@ -5,7 +5,10 @@ a C compiler.
 
 REQUIRES: c-compiler
 
-RUN: %cc -std=c99 %s -I%include %libruntime %libdecimal -lm -o /dev/null
+RUN: %if system-aix %{ export OBJECT_MODE=64 %}
+RUN: %cc -std=c99 %s -I%include %libruntime %libdecimal -lm  \
+RUN: %if system-aix %{-lpthread %}
+RUN: rm a.out
 */
 
 #include "flang/Runtime/entry-names.h"

From 11c2c0940b5e44920847b4d191a1272141de65f9 Mon Sep 17 00:00:00 2001
From: madanial0 <118996571+madanial0@users.noreply.github.com>
Date: Thu, 21 Dec 2023 08:59:21 -0500
Subject: [PATCH 064/342] [Flang] Add fortran runtime libraries to AIX driver
 (#75921)

Add fortran runtime libraries to flang-new on AIX

Co-authored-by: Mark Danial <mark.danial@ibm.com>
---
 clang/lib/Driver/ToolChains/AIX.cpp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/clang/lib/Driver/ToolChains/AIX.cpp b/clang/lib/Driver/ToolChains/AIX.cpp
index f9670ea6f251b..e6126ff62db3c 100644
--- a/clang/lib/Driver/ToolChains/AIX.cpp
+++ b/clang/lib/Driver/ToolChains/AIX.cpp
@@ -328,6 +328,12 @@ void aix::Linker::ConstructJob(Compilation &C, const JobAction &JA,
     }
   }
 
+  if (D.IsFlangMode()) {
+    addFortranRuntimeLibraryPath(ToolChain, Args, CmdArgs);
+    addFortranRuntimeLibs(ToolChain, Args, CmdArgs);
+    CmdArgs.push_back("-lm");
+    CmdArgs.push_back("-lpthread");
+  }
   const char *Exec = Args.MakeArgString(ToolChain.GetLinkerPath());
   C.addCommand(std::make_unique<Command>(JA, *this, ResponseFileSupport::None(),
                                          Exec, CmdArgs, Inputs, Output));

From 55985db5fe82705234370848c47575db7a16437e Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett@linaro.org>
Date: Thu, 21 Dec 2023 14:05:08 +0000
Subject: [PATCH 065/342] [lldb][test] Remove non-existent Windows lib from
 thread tests

I assumed since it was in the PR and seemed like a logical
library to have, it would exist, but only `...Common` exists.
---
 lldb/unittests/Thread/CMakeLists.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/lldb/unittests/Thread/CMakeLists.txt b/lldb/unittests/Thread/CMakeLists.txt
index a90643eb0c110..2b771d3cd309f 100644
--- a/lldb/unittests/Thread/CMakeLists.txt
+++ b/lldb/unittests/Thread/CMakeLists.txt
@@ -1,5 +1,4 @@
 if (CMAKE_SYSTEM_NAME MATCHES "Windows")
-  list(APPEND LLDB_WINDOWS_LIBS lldbPluginProcessWindows)
   list(APPEND LLDB_WINDOWS_LIBS lldbPluginProcessWindowsCommon)
 endif()
 

From f54249e79a507f4bfeaa9ce3f693dbe01c9af915 Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett@linaro.org>
Date: Thu, 21 Dec 2023 14:07:27 +0000
Subject: [PATCH 066/342] [lldb][test] Link to PlatformWindows in thread tests

Clearly I need my eyes checked, it wasn't linking to a non-existent
library at all, I had the name wrong.
---
 lldb/unittests/Thread/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lldb/unittests/Thread/CMakeLists.txt b/lldb/unittests/Thread/CMakeLists.txt
index 2b771d3cd309f..8fc44da6f5b39 100644
--- a/lldb/unittests/Thread/CMakeLists.txt
+++ b/lldb/unittests/Thread/CMakeLists.txt
@@ -1,4 +1,5 @@
 if (CMAKE_SYSTEM_NAME MATCHES "Windows")
+  list(APPEND LLDB_WINDOWS_LIBS lldbPluginPlatformWindows)
   list(APPEND LLDB_WINDOWS_LIBS lldbPluginProcessWindowsCommon)
 endif()
 

From 70260860739fcbea2a5bee9a0d5e1d1d32ac6603 Mon Sep 17 00:00:00 2001
From: stephenpeckham <118857872+stephenpeckham@users.noreply.github.com>
Date: Thu, 21 Dec 2023 08:17:32 -0600
Subject: [PATCH 067/342] [XCOFF] Use RLDs to print branches even without -r
 (#74342)

This presents misleading and confusing output. If you have a function
defined at the beginning of an XCOFF object file, and you have a
function call to an external function, the function call disassembles as
a branch to the local function. That is,

`void f() { f(); g();}`

disassembles as
>00000000 <.f>:
       0: 7c 08 02 a6   mflr 0
4: 94 21 ff c0 stwu 1, -64(1)
       8: 90 01 00 48   stw 0, 72(1)
      c: 4b ff ff f5   bl 0x0 <.f>
      10: 4b ff ff f1   bl 0x0 <.f>

With this PR, the second call will display:

`10: 4b ff ff f1   bl 0x0 <.g>  `

Using -r can help, but you still get the confusing output:

>10: 4b ff ff f1   bl 0x0 <.f>
      00000010:  R_RBR        .g
---
 llvm/test/CodeGen/PowerPC/aix-return55.ll     |   2 +-
 .../CodeGen/PowerPC/aix-xcoff-funcsect.ll     |   8 +-
 llvm/test/CodeGen/PowerPC/aix-xcoff-reloc.ll  |   6 +-
 .../XCOFF/disassemble-symbolize-operands.ll   |   2 +-
 .../XCOFF/disassemble-symbolize-operands2.ll  |  65 +++++++
 llvm/tools/llvm-objdump/XCOFFDump.cpp         |   1 +
 llvm/tools/llvm-objdump/XCOFFDump.h           |   1 +
 llvm/tools/llvm-objdump/llvm-objdump.cpp      | 171 +++++++++++++-----
 8 files changed, 197 insertions(+), 59 deletions(-)
 create mode 100644 llvm/test/tools/llvm-objdump/XCOFF/disassemble-symbolize-operands2.ll

diff --git a/llvm/test/CodeGen/PowerPC/aix-return55.ll b/llvm/test/CodeGen/PowerPC/aix-return55.ll
index c7d481ced140e..a36deda9c1469 100644
--- a/llvm/test/CodeGen/PowerPC/aix-return55.ll
+++ b/llvm/test/CodeGen/PowerPC/aix-return55.ll
@@ -39,7 +39,7 @@ entry:
 ;CHECKOBJ-NEXT:      1c: 67 8a bc de                   oris 10, 28, 48350{{[[:space:]] *}}
 ;CHECKOBJ32-NEXT: 00000020 <d>:
 ;CHECKOBJ64-NEXT: 0000000000000020 <d>:
-;CHECKOBJ-NEXT:      20: 40 14 00 00                   bdnzf   20, 0x20
+;CHECKOBJ-NEXT:      20: 40 14 00 00                   bdnzf   20, 0x20 <d>
 ;CHECKOBJ-NEXT:      24: 00 00 00 00                   <unknown>{{[[:space:]] *}}
 ;CHECKOBJ32-NEXT: 00000028 <foo>:
 ;CHECKOBJ32-NEXT:    28: 00 00 00 00                   <unknown>
diff --git a/llvm/test/CodeGen/PowerPC/aix-xcoff-funcsect.ll b/llvm/test/CodeGen/PowerPC/aix-xcoff-funcsect.ll
index a5056d407b76f..a557b6f4f1719 100644
--- a/llvm/test/CodeGen/PowerPC/aix-xcoff-funcsect.ll
+++ b/llvm/test/CodeGen/PowerPC/aix-xcoff-funcsect.ll
@@ -189,7 +189,7 @@ entry:
 ; DIS32-NEXT:       40: 7c 08 02 a6  	mflr 0
 ; DIS32-NEXT:       44: 94 21 ff c0  	stwu 1, -64(1)
 ; DIS32-NEXT:       48: 90 01 00 48  	stw 0, 72(1)
-; DIS32-NEXT:       4c: 4b ff ff b5  	bl 0x0 <.alias_foo>
+; DIS32-NEXT:       4c: 4b ff ff b5  	bl 0x0 <.foo>
 ; DIS32-NEXT: 			0000004c:  R_RBR	(idx: 7) .foo[PR]
 ; DIS32-NEXT:       50: 60 00 00 00  	nop
 ; DIS32-NEXT:       54: 48 00 00 6d  	bl 0xc0 <.static_overalign_foo>
@@ -198,7 +198,7 @@ entry:
 ; DIS32-NEXT:       5c: 4b ff ff a5  	bl 0x0 <.alias_foo>
 ; DIS32-NEXT: 			0000005c:  R_RBR	(idx: 9) .alias_foo
 ; DIS32-NEXT:       60: 60 00 00 00  	nop
-; DIS32-NEXT:       64: 4b ff ff 9d  	bl 0x0 <.alias_foo>
+; DIS32-NEXT:       64: 4b ff ff 9d  	bl 0x0 <.extern_foo>
 ; DIS32-NEXT: 			00000064:  R_RBR	(idx: 1) .extern_foo[PR]
 ; DIS32-NEXT:       68: 60 00 00 00  	nop
 ; DIS32-NEXT:       6c: 4b ff ff b5  	bl 0x20 <.hidden_foo>
@@ -212,7 +212,7 @@ entry:
 ; DIS64-NEXT:       40: 7c 08 02 a6  	mflr 0
 ; DIS64-NEXT:       44: f8 21 ff 91  	stdu 1, -112(1)
 ; DIS64-NEXT:       48: f8 01 00 80  	std 0, 128(1)
-; DIS64-NEXT:       4c: 4b ff ff b5  	bl 0x0 <.alias_foo>
+; DIS64-NEXT:       4c: 4b ff ff b5  	bl 0x0 <.foo>
 ; DIS64-NEXT: 		000000000000004c:  R_RBR	(idx: 7) .foo[PR]
 ; DIS64-NEXT:       50: 60 00 00 00  	nop
 ; DIS64-NEXT:       54: 48 00 00 6d  	bl 0xc0 <.static_overalign_foo>
@@ -221,7 +221,7 @@ entry:
 ; DIS64-NEXT:       5c: 4b ff ff a5  	bl 0x0 <.alias_foo>
 ; DIS64-NEXT: 		000000000000005c:  R_RBR	(idx: 9) .alias_foo
 ; DIS64-NEXT:       60: 60 00 00 00  	nop
-; DIS64-NEXT:       64: 4b ff ff 9d  	bl 0x0 <.alias_foo>
+; DIS64-NEXT:       64: 4b ff ff 9d  	bl 0x0 <.extern_foo>
 ; DIS64-NEXT: 		0000000000000064:  R_RBR	(idx: 1) .extern_foo[PR]
 ; DIS64-NEXT:       68: 60 00 00 00  	nop
 ; DIS64-NEXT:       6c: 4b ff ff b5  	bl 0x20 <.hidden_foo>
diff --git a/llvm/test/CodeGen/PowerPC/aix-xcoff-reloc.ll b/llvm/test/CodeGen/PowerPC/aix-xcoff-reloc.ll
index 97a5fbcf78f5d..5ac6a7af0db26 100644
--- a/llvm/test/CodeGen/PowerPC/aix-xcoff-reloc.ll
+++ b/llvm/test/CodeGen/PowerPC/aix-xcoff-reloc.ll
@@ -456,7 +456,7 @@ declare i32 @bar(i32)
 ; SYM-NEXT: ]
 
 
-; DIS:      {{.*}}aix-xcoff-reloc.ll.tmp.o:   file format aixcoff-rs6000
+; DIS:      :   file format aixcoff-rs6000
 ; DIS:      Disassembly of section .text:
 ; DIS:      00000000 <.foo>:
 ; DIS-NEXT:        0: 7c 08 02 a6                   mflr 0
@@ -495,7 +495,7 @@ declare i32 @bar(i32)
 ; DIS:      00000084 <globalB>:
 ; DIS-NEXT:       84: 00 00 00 44                   <unknown>
 
-; DIS_REL:       {{.*}}aix-xcoff-reloc.ll.tmp.o:   file format aixcoff-rs6000
+; DIS_REL:       :   file format aixcoff-rs6000
 ; DIS_REL:       RELOCATION RECORDS FOR [.text]:
 ; DIS_REL-NEXT:  OFFSET   TYPE                     VALUE
 ; DIS_REL-NEXT:  00000010 R_RBR                    .bar
@@ -515,7 +515,7 @@ declare i32 @bar(i32)
 ; DIS64-NEXT:        4: f8 21 ff 91  	stdu 1, -112(1)
 ; DIS64-NEXT:        8: 38 60 00 01  	li 3, 1
 ; DIS64-NEXT:        c: f8 01 00 80  	std 0, 128(1)
-; DIS64-NEXT:       10: 4b ff ff f1  	bl 0x0 <.foo>
+; DIS64-NEXT:       10: 4b ff ff f1  	bl 0x0 <.bar>
 ; DIS64-NEXT:       14: 60 00 00 00  	nop
 ; DIS64-NEXT:       18: e8 82 00 00  	ld 4, 0(2)
 ; DIS64-NEXT:       1c: e8 a2 00 08  	ld 5, 8(2)
diff --git a/llvm/test/tools/llvm-objdump/XCOFF/disassemble-symbolize-operands.ll b/llvm/test/tools/llvm-objdump/XCOFF/disassemble-symbolize-operands.ll
index adedb6b7a5abf..2b4d6806292ce 100644
--- a/llvm/test/tools/llvm-objdump/XCOFF/disassemble-symbolize-operands.ll
+++ b/llvm/test/tools/llvm-objdump/XCOFF/disassemble-symbolize-operands.ll
@@ -21,7 +21,7 @@
 ; CHECK-NEXT:        68:      	cmplwi	3, 11
 ; CHECK-NEXT:        6c:      	bt	0, 0x60 <L2>
 ; CHECK-NEXT:        70:        mr      31, 3
-; CHECK-NEXT:        74:      	bl 0x0 <.internal>
+; CHECK-NEXT:        74:      	bl 0x0 <.extern>
 ; CHECK-NEXT:        78:      	nop
 ; CHECK-NEXT:        7c:        mr      3, 31
 ; CHECK-NEXT:        80:      	b 0x60 <L2>
diff --git a/llvm/test/tools/llvm-objdump/XCOFF/disassemble-symbolize-operands2.ll b/llvm/test/tools/llvm-objdump/XCOFF/disassemble-symbolize-operands2.ll
new file mode 100644
index 0000000000000..a9cee924845f8
--- /dev/null
+++ b/llvm/test/tools/llvm-objdump/XCOFF/disassemble-symbolize-operands2.ll
@@ -0,0 +1,65 @@
+; RUN: llc -mtriple=powerpc-ibm-aix-xcoff %s -filetype=obj -o %t
+; RUN: llvm-objdump %t -r -d --symbolize-operands --no-show-raw-insn \
+; RUN:   | FileCheck %s
+
+; CHECK-LABEL: <.a>:
+;; No <L0> should appear
+; CHECK-NEXT:       0:      	mflr 0
+; CHECK-NEXT:       4:      	stwu 1, -64(1)
+; CHECK-NEXT:       8:      	lwz 3, 0(2)
+; CHECK-NEXT:0000000a:  R_TOC        var
+; CHECK-NEXT:       c:      	stw 0, 72(1)
+; CHECK-NEXT:      10:      	lwz 3, 0(3)
+; CHECK-NEXT:      14:      	bl 0x4c <.b>
+; CHECK-NEXT:      18:      	nop
+; CHECK-NEXT:      1c:      	li 3, 1
+; CHECK-NEXT:      20:      	bl 0x0 <.c>
+; CHECK-NEXT:00000020:  R_RBR        .c
+
+; CHECK-LABEL: <.b>:
+; CHECK-NEXT:      4c:      	mflr 0
+; CHECK-NEXT:      50:      	stwu 1, -64(1)
+; CHECK-NEXT:      54:      	cmplwi	3, 1
+; CHECK-NEXT:      58:      	stw 0, 72(1)
+; CHECK-NEXT:      5c:      	stw 3, 60(1)
+; CHECK-NEXT:      60:      	bf	2, 0x6c <L0>
+; CHECK-NEXT:      64:      	bl 0x0 <.a>
+; CHECK-NEXT:      68:      	nop
+; CHECK-NEXT:<L0>:
+; CHECK-NEXT:      6c:      	li 3, 2
+; CHECK-NEXT:      70:      	bl 0x0 <.c>
+; CHECK-NEXT:00000070:  R_RBR        .c
+
+target triple = "powerpc-ibm-aix7.2.0.0"
+
+@var = external global i32, align 4
+
+; Function Attrs: noinline nounwind optnone
+define i32 @a() {
+entry:
+  %0 = load i32, ptr @var, align 4
+  %call = call i32 @b(i32 noundef %0)
+  %call1 = call i32 @c(i32 noundef 1)
+  ret i32 %call1
+}
+
+; Function Attrs: noinline nounwind optnone
+define i32 @b(i32 noundef %x) {
+entry:
+  %x.addr = alloca i32, align 4
+  store i32 %x, ptr %x.addr, align 4
+  %0 = load i32, ptr %x.addr, align 4
+  %cmp = icmp eq i32 %0, 1
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  %call = call i32 @a()
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  %call1 = call i32 @c(i32 noundef 2)
+  ret i32 %call1
+}
+
+declare i32 @c(i32 noundef)
+
diff --git a/llvm/tools/llvm-objdump/XCOFFDump.cpp b/llvm/tools/llvm-objdump/XCOFFDump.cpp
index 0f6147924f8a1..d9c00c0962098 100644
--- a/llvm/tools/llvm-objdump/XCOFFDump.cpp
+++ b/llvm/tools/llvm-objdump/XCOFFDump.cpp
@@ -43,6 +43,7 @@ objdump::createXCOFFDumper(const object::XCOFFObjectFile &Obj) {
 
 Error objdump::getXCOFFRelocationValueString(const XCOFFObjectFile &Obj,
                                              const RelocationRef &Rel,
+                                             bool SymbolDescription,
                                              SmallVectorImpl<char> &Result) {
   symbol_iterator SymI = Rel.getSymbol();
   if (SymI == Obj.symbol_end())
diff --git a/llvm/tools/llvm-objdump/XCOFFDump.h b/llvm/tools/llvm-objdump/XCOFFDump.h
index cf5b19f910ea8..0ba6ba4cdaaad 100644
--- a/llvm/tools/llvm-objdump/XCOFFDump.h
+++ b/llvm/tools/llvm-objdump/XCOFFDump.h
@@ -33,6 +33,7 @@ std::string getXCOFFSymbolDescription(const SymbolInfoTy &SymbolInfo,
 
 Error getXCOFFRelocationValueString(const object::XCOFFObjectFile &Obj,
                                     const object::RelocationRef &RelRef,
+                                    bool SymbolDescription,
                                     llvm::SmallVectorImpl<char> &Result);
 
 void dumpTracebackTable(ArrayRef<uint8_t> Bytes, uint64_t Address,
diff --git a/llvm/tools/llvm-objdump/llvm-objdump.cpp b/llvm/tools/llvm-objdump/llvm-objdump.cpp
index 463d73e73ef82..7467a6062b5a8 100644
--- a/llvm/tools/llvm-objdump/llvm-objdump.cpp
+++ b/llvm/tools/llvm-objdump/llvm-objdump.cpp
@@ -424,6 +424,7 @@ bool objdump::isRelocAddressLess(RelocationRef A, RelocationRef B) {
 }
 
 static Error getRelocationValueString(const RelocationRef &Rel,
+                                      bool SymbolDescription,
                                       SmallVectorImpl<char> &Result) {
   const ObjectFile *Obj = Rel.getObject();
   if (auto *ELF = dyn_cast<ELFObjectFileBase>(Obj))
@@ -435,7 +436,8 @@ static Error getRelocationValueString(const RelocationRef &Rel,
   if (auto *MachO = dyn_cast<MachOObjectFile>(Obj))
     return getMachORelocationValueString(MachO, Rel, Result);
   if (auto *XCOFF = dyn_cast<XCOFFObjectFile>(Obj))
-    return getXCOFFRelocationValueString(*XCOFF, Rel, Result);
+    return getXCOFFRelocationValueString(*XCOFF, Rel, SymbolDescription,
+                                         Result);
   llvm_unreachable("unknown object file format");
 }
 
@@ -527,7 +529,7 @@ static void printRelocation(formatted_raw_ostream &OS, StringRef FileName,
   SmallString<16> Name;
   SmallString<32> Val;
   Rel.getTypeName(Name);
-  if (Error E = getRelocationValueString(Rel, Val))
+  if (Error E = getRelocationValueString(Rel, SymbolDescription, Val))
     reportError(std::move(E), FileName);
   OS << (Is64Bits || !LeadingAddr ? "\t\t" : "\t\t\t");
   if (LeadingAddr)
@@ -1289,7 +1291,8 @@ collectLocalBranchTargets(ArrayRef<uint8_t> Bytes, MCInstrAnalysis *MIA,
                           uint64_t Start, uint64_t End,
                           std::unordered_map<uint64_t, std::string> &Labels) {
   // So far only supports PowerPC and X86.
-  if (!STI->getTargetTriple().isPPC() && !STI->getTargetTriple().isX86())
+  const bool isPPC = STI->getTargetTriple().isPPC();
+  if (!isPPC && !STI->getTargetTriple().isX86())
     return;
 
   if (MIA)
@@ -1299,8 +1302,8 @@ collectLocalBranchTargets(ArrayRef<uint8_t> Bytes, MCInstrAnalysis *MIA,
   unsigned LabelCount = 0;
   Start += SectionAddr;
   End += SectionAddr;
-  uint64_t Index = Start;
-  while (Index < End) {
+  const bool isXCOFF = STI->getTargetTriple().isOSBinFormatXCOFF();
+  for (uint64_t Index = Start; Index < End;) {
     // Disassemble a real instruction and record function-local branch labels.
     MCInst Inst;
     uint64_t Size;
@@ -1311,18 +1314,22 @@ collectLocalBranchTargets(ArrayRef<uint8_t> Bytes, MCInstrAnalysis *MIA,
       Size = std::min<uint64_t>(ThisBytes.size(),
                                 DisAsm->suggestBytesToSkip(ThisBytes, Index));
 
-    if (Disassembled && MIA) {
-      uint64_t Target;
-      bool TargetKnown = MIA->evaluateBranch(Inst, Index, Size, Target);
-      // On PowerPC, if the address of a branch is the same as the target, it
-      // means that it's a function call. Do not mark the label for this case.
-      if (TargetKnown && (Target >= Start && Target < End) &&
-          !Labels.count(Target) &&
-          !(STI->getTargetTriple().isPPC() && Target == Index))
-        Labels[Target] = ("L" + Twine(LabelCount++)).str();
-      MIA->updateState(Inst, Index);
-    } else if (!Disassembled && MIA) {
-      MIA->resetState();
+    if (MIA) {
+      if (Disassembled) {
+        uint64_t Target;
+        bool TargetKnown = MIA->evaluateBranch(Inst, Index, Size, Target);
+        if (TargetKnown && (Target >= Start && Target < End) &&
+            !Labels.count(Target)) {
+          // On PowerPC and AIX, a function call is encoded as a branch to 0.
+          // On other PowerPC platforms (ELF), a function call is encoded as
+          // a branch to self. Do not add a label for these cases.
+          if (!(isPPC &&
+                ((Target == 0 && isXCOFF) || (Target == Index && !isXCOFF))))
+            Labels[Target] = ("L" + Twine(LabelCount++)).str();
+        }
+        MIA->updateState(Inst, Index);
+      } else
+        MIA->resetState();
     }
     Index += Size;
   }
@@ -1486,7 +1493,7 @@ disassembleObject(ObjectFile &Obj, const ObjectFile &DbgObj,
   }
 
   std::map<SectionRef, std::vector<RelocationRef>> RelocMap;
-  if (InlineRelocs)
+  if (InlineRelocs || Obj.isXCOFF())
     RelocMap = getRelocsMap(Obj);
   bool Is64Bits = Obj.getBytesInAddress() > 4;
 
@@ -1985,6 +1992,8 @@ disassembleObject(ObjectFile &Obj, const ObjectFile &DbgObj,
         DT->InstrAnalysis->resetState();
 
       while (Index < End) {
+        uint64_t RelOffset;
+
         // ARM and AArch64 ELF binaries can interleave data and text in the
         // same section. We rely on the markers introduced to understand what
         // we need to dump. If the data marker is within a function, it is
@@ -2019,6 +2028,26 @@ disassembleObject(ObjectFile &Obj, const ObjectFile &DbgObj,
           }
         }
 
+        auto findRel = [&]() {
+          while (RelCur != RelEnd) {
+            RelOffset = RelCur->getOffset() - RelAdjustment;
+            // If this relocation is hidden, skip it.
+            if (getHidden(*RelCur) || SectionAddr + RelOffset < StartAddress) {
+              ++RelCur;
+              continue;
+            }
+
+            // Stop when RelCur's offset is past the disassembled
+            // instruction/data.
+            if (RelOffset >= Index + Size)
+              return false;
+            if (RelOffset >= Index)
+              return true;
+            ++RelCur;
+          }
+          return false;
+        };
+
         if (DumpARMELFData) {
           Size = dumpARMELFData(SectionAddr, Index, End, Obj, Bytes,
                                 MappingSymbols, *DT->SubtargetInfo, FOS);
@@ -2029,7 +2058,7 @@ disassembleObject(ObjectFile &Obj, const ObjectFile &DbgObj,
             uint64_t MaxOffset = End - Index;
             // For --reloc: print zero blocks patched by relocations, so that
             // relocations can be shown in the dump.
-            if (RelCur != RelEnd)
+            if (InlineRelocs && RelCur != RelEnd)
               MaxOffset = std::min(RelCur->getOffset() - RelAdjustment - Index,
                                    MaxOffset);
 
@@ -2087,17 +2116,19 @@ disassembleObject(ObjectFile &Obj, const ObjectFile &DbgObj,
 
           DT->InstPrinter->setCommentStream(llvm::nulls());
 
-          // If disassembly has failed, avoid analysing invalid/incomplete
-          // instruction information. Otherwise, try to resolve the target
-          // address (jump target or memory operand address) and print it on the
+          // If disassembly succeeds, we try to resolve the target address
+          // (jump target or memory operand address) and print it to the
           // right of the instruction.
+          //
+          // Otherwise, we don't print anything else so that we avoid
+          // analyzing invalid or incomplete instruction information.
           if (Disassembled && DT->InstrAnalysis) {
-            // Branch targets are printed just after the instructions.
             llvm::raw_ostream *TargetOS = &FOS;
             uint64_t Target;
             bool PrintTarget = DT->InstrAnalysis->evaluateBranch(
                 Inst, SectionAddr + Index, Size, Target);
-            if (!PrintTarget)
+
+            if (!PrintTarget) {
               if (std::optional<uint64_t> MaybeTarget =
                       DT->InstrAnalysis->evaluateMemoryOperandAddress(
                           Inst, DT->SubtargetInfo.get(), SectionAddr + Index,
@@ -2111,6 +2142,8 @@ disassembleObject(ObjectFile &Obj, const ObjectFile &DbgObj,
                   *TargetOS << "0x" << Twine::utohexstr(Target);
                 }
               }
+            }
+
             if (PrintTarget) {
               // In a relocatable object, the target's section must reside in
               // the same section as the call instruction or it is accessed
@@ -2120,7 +2153,8 @@ disassembleObject(ObjectFile &Obj, const ObjectFile &DbgObj,
               // In that case, locate the section(s) containing the target
               // address and find the symbol in one of those, if possible.
               //
-              // N.B. We don't walk the relocations in the relocatable case yet.
+              // N.B. Except for XCOFF, we don't walk the relocations in the
+              // relocatable case yet.
               std::vector<const SectionSymbolsTy *> TargetSectionSymbols;
               if (!Obj.isRelocatableObject()) {
                 auto It = llvm::partition_point(
@@ -2166,19 +2200,65 @@ disassembleObject(ObjectFile &Obj, const ObjectFile &DbgObj,
                   break;
               }
 
+              // Branch targets are printed just after the instructions.
               // Print the labels corresponding to the target if there's any.
               bool BBAddrMapLabelAvailable = BBAddrMapLabels.count(Target);
               bool LabelAvailable = AllLabels.count(Target);
+
               if (TargetSym != nullptr) {
                 uint64_t TargetAddress = TargetSym->Addr;
                 uint64_t Disp = Target - TargetAddress;
                 std::string TargetName = Demangle ? demangle(TargetSym->Name)
                                                   : TargetSym->Name.str();
+                bool RelFixedUp = false;
+                SmallString<32> Val;
 
                 *TargetOS << " <";
-                if (!Disp) {
-                  // Always Print the binary symbol precisely corresponding to
-                  // the target address.
+                // On XCOFF, we use relocations, even without -r, so we
+                // can print the correct name for an extern function call.
+                if (Obj.isXCOFF() && findRel()) {
+                  // Check for possible branch relocations and
+                  // branches to fixup code.
+                  bool BranchRelocationType = true;
+                  XCOFF::RelocationType RelocType;
+                  if (Obj.is64Bit()) {
+                    const XCOFFRelocation64 *Reloc =
+                        reinterpret_cast<XCOFFRelocation64 *>(
+                            RelCur->getRawDataRefImpl().p);
+                    RelFixedUp = Reloc->isFixupIndicated();
+                    RelocType = Reloc->Type;
+                  } else {
+                    const XCOFFRelocation32 *Reloc =
+                        reinterpret_cast<XCOFFRelocation32 *>(
+                            RelCur->getRawDataRefImpl().p);
+                    RelFixedUp = Reloc->isFixupIndicated();
+                    RelocType = Reloc->Type;
+                  }
+                  BranchRelocationType =
+                      RelocType == XCOFF::R_BA || RelocType == XCOFF::R_BR ||
+                      RelocType == XCOFF::R_RBA || RelocType == XCOFF::R_RBR;
+
+                  // If we have a valid relocation, try to print its
+                  // corresponding symbol name. Multiple relocations on the
+                  // same instruction are not handled.
+                  // Branches to fixup code will have the RelFixedUp flag set in
+                  // the RLD. For these instructions, we print the correct
+                  // branch target, but print the referenced symbol as a
+                  // comment.
+                  if (Error E = getRelocationValueString(*RelCur, false, Val)) {
+                    // If -r was used, this error will be printed later.
+                    // Otherwise, we ignore the error and print what
+                    // would have been printed without using relocations.
+                    consumeError(std::move(E));
+                    *TargetOS << TargetName;
+                    RelFixedUp = false; // Suppress comment for RLD sym name
+                  } else if (BranchRelocationType && !RelFixedUp)
+                    *TargetOS << Val;
+                  else
+                    *TargetOS << TargetName;
+                  if (Disp)
+                    *TargetOS << "+0x" << Twine::utohexstr(Disp);
+                } else if (!Disp) {
                   *TargetOS << TargetName;
                 } else if (BBAddrMapLabelAvailable) {
                   *TargetOS << BBAddrMapLabels[Target].front();
@@ -2190,6 +2270,12 @@ disassembleObject(ObjectFile &Obj, const ObjectFile &DbgObj,
                   *TargetOS << TargetName << "+0x" << Twine::utohexstr(Disp);
                 }
                 *TargetOS << ">";
+                if (RelFixedUp && !InlineRelocs) {
+                  // We have fixup code for a relocation. We print the
+                  // referenced symbol as a comment.
+                  *TargetOS << "\t# " << Val;
+                }
+
               } else if (BBAddrMapLabelAvailable) {
                 *TargetOS << " <" << BBAddrMapLabels[Target].front() << ">";
               } else if (LabelAvailable) {
@@ -2215,36 +2301,20 @@ disassembleObject(ObjectFile &Obj, const ObjectFile &DbgObj,
         if (BTF)
           printBTFRelocation(FOS, *BTF, {Index, Section.getIndex()}, LVP);
 
-        // Hexagon does this in pretty printer
-        if (Obj.getArch() != Triple::hexagon) {
-          // Print relocation for instruction and data.
-          while (RelCur != RelEnd) {
-            uint64_t Offset = RelCur->getOffset() - RelAdjustment;
-            // If this relocation is hidden, skip it.
-            if (getHidden(*RelCur)) {
-              ++RelCur;
-              continue;
-            }
-
-            // Stop when RelCur's offset is past the disassembled
-            // instruction/data. Note that it's possible the disassembled data
-            // is not the complete data: we might see the relocation printed in
-            // the middle of the data, but this matches the binutils objdump
-            // output.
-            if (Offset >= Index + Size)
-              break;
-
+        // Hexagon handles relocs in pretty printer
+        if (InlineRelocs && Obj.getArch() != Triple::hexagon) {
+          while (findRel()) {
             // When --adjust-vma is used, update the address printed.
             if (RelCur->getSymbol() != Obj.symbol_end()) {
               Expected<section_iterator> SymSI =
                   RelCur->getSymbol()->getSection();
               if (SymSI && *SymSI != Obj.section_end() &&
                   shouldAdjustVA(**SymSI))
-                Offset += AdjustVMA;
+                RelOffset += AdjustVMA;
             }
 
             printRelocation(FOS, Obj.getFileName(), *RelCur,
-                            SectionAddr + Offset, Is64Bits);
+                            SectionAddr + RelOffset, Is64Bits);
             LVP.printAfterOtherLine(FOS, true);
             ++RelCur;
           }
@@ -2428,7 +2498,8 @@ void Dumper::printRelocations() {
         if (Address < StartAddress || Address > StopAddress || getHidden(Reloc))
           continue;
         Reloc.getTypeName(RelocName);
-        if (Error E = getRelocationValueString(Reloc, ValueStr))
+        if (Error E =
+                getRelocationValueString(Reloc, SymbolDescription, ValueStr))
           reportUniqueWarning(std::move(E));
 
         outs() << format(Fmt.data(), Address) << " "

From 92dc23c0e054183e8adf41aad2a2609cefc392c0 Mon Sep 17 00:00:00 2001
From: Tomas Matheson <tomas.matheson@arm.com>
Date: Thu, 2 Feb 2023 13:19:05 +0000
Subject: [PATCH 068/342] [AArch64] add missing test case for v9.4-A

---
 clang/test/Preprocessor/aarch64-target-features.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/clang/test/Preprocessor/aarch64-target-features.c b/clang/test/Preprocessor/aarch64-target-features.c
index db89aa7b608ad..b3da54162da04 100644
--- a/clang/test/Preprocessor/aarch64-target-features.c
+++ b/clang/test/Preprocessor/aarch64-target-features.c
@@ -600,6 +600,7 @@
 // RUN: %clang -target aarch64-none-elf -march=armv9.1-a -x c -E -dM %s -o - | FileCheck --check-prefixes=CHECK-V81-OR-LATER,CHECK-V83-OR-LATER,CHECK-V85-OR-LATER %s
 // RUN: %clang -target aarch64-none-elf -march=armv9.2-a -x c -E -dM %s -o - | FileCheck --check-prefixes=CHECK-V81-OR-LATER,CHECK-V83-OR-LATER,CHECK-V85-OR-LATER %s
 // RUN: %clang -target aarch64-none-elf -march=armv9.3-a -x c -E -dM %s -o - | FileCheck --check-prefixes=CHECK-V81-OR-LATER,CHECK-V83-OR-LATER,CHECK-V85-OR-LATER %s
+// RUN: %clang -target aarch64-none-elf -march=armv9.4-a -x c -E -dM %s -o - | FileCheck --check-prefixes=CHECK-V81-OR-LATER,CHECK-V83-OR-LATER,CHECK-V85-OR-LATER %s
 // RUN: %clang -target aarch64-none-elf -march=armv9.5-a -x c -E -dM %s -o - | FileCheck --check-prefixes=CHECK-V81-OR-LATER,CHECK-V83-OR-LATER,CHECK-V85-OR-LATER %s
 // CHECK-V81-OR-LATER: __ARM_FEATURE_ATOMICS 1
 // CHECK-V85-OR-LATER: __ARM_FEATURE_BTI 1

From 934b1099cbf14fa3f86a269dff957da8e5fb619f Mon Sep 17 00:00:00 2001
From: Oliver Stannard <oliver.stannard@arm.com>
Date: Wed, 1 Feb 2023 18:16:07 +0000
Subject: [PATCH 069/342] [AArch64] Add FEAT_PAuthLR assembler support

Add assembly/disassembly support for the new PAuthLR instructions
introduced in Armv9.5-A:

- AUTIASPPC/AUTIBSPPC
- PACIASPPC/PACIBSPPC
- PACNBIASPPC/PACNBIBSPPC
- RETAASPPC/RETABSPPC
- PACM

Documentation for these instructions can be found here:
https://developer.arm.com/documentation/ddi0602/2023-09/Base-Instructions/
---
 llvm/lib/Target/AArch64/AArch64.td            |   9 +-
 .../lib/Target/AArch64/AArch64InstrFormats.td |  74 +++++++++
 llvm/lib/Target/AArch64/AArch64InstrInfo.td   |  39 +++++
 llvm/lib/Target/AArch64/AArch64SchedA64FX.td  |   2 +-
 .../Target/AArch64/AArch64SchedNeoverseN2.td  |   2 +-
 .../AArch64/AsmParser/AArch64AsmParser.cpp    |  28 ++++
 .../Disassembler/AArch64Disassembler.cpp      |  18 +++
 .../MCTargetDesc/AArch64AsmBackend.cpp        |  14 ++
 .../MCTargetDesc/AArch64ELFObjectWriter.cpp   |   4 +
 .../AArch64/MCTargetDesc/AArch64FixupKinds.h  |   5 +
 .../MCTargetDesc/AArch64MCCodeEmitter.cpp     |  29 ++++
 .../MC/AArch64/armv9.5a-pauthlr-diagnostics.s |  57 +++++++
 llvm/test/MC/AArch64/armv9.5a-pauthlr-reloc.s |  12 ++
 llvm/test/MC/AArch64/armv9.5a-pauthlr.s       | 151 ++++++++++++++++++
 .../Disassembler/AArch64/armv9.5a-pauthlr.txt |  78 +++++++++
 15 files changed, 518 insertions(+), 4 deletions(-)
 create mode 100644 llvm/test/MC/AArch64/armv9.5a-pauthlr-diagnostics.s
 create mode 100644 llvm/test/MC/AArch64/armv9.5a-pauthlr-reloc.s
 create mode 100644 llvm/test/MC/AArch64/armv9.5a-pauthlr.s
 create mode 100644 llvm/test/MC/Disassembler/AArch64/armv9.5a-pauthlr.txt

diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td
index db92a94e40e4b..97e92a57a7ff4 100644
--- a/llvm/lib/Target/AArch64/AArch64.td
+++ b/llvm/lib/Target/AArch64/AArch64.td
@@ -622,8 +622,13 @@ def FeatureLdpAlignedOnly : SubtargetFeature<"ldp-aligned-only", "HasLdpAlignedO
 def FeatureStpAlignedOnly : SubtargetFeature<"stp-aligned-only", "HasStpAlignedOnly",
     "true", "In order to emit stp, first check if the store will be aligned to 2 * element_size">;
 
+// AArch64 2023 Architecture Extensions (v9.5-A)
+
 def FeatureCPA : SubtargetFeature<"cpa", "HasCPA", "true",
-  "Enable ARMv9.5-A Checked Pointer Arithmetic (FEAT_CPA)">;
+    "Enable Armv9.5-A Checked Pointer Arithmetic (FEAT_CPA)">;
+
+def FeaturePAuthLR : SubtargetFeature<"pauth-lr", "HasPAuthLR",
+    "true", "Enable Armv9.5-A PAC enhancements (FEAT_PAuth_LR)">;
 
 //===----------------------------------------------------------------------===//
 // Architectures.
@@ -810,7 +815,7 @@ def SMEUnsupported : AArch64Unsupported {
                       SME2Unsupported.F);
 }
 
-let F = [HasPAuth] in
+let F = [HasPAuth, HasPAuthLR] in
 def PAUnsupported : AArch64Unsupported;
 
 include "AArch64SchedA53.td"
diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
index 690ac0dcda621..cb63d8726744d 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -2368,6 +2368,80 @@ class ClearAuth<bits<1> data, string asm>
   let Inst{4-0} = Rd;
 }
 
+// v9.5-A FEAT_PAuth_LR
+
+class SignAuthFixedRegs<bits<5> opcode2, bits<6> opcode, string asm>
+  : I<(outs), (ins), asm, "", "", []>,
+    Sched<[WriteI, ReadI]> {
+  let Inst{31} = 0b1; // sf
+  let Inst{30} = 0b1;
+  let Inst{29} = 0b0; // S
+  let Inst{28-21} = 0b11010110;
+  let Inst{20-16} = opcode2;
+  let Inst{15-10} = opcode;
+  let Inst{9-5} = 0b11111; // Rn
+  let Inst{4-0} = 0b11110; // Rd
+}
+
+def PAuthPCRelLabel16Operand : PCRelLabel<16> {
+  let Name = "PAuthPCRelLabel16";
+  let PredicateMethod = "isPAuthPCRelLabel16Operand";
+}
+def am_pauth_pcrel : Operand<OtherVT> {
+  let EncoderMethod = "getPAuthPCRelOpValue";
+  let DecoderMethod = "DecodePCRelLabel16";
+  let PrintMethod = "printAlignedLabel";
+  let ParserMatchClass = PAuthPCRelLabel16Operand;
+  let OperandType = "OPERAND_PCREL";
+}
+
+class SignAuthPCRel<bits<2> opc, string asm>
+  : I<(outs), (ins am_pauth_pcrel:$label), asm, "\t$label", "", []>,
+    Sched<[]> {
+  bits<16> label;
+  let Inst{31} = 0b1; // sf
+  let Inst{30-23} = 0b11100111;
+  let Inst{22-21} = opc;
+  let Inst{20-5} = label; // imm
+  let Inst{4-0} = 0b11111; // Rd
+}
+
+class SignAuthOneReg<bits<5> opcode2, bits<6> opcode, string asm>
+  : I<(outs), (ins GPR64:$Rn), asm, "\t$Rn", "", []>,
+    Sched<[]> {
+  bits<5> Rn;
+  let Inst{31} = 0b1; // sf
+  let Inst{30} = 0b1;
+  let Inst{29} = 0b0; // S
+  let Inst{28-21} = 0b11010110;
+  let Inst{20-16} = opcode2;
+  let Inst{15-10} = opcode;
+  let Inst{9-5} = Rn;
+  let Inst{4-0} = 0b11110; // Rd
+}
+
+class SignAuthReturnPCRel<bits<3> opc, bits<5> op2, string asm>
+  : I<(outs), (ins am_pauth_pcrel:$label), asm, "\t$label", "", []>,
+    Sched<[WriteAtomic]> {
+  bits<16> label;
+  let Inst{31-24} = 0b01010101;
+  let Inst{23-21} = opc;
+  let Inst{20-5} = label; // imm16
+  let Inst{4-0} = op2;
+}
+
+class SignAuthReturnReg<bits<6> op3, string asm>
+  : I<(outs), (ins GPR64common:$Rm), asm, "\t$Rm", "", []>,
+    Sched<[WriteAtomic]> {
+  bits<5> Rm;
+  let Inst{31-25} = 0b1101011;
+  let Inst{24-21} = 0b0010; // opc
+  let Inst{20-16} = 0b11111; // op2
+  let Inst{15-10} = op3;
+  let Inst{9-5} = 0b11111; // Rn
+  let Inst{4-0} = Rm; // op4 (Rm)
+}
+
 // Base class for the Armv8.4-A 8 and 16-bit flag manipulation instructions
 class BaseFlagManipulation<bit sf, bit sz, dag iops, string asm, string ops>
     : I<(outs), iops, asm, ops, "", []>,
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 4ccac40f99a0a..977729bb082b7 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -61,6 +61,9 @@ def HasLOR           : Predicate<"Subtarget->hasLOR()">,
 def HasPAuth         : Predicate<"Subtarget->hasPAuth()">,
                        AssemblerPredicateWithAll<(all_of FeaturePAuth), "pauth">;
 
+def HasPAuthLR       : Predicate<"Subtarget->hasPAuthLR()">,
+                       AssemblerPredicateWithAll<(all_of FeaturePAuthLR), "pauth-lr">;
+
 def HasJS            : Predicate<"Subtarget->hasJS()">,
                        AssemblerPredicateWithAll<(all_of FeatureJS), "jsconv">;
 
@@ -1646,6 +1649,42 @@ let Predicates = [HasPAuth] in {
 
 }
 
+// v9.5-A pointer authentication extensions
+
+// Always accept "pacm" as an alias for "hint #39", but don't emit it when
+// disassembling if we don't have the pauth-lr feature.
+let CRm = 0b0100 in {
+  def PACM : SystemNoOperands<0b111, "hint\t#39">;
+}
+def : InstAlias<"pacm", (PACM), 0>;
+
+let Predicates = [HasPAuthLR] in {
+  let Defs = [LR], Uses = [LR, SP] in {
+    //                                opcode2, opcode,   asm
+    def PACIASPPC : SignAuthFixedRegs<0b00001, 0b101000, "paciasppc">;
+    def PACIBSPPC : SignAuthFixedRegs<0b00001, 0b101001, "pacibsppc">;
+    def PACNBIASPPC : SignAuthFixedRegs<0b00001, 0b100000, "pacnbiasppc">;
+    def PACNBIBSPPC : SignAuthFixedRegs<0b00001, 0b100001, "pacnbibsppc">;
+    //                             opc,  asm
+    def AUTIASPPCi : SignAuthPCRel<0b00, "autiasppc">;
+    def AUTIBSPPCi : SignAuthPCRel<0b01, "autibsppc">;
+    //                              opcode2, opcode,   asm
+    def AUTIASPPCr : SignAuthOneReg<0b00001, 0b100100, "autiasppc">;
+    def AUTIBSPPCr : SignAuthOneReg<0b00001, 0b100101, "autibsppc">;
+  }
+
+  let Uses = [LR, SP], isReturn = 1, isTerminator = 1, isBarrier = 1 in {
+    //                                   opc,   op2,     asm
+    def RETAASPPCi : SignAuthReturnPCRel<0b000, 0b11111, "retaasppc">;
+    def RETABSPPCi : SignAuthReturnPCRel<0b001, 0b11111, "retabsppc">;
+    //                                 op3,      asm
+    def RETAASPPCr : SignAuthReturnReg<0b000010, "retaasppc">;
+    def RETABSPPCr : SignAuthReturnReg<0b000011, "retabsppc">;
+  }
+  def : InstAlias<"pacm", (PACM), 1>;
+}
+
+
 // v8.3a floating point conversion for javascript
 let Predicates = [HasJS, HasFPARMv8], Defs = [NZCV] in
 def FJCVTZS  : BaseFPToIntegerUnscaled<0b01, 0b11, 0b110, FPR64, GPR32,
diff --git a/llvm/lib/Target/AArch64/AArch64SchedA64FX.td b/llvm/lib/Target/AArch64/AArch64SchedA64FX.td
index 813b4a3affcfd..7edce4b61605d 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedA64FX.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedA64FX.td
@@ -22,7 +22,7 @@ def A64FXModel : SchedMachineModel {
 
   list<Predicate> UnsupportedFeatures = !listconcat(SMEUnsupported.F, SVEUnsupported.F,
                                                     [HasMTE, HasMatMulInt8, HasBF16,
-                                                    HasPAuth, HasCPA]);
+                                                    HasPAuth, HasPAuthLR, HasCPA]);
   let FullInstRWOverlapCheck = 0;
 }
 
diff --git a/llvm/lib/Target/AArch64/AArch64SchedNeoverseN2.td b/llvm/lib/Target/AArch64/AArch64SchedNeoverseN2.td
index 53cf725f0e235..a6fab5e6245f8 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedNeoverseN2.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedNeoverseN2.td
@@ -19,7 +19,7 @@ def NeoverseN2Model : SchedMachineModel {
   let CompleteModel         =   1;
 
   list<Predicate> UnsupportedFeatures = !listconcat(SMEUnsupported.F,
-                                                    [HasSVE2p1, HasCPA]);
+    [HasSVE2p1, HasPAuthLR, HasCPA]);
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
index 74afa4183e67e..38a92cb096029 100644
--- a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
+++ b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
@@ -1696,6 +1696,21 @@ class AArch64Operand : public MCParsedAsmOperand {
     return DiagnosticPredicateTy::Match;
   }
 
+  bool isPAuthPCRelLabel16Operand() const {
+    // PAuth PCRel16 operands are similar to regular branch targets, but only
+    // negative values are allowed for concrete immediates as signing instr
+    // should be in a lower address.
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return true;
+    int64_t Val = MCE->getValue();
+    if (Val & 0b11)
+      return false;
+    return (Val <= 0) && (Val > -(1 << 18));
+  }
+
   void addExpr(MCInst &Inst, const MCExpr *Expr) const {
     // Add as immediates when possible.  Null MCExpr = 0.
     if (!Expr)
@@ -1997,6 +2012,19 @@ class AArch64Operand : public MCParsedAsmOperand {
     Inst.addOperand(MCOperand::createImm(MCE->getValue() >> 2));
   }
 
+  void addPAuthPCRelLabel16Operands(MCInst &Inst, unsigned N) const {
+    // PC-relative operands don't encode the low bits, so shift them off
+    // here. If it's a label, however, just put it on directly as there's
+    // not enough information now to do anything.
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE) {
+      addExpr(Inst, getImm());
+      return;
+    }
+    Inst.addOperand(MCOperand::createImm(MCE->getValue() >> 2));
+  }
+
   void addPCRelLabel19Operands(MCInst &Inst, unsigned N) const {
     // Branch operands don't encode the low bits, so shift them off
     // here. If it's a label, however, just put it on directly as there's
diff --git a/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp b/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
index cf2d3879292d1..c8cebaa5995e0 100644
--- a/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
+++ b/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
@@ -165,6 +165,9 @@ static DecodeStatus DecodeFixedPointScaleImm32(MCInst &Inst, unsigned Imm,
 static DecodeStatus DecodeFixedPointScaleImm64(MCInst &Inst, unsigned Imm,
                                                uint64_t Address,
                                                const MCDisassembler *Decoder);
+static DecodeStatus DecodePCRelLabel16(MCInst &Inst, unsigned Imm,
+                                       uint64_t Address,
+                                       const MCDisassembler *Decoder);
 static DecodeStatus DecodePCRelLabel19(MCInst &Inst, unsigned Imm,
                                        uint64_t Address,
                                        const MCDisassembler *Decoder);
@@ -887,6 +890,21 @@ static DecodeStatus DecodeFixedPointScaleImm64(MCInst &Inst, unsigned Imm,
   return Success;
 }
 
+static DecodeStatus DecodePCRelLabel16(MCInst &Inst, unsigned Imm,
+                                       uint64_t Addr,
+                                       const MCDisassembler *Decoder) {
+  // Immediate is encoded as the top 16-bits of an unsigned 18-bit negative
+  // PC-relative offset.
+  int64_t ImmVal = Imm;
+  if (ImmVal < 0 || ImmVal > (1 << 16))
+    return Fail;
+  ImmVal = -ImmVal;
+  if (!Decoder->tryAddingSymbolicOperand(Inst, (ImmVal << 2), Addr,
+                                         /*IsBranch=*/false, 0, 0, 4))
+    Inst.addOperand(MCOperand::createImm(ImmVal));
+  return Success;
+}
+
 static DecodeStatus DecodePCRelLabel19(MCInst &Inst, unsigned Imm,
                                        uint64_t Addr,
                                        const MCDisassembler *Decoder) {
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
index a6900b8963bb3..30ef3680ae79c 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
@@ -67,6 +67,7 @@ class AArch64AsmBackend : public MCAsmBackend {
         {"fixup_aarch64_ldr_pcrel_imm19", 5, 19, PCRelFlagVal},
         {"fixup_aarch64_movw", 5, 16, 0},
         {"fixup_aarch64_pcrel_branch14", 5, 14, PCRelFlagVal},
+        {"fixup_aarch64_pcrel_branch16", 5, 16, PCRelFlagVal},
         {"fixup_aarch64_pcrel_branch19", 5, 19, PCRelFlagVal},
         {"fixup_aarch64_pcrel_branch26", 0, 26, PCRelFlagVal},
         {"fixup_aarch64_pcrel_call26", 0, 26, PCRelFlagVal}};
@@ -121,6 +122,7 @@ static unsigned getFixupKindNumBytes(unsigned Kind) {
 
   case AArch64::fixup_aarch64_movw:
   case AArch64::fixup_aarch64_pcrel_branch14:
+  case AArch64::fixup_aarch64_pcrel_branch16:
   case AArch64::fixup_aarch64_add_imm12:
   case AArch64::fixup_aarch64_ldst_imm12_scale1:
   case AArch64::fixup_aarch64_ldst_imm12_scale2:
@@ -314,6 +316,17 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, const MCValue &Target,
     if (Value & 0x3)
       Ctx.reportError(Fixup.getLoc(), "fixup not sufficiently aligned");
     return (Value >> 2) & 0x3fff;
+  case AArch64::fixup_aarch64_pcrel_branch16:
+    // Unsigned PC-relative offset, so invert the negative immediate.
+    SignedValue = -SignedValue;
+    Value = static_cast<uint64_t>(SignedValue);
+    // Check valid 18-bit unsigned range.
+    if (SignedValue < 0 || SignedValue > ((1 << 18) - 1))
+      Ctx.reportError(Fixup.getLoc(), "fixup value out of range");
+    // Low two bits are not encoded (4-byte alignment assumed).
+    if (Value & 0b11)
+      Ctx.reportError(Fixup.getLoc(), "fixup not sufficiently aligned");
+    return (Value >> 2) & 0xffff;
   case AArch64::fixup_aarch64_pcrel_branch26:
   case AArch64::fixup_aarch64_pcrel_call26:
     if (TheTriple.isOSBinFormatCOFF() && !IsResolved && SignedValue != 0) {
@@ -380,6 +393,7 @@ unsigned AArch64AsmBackend::getFixupKindContainereSizeInBytes(unsigned Kind) con
 
   case AArch64::fixup_aarch64_movw:
   case AArch64::fixup_aarch64_pcrel_branch14:
+  case AArch64::fixup_aarch64_pcrel_branch16:
   case AArch64::fixup_aarch64_add_imm12:
   case AArch64::fixup_aarch64_ldst_imm12_scale1:
   case AArch64::fixup_aarch64_ldst_imm12_scale2:
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
index 9de40661298cc..496ab18e9b195 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
@@ -186,6 +186,10 @@ unsigned AArch64ELFObjectWriter::getRelocType(MCContext &Ctx,
       return R_CLS(LD_PREL_LO19);
     case AArch64::fixup_aarch64_pcrel_branch14:
       return R_CLS(TSTBR14);
+    case AArch64::fixup_aarch64_pcrel_branch16:
+      Ctx.reportError(Fixup.getLoc(),
+                      "relocation of PAC/AUT instructions is not supported");
+      return ELF::R_AARCH64_NONE;
     case AArch64::fixup_aarch64_pcrel_branch19:
       return R_CLS(CONDBR19);
     default:
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64FixupKinds.h b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64FixupKinds.h
index 767dd88055201..fdee2d5ad2bf3 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64FixupKinds.h
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64FixupKinds.h
@@ -43,6 +43,11 @@ enum Fixups {
   // The high 14 bits of a 21-bit pc-relative immediate.
   fixup_aarch64_pcrel_branch14,
 
+  // The high 16 bits of a 18-bit unsigned PC-relative immediate. Used by
+  // pointer authentication, only within a function, so no relocation can be
+  // generated.
+  fixup_aarch64_pcrel_branch16,
+
   // The high 19 bits of a 21-bit pc-relative immediate. Same encoding as
   // fixup_aarch64_pcrel_adrhi, except this is use by b.cc and generates
   // relocations directly when necessary.
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
index dbc4323a860f5..c3e12b6d8024e 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
@@ -88,6 +88,12 @@ class AArch64MCCodeEmitter : public MCCodeEmitter {
                                       SmallVectorImpl<MCFixup> &Fixups,
                                       const MCSubtargetInfo &STI) const;
 
+  /// getPAuthPCRelOpValue - Return the encoded value for a pointer
+  /// authentication pc-relative operand.
+  uint32_t getPAuthPCRelOpValue(const MCInst &MI, unsigned OpIdx,
+                                SmallVectorImpl<MCFixup> &Fixups,
+                                const MCSubtargetInfo &STI) const;
+
   /// getLoadLiteralOpValue - Return the encoded value for a load-literal
   /// pc-relative address.
   uint32_t getLoadLiteralOpValue(const MCInst &MI, unsigned OpIdx,
@@ -327,6 +333,29 @@ uint32_t AArch64MCCodeEmitter::getCondBranchTargetOpValue(
   return 0;
 }
 
+/// getPAuthPCRelOpValue - Return the encoded value for a pointer
+/// authentication pc-relative operand.
+uint32_t
+AArch64MCCodeEmitter::getPAuthPCRelOpValue(const MCInst &MI, unsigned OpIdx,
+                                           SmallVectorImpl<MCFixup> &Fixups,
+                                           const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+
+  // If the destination is an immediate, invert sign as it's a negative value
+  // that should be encoded as unsigned
+  if (MO.isImm())
+    return -(MO.getImm());
+  assert(MO.isExpr() && "Unexpected target type!");
+
+  MCFixupKind Kind = MCFixupKind(AArch64::fixup_aarch64_pcrel_branch16);
+  Fixups.push_back(MCFixup::create(0, MO.getExpr(), Kind, MI.getLoc()));
+
+  ++MCNumFixups;
+
+  // All of the information is in the fixup.
+  return 0;
+}
+
 /// getLoadLiteralOpValue - Return the encoded value for a load-literal
 /// pc-relative address.
 uint32_t
diff --git a/llvm/test/MC/AArch64/armv9.5a-pauthlr-diagnostics.s b/llvm/test/MC/AArch64/armv9.5a-pauthlr-diagnostics.s
new file mode 100644
index 0000000000000..d06183be9da3e
--- /dev/null
+++ b/llvm/test/MC/AArch64/armv9.5a-pauthlr-diagnostics.s
@@ -0,0 +1,57 @@
+// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+pauth-lr 2>&1 < %s | FileCheck %s
+
+  autiasppc #2
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: expected label or encodable integer pc offset
+// CHECK-NEXT: autiasppc #2
+// CHECK-NOT: [[@LINE-3]]:{{[0-9]+}}:
+
+  autiasppc #1<<17
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: expected label or encodable integer pc offset
+// CHECK-NEXT: autiasppc #1<<17
+// CHECK-NOT: [[@LINE-3]]:{{[0-9]+}}:
+
+  autiasppc #-2
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: expected label or encodable integer pc offset
+// CHECK-NEXT: autiasppc #-2
+// CHECK-NOT: [[@LINE-3]]:{{[0-9]+}}:
+
+  autiasppc w0
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: expected label or encodable integer pc offset
+// CHECK-NEXT: autiasppc w0
+// CHECK-NOT: [[@LINE-3]]:{{[0-9]+}}:
+
+  autiasppc sp
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: expected label or encodable integer pc offset
+// CHECK-NEXT: autiasppc sp
+// CHECK-NOT: [[@LINE-3]]:{{[0-9]+}}:
+
+  retabsppc #2
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: expected label or encodable integer pc offset
+// CHECK-NEXT: retabsppc #2
+// CHECK-NOT: [[@LINE-3]]:{{[0-9]+}}:
+
+  retabsppc #(1<<17)
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: expected label or encodable integer pc offset
+// CHECK-NEXT: retabsppc #(1<<17)
+// CHECK-NOT: [[@LINE-3]]:{{[0-9]+}}:
+
+  retabsppc #-2
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: expected label or encodable integer pc offset
+// CHECK-NEXT: retabsppc #-2
+// CHECK-NOT: [[@LINE-3]]:{{[0-9]+}}:
+
+  retaasppc w0
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: expected label or encodable integer pc offset
+// CHECK-NEXT: retaasppc w0
+// CHECK-NOT: [[@LINE-3]]:{{[0-9]+}}:
+
+  retaasppc sp
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: expected label or encodable integer pc offset
+// CHECK-NEXT: retaasppc sp
+// CHECK-NOT: [[@LINE-3]]:{{[0-9]+}}:
+
+  retaasppc xzr
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: expected label or encodable integer pc offset
+// CHECK-NEXT: retaasppc xzr
+// CHECK-NOT: [[@LINE-3]]:{{[0-9]+}}:
+
diff --git a/llvm/test/MC/AArch64/armv9.5a-pauthlr-reloc.s b/llvm/test/MC/AArch64/armv9.5a-pauthlr-reloc.s
new file mode 100644
index 0000000000000..c10142a199766
--- /dev/null
+++ b/llvm/test/MC/AArch64/armv9.5a-pauthlr-reloc.s
@@ -0,0 +1,12 @@
+// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+pauth-lr -filetype=obj -o /dev/null 2>&1 < %s | FileCheck %s
+
+  autiasppc undef_label
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: relocation of PAC/AUT instructions is not supported
+// CHECK-NEXT: autiasppc undef_label
+// CHECK-NOT: [[@LINE-3]]:{{[0-9]+}}:
+
+  autibsppc undef_label
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: relocation of PAC/AUT instructions is not supported
+// CHECK-NEXT: autibsppc undef_label
+// CHECK-NOT: [[@LINE-3]]:{{[0-9]+}}:
+
diff --git a/llvm/test/MC/AArch64/armv9.5a-pauthlr.s b/llvm/test/MC/AArch64/armv9.5a-pauthlr.s
new file mode 100644
index 0000000000000..24e9c44984683
--- /dev/null
+++ b/llvm/test/MC/AArch64/armv9.5a-pauthlr.s
@@ -0,0 +1,151 @@
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+pauth-lr < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
+// RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+pauth-lr < %s \
+// RUN:        | llvm-objdump -d --mattr=+pauth-lr - | FileCheck %s --check-prefix=CHECK-DISASS
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+pauth-lr < %s \
+// RUN:        | llvm-objdump -d --mattr=-pauth-lr - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+
+// Label at address 4, so we can test that the address shows up in the
+// disassembly.
+  nop
+label1:
+
+  paciasppc
+// CHECK-INST: paciasppc
+// CHECK-DISASS: paciasppc
+// CHECK-ENCODING: [0xfe,0xa3,0xc1,0xda]
+// CHECK-ERROR: instruction requires: pauth-lr
+// CHECK-UNKNOWN: dac1a3fe <unknown>
+
+  pacibsppc
+// CHECK-INST: pacibsppc
+// CHECK-DISASS: pacibsppc
+// CHECK-ENCODING: [0xfe,0xa7,0xc1,0xda]
+// CHECK-ERROR: instruction requires: pauth-lr
+// CHECK-UNKNOWN: dac1a7fe <unknown>
+
+  pacnbiasppc
+// CHECK-INST: pacnbiasppc
+// CHECK-DISASS: pacnbiasppc
+// CHECK-ENCODING: [0xfe,0x83,0xc1,0xda]
+// CHECK-ERROR: instruction requires: pauth-lr
+// CHECK-UNKNOWN: dac183fe <unknown>
+
+  pacnbibsppc
+// CHECK-INST: pacnbibsppc
+// CHECK-DISASS: pacnbibsppc
+// CHECK-ENCODING: [0xfe,0x87,0xc1,0xda]
+// CHECK-ERROR: instruction requires: pauth-lr
+// CHECK-UNKNOWN: dac187fe <unknown>
+
+  autiasppc label1
+// CHECK-INST: autiasppc label1
+// CHECK-DISASS: autiasppc 0x4 <label1>
+// CHECK-ENCODING: [0bAAA11111,A,0b100AAAAA,0xf3]
+// CHECK-ENCODING: fixup A - offset: 0, value: label1, kind: fixup_aarch64_pcrel_branch16
+// CHECK-ERROR: instruction requires: pauth-lr
+// CHECK-UNKNOWN: f380009f <unknown>
+
+  autibsppc label1
+// CHECK-INST: autibsppc label1
+// CHECK-DISASS: autibsppc 0x4 <label1>
+// CHECK-ENCODING: [0bAAA11111,A,0b101AAAAA,0xf3]
+// CHECK-ENCODING: fixup A - offset: 0, value: label1, kind: fixup_aarch64_pcrel_branch16
+// CHECK-ERROR: instruction requires: pauth-lr
+// CHECK-UNKNOWN: f3a000bf <unknown>
+
+  autibsppc #0
+// CHECK-INST: autibsppc #0
+// CHECK-DISASS: autibsppc 0x1c <label1+0x18>
+// CHECK-ENCODING: [0x1f,0x00,0xa0,0xf3]
+// CHECK-ERROR: instruction requires: pauth-lr
+// CHECK-UNKNOWN: f3a0001f <unknown>
+
+  autibsppc #-(1<<18)+4
+// CHECK-INST: autibsppc #-262140
+// CHECK-DISASS: autibsppc 0xfffffffffffc0024 <label1+0xfffffffffffc0020>
+// CHECK-ENCODING: [0xff,0xff,0xbf,0xf3]
+// CHECK-ERROR: instruction requires: pauth-lr
+// CHECK-UNKNOWN: f3bfffff <unknown>
+
+  autiasppc x0
+// CHECK-INST: autiasppc x0
+// CHECK-DISASS: autiasppc x0
+// CHECK-ENCODING: [0x1e,0x90,0xc1,0xda]
+// CHECK-ERROR: instruction requires: pauth-lr
+// CHECK-UNKNOWN: dac1901e <unknown>
+
+  autibsppc x1
+// CHECK-INST: autibsppc x1
+// CHECK-DISASS: autibsppc x1
+// CHECK-ENCODING: [0x3e,0x94,0xc1,0xda]
+// CHECK-ERROR: instruction requires: pauth-lr
+// CHECK-UNKNOWN: dac1943e <unknown>
+
+  autiasppc xzr
+// CHECK-INST: autiasppc xzr
+// CHECK-DISASS: autiasppc xzr
+// CHECK-ENCODING: [0xfe,0x93,0xc1,0xda]
+// CHECK-ERROR: instruction requires: pauth-lr
+// CHECK-UNKNOWN: dac193fe <unknown>
+
+  autibsppc xzr
+// CHECK-INST: autibsppc xzr
+// CHECK-DISASS: autibsppc xzr
+// CHECK-ENCODING: [0xfe,0x97,0xc1,0xda]
+// CHECK-ERROR: instruction requires: pauth-lr
+// CHECK-UNKNOWN: dac197fe <unknown>
+
+
+  retaasppc label1
+// CHECK-INST: retaasppc label1
+// CHECK-DISASS: retaasppc 0x4 <label1>
+// CHECK-ENCODING: [0bAAA11111,A,0b000AAAAA,0x55]
+// CHECK-ENCODING: //   fixup A - offset: 0, value: label1, kind: fixup_aarch64_pcrel_branch16
+// CHECK-ERROR: instruction requires: pauth-lr
+// CHECK-UNKNOWN: 5500019f <unknown>
+
+  retabsppc label1
+// CHECK-INST: retabsppc label1
+// CHECK-DISASS: retabsppc 0x4 <label1>
+// CHECK-ENCODING: [0bAAA11111,A,0b001AAAAA,0x55]
+// CHECK-ENCODING: //   fixup A - offset: 0, value: label1, kind: fixup_aarch64_pcrel_branch16
+// CHECK-ERROR: instruction requires: pauth-lr
+// CHECK-UNKNOWN: 552001bf <unknown>
+
+  retaasppc #0
+// CHECK-INST: retaasppc #0
+// CHECK-DISASS: retaasppc 0x3c <label1+0x38>
+// CHECK-ENCODING: [0x1f,0x00,0x00,0x55]
+// CHECK-ERROR: instruction requires: pauth-lr
+// CHECK-UNKNOWN: 5500001f <unknown>
+
+  retaasppc #-(1<<18)+4
+// CHECK-INST: retaasppc #-262140
+// CHECK-DISASS: retaasppc 0xfffffffffffc0044 <label1+0xfffffffffffc0040>
+// CHECK-ENCODING: [0xff,0xff,0x1f,0x55]
+// CHECK-ERROR: instruction requires: pauth-lr
+// CHECK-UNKNOWN: 551fffff <unknown>
+
+  retaasppc x2
+// CHECK-INST: retaasppc x2
+// CHECK-DISASS: retaasppc x2
+// CHECK-ENCODING: [0xe2,0x0b,0x5f,0xd6]
+// CHECK-ERROR: instruction requires: pauth-lr
+// CHECK-UNKNOWN: d65f0be2 <unknown>
+
+  retabsppc x3
+// CHECK-INST: retabsppc x3
+// CHECK-DISASS: retabsppc x3
+// CHECK-ENCODING: [0xe3,0x0f,0x5f,0xd6]
+// CHECK-ERROR: instruction requires: pauth-lr
+// CHECK-UNKNOWN: d65f0fe3 <unknown>
+
+  pacm
+// CHECK-INST: pacm
+// CHECK-DISASS: pacm
+// CHECK-ENCODING: [0xff,0x24,0x03,0xd5]
+// CHECK-ERROR-NOT: instruction requires:
+// CHECK-UNKNOWN: d50324ff hint #39
diff --git a/llvm/test/MC/Disassembler/AArch64/armv9.5a-pauthlr.txt b/llvm/test/MC/Disassembler/AArch64/armv9.5a-pauthlr.txt
new file mode 100644
index 0000000000000..caf1fde2c2b7c
--- /dev/null
+++ b/llvm/test/MC/Disassembler/AArch64/armv9.5a-pauthlr.txt
@@ -0,0 +1,78 @@
+# RUN: llvm-mc -triple aarch64 -disassemble -mattr=+pauth-lr < %s | FileCheck %s
+# RUN: not llvm-mc -triple aarch64 -disassemble < %s 2>&1 | FileCheck %s --check-prefix=NO-PAUTHLR
+
+[0xfe,0xa3,0xc1,0xda]
+# CHECK: paciasppc
+# NO-PAUTHLR: invalid instruction encoding
+
+[0xfe,0xa7,0xc1,0xda]
+# CHECK: pacibsppc
+# NO-PAUTHLR: invalid instruction encoding
+
+[0xfe,0x83,0xc1,0xda]
+# CHECK: pacnbiasppc
+# NO-PAUTHLR: invalid instruction encoding
+
+[0xfe,0x87,0xc1,0xda]
+# CHECK: pacnbibsppc
+# NO-PAUTHLR: invalid instruction encoding
+
+[0x9f,0x00,0x80,0xf3]
+# CHECK: autiasppc #-16
+# NO-PAUTHLR: invalid instruction encoding
+
+[0xbf,0x00,0xa0,0xf3]
+# CHECK: autibsppc #-20
+# NO-PAUTHLR: invalid instruction encoding
+
+[0x1f,0x00,0xa0,0xf3]
+# CHECK: autibsppc #0
+# NO-PAUTHLR: invalid instruction encoding
+
+[0xff,0xff,0xbf,0xf3]
+# CHECK: autibsppc #-262140
+# NO-PAUTHLR: invalid instruction encoding
+
+[0x1e,0x90,0xc1,0xda]
+# CHECK: autiasppc x0
+# NO-PAUTHLR: invalid instruction encoding
+
+[0x3e,0x94,0xc1,0xda]
+# CHECK: autibsppc x1
+# NO-PAUTHLR: invalid instruction encoding
+
+[0xfe,0x93,0xc1,0xda]
+# CHECK: autiasppc xzr
+# NO-PAUTHLR: invalid instruction encoding
+
+[0xfe,0x97,0xc1,0xda]
+# CHECK: autibsppc xzr
+# NO-PAUTHLR: invalid instruction encoding
+
+[0xbf,0x01,0x00,0x55]
+# CHECK: retaasppc #-52
+# NO-PAUTHLR: invalid instruction encoding
+
+[0xdf,0x01,0x20,0x55]
+# CHECK: retabsppc #-56
+# NO-PAUTHLR: invalid instruction encoding
+
+[0x1f,0x00,0x00,0x55]
+# CHECK: retaasppc #0
+# NO-PAUTHLR: invalid instruction encoding
+
+[0xff,0xff,0x1f,0x55]
+# CHECK: retaasppc #-262140
+# NO-PAUTHLR: invalid instruction encoding
+
+[0xe2,0x0b,0x5f,0xd6]
+# CHECK: retaasppc x2
+# NO-PAUTHLR: invalid instruction encoding
+
+[0xe3,0x0f,0x5f,0xd6]
+# CHECK: retabsppc x3
+# NO-PAUTHLR: invalid instruction encoding
+
+[0xff,0x24,0x03,0xd5]
+# CHECK: pacm
+# NO-PAUTHLR: hint #39

From 5992ce90b8c0fac06436c3c86621fbf6d5398ee5 Mon Sep 17 00:00:00 2001
From: Tomas Matheson <tomas.matheson@arm.com>
Date: Fri, 16 Jun 2023 11:10:37 +0100
Subject: [PATCH 070/342] [AArch64] Codegen support for FEAT_PAuthLR

- Adds a new +pc option to -mbranch-protection that will enable
  the use of PC as a diversifier in PAC branch protection code.

- When +pauth-lr is enabled (-march=armv9.5a+pauth-lr) in combination
  with -mbranch-protection=pac-ret+pc, the new 9.5-a instructions
  (pacibsppc, retaasppc, etc) are used.

Documentation for the relevant instructions can be found here:
https://developer.arm.com/documentation/ddi0602/2023-09/Base-Instructions/

Co-authored-by: Lucas Prates <lucas.prates@arm.com>
---
 clang/include/clang/Basic/LangOptions.def     |   1 +
 clang/include/clang/Basic/TargetInfo.h        |   1 +
 clang/include/clang/Driver/Options.td         |   2 +
 clang/lib/Basic/Targets/AArch64.cpp           |   1 +
 clang/lib/Basic/Targets/ARM.cpp               |   1 +
 clang/lib/CodeGen/CodeGenModule.cpp           |   3 +
 clang/lib/CodeGen/Targets/AArch64.cpp         |   2 +
 clang/lib/Driver/ToolChains/Clang.cpp         |   7 +-
 .../CodeGen/aarch64-branch-protection-attr.c  |  28 +
 clang/test/Driver/aarch64-pauth-lr.c          |  23 +
 clang/test/Driver/aarch64-v95a.c              |   7 +
 .../llvm/TargetParser/AArch64TargetParser.h   |   2 +
 .../llvm/TargetParser/ARMTargetParserCommon.h |   1 +
 llvm/lib/Target/AArch64/AArch64InstrInfo.cpp  |  11 +
 .../AArch64/AArch64MachineFunctionInfo.cpp    |  28 +-
 .../AArch64/AArch64MachineFunctionInfo.h      |  18 +
 .../lib/Target/AArch64/AArch64PointerAuth.cpp |  86 ++-
 .../TargetParser/ARMTargetParserCommon.cpp    |   6 +-
 .../AArch64/sign-return-address-pauth-lr.ll   | 542 ++++++++++++++++++
 .../CodeGen/AArch64/sign-return-address.ll    |   3 +
 .../TargetParser/TargetParserTest.cpp         |   4 +-
 21 files changed, 752 insertions(+), 25 deletions(-)
 create mode 100644 clang/test/Driver/aarch64-pauth-lr.c
 create mode 100644 llvm/test/CodeGen/AArch64/sign-return-address-pauth-lr.ll

diff --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def
index 152d9f65f86db..21abc346cf17a 100644
--- a/clang/include/clang/Basic/LangOptions.def
+++ b/clang/include/clang/Basic/LangOptions.def
@@ -456,6 +456,7 @@ ENUM_LANGOPT(SignReturnAddressScope, SignReturnAddressScopeKind, 2, SignReturnAd
 ENUM_LANGOPT(SignReturnAddressKey, SignReturnAddressKeyKind, 1, SignReturnAddressKeyKind::AKey,
              "Key used for return address signing")
 LANGOPT(BranchTargetEnforcement, 1, 0, "Branch-target enforcement enabled")
+LANGOPT(BranchProtectionPAuthLR, 1, 0, "Use PC as a diversifier using PAuthLR NOP instructions.")
 
 LANGOPT(SpeculativeLoadHardening, 1, 0, "Speculative load hardening enabled")
 
diff --git a/clang/include/clang/Basic/TargetInfo.h b/clang/include/clang/Basic/TargetInfo.h
index aa0f5023104a1..ac3c324c6c29c 100644
--- a/clang/include/clang/Basic/TargetInfo.h
+++ b/clang/include/clang/Basic/TargetInfo.h
@@ -1372,6 +1372,7 @@ class TargetInfo : public TransferrableTargetInfo,
     LangOptions::SignReturnAddressKeyKind SignKey =
         LangOptions::SignReturnAddressKeyKind::AKey;
     bool BranchTargetEnforcement = false;
+    bool BranchProtectionPAuthLR = false;
   };
 
   /// Determine if the Architecture in this TargetInfo supports branch
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 1b02087425b75..965d402af2d7b 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -6999,6 +6999,8 @@ def msign_return_address_key_EQ : Joined<["-"], "msign-return-address-key=">,
     Values<"a_key,b_key">;
 def mbranch_target_enforce : Flag<["-"], "mbranch-target-enforce">,
   MarshallingInfoFlag<LangOpts<"BranchTargetEnforcement">>;
+def mbranch_protection_pauth_lr : Flag<["-"], "mbranch-protection-pauth-lr">,
+  MarshallingInfoFlag<LangOpts<"BranchProtectionPAuthLR">>;
 def fno_dllexport_inlines : Flag<["-"], "fno-dllexport-inlines">,
   MarshallingInfoNegativeFlag<LangOpts<"DllExportInlines">>;
 def cfguard_no_checks : Flag<["-"], "cfguard-no-checks">,
diff --git a/clang/lib/Basic/Targets/AArch64.cpp b/clang/lib/Basic/Targets/AArch64.cpp
index def16c032c869..3ee39133fcee7 100644
--- a/clang/lib/Basic/Targets/AArch64.cpp
+++ b/clang/lib/Basic/Targets/AArch64.cpp
@@ -225,6 +225,7 @@ bool AArch64TargetInfo::validateBranchProtection(StringRef Spec, StringRef,
     BPI.SignKey = LangOptions::SignReturnAddressKeyKind::BKey;
 
   BPI.BranchTargetEnforcement = PBP.BranchTargetEnforcement;
+  BPI.BranchProtectionPAuthLR = PBP.BranchProtectionPAuthLR;
   return true;
 }
 
diff --git a/clang/lib/Basic/Targets/ARM.cpp b/clang/lib/Basic/Targets/ARM.cpp
index ce7e4d4639cea..6e1842fc64e50 100644
--- a/clang/lib/Basic/Targets/ARM.cpp
+++ b/clang/lib/Basic/Targets/ARM.cpp
@@ -419,6 +419,7 @@ bool ARMTargetInfo::validateBranchProtection(StringRef Spec, StringRef Arch,
   BPI.SignKey = LangOptions::SignReturnAddressKeyKind::AKey;
 
   BPI.BranchTargetEnforcement = PBP.BranchTargetEnforcement;
+  BPI.BranchProtectionPAuthLR = PBP.BranchProtectionPAuthLR;
   return true;
 }
 
diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index b2e173d0d6949..d78f2594a2376 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -1106,6 +1106,9 @@ void CodeGenModule::Release() {
     if (LangOpts.BranchTargetEnforcement)
       getModule().addModuleFlag(llvm::Module::Min, "branch-target-enforcement",
                                 1);
+    if (LangOpts.BranchProtectionPAuthLR)
+      getModule().addModuleFlag(llvm::Module::Min, "branch-protection-pauth-lr",
+                                1);
     if (LangOpts.hasSignReturnAddress())
       getModule().addModuleFlag(llvm::Module::Min, "sign-return-address", 1);
     if (LangOpts.isSignReturnAddressScopeAll())
diff --git a/clang/lib/CodeGen/Targets/AArch64.cpp b/clang/lib/CodeGen/Targets/AArch64.cpp
index be5145daa00b7..7102d190fe008 100644
--- a/clang/lib/CodeGen/Targets/AArch64.cpp
+++ b/clang/lib/CodeGen/Targets/AArch64.cpp
@@ -136,6 +136,8 @@ class AArch64TargetCodeGenInfo : public TargetCodeGenInfo {
 
     Fn->addFnAttr("branch-target-enforcement",
                   BPI.BranchTargetEnforcement ? "true" : "false");
+    Fn->addFnAttr("branch-protection-pauth-lr",
+                  BPI.BranchProtectionPAuthLR ? "true" : "false");
   }
 
   bool isScalarizableAsmOperand(CodeGen::CodeGenFunction &CGF,
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index de9fd5eaa1e02..4783affd3220b 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -1497,7 +1497,7 @@ static void CollectARMPACBTIOptions(const ToolChain &TC, const ArgList &Args,
         << Triple.getArchName();
 
   StringRef Scope, Key;
-  bool IndirectBranches;
+  bool IndirectBranches, BranchProtectionPAuthLR;
 
   if (A->getOption().matches(options::OPT_msign_return_address_EQ)) {
     Scope = A->getValue();
@@ -1506,6 +1506,7 @@ static void CollectARMPACBTIOptions(const ToolChain &TC, const ArgList &Args,
           << A->getSpelling() << Scope;
     Key = "a_key";
     IndirectBranches = false;
+    BranchProtectionPAuthLR = false;
   } else {
     StringRef DiagMsg;
     llvm::ARM::ParsedBranchProtection PBP;
@@ -1517,6 +1518,7 @@ static void CollectARMPACBTIOptions(const ToolChain &TC, const ArgList &Args,
           << "b-key" << A->getAsString(Args);
     Scope = PBP.Scope;
     Key = PBP.Key;
+    BranchProtectionPAuthLR = PBP.BranchProtectionPAuthLR;
     IndirectBranches = PBP.BranchTargetEnforcement;
   }
 
@@ -1525,6 +1527,9 @@ static void CollectARMPACBTIOptions(const ToolChain &TC, const ArgList &Args,
   if (!Scope.equals("none"))
     CmdArgs.push_back(
         Args.MakeArgString(Twine("-msign-return-address-key=") + Key));
+  if (BranchProtectionPAuthLR)
+    CmdArgs.push_back(
+        Args.MakeArgString(Twine("-mbranch-protection-pauth-lr")));
   if (IndirectBranches)
     CmdArgs.push_back("-mbranch-target-enforce");
 }
diff --git a/clang/test/CodeGen/aarch64-branch-protection-attr.c b/clang/test/CodeGen/aarch64-branch-protection-attr.c
index 3c2714e2feda2..8ab3e17ade426 100644
--- a/clang/test/CodeGen/aarch64-branch-protection-attr.c
+++ b/clang/test/CodeGen/aarch64-branch-protection-attr.c
@@ -46,6 +46,24 @@ __attribute__ ((target("branch-protection=pac-ret+leaf+bti")))
 void btileaf() {}
 // CHECK: define{{.*}} void @btileaf() #[[#BTIPACLEAF:]]
 
+
+__attribute__ ((target("branch-protection=pac-ret+pc")))
+void pauthlr() {}
+// CHECK: define{{.*}} void @pauthlr()  #[[#PAUTHLR:]]
+
+__attribute__ ((target("branch-protection=pac-ret+pc+b-key")))
+void pauthlr_bkey() {}
+// CHECK: define{{.*}} void @pauthlr_bkey()  #[[#PAUTHLR_BKEY:]]
+
+__attribute__ ((target("branch-protection=pac-ret+pc+leaf")))
+void pauthlr_leaf() {}
+// CHECK: define{{.*}} void @pauthlr_leaf()  #[[#PAUTHLR_LEAF:]]
+
+__attribute__ ((target("branch-protection=pac-ret+pc+bti")))
+void pauthlr_bti() {}
+// CHECK: define{{.*}} void @pauthlr_bti()  #[[#PAUTHLR_BTI:]]
+
+
 // CHECK-DAG: attributes #[[#NONE]] = { {{.*}} "branch-target-enforcement"="false" {{.*}} "sign-return-address"="none"
 
 // CHECK-DAG: attributes #[[#STD]] = { {{.*}} "branch-target-enforcement"="true" {{.*}} "sign-return-address"="non-leaf" "sign-return-address-key"="a_key"
@@ -61,3 +79,13 @@ void btileaf() {}
 // CHECK-DAG: attributes #[[#PACBKEYLEAF]] = { {{.*}} "branch-target-enforcement"="false" {{.*}}"sign-return-address"="all" "sign-return-address-key"="b_key"
 
 // CHECK-DAG: attributes #[[#BTIPACLEAF]] = { {{.*}}"branch-target-enforcement"="true" {{.*}} "sign-return-address"="all" "sign-return-address-key"="a_key"
+
+
+// CHECK-DAG: attributes #[[#PAUTHLR]] = { {{.*}}"branch-protection-pauth-lr"="true" {{.*}}"branch-target-enforcement"="false" {{.*}}"sign-return-address"="non-leaf" "sign-return-address-key"="a_key"
+
+// CHECK-DAG: attributes #[[#PAUTHLR_BKEY]] = { {{.*}}"branch-protection-pauth-lr"="true" {{.*}}"branch-target-enforcement"="false" {{.*}}"sign-return-address"="non-leaf" "sign-return-address-key"="b_key"
+
+// CHECK-DAG: attributes #[[#PAUTHLR_LEAF]] = { {{.*}}"branch-protection-pauth-lr"="true" {{.*}}"branch-target-enforcement"="false" {{.*}}"sign-return-address"="all" "sign-return-address-key"="a_key"
+
+// CHECK-DAG: attributes #[[#PAUTHLR_BTI]] = { {{.*}}"branch-protection-pauth-lr"="true" {{.*}}"branch-target-enforcement"="true" {{.*}}"sign-return-address"="non-leaf" "sign-return-address-key"="a_key"
+
diff --git a/clang/test/Driver/aarch64-pauth-lr.c b/clang/test/Driver/aarch64-pauth-lr.c
new file mode 100644
index 0000000000000..2e1b530fc9895
--- /dev/null
+++ b/clang/test/Driver/aarch64-pauth-lr.c
@@ -0,0 +1,23 @@
+// Check the -cc1 flags for the various forms of -mbranch-protection=pac-ret+pc.
+
+// RUN: %clang -target aarch64-arm-none-eabi -c %s -### -mbranch-protection=pac-ret+pc                  2>&1 |  FileCheck %s --check-prefixes=PAUTH-LR
+// RUN: %clang -target aarch64-arm-none-eabi -c %s -### -mbranch-protection=pac-ret+pc+b-key            2>&1 |  FileCheck %s --check-prefixes=PAUTH-LR-B-KEY
+// RUN: %clang -target aarch64-arm-none-eabi -c %s -### -mbranch-protection=pac-ret+pc+leaf             2>&1 |  FileCheck %s --check-prefixes=PAUTH-LR-LEAF
+// RUN: %clang -target aarch64-arm-none-eabi -c %s -### -mbranch-protection=pac-ret+pc+bti              2>&1 |  FileCheck %s --check-prefixes=PAUTH-LR-BTI
+// RUN: %clang -target aarch64-arm-none-eabi -c %s -### -mbranch-protection=pac-ret+pc+leaf+b-key+bti   2>&1 |  FileCheck %s --check-prefixes=PAUTH-LR-LEAF-B-KEY-BTI
+// RUN: %clang -target aarch64-arm-none-eabi -c %s -### -mbranch-protection=pac-ret+pc                  -march=armv9.5-a 2>&1 |  FileCheck %s --check-prefixes=PAUTH-LR
+// RUN: %clang -target aarch64-arm-none-eabi -c %s -### -mbranch-protection=pac-ret+pc+b-key            -march=armv9.5-a 2>&1 |  FileCheck %s --check-prefixes=PAUTH-LR-B-KEY
+// RUN: %clang -target aarch64-arm-none-eabi -c %s -### -mbranch-protection=pac-ret+pc+leaf             -march=armv9.5-a 2>&1 |  FileCheck %s --check-prefixes=PAUTH-LR-LEAF
+// RUN: %clang -target aarch64-arm-none-eabi -c %s -### -mbranch-protection=pac-ret+pc+bti              -march=armv9.5-a 2>&1 |  FileCheck %s --check-prefixes=PAUTH-LR-BTI
+// RUN: %clang -target aarch64-arm-none-eabi -c %s -### -mbranch-protection=pac-ret+pc+leaf+b-key+bti   -march=armv9.5-a 2>&1 |  FileCheck %s --check-prefixes=PAUTH-LR-LEAF-B-KEY-BTI
+
+// PAUTH-LR: "-msign-return-address=non-leaf" "-msign-return-address-key=a_key" "-mbranch-protection-pauth-lr"
+// PAUTH-LR-B-KEY: "-msign-return-address=non-leaf" "-msign-return-address-key=b_key" "-mbranch-protection-pauth-lr"
+// PAUTH-LR-LEAF: "-msign-return-address=all" "-msign-return-address-key=a_key" "-mbranch-protection-pauth-lr"
+// PAUTH-LR-BTI: "-msign-return-address=non-leaf" "-msign-return-address-key=a_key" "-mbranch-protection-pauth-lr"
+// PAUTH-LR-LEAF-B-KEY-BTI: "-msign-return-address=all" "-msign-return-address-key=b_key" "-mbranch-protection-pauth-lr" "-mbranch-target-enforce"
+
+// NOT-PAUTH-LR: "-mbranch-target-enforce"
+// NOT-PAUTH-LR-B-KEY: "-mbranch-target-enforce"
+// NOT-PAUTH-LR-LEAF: "-mbranch-target-enforce"
+// NOT-PAUTH-LR-BTI: "-mbranch-target-enforce"
diff --git a/clang/test/Driver/aarch64-v95a.c b/clang/test/Driver/aarch64-v95a.c
index 366cade86a9fb..6fac62e8b389a 100644
--- a/clang/test/Driver/aarch64-v95a.c
+++ b/clang/test/Driver/aarch64-v95a.c
@@ -1,3 +1,5 @@
+// ===== Base v9.5a architecture =====
+
 // RUN: %clang -target aarch64 -march=armv9.5a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV95A %s
 // RUN: %clang -target aarch64 -march=armv9.5-a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV95A %s
 // RUN: %clang -target aarch64 -mlittle-endian -march=armv9.5a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV95A %s
@@ -5,6 +7,7 @@
 // RUN: %clang -target aarch64_be -mlittle-endian -march=armv9.5a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV95A %s
 // RUN: %clang -target aarch64_be -mlittle-endian -march=armv9.5-a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV95A %s
 // GENERICV95A: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "generic" "-target-feature" "+neon" "-target-feature" "+v9.5a"
+
 // RUN: %clang -target aarch64_be -march=armv9.5a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV95A-BE %s
 // RUN: %clang -target aarch64_be -march=armv9.5-a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV95A-BE %s
 // RUN: %clang -target aarch64 -mbig-endian -march=armv9.5a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV95A-BE %s
@@ -18,3 +21,7 @@
 // RUN: %clang -target aarch64 -march=armv9.5a+cpa -### -c %s 2>&1 | FileCheck -check-prefix=V95A-CPA %s
 // RUN: %clang -target aarch64 -march=armv9.5-a+cpa -### -c %s 2>&1 | FileCheck -check-prefix=V95A-CPA %s
 // V95A-CPA: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "generic" "-target-feature" "+neon" "-target-feature" "+v9.5a" "-target-feature" "+cpa"
+
+// RUN: %clang -target aarch64 -march=armv9.5a+pauth-lr -### -c %s 2>&1 | FileCheck -check-prefix=V95A-PAUTHLR %s
+// RUN: %clang -target aarch64 -march=armv9.5-a+pauth-lr -### -c %s 2>&1 | FileCheck -check-prefix=V95A-PAUTHLR %s
+// V95A-PAUTHLR: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "generic" "-target-feature" "+neon" "-target-feature" "+v9.5a" "-target-feature" "+pauth-lr"
diff --git a/llvm/include/llvm/TargetParser/AArch64TargetParser.h b/llvm/include/llvm/TargetParser/AArch64TargetParser.h
index f0b35790133fb..6c7410a8b8f79 100644
--- a/llvm/include/llvm/TargetParser/AArch64TargetParser.h
+++ b/llvm/include/llvm/TargetParser/AArch64TargetParser.h
@@ -174,6 +174,7 @@ enum ArchExtKind : unsigned {
   AEK_SMEF8F32 =      70, // FEAT_SME_F8F32
   AEK_SMEFA64 =       71, // FEAT_SME_FA64
   AEK_CPA =           72, // FEAT_CPA
+  AEK_PAUTHLR =       73, // FEAT_PAuth_LR
   AEK_NUM_EXTENSIONS
 };
 using ExtensionBitset = Bitset<AEK_NUM_EXTENSIONS>;
@@ -297,6 +298,7 @@ inline constexpr ExtensionInfo Extensions[] = {
     {"sme-f8f32", AArch64::AEK_SMEF8F32, "+sme-f8f32", "-sme-f8f32", FEAT_INIT, "+sme2,+fp8", 0},
     {"sme-fa64",  AArch64::AEK_SMEFA64,  "+sme-fa64", "-sme-fa64",  FEAT_INIT, "", 0},
     {"cpa", AArch64::AEK_CPA, "+cpa", "-cpa", FEAT_INIT, "", 0},
+    {"pauth-lr", AArch64::AEK_PAUTHLR, "+pauth-lr", "-pauth-lr", FEAT_INIT, "", 0},
     // Special cases
     {"none", AArch64::AEK_NONE, {}, {}, FEAT_INIT, "", ExtensionInfo::MaxFMVPriority},
 };
diff --git a/llvm/include/llvm/TargetParser/ARMTargetParserCommon.h b/llvm/include/llvm/TargetParser/ARMTargetParserCommon.h
index e3d9ffc1d4db5..1e4187c6fb111 100644
--- a/llvm/include/llvm/TargetParser/ARMTargetParserCommon.h
+++ b/llvm/include/llvm/TargetParser/ARMTargetParserCommon.h
@@ -41,6 +41,7 @@ struct ParsedBranchProtection {
   StringRef Scope;
   StringRef Key;
   bool BranchTargetEnforcement;
+  bool BranchProtectionPAuthLR;
 };
 
 bool parseBranchProtection(StringRef Spec, ParsedBranchProtection &PBP,
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 175f6ef49c3ba..6d85e1fb5fbf1 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -8802,12 +8802,23 @@ AArch64InstrInfo::getOutliningTypeImpl(MachineBasicBlock::iterator &MIT,
   // Don't outline anything used for return address signing. The outlined
   // function will get signed later if needed
   switch (MI.getOpcode()) {
+  case AArch64::PACM:
   case AArch64::PACIASP:
   case AArch64::PACIBSP:
+  case AArch64::PACIASPPC:
+  case AArch64::PACIBSPPC:
   case AArch64::AUTIASP:
   case AArch64::AUTIBSP:
+  case AArch64::AUTIASPPCi:
+  case AArch64::AUTIASPPCr:
+  case AArch64::AUTIBSPPCi:
+  case AArch64::AUTIBSPPCr:
   case AArch64::RETAA:
   case AArch64::RETAB:
+  case AArch64::RETAASPPCi:
+  case AArch64::RETAASPPCr:
+  case AArch64::RETABSPPCi:
+  case AArch64::RETABSPPCr:
   case AArch64::EMITBKEY:
   case AArch64::PAUTH_PROLOGUE:
   case AArch64::PAUTH_EPILOGUE:
diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp
index 9da59ef2a8062..1a8c71888a852 100644
--- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp
@@ -93,16 +93,24 @@ AArch64FunctionInfo::AArch64FunctionInfo(const Function &F,
   // TODO: skip functions that have no instrumented allocas for optimization
   IsMTETagged = F.hasFnAttribute(Attribute::SanitizeMemTag);
 
-  if (!F.hasFnAttribute("branch-target-enforcement")) {
-    if (const auto *BTE = mdconst::extract_or_null<ConstantInt>(
-            F.getParent()->getModuleFlag("branch-target-enforcement")))
-      BranchTargetEnforcement = BTE->getZExtValue();
-  } else {
-    const StringRef BTIEnable =
-        F.getFnAttribute("branch-target-enforcement").getValueAsString();
-    assert(BTIEnable == "true" || BTIEnable == "false");
-    BranchTargetEnforcement = BTIEnable == "true";
-  }
+  // BTI/PAuthLR may be set either on the function or the module. Set Bool from
+  // either the function attribute or module attribute, depending on what is
+  // set.
+  // Note: the module attributed is numeric (0 or 1) but the function attribute
+  // is stringy ("true" or "false").
+  auto TryFnThenModule = [&](StringRef AttrName, bool &Bool) {
+    if (F.hasFnAttribute(AttrName)) {
+      const StringRef V = F.getFnAttribute(AttrName).getValueAsString();
+      assert(V.equals_insensitive("true") || V.equals_insensitive("false"));
+      Bool = V.equals_insensitive("true");
+    } else if (const auto *ModVal = mdconst::extract_or_null<ConstantInt>(
+                   F.getParent()->getModuleFlag(AttrName))) {
+      Bool = ModVal->getZExtValue();
+    }
+  };
+
+  TryFnThenModule("branch-target-enforcement", BranchTargetEnforcement);
+  TryFnThenModule("branch-protection-pauth-lr", BranchProtectionPAuthLR);
 
   // The default stack probe size is 4096 if the function has no
   // stack-probe-size attribute. This is a safe default because it is the
diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
index 219f83cfd32e0..cd4a18bfbc23a 100644
--- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
@@ -22,6 +22,7 @@
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/IR/Function.h"
 #include "llvm/MC/MCLinkerOptimizationHint.h"
+#include "llvm/MC/MCSymbol.h"
 #include <cassert>
 #include <optional>
 
@@ -164,10 +165,21 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
   /// SignWithBKey modifies the default PAC-RET mode to signing with the B key.
   bool SignWithBKey = false;
 
+  /// SigningInstrOffset captures the offset of the PAC-RET signing instruction
+  /// within the prologue, so it can be re-used for authentication in the
+  /// epilogue when using PC as a second salt (FEAT_PAuth_LR)
+  MCSymbol *SignInstrLabel = nullptr;
+
   /// BranchTargetEnforcement enables placing BTI instructions at potential
   /// indirect branch destinations.
   bool BranchTargetEnforcement = false;
 
+  /// Indicates that SP signing should be diversified with PC as-per PAuthLR.
+  /// This is set by -mbranch-protection and will emit NOP instructions unless
+  /// the subtarget feature +pauthlr is also used (in which case non-NOP
+  /// instructions are emitted).
+  bool BranchProtectionPAuthLR = false;
+
   /// Whether this function has an extended frame record [Ctx, FP, LR]. If so,
   /// bit 60 of the in-memory FP will be 1 to enable other tools to detect the
   /// extended record.
@@ -436,10 +448,16 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
   bool needsShadowCallStackPrologueEpilogue(MachineFunction &MF) const;
 
   bool shouldSignWithBKey() const { return SignWithBKey; }
+
+  MCSymbol *getSigningInstrLabel() const { return SignInstrLabel; }
+  void setSigningInstrLabel(MCSymbol *Label) { SignInstrLabel = Label; }
+
   bool isMTETagged() const { return IsMTETagged; }
 
   bool branchTargetEnforcement() const { return BranchTargetEnforcement; }
 
+  bool branchProtectionPAuthLR() const { return BranchProtectionPAuthLR; }
+
   void setHasSwiftAsyncContext(bool HasContext) {
     HasSwiftAsyncContext = HasContext;
   }
diff --git a/llvm/lib/Target/AArch64/AArch64PointerAuth.cpp b/llvm/lib/Target/AArch64/AArch64PointerAuth.cpp
index 7576d2a899d1a..334149a6bf5cf 100644
--- a/llvm/lib/Target/AArch64/AArch64PointerAuth.cpp
+++ b/llvm/lib/Target/AArch64/AArch64PointerAuth.cpp
@@ -60,11 +60,35 @@ FunctionPass *llvm::createAArch64PointerAuthPass() {
 
 char AArch64PointerAuth::ID = 0;
 
+// Where PAuthLR support is not known at compile time, it is supported using
+// PACM. PACM is in the hint space so has no effect when PAuthLR is not
+// supported by the hardware, but will alter the behaviour of PACI*SP, AUTI*SP
+// and RETAA/RETAB if the hardware supports PAuthLR.
+static void BuildPACM(const AArch64Subtarget &Subtarget, MachineBasicBlock &MBB,
+                      MachineBasicBlock::iterator MBBI, DebugLoc DL,
+                      MachineInstr::MIFlag Flags, MCSymbol *PACSym = nullptr) {
+  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+  auto &MFnI = *MBB.getParent()->getInfo<AArch64FunctionInfo>();
+
+  // ADR X16,<address_of_PACIASP>
+  if (PACSym) {
+    assert(Flags == MachineInstr::FrameDestroy);
+    BuildMI(MBB, MBBI, DL, TII->get(AArch64::ADR))
+        .addReg(AArch64::X16)
+        .addSym(PACSym);
+  }
+
+  // Only emit PACM if -mbranch-protection has +pc and the target does not
+  // have feature +pauth-lr.
+  if (MFnI.branchProtectionPAuthLR() && !Subtarget.hasPAuthLR())
+    BuildMI(MBB, MBBI, DL, TII->get(AArch64::PACM)).setMIFlag(Flags);
+}
+
 void AArch64PointerAuth::signLR(MachineFunction &MF,
                                 MachineBasicBlock::iterator MBBI) const {
-  const AArch64FunctionInfo *MFnI = MF.getInfo<AArch64FunctionInfo>();
-  bool UseBKey = MFnI->shouldSignWithBKey();
-  bool EmitCFI = MFnI->needsDwarfUnwindInfo(MF);
+  auto &MFnI = *MF.getInfo<AArch64FunctionInfo>();
+  bool UseBKey = MFnI.shouldSignWithBKey();
+  bool EmitCFI = MFnI.needsDwarfUnwindInfo(MF);
   bool NeedsWinCFI = MF.hasWinCFI();
 
   MachineBasicBlock &MBB = *MBBI->getParent();
@@ -77,11 +101,29 @@ void AArch64PointerAuth::signLR(MachineFunction &MF,
         .setMIFlag(MachineInstr::FrameSetup);
   }
 
+  // PAuthLR authentication instructions need to know the value of PC at the
+  // point of signing (PACI*).
+  if (MFnI.branchProtectionPAuthLR()) {
+    MCSymbol *PACSym = MF.getMMI().getContext().createTempSymbol();
+    MFnI.setSigningInstrLabel(PACSym);
+  }
+
   // No SEH opcode for this one; it doesn't materialize into an
   // instruction on Windows.
-  BuildMI(MBB, MBBI, DL,
-          TII->get(UseBKey ? AArch64::PACIBSP : AArch64::PACIASP))
-      .setMIFlag(MachineInstr::FrameSetup);
+  if (MFnI.branchProtectionPAuthLR() && Subtarget->hasPAuthLR()) {
+    BuildMI(MBB, MBBI, DL,
+            TII->get(MFnI.shouldSignWithBKey() ? AArch64::PACIBSPPC
+                                               : AArch64::PACIASPPC))
+        .setMIFlag(MachineInstr::FrameSetup)
+        ->setPreInstrSymbol(MF, MFnI.getSigningInstrLabel());
+  } else {
+    BuildPACM(*Subtarget, MBB, MBBI, DL, MachineInstr::FrameSetup);
+    BuildMI(MBB, MBBI, DL,
+            TII->get(MFnI.shouldSignWithBKey() ? AArch64::PACIBSP
+                                               : AArch64::PACIASP))
+        .setMIFlag(MachineInstr::FrameSetup)
+        ->setPreInstrSymbol(MF, MFnI.getSigningInstrLabel());
+  }
 
   if (EmitCFI) {
     unsigned CFIIndex =
@@ -118,15 +160,37 @@ void AArch64PointerAuth::authenticateLR(
   // DW_CFA_AARCH64_negate_ra_state can't be emitted.
   bool TerminatorIsCombinable =
       TI != MBB.end() && TI->getOpcode() == AArch64::RET;
+  MCSymbol *PACSym = MFnI->getSigningInstrLabel();
+
   if (Subtarget->hasPAuth() && TerminatorIsCombinable && !NeedsWinCFI &&
       !MF.getFunction().hasFnAttribute(Attribute::ShadowCallStack)) {
-    unsigned CombinedRetOpcode = UseBKey ? AArch64::RETAB : AArch64::RETAA;
-    BuildMI(MBB, TI, DL, TII->get(CombinedRetOpcode)).copyImplicitOps(*TI);
+    if (MFnI->branchProtectionPAuthLR() && Subtarget->hasPAuthLR()) {
+      assert(PACSym && "No PAC instruction to refer to");
+      BuildMI(MBB, TI, DL,
+              TII->get(UseBKey ? AArch64::RETABSPPCi : AArch64::RETAASPPCi))
+          .addSym(PACSym)
+          .copyImplicitOps(*MBBI)
+          .setMIFlag(MachineInstr::FrameDestroy);
+    } else {
+      BuildPACM(*Subtarget, MBB, TI, DL, MachineInstr::FrameDestroy, PACSym);
+      BuildMI(MBB, TI, DL, TII->get(UseBKey ? AArch64::RETAB : AArch64::RETAA))
+          .copyImplicitOps(*MBBI)
+          .setMIFlag(MachineInstr::FrameDestroy);
+    }
     MBB.erase(TI);
   } else {
-    unsigned AutOpcode = UseBKey ? AArch64::AUTIBSP : AArch64::AUTIASP;
-    BuildMI(MBB, MBBI, DL, TII->get(AutOpcode))
-        .setMIFlag(MachineInstr::FrameDestroy);
+    if (MFnI->branchProtectionPAuthLR() && Subtarget->hasPAuthLR()) {
+      assert(PACSym && "No PAC instruction to refer to");
+      BuildMI(MBB, MBBI, DL,
+              TII->get(UseBKey ? AArch64::AUTIBSPPCi : AArch64::AUTIASPPCi))
+          .addSym(PACSym)
+          .setMIFlag(MachineInstr::FrameDestroy);
+    } else {
+      BuildPACM(*Subtarget, MBB, MBBI, DL, MachineInstr::FrameDestroy, PACSym);
+      BuildMI(MBB, MBBI, DL,
+              TII->get(UseBKey ? AArch64::AUTIBSP : AArch64::AUTIASP))
+          .setMIFlag(MachineInstr::FrameDestroy);
+    }
 
     if (EmitAsyncCFI) {
       unsigned CFIIndex =
diff --git a/llvm/lib/TargetParser/ARMTargetParserCommon.cpp b/llvm/lib/TargetParser/ARMTargetParserCommon.cpp
index 10b80cad43472..6d3a59d532fd3 100644
--- a/llvm/lib/TargetParser/ARMTargetParserCommon.cpp
+++ b/llvm/lib/TargetParser/ARMTargetParserCommon.cpp
@@ -134,13 +134,13 @@ ARM::EndianKind ARM::parseArchEndian(StringRef Arch) {
 }
 
 // Parse a branch protection specification, which has the form
-//   standard | none | [bti,pac-ret[+b-key,+leaf]*]
+//   standard | none | [bti,pac-ret[+b-key,+leaf,+pc]*]
 // Returns true on success, with individual elements of the specification
 // returned in `PBP`. Returns false in error, with `Err` containing
 // an erroneous part of the spec.
 bool ARM::parseBranchProtection(StringRef Spec, ParsedBranchProtection &PBP,
                                 StringRef &Err) {
-  PBP = {"none", "a_key", false};
+  PBP = {"none", "a_key", false, false};
   if (Spec == "none")
     return true; // defaults are ok
 
@@ -166,6 +166,8 @@ bool ARM::parseBranchProtection(StringRef Spec, ParsedBranchProtection &PBP,
           PBP.Scope = "all";
         else if (PACOpt == "b-key")
           PBP.Key = "b_key";
+        else if (PACOpt == "pc")
+          PBP.BranchProtectionPAuthLR = true;
         else
           break;
       }
diff --git a/llvm/test/CodeGen/AArch64/sign-return-address-pauth-lr.ll b/llvm/test/CodeGen/AArch64/sign-return-address-pauth-lr.ll
new file mode 100644
index 0000000000000..a78fa853d99dc
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sign-return-address-pauth-lr.ll
@@ -0,0 +1,542 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+
+; PauthLR is controlled via a combination of -mbranch-protection and +pauth-lr.
+; -mbranch-protection=+pc enables branch protection. If the feature +pauth-lr
+; is available (v9.5a onwards) then non-NOP instructions are used; otherwise
+; NOP instructions are used.
+
+; There are 6 cases to cover:
+
+; feature \ -mbranch-protection= |    none    | pac-ret |   pac-ret+pc
+; ------------------------------------------------------------------------
+; without +pauth-lr              | no codegen | old pac |     NOP pauth-lr
+;    with +pauth-lr              | no codegen | old pac | non-NOP pauth-lr
+
+; sign-return-address.ll tests combinations of -mbranch-protection=none/pac-ret
+; and whether +pauth-lr is present or not.
+
+; sign-return-address-pauth-lr.ll is identical, with the addition of this module
+; attribute, which enables -mbranch-protection=pac-ret+pc, and therefore tests
+; the remaining parameter combinations in the table:
+!llvm.module.flags = !{!1}
+!1 = !{i32 1, !"branch-protection-pauth-lr", i32 1}
+
+; RUN: llc -mtriple=aarch64              < %s | FileCheck --check-prefixes=CHECK,COMPAT %s
+; RUN: llc -mtriple=aarch64 -mattr=v8.3a < %s | FileCheck --check-prefixes=CHECK,V83A %s
+; RUN: llc -mtriple=aarch64 -mattr=v9a -mattr=pauth-lr < %s | FileCheck --check-prefixes=PAUTHLR %s
+
+define i32 @leaf(i32 %x) {
+; CHECK-LABEL: leaf:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ret
+;
+; PAUTHLR-LABEL: leaf:
+; PAUTHLR:       // %bb.0:
+; PAUTHLR-NEXT:    ret
+  ret i32 %x
+}
+
+define i32 @leaf_sign_none(i32 %x) "sign-return-address"="none"  {
+; CHECK-LABEL: leaf_sign_none:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ret
+;
+; PAUTHLR-LABEL: leaf_sign_none:
+; PAUTHLR:       // %bb.0:
+; PAUTHLR-NEXT:    ret
+  ret i32 %x
+}
+
+define i32 @leaf_sign_non_leaf(i32 %x) "sign-return-address"="non-leaf"  {
+; CHECK-LABEL: leaf_sign_non_leaf:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ret
+;
+; PAUTHLR-LABEL: leaf_sign_non_leaf:
+; PAUTHLR:       // %bb.0:
+; PAUTHLR-NEXT:    ret
+  ret i32 %x
+}
+
+define i32 @leaf_sign_all(i32 %x) "sign-return-address"="all" {
+; COMPAT-LABEL: leaf_sign_all:
+; COMPAT:       // %bb.0:
+; COMPAT-NEXT:    hint #39
+; COMPAT-NEXT:  .Ltmp0:
+; COMPAT-NEXT:    hint #25
+; COMPAT-NEXT:    .cfi_negate_ra_state
+; COMPAT-NEXT:    adr x16, .Ltmp0
+; COMPAT-NEXT:    hint #39
+; COMPAT-NEXT:    hint #29
+; COMPAT-NEXT:    ret
+;
+; V83A-LABEL: leaf_sign_all:
+; V83A:       // %bb.0:
+; V83A-NEXT:    hint #39
+; V83A-NEXT:  .Ltmp0:
+; V83A-NEXT:    paciasp
+; V83A-NEXT:    .cfi_negate_ra_state
+; V83A-NEXT:    adr x16, .Ltmp0
+; V83A-NEXT:    hint #39
+; V83A-NEXT:    retaa
+;
+; PAUTHLR-LABEL: leaf_sign_all:
+; PAUTHLR:       // %bb.0:
+; PAUTHLR-NEXT:  .Ltmp0:
+; PAUTHLR-NEXT:    paciasppc
+; PAUTHLR-NEXT:    .cfi_negate_ra_state
+; PAUTHLR-NEXT:    retaasppc .Ltmp0
+  ret i32 %x
+}
+
+define i64 @leaf_clobbers_lr(i64 %x) "sign-return-address"="non-leaf"  {
+; COMPAT-LABEL: leaf_clobbers_lr:
+; COMPAT:       // %bb.0:
+; COMPAT-NEXT:    hint #39
+; COMPAT-NEXT:  .Ltmp1:
+; COMPAT-NEXT:    hint #25
+; COMPAT-NEXT:    .cfi_negate_ra_state
+; COMPAT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; COMPAT-NEXT:    .cfi_def_cfa_offset 16
+; COMPAT-NEXT:    .cfi_offset w30, -16
+; COMPAT-NEXT:    //APP
+; COMPAT-NEXT:    mov x30, x0
+; COMPAT-NEXT:    //NO_APP
+; COMPAT-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; COMPAT-NEXT:    adr x16, .Ltmp1
+; COMPAT-NEXT:    hint #39
+; COMPAT-NEXT:    hint #29
+; COMPAT-NEXT:    ret
+;
+; V83A-LABEL: leaf_clobbers_lr:
+; V83A:       // %bb.0:
+; V83A-NEXT:    hint #39
+; V83A-NEXT:  .Ltmp1:
+; V83A-NEXT:    paciasp
+; V83A-NEXT:    .cfi_negate_ra_state
+; V83A-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; V83A-NEXT:    .cfi_def_cfa_offset 16
+; V83A-NEXT:    .cfi_offset w30, -16
+; V83A-NEXT:    //APP
+; V83A-NEXT:    mov x30, x0
+; V83A-NEXT:    //NO_APP
+; V83A-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; V83A-NEXT:    adr x16, .Ltmp1
+; V83A-NEXT:    hint #39
+; V83A-NEXT:    retaa
+;
+; PAUTHLR-LABEL: leaf_clobbers_lr:
+; PAUTHLR:       // %bb.0:
+; PAUTHLR-NEXT:  .Ltmp1:
+; PAUTHLR-NEXT:    paciasppc
+; PAUTHLR-NEXT:    .cfi_negate_ra_state
+; PAUTHLR-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; PAUTHLR-NEXT:    .cfi_def_cfa_offset 16
+; PAUTHLR-NEXT:    .cfi_offset w30, -16
+; PAUTHLR-NEXT:    //APP
+; PAUTHLR-NEXT:    mov x30, x0
+; PAUTHLR-NEXT:    //NO_APP
+; PAUTHLR-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; PAUTHLR-NEXT:    retaasppc .Ltmp1
+  call void asm sideeffect "mov x30, $0", "r,~{lr}"(i64 %x) #1
+  ret i64 %x
+}
+
+declare i32 @foo(i32)
+
+define i32 @non_leaf_sign_all(i32 %x) "sign-return-address"="all" {
+; COMPAT-LABEL: non_leaf_sign_all:
+; COMPAT:       // %bb.0:
+; COMPAT-NEXT:    hint #39
+; COMPAT-NEXT:  .Ltmp2:
+; COMPAT-NEXT:    hint #25
+; COMPAT-NEXT:    .cfi_negate_ra_state
+; COMPAT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; COMPAT-NEXT:    .cfi_def_cfa_offset 16
+; COMPAT-NEXT:    .cfi_offset w30, -16
+; COMPAT-NEXT:    bl foo
+; COMPAT-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; COMPAT-NEXT:    adr x16, .Ltmp2
+; COMPAT-NEXT:    hint #39
+; COMPAT-NEXT:    hint #29
+; COMPAT-NEXT:    ret
+;
+; V83A-LABEL: non_leaf_sign_all:
+; V83A:       // %bb.0:
+; V83A-NEXT:    hint #39
+; V83A-NEXT:  .Ltmp2:
+; V83A-NEXT:    paciasp
+; V83A-NEXT:    .cfi_negate_ra_state
+; V83A-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; V83A-NEXT:    .cfi_def_cfa_offset 16
+; V83A-NEXT:    .cfi_offset w30, -16
+; V83A-NEXT:    bl foo
+; V83A-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; V83A-NEXT:    adr x16, .Ltmp2
+; V83A-NEXT:    hint #39
+; V83A-NEXT:    retaa
+;
+; PAUTHLR-LABEL: non_leaf_sign_all:
+; PAUTHLR:       // %bb.0:
+; PAUTHLR-NEXT:  .Ltmp2:
+; PAUTHLR-NEXT:    paciasppc
+; PAUTHLR-NEXT:    .cfi_negate_ra_state
+; PAUTHLR-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; PAUTHLR-NEXT:    .cfi_def_cfa_offset 16
+; PAUTHLR-NEXT:    .cfi_offset w30, -16
+; PAUTHLR-NEXT:    bl foo
+; PAUTHLR-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; PAUTHLR-NEXT:    retaasppc .Ltmp2
+  %call = call i32 @foo(i32 %x)
+  ret i32 %call
+}
+
+define i32 @non_leaf_sign_non_leaf(i32 %x) "sign-return-address"="non-leaf"  {
+; COMPAT-LABEL: non_leaf_sign_non_leaf:
+; COMPAT:       // %bb.0:
+; COMPAT-NEXT:    hint #39
+; COMPAT-NEXT:  .Ltmp3:
+; COMPAT-NEXT:    hint #25
+; COMPAT-NEXT:    .cfi_negate_ra_state
+; COMPAT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; COMPAT-NEXT:    .cfi_def_cfa_offset 16
+; COMPAT-NEXT:    .cfi_offset w30, -16
+; COMPAT-NEXT:    bl foo
+; COMPAT-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; COMPAT-NEXT:    adr x16, .Ltmp3
+; COMPAT-NEXT:    hint #39
+; COMPAT-NEXT:    hint #29
+; COMPAT-NEXT:    ret
+;
+; V83A-LABEL: non_leaf_sign_non_leaf:
+; V83A:       // %bb.0:
+; V83A-NEXT:    hint #39
+; V83A-NEXT:  .Ltmp3:
+; V83A-NEXT:    paciasp
+; V83A-NEXT:    .cfi_negate_ra_state
+; V83A-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; V83A-NEXT:    .cfi_def_cfa_offset 16
+; V83A-NEXT:    .cfi_offset w30, -16
+; V83A-NEXT:    bl foo
+; V83A-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; V83A-NEXT:    adr x16, .Ltmp3
+; V83A-NEXT:    hint #39
+; V83A-NEXT:    retaa
+;
+; PAUTHLR-LABEL: non_leaf_sign_non_leaf:
+; PAUTHLR:       // %bb.0:
+; PAUTHLR-NEXT:  .Ltmp3:
+; PAUTHLR-NEXT:    paciasppc
+; PAUTHLR-NEXT:    .cfi_negate_ra_state
+; PAUTHLR-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; PAUTHLR-NEXT:    .cfi_def_cfa_offset 16
+; PAUTHLR-NEXT:    .cfi_offset w30, -16
+; PAUTHLR-NEXT:    bl foo
+; PAUTHLR-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; PAUTHLR-NEXT:    retaasppc .Ltmp3
+  %call = call i32 @foo(i32 %x)
+  ret i32 %call
+}
+
+; Should not use the RETAA instruction.
+define i32 @non_leaf_scs(i32 %x) "sign-return-address"="non-leaf" shadowcallstack "target-features"="+v8.3a,+reserve-x18"  {
+; CHECK-LABEL: non_leaf_scs:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [x18], #8
+; CHECK-NEXT:    .cfi_escape 0x16, 0x12, 0x02, 0x82, 0x78 //
+; CHECK-NEXT:    hint #39
+; CHECK-NEXT:  .Ltmp4:
+; CHECK-NEXT:    paciasp
+; CHECK-NEXT:    .cfi_negate_ra_state
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    bl foo
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    adr x16, .Ltmp4
+; CHECK-NEXT:    hint #39
+; CHECK-NEXT:    autiasp
+; CHECK-NEXT:    ldr x30, [x18, #-8]!
+; CHECK-NEXT:    ret
+;
+; PAUTHLR-LABEL: non_leaf_scs:
+; PAUTHLR:       // %bb.0:
+; PAUTHLR-NEXT:    str x30, [x18], #8
+; PAUTHLR-NEXT:    .cfi_escape 0x16, 0x12, 0x02, 0x82, 0x78 //
+; PAUTHLR-NEXT:  .Ltmp4:
+; PAUTHLR-NEXT:    paciasppc
+; PAUTHLR-NEXT:    .cfi_negate_ra_state
+; PAUTHLR-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; PAUTHLR-NEXT:    .cfi_def_cfa_offset 16
+; PAUTHLR-NEXT:    .cfi_offset w30, -16
+; PAUTHLR-NEXT:    bl foo
+; PAUTHLR-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; PAUTHLR-NEXT:    autiasppc .Ltmp4
+; PAUTHLR-NEXT:    ldr x30, [x18, #-8]!
+; PAUTHLR-NEXT:    ret
+  %call = call i32 @foo(i32 %x)
+  ret i32 %call
+}
+
+define i32 @leaf_sign_all_v83(i32 %x) "sign-return-address"="all" "target-features"="+v8.3a" {
+; CHECK-LABEL: leaf_sign_all_v83:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    hint #39
+; CHECK-NEXT:  .Ltmp5:
+; CHECK-NEXT:    paciasp
+; CHECK-NEXT:    .cfi_negate_ra_state
+; CHECK-NEXT:    adr x16, .Ltmp5
+; CHECK-NEXT:    hint #39
+; CHECK-NEXT:    retaa
+;
+; PAUTHLR-LABEL: leaf_sign_all_v83:
+; PAUTHLR:       // %bb.0:
+; PAUTHLR-NEXT:  .Ltmp5:
+; PAUTHLR-NEXT:    paciasppc
+; PAUTHLR-NEXT:    .cfi_negate_ra_state
+; PAUTHLR-NEXT:    retaasppc .Ltmp5
+  ret i32 %x
+}
+
+declare fastcc i64 @bar(i64)
+
+define fastcc void @spill_lr_and_tail_call(i64 %x) "sign-return-address"="all" {
+; COMPAT-LABEL: spill_lr_and_tail_call:
+; COMPAT:       // %bb.0:
+; COMPAT-NEXT:    hint #39
+; COMPAT-NEXT:  .Ltmp6:
+; COMPAT-NEXT:    hint #25
+; COMPAT-NEXT:    .cfi_negate_ra_state
+; COMPAT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; COMPAT-NEXT:    .cfi_def_cfa_offset 16
+; COMPAT-NEXT:    .cfi_offset w30, -16
+; COMPAT-NEXT:    //APP
+; COMPAT-NEXT:    mov x30, x0
+; COMPAT-NEXT:    //NO_APP
+; COMPAT-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; COMPAT-NEXT:    adr x16, .Ltmp6
+; COMPAT-NEXT:    hint #39
+; COMPAT-NEXT:    hint #29
+; COMPAT-NEXT:    b bar
+;
+; V83A-LABEL: spill_lr_and_tail_call:
+; V83A:       // %bb.0:
+; V83A-NEXT:    hint #39
+; V83A-NEXT:  .Ltmp6:
+; V83A-NEXT:    paciasp
+; V83A-NEXT:    .cfi_negate_ra_state
+; V83A-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; V83A-NEXT:    .cfi_def_cfa_offset 16
+; V83A-NEXT:    .cfi_offset w30, -16
+; V83A-NEXT:    //APP
+; V83A-NEXT:    mov x30, x0
+; V83A-NEXT:    //NO_APP
+; V83A-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; V83A-NEXT:    adr x16, .Ltmp6
+; V83A-NEXT:    hint #39
+; V83A-NEXT:    autiasp
+; V83A-NEXT:    b bar
+;
+; PAUTHLR-LABEL: spill_lr_and_tail_call:
+; PAUTHLR:       // %bb.0:
+; PAUTHLR-NEXT:  .Ltmp6:
+; PAUTHLR-NEXT:    paciasppc
+; PAUTHLR-NEXT:    .cfi_negate_ra_state
+; PAUTHLR-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; PAUTHLR-NEXT:    .cfi_def_cfa_offset 16
+; PAUTHLR-NEXT:    .cfi_offset w30, -16
+; PAUTHLR-NEXT:    //APP
+; PAUTHLR-NEXT:    mov x30, x0
+; PAUTHLR-NEXT:    //NO_APP
+; PAUTHLR-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; PAUTHLR-NEXT:    autiasppc .Ltmp6
+; PAUTHLR-NEXT:    b bar
+  call void asm sideeffect "mov x30, $0", "r,~{lr}"(i64 %x) #1
+  tail call fastcc i64 @bar(i64 %x)
+  ret void
+}
+
+define i32 @leaf_sign_all_a_key(i32 %x) "sign-return-address"="all" "sign-return-address-key"="a_key" {
+; COMPAT-LABEL: leaf_sign_all_a_key:
+; COMPAT:       // %bb.0:
+; COMPAT-NEXT:    hint #39
+; COMPAT-NEXT:  .Ltmp7:
+; COMPAT-NEXT:    hint #25
+; COMPAT-NEXT:    .cfi_negate_ra_state
+; COMPAT-NEXT:    adr x16, .Ltmp7
+; COMPAT-NEXT:    hint #39
+; COMPAT-NEXT:    hint #29
+; COMPAT-NEXT:    ret
+;
+; V83A-LABEL: leaf_sign_all_a_key:
+; V83A:       // %bb.0:
+; V83A-NEXT:    hint #39
+; V83A-NEXT:  .Ltmp7:
+; V83A-NEXT:    paciasp
+; V83A-NEXT:    .cfi_negate_ra_state
+; V83A-NEXT:    adr x16, .Ltmp7
+; V83A-NEXT:    hint #39
+; V83A-NEXT:    retaa
+;
+; PAUTHLR-LABEL: leaf_sign_all_a_key:
+; PAUTHLR:       // %bb.0:
+; PAUTHLR-NEXT:  .Ltmp7:
+; PAUTHLR-NEXT:    paciasppc
+; PAUTHLR-NEXT:    .cfi_negate_ra_state
+; PAUTHLR-NEXT:    retaasppc .Ltmp7
+  ret i32 %x
+}
+
+define i32 @leaf_sign_all_b_key(i32 %x) "sign-return-address"="all" "sign-return-address-key"="b_key" {
+; COMPAT-LABEL: leaf_sign_all_b_key:
+; COMPAT:       // %bb.0:
+; COMPAT-NEXT:    .cfi_b_key_frame
+; COMPAT-NEXT:    hint #39
+; COMPAT-NEXT:  .Ltmp8:
+; COMPAT-NEXT:    hint #27
+; COMPAT-NEXT:    .cfi_negate_ra_state
+; COMPAT-NEXT:    adr x16, .Ltmp8
+; COMPAT-NEXT:    hint #39
+; COMPAT-NEXT:    hint #31
+; COMPAT-NEXT:    ret
+;
+; V83A-LABEL: leaf_sign_all_b_key:
+; V83A:       // %bb.0:
+; V83A-NEXT:    .cfi_b_key_frame
+; V83A-NEXT:    hint #39
+; V83A-NEXT:  .Ltmp8:
+; V83A-NEXT:    pacibsp
+; V83A-NEXT:    .cfi_negate_ra_state
+; V83A-NEXT:    adr x16, .Ltmp8
+; V83A-NEXT:    hint #39
+; V83A-NEXT:    retab
+;
+; PAUTHLR-LABEL: leaf_sign_all_b_key:
+; PAUTHLR:       // %bb.0:
+; PAUTHLR-NEXT:    .cfi_b_key_frame
+; PAUTHLR-NEXT:  .Ltmp8:
+; PAUTHLR-NEXT:    pacibsppc
+; PAUTHLR-NEXT:    .cfi_negate_ra_state
+; PAUTHLR-NEXT:    retabsppc .Ltmp8
+  ret i32 %x
+}
+
+define i32 @leaf_sign_all_v83_b_key(i32 %x) "sign-return-address"="all" "target-features"="+v8.3a" "sign-return-address-key"="b_key" {
+; CHECK-LABEL: leaf_sign_all_v83_b_key:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    .cfi_b_key_frame
+; CHECK-NEXT:    hint #39
+; CHECK-NEXT:  .Ltmp9:
+; CHECK-NEXT:    pacibsp
+; CHECK-NEXT:    .cfi_negate_ra_state
+; CHECK-NEXT:    adr x16, .Ltmp9
+; CHECK-NEXT:    hint #39
+; CHECK-NEXT:    retab
+;
+; PAUTHLR-LABEL: leaf_sign_all_v83_b_key:
+; PAUTHLR:       // %bb.0:
+; PAUTHLR-NEXT:    .cfi_b_key_frame
+; PAUTHLR-NEXT:  .Ltmp9:
+; PAUTHLR-NEXT:    pacibsppc
+; PAUTHLR-NEXT:    .cfi_negate_ra_state
+; PAUTHLR-NEXT:    retabsppc .Ltmp9
+  ret i32 %x
+}
+
+; Note that BTI instruction is not needed before PACIASP.
+define i32 @leaf_sign_all_a_key_bti(i32 %x) "sign-return-address"="all" "sign-return-address-key"="a_key" "branch-target-enforcement"="true"{
+; COMPAT-LABEL: leaf_sign_all_a_key_bti:
+; COMPAT:       // %bb.0:
+; COMPAT-NEXT:    hint #34
+; COMPAT-NEXT:    hint #39
+; COMPAT-NEXT:  .Ltmp10:
+; COMPAT-NEXT:    hint #25
+; COMPAT-NEXT:    .cfi_negate_ra_state
+; COMPAT-NEXT:    adr x16, .Ltmp10
+; COMPAT-NEXT:    hint #39
+; COMPAT-NEXT:    hint #29
+; COMPAT-NEXT:    ret
+;
+; V83A-LABEL: leaf_sign_all_a_key_bti:
+; V83A:       // %bb.0:
+; V83A-NEXT:    hint #34
+; V83A-NEXT:    hint #39
+; V83A-NEXT:  .Ltmp10:
+; V83A-NEXT:    paciasp
+; V83A-NEXT:    .cfi_negate_ra_state
+; V83A-NEXT:    adr x16, .Ltmp10
+; V83A-NEXT:    hint #39
+; V83A-NEXT:    retaa
+;
+; PAUTHLR-LABEL: leaf_sign_all_a_key_bti:
+; PAUTHLR:       // %bb.0:
+; PAUTHLR-NEXT:    bti c
+; PAUTHLR-NEXT:  .Ltmp10:
+; PAUTHLR-NEXT:    paciasppc
+; PAUTHLR-NEXT:    .cfi_negate_ra_state
+; PAUTHLR-NEXT:    retaasppc .Ltmp10
+  ret i32 %x
+}
+
+; Note that BTI instruction is not needed before PACIBSP.
+define i32 @leaf_sign_all_b_key_bti(i32 %x) "sign-return-address"="all" "sign-return-address-key"="b_key" "branch-target-enforcement"="true"{
+; COMPAT-LABEL: leaf_sign_all_b_key_bti:
+; COMPAT:       // %bb.0:
+; COMPAT-NEXT:    hint #34
+; COMPAT-NEXT:    .cfi_b_key_frame
+; COMPAT-NEXT:    hint #39
+; COMPAT-NEXT:  .Ltmp11:
+; COMPAT-NEXT:    hint #27
+; COMPAT-NEXT:    .cfi_negate_ra_state
+; COMPAT-NEXT:    adr x16, .Ltmp11
+; COMPAT-NEXT:    hint #39
+; COMPAT-NEXT:    hint #31
+; COMPAT-NEXT:    ret
+;
+; V83A-LABEL: leaf_sign_all_b_key_bti:
+; V83A:       // %bb.0:
+; V83A-NEXT:    hint #34
+; V83A-NEXT:    .cfi_b_key_frame
+; V83A-NEXT:    hint #39
+; V83A-NEXT:  .Ltmp11:
+; V83A-NEXT:    pacibsp
+; V83A-NEXT:    .cfi_negate_ra_state
+; V83A-NEXT:    adr x16, .Ltmp11
+; V83A-NEXT:    hint #39
+; V83A-NEXT:    retab
+;
+; PAUTHLR-LABEL: leaf_sign_all_b_key_bti:
+; PAUTHLR:       // %bb.0:
+; PAUTHLR-NEXT:    bti c
+; PAUTHLR-NEXT:    .cfi_b_key_frame
+; PAUTHLR-NEXT:  .Ltmp11:
+; PAUTHLR-NEXT:    pacibsppc
+; PAUTHLR-NEXT:    .cfi_negate_ra_state
+; PAUTHLR-NEXT:    retabsppc .Ltmp11
+  ret i32 %x
+}
+
+; Note that BTI instruction is not needed before PACIBSP.
+define i32 @leaf_sign_all_v83_b_key_bti(i32 %x) "sign-return-address"="all" "target-features"="+v8.3a" "sign-return-address-key"="b_key" "branch-target-enforcement"="true" {
+; CHECK-LABEL: leaf_sign_all_v83_b_key_bti:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    hint #34
+; CHECK-NEXT:    .cfi_b_key_frame
+; CHECK-NEXT:    hint #39
+; CHECK-NEXT:  .Ltmp12:
+; CHECK-NEXT:    pacibsp
+; CHECK-NEXT:    .cfi_negate_ra_state
+; CHECK-NEXT:    adr x16, .Ltmp12
+; CHECK-NEXT:    hint #39
+; CHECK-NEXT:    retab
+;
+; PAUTHLR-LABEL: leaf_sign_all_v83_b_key_bti:
+; PAUTHLR:       // %bb.0:
+; PAUTHLR-NEXT:    bti c
+; PAUTHLR-NEXT:    .cfi_b_key_frame
+; PAUTHLR-NEXT:  .Ltmp12:
+; PAUTHLR-NEXT:    pacibsppc
+; PAUTHLR-NEXT:    .cfi_negate_ra_state
+; PAUTHLR-NEXT:    retabsppc .Ltmp12
+  ret i32 %x
+}
diff --git a/llvm/test/CodeGen/AArch64/sign-return-address.ll b/llvm/test/CodeGen/AArch64/sign-return-address.ll
index 5680915c7f414..1481d4beb50d6 100644
--- a/llvm/test/CodeGen/AArch64/sign-return-address.ll
+++ b/llvm/test/CodeGen/AArch64/sign-return-address.ll
@@ -2,6 +2,9 @@
 ; RUN: llc -mtriple=aarch64              < %s | FileCheck --check-prefixes=CHECK,COMPAT %s
 ; RUN: llc -mtriple=aarch64 -mattr=v8.3a < %s | FileCheck --check-prefixes=CHECK,V83A %s
 
+; v9.5-A is not expected to change codegen without -mbranch-protection=+pc, so reuse V83A.
+; RUN: llc -mtriple=aarch64 -mattr=v9.5a < %s | FileCheck --check-prefixes=CHECK,V83A %s
+
 define i32 @leaf(i32 %x) {
 ; CHECK-LABEL: leaf:
 ; CHECK:       // %bb.0:
diff --git a/llvm/unittests/TargetParser/TargetParserTest.cpp b/llvm/unittests/TargetParser/TargetParserTest.cpp
index 30e60ad92b68e..866176ab09836 100644
--- a/llvm/unittests/TargetParser/TargetParserTest.cpp
+++ b/llvm/unittests/TargetParser/TargetParserTest.cpp
@@ -1812,7 +1812,8 @@ TEST(TargetParserTest, AArch64ExtensionFeatures) {
       AArch64::AEK_SSVE_FP8DOT4, AArch64::AEK_LUT,
       AArch64::AEK_SME_LUTv2,    AArch64::AEK_SMEF8F16,
       AArch64::AEK_SMEF8F32,     AArch64::AEK_SMEFA64,
-      AArch64::AEK_CPA};
+      AArch64::AEK_CPA,          AArch64::AEK_PAUTHLR,
+  };
 
   std::vector<StringRef> Features;
 
@@ -1899,6 +1900,7 @@ TEST(TargetParserTest, AArch64ExtensionFeatures) {
   EXPECT_TRUE(llvm::is_contained(Features, "+sme-f8f32"));
   EXPECT_TRUE(llvm::is_contained(Features, "+sme-fa64"));
   EXPECT_TRUE(llvm::is_contained(Features, "+cpa"));
+  EXPECT_TRUE(llvm::is_contained(Features, "+pauth-lr"));
 
   // Assuming we listed every extension above, this should produce the same
   // result. (note that AEK_NONE doesn't have a name so it won't be in the

From e414ba33b44971d47d24d75b7da94898d2cc8bde Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Thu, 21 Dec 2023 15:16:55 +0100
Subject: [PATCH 071/342] [ValueTracking] Shufflevector produces poison rather
 than undef

Shufflevector semantics have changed so that poison mask elements
return poison rather than undef. Reflect this in the
canCreateUndefOrPoison() implementation.
---
 llvm/lib/Analysis/ValueTracking.cpp           | 3 ---
 llvm/unittests/Analysis/ValueTrackingTest.cpp | 8 ++++----
 2 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index efdb3fc285824..2ce660b9a858e 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -6674,9 +6674,6 @@ static bool canCreateUndefOrPoison(const Operator *Op, bool PoisonOnly,
     return false;
   }
   case Instruction::ShuffleVector: {
-    // shufflevector may return undef.
-    if (PoisonOnly)
-      return false;
     ArrayRef<int> Mask = isa<ConstantExpr>(Op)
                              ? cast<ConstantExpr>(Op)->getShuffleMask()
                              : cast<ShuffleVectorInst>(Op)->getShuffleMask();
diff --git a/llvm/unittests/Analysis/ValueTrackingTest.cpp b/llvm/unittests/Analysis/ValueTrackingTest.cpp
index 0d3a594da0c06..27f631884072b 100644
--- a/llvm/unittests/Analysis/ValueTrackingTest.cpp
+++ b/llvm/unittests/Analysis/ValueTrackingTest.cpp
@@ -1177,12 +1177,12 @@ TEST(ValueTracking, canCreatePoisonOrUndef) {
       {{false, false},
        "shufflevector <4 x i32> %vx, <4 x i32> %vx2, "
        "<4 x i32> <i32 0, i32 1, i32 2, i32 3>"},
-      {{false, true},
+      {{true, false},
        "shufflevector <4 x i32> %vx, <4 x i32> %vx2, "
-       "<4 x i32> <i32 0, i32 1, i32 2, i32 undef>"},
-      {{false, true},
+       "<4 x i32> <i32 0, i32 1, i32 2, i32 poison>"},
+      {{true, false},
        "shufflevector <vscale x 4 x i32> %svx, "
-       "<vscale x 4 x i32> %svx, <vscale x 4 x i32> undef"},
+       "<vscale x 4 x i32> %svx, <vscale x 4 x i32> poison"},
       {{true, false}, "call i32 @g(i32 %x)"},
       {{false, false}, "call noundef i32 @g(i32 %x)"},
       {{true, false}, "fcmp nnan oeq float %fx, %fy"},

From 2fe94cead0a55d8d269e6e32bb95f7aa987d4db8 Mon Sep 17 00:00:00 2001
From: Shengchen Kan <shengchen.kan@intel.com>
Date: Thu, 21 Dec 2023 22:09:11 +0800
Subject: [PATCH 072/342] [X86][NFC] Refine code in X86InstrArithmetic.td

1. Simplify the variable name
2. Change HasOddOpcode to HasEvenOpcode b/c
  a. opcode of any 8-bit arithmetic instruction is even
  b. opcode of a 16/32/64-bit arithmetic instruction is usually
     odd, but it can be even sometimes, e.g. INC/DEC, ADCX/ADOX
  c. so that we can remove `let Opcode = o` for the mentioned corner
     cases.
---
 llvm/lib/Target/X86/X86InstrArithmetic.td | 138 +++++++++++-----------
 llvm/lib/Target/X86/X86InstrUtils.td      |  22 ++--
 2 files changed, 78 insertions(+), 82 deletions(-)

diff --git a/llvm/lib/Target/X86/X86InstrArithmetic.td b/llvm/lib/Target/X86/X86InstrArithmetic.td
index 664ba316cd75b..dad8818b1c3b7 100644
--- a/llvm/lib/Target/X86/X86InstrArithmetic.td
+++ b/llvm/lib/Target/X86/X86InstrArithmetic.td
@@ -232,7 +232,7 @@ class BinOpMIF_MF<bits<8> o, string m, X86TypeInfo t, SDNode node, Format f>
 
 // BinOpMI8 - Instructions that read "[mem], imm8".
 class BinOpMI8<string m, X86TypeInfo t, Format f>
-  : ITy<0x82, f, t, (outs), (ins t.MemOperand:$dst, t.Imm8Operand:$src), m,
+  : ITy<0x83, f, t, (outs), (ins t.MemOperand:$dst, t.Imm8Operand:$src), m,
         "{$src, $dst|$dst, $src}", []> {
   let ImmT = Imm8;
   let mayLoad = 1;
@@ -292,7 +292,7 @@ class UnaryOpM<bits<8> o, Format f, string m, X86TypeInfo t, list<dag> p>
 
 // INCDECR - Instructions like "inc reg".
 class INCDECR<Format f, string m, X86TypeInfo t, SDPatternOperator node>
-  : UnaryOpR<0xFE, f, m, t,
+  : UnaryOpR<0xFF, f, m, t,
              [(set t.RegClass:$dst, EFLAGS, (node t.RegClass:$src1, 1))]>,
     DefEFLAGS {
   let isConvertibleToThreeAddress = 1; // Can xform into LEA.
@@ -300,7 +300,7 @@ class INCDECR<Format f, string m, X86TypeInfo t, SDPatternOperator node>
 
 // INCDECM - Instructions like "inc [mem]".
 class INCDECM<Format f, string m, X86TypeInfo t, int num>
-  : UnaryOpM<0xFE, f, m, t,
+  : UnaryOpM<0xFF, f, m, t,
              [(store (add (t.LoadNode addr:$dst), num), addr:$dst),
               (implicit EFLAGS)]>, DefEFLAGS;
 
@@ -309,7 +309,6 @@ class INCDECR_ALT<bits<8> o, string m, X86TypeInfo t>
   : UnaryOpR<o, AddRegFrm, m, t, []>, DefEFLAGS {
   // Short forms only valid in 32-bit mode. Selected during MCInst lowering.
   let Predicates = [Not64BitMode];
-  let Opcode = o;
 }
 
 // MulOpR - Instructions like "mul reg".
@@ -664,13 +663,13 @@ multiclass ArithBinOp_RF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
     def NAME#8ri   : BinOpRI_RF<0x80, mnemonic, Xi8 , opnodeflag, RegMRM>;
     // NOTE: These are order specific, we want the ri8 forms to be listed
     // first so that they are slightly preferred to the ri forms.
-    def NAME#16ri8 : BinOpRI8_RF<0x82, mnemonic, Xi16, RegMRM>;
-    def NAME#32ri8 : BinOpRI8_RF<0x82, mnemonic, Xi32, RegMRM>;
-    def NAME#64ri8 : BinOpRI8_RF<0x82, mnemonic, Xi64, RegMRM>;
+    def NAME#16ri8 : BinOpRI8_RF<0x83, mnemonic, Xi16, RegMRM>;
+    def NAME#32ri8 : BinOpRI8_RF<0x83, mnemonic, Xi32, RegMRM>;
+    def NAME#64ri8 : BinOpRI8_RF<0x83, mnemonic, Xi64, RegMRM>;
 
-    def NAME#16ri  : BinOpRI_RF<0x80, mnemonic, Xi16, opnodeflag, RegMRM>;
-    def NAME#32ri  : BinOpRI_RF<0x80, mnemonic, Xi32, opnodeflag, RegMRM>;
-    def NAME#64ri32: BinOpRI_RF<0x80, mnemonic, Xi64, opnodeflag, RegMRM>;
+    def NAME#16ri  : BinOpRI_RF<0x81, mnemonic, Xi16, opnodeflag, RegMRM>;
+    def NAME#32ri  : BinOpRI_RF<0x81, mnemonic, Xi32, opnodeflag, RegMRM>;
+    def NAME#64ri32: BinOpRI_RF<0x81, mnemonic, Xi64, opnodeflag, RegMRM>;
     }
   } // Constraints = "$src1 = $dst"
 
@@ -687,10 +686,10 @@ multiclass ArithBinOp_RF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
   def NAME#64mi8  : BinOpMI8_MF<mnemonic, Xi64, MemMRM>;
 
   def NAME#8mi    : BinOpMI_MF<0x80, mnemonic, Xi8 , opnode, MemMRM>;
-  def NAME#16mi   : BinOpMI_MF<0x80, mnemonic, Xi16, opnode, MemMRM>;
-  def NAME#32mi   : BinOpMI_MF<0x80, mnemonic, Xi32, opnode, MemMRM>;
+  def NAME#16mi   : BinOpMI_MF<0x81, mnemonic, Xi16, opnode, MemMRM>;
+  def NAME#32mi   : BinOpMI_MF<0x81, mnemonic, Xi32, opnode, MemMRM>;
   let Predicates = [In64BitMode] in
-  def NAME#64mi32 : BinOpMI_MF<0x80, mnemonic, Xi64, opnode, MemMRM>;
+  def NAME#64mi32 : BinOpMI_MF<0x81, mnemonic, Xi64, opnode, MemMRM>;
 
   // These are for the disassembler since 0x82 opcode behaves like 0x80, but
   // not in 64-bit mode.
@@ -744,13 +743,13 @@ multiclass ArithBinOp_RFF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
     let isConvertibleToThreeAddress = ConvertibleToThreeAddress in {
     // NOTE: These are order specific, we want the ri8 forms to be listed
     // first so that they are slightly preferred to the ri forms.
-    def NAME#16ri8 : BinOpRI8F_RF<0x82, mnemonic, Xi16, RegMRM>;
-    def NAME#32ri8 : BinOpRI8F_RF<0x82, mnemonic, Xi32, RegMRM>;
-    def NAME#64ri8 : BinOpRI8F_RF<0x82, mnemonic, Xi64, RegMRM>;
+    def NAME#16ri8 : BinOpRI8F_RF<0x83, mnemonic, Xi16, RegMRM>;
+    def NAME#32ri8 : BinOpRI8F_RF<0x83, mnemonic, Xi32, RegMRM>;
+    def NAME#64ri8 : BinOpRI8F_RF<0x83, mnemonic, Xi64, RegMRM>;
 
-    def NAME#16ri  : BinOpRIF_RF<0x80, mnemonic, Xi16, opnode, RegMRM>;
-    def NAME#32ri  : BinOpRIF_RF<0x80, mnemonic, Xi32, opnode, RegMRM>;
-    def NAME#64ri32: BinOpRIF_RF<0x80, mnemonic, Xi64, opnode, RegMRM>;
+    def NAME#16ri  : BinOpRIF_RF<0x81, mnemonic, Xi16, opnode, RegMRM>;
+    def NAME#32ri  : BinOpRIF_RF<0x81, mnemonic, Xi32, opnode, RegMRM>;
+    def NAME#64ri32: BinOpRIF_RF<0x81, mnemonic, Xi64, opnode, RegMRM>;
     }
   } // Constraints = "$src1 = $dst"
 
@@ -767,10 +766,10 @@ multiclass ArithBinOp_RFF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
   def NAME#64mi8  : BinOpMI8F_MF<mnemonic, Xi64, MemMRM>;
 
   def NAME#8mi    : BinOpMIF_MF<0x80, mnemonic, Xi8 , opnode, MemMRM>;
-  def NAME#16mi   : BinOpMIF_MF<0x80, mnemonic, Xi16, opnode, MemMRM>;
-  def NAME#32mi   : BinOpMIF_MF<0x80, mnemonic, Xi32, opnode, MemMRM>;
+  def NAME#16mi   : BinOpMIF_MF<0x81, mnemonic, Xi16, opnode, MemMRM>;
+  def NAME#32mi   : BinOpMIF_MF<0x81, mnemonic, Xi32, opnode, MemMRM>;
   let Predicates = [In64BitMode] in
-  def NAME#64mi32 : BinOpMIF_MF<0x80, mnemonic, Xi64, opnode, MemMRM>;
+  def NAME#64mi32 : BinOpMIF_MF<0x81, mnemonic, Xi64, opnode, MemMRM>;
 
   // These are for the disassembler since 0x82 opcode behaves like 0x80, but
   // not in 64-bit mode.
@@ -822,13 +821,13 @@ multiclass ArithBinOp_F<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
   let isConvertibleToThreeAddress = ConvertibleToThreeAddress in {
   // NOTE: These are order specific, we want the ri8 forms to be listed
   // first so that they are slightly preferred to the ri forms.
-  def NAME#16ri8 : BinOpRI8_F<0x82, mnemonic, Xi16, RegMRM>;
-  def NAME#32ri8 : BinOpRI8_F<0x82, mnemonic, Xi32, RegMRM>;
-  def NAME#64ri8 : BinOpRI8_F<0x82, mnemonic, Xi64, RegMRM>;
+  def NAME#16ri8 : BinOpRI8_F<0x83, mnemonic, Xi16, RegMRM>;
+  def NAME#32ri8 : BinOpRI8_F<0x83, mnemonic, Xi32, RegMRM>;
+  def NAME#64ri8 : BinOpRI8_F<0x83, mnemonic, Xi64, RegMRM>;
 
-  def NAME#16ri  : BinOpRI_F<0x80, mnemonic, Xi16, opnode, RegMRM>;
-  def NAME#32ri  : BinOpRI_F<0x80, mnemonic, Xi32, opnode, RegMRM>;
-  def NAME#64ri32: BinOpRI_F<0x80, mnemonic, Xi64, opnode, RegMRM>;
+  def NAME#16ri  : BinOpRI_F<0x81, mnemonic, Xi16, opnode, RegMRM>;
+  def NAME#32ri  : BinOpRI_F<0x81, mnemonic, Xi32, opnode, RegMRM>;
+  def NAME#64ri32: BinOpRI_F<0x81, mnemonic, Xi64, opnode, RegMRM>;
   }
 
   def NAME#8mr    : BinOpMR_F<BaseOpc, mnemonic, Xi8 , opnode>;
@@ -844,10 +843,10 @@ multiclass ArithBinOp_F<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
   def NAME#64mi8  : BinOpMI8_F<mnemonic, Xi64, MemMRM>;
 
   def NAME#8mi    : BinOpMI_F<0x80, mnemonic, Xi8 , opnode, MemMRM>;
-  def NAME#16mi   : BinOpMI_F<0x80, mnemonic, Xi16, opnode, MemMRM>;
-  def NAME#32mi   : BinOpMI_F<0x80, mnemonic, Xi32, opnode, MemMRM>;
+  def NAME#16mi   : BinOpMI_F<0x81, mnemonic, Xi16, opnode, MemMRM>;
+  def NAME#32mi   : BinOpMI_F<0x81, mnemonic, Xi32, opnode, MemMRM>;
   let Predicates = [In64BitMode] in
-  def NAME#64mi32 : BinOpMI_F<0x80, mnemonic, Xi64, opnode, MemMRM>;
+  def NAME#64mi32 : BinOpMI_F<0x81, mnemonic, Xi64, opnode, MemMRM>;
 
   // These are for the disassembler since 0x82 opcode behaves like 0x80, but
   // not in 64-bit mode.
@@ -868,16 +867,16 @@ multiclass ArithBinOp_F<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
 }
 
 
-defm AND : ArithBinOp_RF<0x20, 0x22, 0x24, "and", MRM4r, MRM4m,
+defm AND : ArithBinOp_RF<0x21, 0x23, 0x25, "and", MRM4r, MRM4m,
                          X86and_flag, and, 1, 0, 0>;
-defm OR  : ArithBinOp_RF<0x08, 0x0A, 0x0C, "or", MRM1r, MRM1m,
+defm OR  : ArithBinOp_RF<0x09, 0x0B, 0x0D, "or", MRM1r, MRM1m,
                          X86or_flag, or, 1, 0, 0>;
-defm XOR : ArithBinOp_RF<0x30, 0x32, 0x34, "xor", MRM6r, MRM6m,
+defm XOR : ArithBinOp_RF<0x31, 0x33, 0x35, "xor", MRM6r, MRM6m,
                          X86xor_flag, xor, 1, 0, 0>;
-defm ADD : ArithBinOp_RF<0x00, 0x02, 0x04, "add", MRM0r, MRM0m,
+defm ADD : ArithBinOp_RF<0x01, 0x03, 0x05, "add", MRM0r, MRM0m,
                          X86add_flag, add, 1, 1, 1>;
 let isCompare = 1 in {
-defm SUB : ArithBinOp_RF<0x28, 0x2A, 0x2C, "sub", MRM5r, MRM5m,
+defm SUB : ArithBinOp_RF<0x29, 0x2B, 0x2D, "sub", MRM5r, MRM5m,
                          X86sub_flag, sub, 0, 1, 0>;
 }
 
@@ -891,13 +890,13 @@ def XOR8rr_NOREX : I<0x30, MRMDestReg, (outs GR8_NOREX:$dst),
                      Sched<[WriteALU]>;
 
 // Arithmetic.
-defm ADC : ArithBinOp_RFF<0x10, 0x12, 0x14, "adc", MRM2r, MRM2m, X86adc_flag,
+defm ADC : ArithBinOp_RFF<0x11, 0x13, 0x15, "adc", MRM2r, MRM2m, X86adc_flag,
                           1, 0>;
-defm SBB : ArithBinOp_RFF<0x18, 0x1A, 0x1C, "sbb", MRM3r, MRM3m, X86sbb_flag,
+defm SBB : ArithBinOp_RFF<0x19, 0x1B, 0x1D, "sbb", MRM3r, MRM3m, X86sbb_flag,
                           0, 0>;
 
 let isCompare = 1 in {
-defm CMP : ArithBinOp_F<0x38, 0x3A, 0x3C, "cmp", MRM7r, MRM7m, X86cmp, 0, 0>;
+defm CMP : ArithBinOp_F<0x39, 0x3B, 0x3D, "cmp", MRM7r, MRM7m, X86cmp, 0, 0>;
 }
 
 // Patterns to recognize loads on the LHS of an ADC. We can't make X86adc_flag
@@ -1040,32 +1039,32 @@ let isCompare = 1 in {
   // combine them. This gives bunch of other patterns that start with
   // and a chance to match.
   def TEST8rr  : BinOpRR_F<0x84, "test", Xi8 , null_frag>;
-  def TEST16rr : BinOpRR_F<0x84, "test", Xi16, null_frag>;
-  def TEST32rr : BinOpRR_F<0x84, "test", Xi32, null_frag>;
-  def TEST64rr : BinOpRR_F<0x84, "test", Xi64, null_frag>;
+  def TEST16rr : BinOpRR_F<0x85, "test", Xi16, null_frag>;
+  def TEST32rr : BinOpRR_F<0x85, "test", Xi32, null_frag>;
+  def TEST64rr : BinOpRR_F<0x85, "test", Xi64, null_frag>;
   } // isCommutable
 
 def TEST8mr    : BinOpMR_F<0x84, "test", Xi8 , null_frag>;
-def TEST16mr   : BinOpMR_F<0x84, "test", Xi16, null_frag>;
-def TEST32mr   : BinOpMR_F<0x84, "test", Xi32, null_frag>;
-def TEST64mr   : BinOpMR_F<0x84, "test", Xi64, null_frag>;
+def TEST16mr   : BinOpMR_F<0x85, "test", Xi16, null_frag>;
+def TEST32mr   : BinOpMR_F<0x85, "test", Xi32, null_frag>;
+def TEST64mr   : BinOpMR_F<0x85, "test", Xi64, null_frag>;
 
 def TEST8ri    : BinOpRI_F<0xF6, "test", Xi8 , X86testpat, MRM0r>;
-def TEST16ri   : BinOpRI_F<0xF6, "test", Xi16, X86testpat, MRM0r>;
-def TEST32ri   : BinOpRI_F<0xF6, "test", Xi32, X86testpat, MRM0r>;
-def TEST64ri32 : BinOpRI_F<0xF6, "test", Xi64, X86testpat, MRM0r>;
+def TEST16ri   : BinOpRI_F<0xF7, "test", Xi16, X86testpat, MRM0r>;
+def TEST32ri   : BinOpRI_F<0xF7, "test", Xi32, X86testpat, MRM0r>;
+def TEST64ri32 : BinOpRI_F<0xF7, "test", Xi64, X86testpat, MRM0r>;
 
 def TEST8mi    : BinOpMI_F<0xF6, "test", Xi8 , X86testpat, MRM0m>;
-def TEST16mi   : BinOpMI_F<0xF6, "test", Xi16, X86testpat, MRM0m>;
-def TEST32mi   : BinOpMI_F<0xF6, "test", Xi32, X86testpat, MRM0m>;
+def TEST16mi   : BinOpMI_F<0xF7, "test", Xi16, X86testpat, MRM0m>;
+def TEST32mi   : BinOpMI_F<0xF7, "test", Xi32, X86testpat, MRM0m>;
 
   let Predicates = [In64BitMode] in
-  def TEST64mi32 : BinOpMI_F<0xF6, "test", Xi64, X86testpat, MRM0m>;
+  def TEST64mi32 : BinOpMI_F<0xF7, "test", Xi64, X86testpat, MRM0m>;
 
 def TEST8i8 : BinOpAI_F<0xA8, "test", Xi8 , AL, "{$src, %al|al, $src}">;
-def TEST16i16 : BinOpAI_F<0xA8, "test", Xi16, AX, "{$src, %ax|ax, $src}">;
-def TEST32i32 : BinOpAI_F<0xA8, "test", Xi32, EAX, "{$src, %eax|eax, $src}">;
-def TEST64i32 : BinOpAI_F<0xA8, "test", Xi64, RAX, "{$src, %rax|rax, $src}">;
+def TEST16i16 : BinOpAI_F<0xA9, "test", Xi16, AX, "{$src, %ax|ax, $src}">;
+def TEST32i32 : BinOpAI_F<0xA9, "test", Xi32, EAX, "{$src, %eax|eax, $src}">;
+def TEST64i32 : BinOpAI_F<0xA9, "test", Xi64, RAX, "{$src, %rax|rax, $src}">;
 } // isCompare
 
 // Patterns to match a relocImm into the immediate field.
@@ -1189,27 +1188,25 @@ let Uses = [RDX] in
 //
 // We don't have patterns for these as there is no advantage over ADC for
 // most code.
-class ADCOXOpRR <bits<8> opcode, string mnemonic, X86TypeInfo info>
-  : BinOpRR_RF<opcode, mnemonic, info, null_frag> {
-  let Opcode = opcode;
+class ADCOXOpRR <string m, X86TypeInfo t>
+  : BinOpRR_RF<0xF6, m, t, null_frag> {
   let OpSize = OpSizeFixed;
   let Form = MRMSrcReg;
+  let isCommutable = 1;
 }
 
-class ADCOXOpRM <bits<8> opcode, string mnemonic, X86TypeInfo info>
-  : BinOpRM_RF<opcode, mnemonic, info, null_frag> {
-  let Opcode = opcode;
+class ADCOXOpRM <string m, X86TypeInfo t>
+  : BinOpRM_RF<0xF6, m, t, null_frag> {
   let OpSize = OpSizeFixed;
   let Form = MRMSrcMem;
 }
 
 let Predicates = [HasADX], Constraints = "$src1 = $dst" in {
-  let SchedRW = [WriteADC], isCommutable = 1 in {
-  def ADCX32rr : ADCOXOpRR<0xF6, "adcx", Xi32>, T8PD;
-  def ADCX64rr : ADCOXOpRR<0xF6, "adcx", Xi64>, T8PD;
-
-  def ADOX32rr : ADCOXOpRR<0xF6, "adox", Xi32>, T8XS;
-  def ADOX64rr : ADCOXOpRR<0xF6, "adox", Xi64>, T8XS;
+  let SchedRW = [WriteADC] in {
+  def ADCX32rr : ADCOXOpRR<"adcx", Xi32>, T8PD;
+  def ADCX64rr : ADCOXOpRR<"adcx", Xi64>, T8PD;
+  def ADOX32rr : ADCOXOpRR<"adox", Xi32>, T8XS;
+  def ADOX64rr : ADCOXOpRR<"adox", Xi64>, T8XS;
   }
 
   let SchedRW = [WriteADC.Folded, WriteADC.ReadAfterFold,
@@ -1217,10 +1214,9 @@ let Predicates = [HasADX], Constraints = "$src1 = $dst" in {
                  ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault,
                  // Implicit read of EFLAGS
                  WriteADC.ReadAfterFold] in {
-  def ADCX32rm : ADCOXOpRM<0xF6, "adcx", Xi32>, T8PD;
-  def ADCX64rm : ADCOXOpRM<0xF6, "adcx", Xi64>, T8PD;
-
-  def ADOX32rm : ADCOXOpRM<0xF6, "adox", Xi32>, T8XS;
-  def ADOX64rm : ADCOXOpRM<0xF6, "adox", Xi64>, T8XS;
+  def ADCX32rm : ADCOXOpRM<"adcx", Xi32>, T8PD;
+  def ADCX64rm : ADCOXOpRM<"adcx", Xi64>, T8PD;
+  def ADOX32rm : ADCOXOpRM<"adox", Xi32>, T8XS;
+  def ADOX64rm : ADCOXOpRM<"adox", Xi64>, T8XS;
   }
 }
diff --git a/llvm/lib/Target/X86/X86InstrUtils.td b/llvm/lib/Target/X86/X86InstrUtils.td
index 80854bf606316..a94efd2b1a050 100644
--- a/llvm/lib/Target/X86/X86InstrUtils.td
+++ b/llvm/lib/Target/X86/X86InstrUtils.td
@@ -147,7 +147,7 @@ class X86TypeInfo<ValueType vt, string instrsuffix, RegisterClass regclass,
                   Operand immoperand, SDPatternOperator immoperator,
                   SDPatternOperator immnosuoperator, Operand imm8operand,
                   SDPatternOperator imm8operator, SDPatternOperator imm8nosuoperator,
-                  bit hasOddOpcode, OperandSize opSize,
+                  bit hasEvenOpcode, OperandSize opSize,
                   bit hasREX_W> {
   /// VT - This is the value type itself.
   ValueType VT = vt;
@@ -197,10 +197,10 @@ class X86TypeInfo<ValueType vt, string instrsuffix, RegisterClass regclass,
 
   SDPatternOperator Imm8NoSuOperator = imm8nosuoperator;
 
-  /// HasOddOpcode - This bit is true if the instruction should have an odd (as
-  /// opposed to even) opcode.  Operations on i8 are usually even, operations on
-  /// other datatypes are odd.
-  bit HasOddOpcode = hasOddOpcode;
+  /// HasEvenOpcode - This bit is true if the instruction should have an even (as
+  /// opposed to odd) opcode.  Operations on i8 are even, operations on
+  /// other datatypes are usually odd.
+  bit HasEvenOpcode = hasEvenOpcode;
 
   /// OpSize - Selects whether the instruction needs a 0x66 prefix based on
   /// 16-bit vs 32-bit mode. i8/i64 set this to OpSizeFixed. i16 sets this
@@ -216,16 +216,16 @@ def invalid_node : SDNode<"<<invalid_node>>", SDTIntLeaf,[],"<<invalid_node>>">;
 
 def Xi8  : X86TypeInfo<i8, "b", GR8, loadi8, i8mem, Imm8, i8imm,
                        imm_su, imm, i8imm, invalid_node, invalid_node,
-                       0, OpSizeFixed, 0>;
+                       1, OpSizeFixed, 0>;
 def Xi16 : X86TypeInfo<i16, "w", GR16, loadi16, i16mem, Imm16, i16imm,
                        imm_su, imm, i16i8imm, i16immSExt8_su, i16immSExt8,
-                       1, OpSize16, 0>;
+                       0, OpSize16, 0>;
 def Xi32 : X86TypeInfo<i32, "l", GR32, loadi32, i32mem, Imm32, i32imm,
                        imm_su, imm, i32i8imm, i32immSExt8_su, i32immSExt8,
-                       1, OpSize32, 0>;
+                       0, OpSize32, 0>;
 def Xi64 : X86TypeInfo<i64, "q", GR64, loadi64, i64mem, Imm32S, i64i32imm,
                       i64immSExt32_su, i64immSExt32, i64i8imm, i64immSExt8_su,
-                      i64immSExt8, 1, OpSizeFixed, 1>;
+                      i64immSExt8, 0, OpSizeFixed, 1>;
 
 // Group template arguments that can be derived from the vector type (EltNum x
 // EltVT).  These are things like the register class for the writemask, etc.
@@ -992,8 +992,8 @@ class MMXIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
 class ITy<bits<8> opcode, Format f, X86TypeInfo typeinfo, dag outs, dag ins,
           string mnemonic, string args, list<dag> pattern>
   : I<{opcode{7}, opcode{6}, opcode{5}, opcode{4},
-       opcode{3}, opcode{2}, opcode{1}, typeinfo.HasOddOpcode },
-      f, outs, ins,
+       opcode{3}, opcode{2}, opcode{1},
+       !if(!eq(typeinfo.HasEvenOpcode, 1), 0, opcode{0})}, f, outs, ins,
       !strconcat(mnemonic, "{", typeinfo.InstrSuffix, "}\t", args), pattern> {
 
   let hasSideEffects = 0;

From 3107f313f19a921469776ca498b6aaa0984feda0 Mon Sep 17 00:00:00 2001
From: Radu Salavat <radu.salavat@arm.com>
Date: Thu, 21 Dec 2023 16:37:51 +0200
Subject: [PATCH 073/342] [Flang, Clang] Enable and test 'rdynamic' flag
 (#75598)

Enable and test 'rdynamic' flag
---
 clang/include/clang/Driver/Options.td | 3 ++-
 flang/test/Driver/dynamic-linker.f90  | 4 ++++
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 965d402af2d7b..2b93ddf033499 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -5308,7 +5308,8 @@ def rewrite_objc : Flag<["-"], "rewrite-objc">, Flags<[NoXarchOption]>,
 def rewrite_legacy_objc : Flag<["-"], "rewrite-legacy-objc">,
   Flags<[NoXarchOption]>,
   HelpText<"Rewrite Legacy Objective-C source to C++">;
-def rdynamic : Flag<["-"], "rdynamic">, Group<Link_Group>;
+def rdynamic : Flag<["-"], "rdynamic">, Group<Link_Group>,
+  Visibility<[ClangOption, FlangOption]>;
 def resource_dir : Separate<["-"], "resource-dir">,
   Flags<[NoXarchOption, HelpHidden]>,
   Visibility<[ClangOption, CC1Option, CLOption, DXCOption]>,
diff --git a/flang/test/Driver/dynamic-linker.f90 b/flang/test/Driver/dynamic-linker.f90
index df119c22a2ea5..1cbd407d21ce0 100644
--- a/flang/test/Driver/dynamic-linker.f90
+++ b/flang/test/Driver/dynamic-linker.f90
@@ -7,6 +7,7 @@
 ! RUN: %flang -### --target=x86_64-windows-msvc -rpath /path/to/dir -shared \
 ! RUN:     -static %s 2>&1 | FileCheck \
 ! RUN:     --check-prefixes=MSVC-LINKER-OPTIONS %s
+! RUN: %flang -### --target=aarch64-linux-none -rdynamic %s 2>&1 | FileCheck --check-prefixes=RDYNAMIC-LINKER-OPTION %s
 
 ! TODO: Could the linker have an extension or a suffix?
 ! GNU-LINKER-OPTIONS: "{{.*}}ld{{(.exe)?}}"
@@ -14,6 +15,9 @@
 ! GNU-LINKER-OPTIONS-SAME: "-static"
 ! GNU-LINKER-OPTIONS-SAME: "-rpath" "/path/to/dir"
 
+! RDYNAMIC-LINKER-OPTION: "{{.*}}ld"
+! RDYNAMIC-LINKER-OPTION-SAME: "-export-dynamic"
+
 ! For MSVC, adding -static does not add any additional linker options.
 ! MSVC-LINKER-OPTIONS: "{{.*}}link{{(.exe)?}}"
 ! MSVC-LINKER-OPTIONS-SAME: "-dll"

From 38c1ff89eee769d19ae07b585530f8edd69e124a Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Thu, 21 Dec 2023 15:36:07 +0100
Subject: [PATCH 074/342] [CVP] Add additional tests for undef check (NFC)

---
 .../CorrelatedValuePropagation/cond-at-use.ll | 30 +++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/llvm/test/Transforms/CorrelatedValuePropagation/cond-at-use.ll b/llvm/test/Transforms/CorrelatedValuePropagation/cond-at-use.ll
index 7ec1028d65e0e..546baf086cdbb 100644
--- a/llvm/test/Transforms/CorrelatedValuePropagation/cond-at-use.ll
+++ b/llvm/test/Transforms/CorrelatedValuePropagation/cond-at-use.ll
@@ -567,3 +567,33 @@ define i16 @cond_value_may_not_well_defined(i16 %x) {
   %sel = select i1 %cmp, i16 %and, i16 24
   ret i16 %sel
 }
+
+define i16 @and_elide_poison_flags(i16 noundef %a) {
+; CHECK-LABEL: @and_elide_poison_flags(
+; CHECK-NEXT:    [[X:%.*]] = add nuw i16 [[A:%.*]], 1
+; CHECK-NEXT:    [[AND:%.*]] = and i16 [[X]], 7
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i16 [[X]], 8
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i16 [[AND]], i16 24
+; CHECK-NEXT:    ret i16 [[SEL]]
+;
+  %x = add nuw i16 %a, 1
+  %and = and i16 %x, 7
+  %cmp = icmp ult i16 %x, 8
+  %sel = select i1 %cmp, i16 %and, i16 24
+  ret i16 %sel
+}
+
+define i16 @and_elide_poison_flags_missing_noundef(i16 %a) {
+; CHECK-LABEL: @and_elide_poison_flags_missing_noundef(
+; CHECK-NEXT:    [[X:%.*]] = add nuw i16 [[A:%.*]], 1
+; CHECK-NEXT:    [[AND:%.*]] = and i16 [[X]], 7
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i16 [[X]], 8
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i16 [[AND]], i16 24
+; CHECK-NEXT:    ret i16 [[SEL]]
+;
+  %x = add nuw i16 %a, 1
+  %and = and i16 %x, 7
+  %cmp = icmp ult i16 %x, 8
+  %sel = select i1 %cmp, i16 %and, i16 24
+  ret i16 %sel
+}

From 791200b3bc6898f478138b63e91c03f0c68c7061 Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <Krzysztof.Parzyszek@amd.com>
Date: Thu, 21 Dec 2023 08:39:57 -0600
Subject: [PATCH 075/342] [flang][OpenMP] Avoid captures of references to
 structured bindings

Handle one more case missed in ad37c8694e.
---
 flang/lib/Lower/OpenMP.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/flang/lib/Lower/OpenMP.cpp b/flang/lib/Lower/OpenMP.cpp
index 1acc49abb1da0..8ed31766725e1 100644
--- a/flang/lib/Lower/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP.cpp
@@ -2197,10 +2197,12 @@ static void genBodyOfTargetDataOp(
       if (fir::isa_builtin_cptr_type(refType.getElementType())) {
         converter.bindSymbol(*argSymbol, arg);
       } else {
+        // Avoid capture of a reference to a structured binding.
+        const Fortran::semantics::Symbol *sym = argSymbol;
         extVal.match(
             [&](const fir::MutableBoxValue &mbv) {
               converter.bindSymbol(
-                  *argSymbol,
+                  *sym,
                   fir::MutableBoxValue(
                       arg, fir::factory::getNonDeferredLenParams(extVal), {}));
             },
@@ -2489,7 +2491,7 @@ static void genBodyOfTargetOp(
   // Bind the symbols to their corresponding block arguments.
   for (auto [argIndex, argSymbol] : llvm::enumerate(mapSymbols)) {
     const mlir::BlockArgument &arg = region.getArgument(argIndex);
-    // Avoid capture of reference to a structured binding.
+    // Avoid capture of a reference to a structured binding.
     const Fortran::semantics::Symbol *sym = argSymbol;
     fir::ExtendedValue extVal = converter.getSymbolExtendedValue(*sym);
     extVal.match(

From 8674a023bcacb677ce48b8831e2ae35b5aa2d8ef Mon Sep 17 00:00:00 2001
From: Chia <sun1011jacobi@gmail.com>
Date: Thu, 21 Dec 2023 23:47:21 +0900
Subject: [PATCH 076/342] [InstCombine] fold (Binop phi(a, b) phi(b, a)) ->
 (Binop a, b) while Binop is commutative. (#75765)

Alive2 proof: https://alive2.llvm.org/ce/z/2P8gq-
This patch closes #73905
---
 .../InstCombine/InstCombineCalls.cpp          |  22 +
 .../InstCombine/InstCombineInternal.h         |  15 +
 .../InstCombine/InstructionCombining.cpp      |  53 ++
 .../commutative-operation-over-phis.ll        | 645 ++++++++++++++++++
 4 files changed, 735 insertions(+)
 create mode 100644 llvm/test/Transforms/InstCombine/commutative-operation-over-phis.ll

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index c496f9c7419b5..a272357fa04a4 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -1539,6 +1539,9 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
     if (Instruction *I = foldCommutativeIntrinsicOverSelects(*II))
       return I;
 
+    if (Instruction *I = foldCommutativeIntrinsicOverPhis(*II))
+      return I;
+
     if (CallInst *NewCall = canonicalizeConstantArg0ToArg1(CI))
       return NewCall;
   }
@@ -4237,3 +4240,22 @@ InstCombinerImpl::foldCommutativeIntrinsicOverSelects(IntrinsicInst &II) {
 
   return nullptr;
 }
+
+Instruction *
+InstCombinerImpl::foldCommutativeIntrinsicOverPhis(IntrinsicInst &II) {
+  assert(II.isCommutative() && "Instruction should be commutative");
+
+  PHINode *LHS = dyn_cast<PHINode>(II.getOperand(0));
+  PHINode *RHS = dyn_cast<PHINode>(II.getOperand(1));
+
+  if (!LHS || !RHS)
+    return nullptr;
+
+  if (auto P = matchSymmetricPhiNodesPair(LHS, RHS)) {
+    replaceOperand(II, 0, P->first);
+    replaceOperand(II, 1, P->second);
+    return &II;
+  }
+
+  return nullptr;
+}
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
index f86db698ef8f1..9e76a0cf17b18 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -278,6 +278,16 @@ class LLVM_LIBRARY_VISIBILITY InstCombinerImpl final
                                               IntrinsicInst &Tramp);
   Instruction *foldCommutativeIntrinsicOverSelects(IntrinsicInst &II);
 
+  // Match a pair of Phi Nodes like
+  // phi [a, BB0], [b, BB1] & phi [b, BB0], [a, BB1]
+  // Return the matched two operands.
+  std::optional<std::pair<Value *, Value *>>
+  matchSymmetricPhiNodesPair(PHINode *LHS, PHINode *RHS);
+
+  // Tries to fold (op phi(a, b) phi(b, a)) -> (op a, b)
+  // while op is a commutative intrinsic call.
+  Instruction *foldCommutativeIntrinsicOverPhis(IntrinsicInst &II);
+
   Value *simplifyMaskedLoad(IntrinsicInst &II);
   Instruction *simplifyMaskedStore(IntrinsicInst &II);
   Instruction *simplifyMaskedGather(IntrinsicInst &II);
@@ -492,6 +502,11 @@ class LLVM_LIBRARY_VISIBILITY InstCombinerImpl final
   /// X % (C0 * C1)
   Value *SimplifyAddWithRemainder(BinaryOperator &I);
 
+  // Tries to fold (Binop phi(a, b) phi(b, a)) -> (Binop a, b)
+  // while Binop is commutative.
+  Value *SimplifyPhiCommutativeBinaryOp(BinaryOperator &I, Value *LHS,
+                                        Value *RHS);
+
   // Binary Op helper for select operations where the expression can be
   // efficiently reorganized.
   Value *SimplifySelectsFeedingBinaryOp(BinaryOperator &I, Value *LHS,
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index 4188b5b46e87e..775720ab43a5c 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -1096,6 +1096,54 @@ Value *InstCombinerImpl::foldUsingDistributiveLaws(BinaryOperator &I) {
   return SimplifySelectsFeedingBinaryOp(I, LHS, RHS);
 }
 
+std::optional<std::pair<Value *, Value *>>
+InstCombinerImpl::matchSymmetricPhiNodesPair(PHINode *LHS, PHINode *RHS) {
+  if (LHS->getParent() != RHS->getParent())
+    return std::nullopt;
+
+  if (LHS->getNumIncomingValues() < 2)
+    return std::nullopt;
+
+  if (!equal(LHS->blocks(), RHS->blocks()))
+    return std::nullopt;
+
+  Value *L0 = LHS->getIncomingValue(0);
+  Value *R0 = RHS->getIncomingValue(0);
+
+  for (unsigned I = 1, E = LHS->getNumIncomingValues(); I != E; ++I) {
+    Value *L1 = LHS->getIncomingValue(I);
+    Value *R1 = RHS->getIncomingValue(I);
+
+    if ((L0 == L1 && R0 == R1) || (L0 == R1 && R0 == L1))
+      continue;
+
+    return std::nullopt;
+  }
+
+  return std::optional(std::pair(L0, R0));
+}
+
+Value *InstCombinerImpl::SimplifyPhiCommutativeBinaryOp(BinaryOperator &I,
+                                                        Value *Op0,
+                                                        Value *Op1) {
+  assert(I.isCommutative() && "Instruction should be commutative");
+
+  PHINode *LHS = dyn_cast<PHINode>(Op0);
+  PHINode *RHS = dyn_cast<PHINode>(Op1);
+
+  if (!LHS || !RHS)
+    return nullptr;
+
+  if (auto P = matchSymmetricPhiNodesPair(LHS, RHS)) {
+    Value *BI = Builder.CreateBinOp(I.getOpcode(), P->first, P->second);
+    if (auto *BO = dyn_cast<BinaryOperator>(BI))
+      BO->copyIRFlags(&I);
+    return BI;
+  }
+
+  return nullptr;
+}
+
 Value *InstCombinerImpl::SimplifySelectsFeedingBinaryOp(BinaryOperator &I,
                                                         Value *LHS,
                                                         Value *RHS) {
@@ -1529,6 +1577,11 @@ Instruction *InstCombinerImpl::foldBinopWithPhiOperands(BinaryOperator &BO) {
       BO.getParent() != Phi1->getParent())
     return nullptr;
 
+  if (BO.isCommutative()) {
+    if (Value *V = SimplifyPhiCommutativeBinaryOp(BO, Phi0, Phi1))
+      return replaceInstUsesWith(BO, V);
+  }
+
   // Fold if there is at least one specific constant value in phi0 or phi1's
   // incoming values that comes from the same block and this specific constant
   // value can be used to do optimization for specific binary operator.
diff --git a/llvm/test/Transforms/InstCombine/commutative-operation-over-phis.ll b/llvm/test/Transforms/InstCombine/commutative-operation-over-phis.ll
new file mode 100644
index 0000000000000..e8b0fb198bd11
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/commutative-operation-over-phis.ll
@@ -0,0 +1,645 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -passes=instcombine -S < %s | FileCheck %s
+
+declare void @dummy()
+
+declare i32 @llvm.smax.i32(i32 %a, i32 %b)
+declare i32 @llvm.smin.i32(i32 %a, i32 %b)
+declare i32 @llvm.umax.i32(i32 %a, i32 %b)
+declare i32 @llvm.umin.i32(i32 %a, i32 %b)
+declare float @llvm.maxnum.f32(float %a, float %b)
+declare float @llvm.minnum.f32(float %a, float %b)
+declare float @llvm.maximum.f32(float %a, float %b)
+declare float @llvm.minimum.f32(float %a, float %b)
+declare float @llvm.pow.f32(float %a, float %b)
+
+define i8 @fold_phi_mul(i1 %c, i8 %a, i8 %b)  {
+; CHECK-LABEL: define i8 @fold_phi_mul(
+; CHECK-SAME: i1 [[C:%.*]], i8 [[A:%.*]], i8 [[B:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[C]], label [[THEN:%.*]], label [[END:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    call void @dummy()
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[RET:%.*]] = mul i8 [[A]], [[B]]
+; CHECK-NEXT:    ret i8 [[RET]]
+;
+entry:
+  br i1 %c, label %then, label %end
+then:
+  call void @dummy()
+  br label %end
+end:
+  %phi1 = phi i8 [%a, %entry], [%b, %then]
+  %phi2 = phi i8 [%b, %entry], [%a, %then]
+  %ret = mul i8 %phi1, %phi2
+  ret i8 %ret
+}
+
+define i8 @fold_phi_mul_three(i1 %c, i1 %d, i8 %a, i8 %b)  {
+; CHECK-LABEL: define i8 @fold_phi_mul_three(
+; CHECK-SAME: i1 [[C:%.*]], i1 [[D:%.*]], i8 [[A:%.*]], i8 [[B:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[C]], label [[THEN1:%.*]], label [[END:%.*]]
+; CHECK:       then1:
+; CHECK-NEXT:    call void @dummy()
+; CHECK-NEXT:    br i1 [[D]], label [[THEN2:%.*]], label [[END]]
+; CHECK:       then2:
+; CHECK-NEXT:    call void @dummy()
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[RET:%.*]] = mul i8 [[A]], [[B]]
+; CHECK-NEXT:    ret i8 [[RET]]
+;
+entry:
+  br i1 %c, label %then1, label %end
+then1:
+  call void @dummy()
+  br i1 %d, label %then2, label %end
+then2:
+  call void @dummy()
+  br label %end
+end:
+  %phi1 = phi i8 [%a, %entry], [%b, %then1], [%a, %then2]
+  %phi2 = phi i8 [%b, %entry], [%a, %then1], [%b, %then2]
+  %ret = mul i8 %phi1, %phi2
+  ret i8 %ret
+}
+
+define i8 @fold_phi_mul_three_notopt(i1 %c, i1 %d, i8 %a, i8 %b)  {
+; CHECK-LABEL: define i8 @fold_phi_mul_three_notopt(
+; CHECK-SAME: i1 [[C:%.*]], i1 [[D:%.*]], i8 [[A:%.*]], i8 [[B:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[C]], label [[THEN1:%.*]], label [[END:%.*]]
+; CHECK:       then1:
+; CHECK-NEXT:    call void @dummy()
+; CHECK-NEXT:    br i1 [[D]], label [[THEN2:%.*]], label [[END]]
+; CHECK:       then2:
+; CHECK-NEXT:    call void @dummy()
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[PHI1:%.*]] = phi i8 [ [[A]], [[ENTRY:%.*]] ], [ [[B]], [[THEN1]] ], [ [[A]], [[THEN2]] ]
+; CHECK-NEXT:    [[PHI2:%.*]] = phi i8 [ [[B]], [[ENTRY]] ], [ [[A]], [[THEN1]] ], [ [[A]], [[THEN2]] ]
+; CHECK-NEXT:    [[RET:%.*]] = mul i8 [[PHI1]], [[PHI2]]
+; CHECK-NEXT:    ret i8 [[RET]]
+;
+entry:
+  br i1 %c, label %then1, label %end
+then1:
+  call void @dummy()
+  br i1 %d, label %then2, label %end
+then2:
+  call void @dummy()
+  br label %end
+end:
+  %phi1 = phi i8 [%a, %entry], [%b, %then1], [%a, %then2]
+  %phi2 = phi i8 [%b, %entry], [%a, %then1], [%a, %then2]
+  %ret = mul i8 %phi1, %phi2
+  ret i8 %ret
+}
+
+define i8 @fold_phi_mul_nsw_nuw(i1 %c, i8 %a, i8 %b)  {
+; CHECK-LABEL: define i8 @fold_phi_mul_nsw_nuw(
+; CHECK-SAME: i1 [[C:%.*]], i8 [[A:%.*]], i8 [[B:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[C]], label [[THEN:%.*]], label [[END:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    call void @dummy()
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[RET:%.*]] = mul nuw nsw i8 [[A]], [[B]]
+; CHECK-NEXT:    ret i8 [[RET]]
+;
+entry:
+  br i1 %c, label %then, label %end
+then:
+  call void @dummy()
+  br label %end
+end:
+  %phi1 = phi i8 [%a, %entry], [%b, %then]
+  %phi2 = phi i8 [%b, %entry], [%a, %then]
+  %ret = mul nsw nuw i8 %phi1, %phi2
+  ret i8 %ret
+}
+
+define <2 x i8> @fold_phi_mul_fix_vec(i1 %c, <2 x i8> %a, <2 x i8> %b)  {
+; CHECK-LABEL: define <2 x i8> @fold_phi_mul_fix_vec(
+; CHECK-SAME: i1 [[C:%.*]], <2 x i8> [[A:%.*]], <2 x i8> [[B:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[C]], label [[THEN:%.*]], label [[END:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    call void @dummy()
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[RET:%.*]] = mul <2 x i8> [[A]], [[B]]
+; CHECK-NEXT:    ret <2 x i8> [[RET]]
+;
+entry:
+  br i1 %c, label %then, label %end
+then:
+  call void @dummy()
+  br label %end
+end:
+  %phi1 = phi <2 x i8> [%a, %entry], [%b, %then]
+  %phi2 = phi <2 x i8> [%b, %entry], [%a, %then]
+  %ret = mul <2 x i8> %phi1, %phi2
+  ret <2 x i8> %ret
+}
+
+define <vscale x 2 x i8> @fold_phi_mul_scale_vec(i1 %c, <vscale x 2 x i8> %a, <vscale x 2 x i8> %b)  {
+; CHECK-LABEL: define <vscale x 2 x i8> @fold_phi_mul_scale_vec(
+; CHECK-SAME: i1 [[C:%.*]], <vscale x 2 x i8> [[A:%.*]], <vscale x 2 x i8> [[B:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[C]], label [[THEN:%.*]], label [[END:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    call void @dummy()
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[RET:%.*]] = mul <vscale x 2 x i8> [[A]], [[B]]
+; CHECK-NEXT:    ret <vscale x 2 x i8> [[RET]]
+;
+entry:
+  br i1 %c, label %then, label %end
+then:
+  call void @dummy()
+  br label %end
+end:
+  %phi1 = phi <vscale x 2 x i8> [%a, %entry], [%b, %then]
+  %phi2 = phi <vscale x 2 x i8> [%b, %entry], [%a, %then]
+  %ret = mul <vscale x 2 x i8> %phi1, %phi2
+  ret <vscale x 2 x i8> %ret
+}
+
+define i8 @fold_phi_mul_commute(i1 %c, i8 %a, i8 %b)  {
+; CHECK-LABEL: define i8 @fold_phi_mul_commute(
+; CHECK-SAME: i1 [[C:%.*]], i8 [[A:%.*]], i8 [[B:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[C]], label [[THEN:%.*]], label [[END:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    call void @dummy()
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[RET:%.*]] = mul i8 [[A]], [[B]]
+; CHECK-NEXT:    ret i8 [[RET]]
+;
+entry:
+  br i1 %c, label %then, label %end
+then:
+  call void @dummy()
+  br label %end
+end:
+  %phi1 = phi i8 [%a, %entry], [%b, %then]
+  %phi2 = phi i8 [%a, %then], [%b, %entry]
+  %ret = mul i8 %phi1, %phi2
+  ret i8 %ret
+}
+
+
+define i8 @fold_phi_mul_notopt(i1 %c, i8 %a, i8 %b, i8 %d)  {
+; CHECK-LABEL: define i8 @fold_phi_mul_notopt(
+; CHECK-SAME: i1 [[C:%.*]], i8 [[A:%.*]], i8 [[B:%.*]], i8 [[D:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[C]], label [[THEN:%.*]], label [[END:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    call void @dummy()
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[PHI1:%.*]] = phi i8 [ [[A]], [[ENTRY:%.*]] ], [ [[B]], [[THEN]] ]
+; CHECK-NEXT:    [[PHI2:%.*]] = phi i8 [ [[B]], [[ENTRY]] ], [ [[D]], [[THEN]] ]
+; CHECK-NEXT:    [[RET:%.*]] = mul i8 [[PHI1]], [[PHI2]]
+; CHECK-NEXT:    ret i8 [[RET]]
+;
+entry:
+  br i1 %c, label %then, label %end
+then:
+  call void @dummy()
+  br label %end
+end:
+  %phi1 = phi i8 [%a, %entry], [%b, %then]
+  %phi2 = phi i8 [%b, %entry], [%d, %then]
+  %ret = mul i8 %phi1, %phi2
+  ret i8 %ret
+}
+
+
+define i8 @fold_phi_sub(i1 %c, i8 %a, i8 %b)  {
+; CHECK-LABEL: define i8 @fold_phi_sub(
+; CHECK-SAME: i1 [[C:%.*]], i8 [[A:%.*]], i8 [[B:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[C]], label [[THEN:%.*]], label [[END:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    call void @dummy()
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[PHI1:%.*]] = phi i8 [ [[A]], [[ENTRY:%.*]] ], [ [[B]], [[THEN]] ]
+; CHECK-NEXT:    [[PHI2:%.*]] = phi i8 [ [[B]], [[ENTRY]] ], [ [[A]], [[THEN]] ]
+; CHECK-NEXT:    [[RET:%.*]] = sub i8 [[PHI1]], [[PHI2]]
+; CHECK-NEXT:    ret i8 [[RET]]
+;
+entry:
+  br i1 %c, label %then, label %end
+then:
+  call void @dummy()
+  br label %end
+end:
+  %phi1 = phi i8 [%a, %entry], [%b, %then]
+  %phi2 = phi i8 [%b, %entry], [%a, %then]
+  %ret = sub i8 %phi1, %phi2
+  ret i8 %ret
+}
+
+
+define i8 @fold_phi_add(i1 %c, i8 %a, i8 %b)  {
+; CHECK-LABEL: define i8 @fold_phi_add(
+; CHECK-SAME: i1 [[C:%.*]], i8 [[A:%.*]], i8 [[B:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[C]], label [[THEN:%.*]], label [[END:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    call void @dummy()
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[RET:%.*]] = add i8 [[A]], [[B]]
+; CHECK-NEXT:    ret i8 [[RET]]
+;
+entry:
+  br i1 %c, label %then, label %end
+then:
+  call void @dummy()
+  br label %end
+end:
+  %phi1 = phi i8 [%a, %entry], [%b, %then]
+  %phi2 = phi i8 [%b, %entry], [%a, %then]
+  %ret = add i8 %phi1, %phi2
+  ret i8 %ret
+}
+
+define i8 @fold_phi_and(i1 %c, i8 %a, i8 %b)  {
+; CHECK-LABEL: define i8 @fold_phi_and(
+; CHECK-SAME: i1 [[C:%.*]], i8 [[A:%.*]], i8 [[B:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[C]], label [[THEN:%.*]], label [[END:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    call void @dummy()
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[RET:%.*]] = and i8 [[A]], [[B]]
+; CHECK-NEXT:    ret i8 [[RET]]
+;
+entry:
+  br i1 %c, label %then, label %end
+then:
+  call void @dummy()
+  br label %end
+end:
+  %phi1 = phi i8 [%a, %entry], [%b, %then]
+  %phi2 = phi i8 [%b, %entry], [%a, %then]
+  %ret = and i8 %phi1, %phi2
+  ret i8 %ret
+}
+
+define i8 @fold_phi_or(i1 %c, i8 %a, i8 %b)  {
+; CHECK-LABEL: define i8 @fold_phi_or(
+; CHECK-SAME: i1 [[C:%.*]], i8 [[A:%.*]], i8 [[B:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[C]], label [[THEN:%.*]], label [[END:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    call void @dummy()
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[RET:%.*]] = or i8 [[A]], [[B]]
+; CHECK-NEXT:    ret i8 [[RET]]
+;
+entry:
+  br i1 %c, label %then, label %end
+then:
+  call void @dummy()
+  br label %end
+end:
+  %phi1 = phi i8 [%a, %entry], [%b, %then]
+  %phi2 = phi i8 [%b, %entry], [%a, %then]
+  %ret = or i8 %phi1, %phi2
+  ret i8 %ret
+}
+
+
+define i8 @fold_phi_xor(i1 %c, i8 %a, i8 %b)  {
+; CHECK-LABEL: define i8 @fold_phi_xor(
+; CHECK-SAME: i1 [[C:%.*]], i8 [[A:%.*]], i8 [[B:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[C]], label [[THEN:%.*]], label [[END:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    call void @dummy()
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[RET:%.*]] = xor i8 [[A]], [[B]]
+; CHECK-NEXT:    ret i8 [[RET]]
+;
+entry:
+  br i1 %c, label %then, label %end
+then:
+  call void @dummy()
+  br label %end
+end:
+  %phi1 = phi i8 [%a, %entry], [%b, %then]
+  %phi2 = phi i8 [%b, %entry], [%a, %then]
+  %ret = xor i8 %phi1, %phi2
+  ret i8 %ret
+}
+
+
+define float @fold_phi_fadd(i1 %c, float %a, float %b)  {
+; CHECK-LABEL: define float @fold_phi_fadd(
+; CHECK-SAME: i1 [[C:%.*]], float [[A:%.*]], float [[B:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[C]], label [[THEN:%.*]], label [[END:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    call void @dummy()
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[RET:%.*]] = fadd float [[A]], [[B]]
+; CHECK-NEXT:    ret float [[RET]]
+;
+entry:
+  br i1 %c, label %then, label %end
+then:
+  call void @dummy()
+  br label %end
+end:
+  %phi1 = phi float [%a, %entry], [%b, %then]
+  %phi2 = phi float [%b, %entry], [%a, %then]
+  %ret = fadd float %phi1, %phi2
+  ret float %ret
+}
+
+define float @fold_phi_fadd_nnan(i1 %c, float %a, float %b)  {
+; CHECK-LABEL: define float @fold_phi_fadd_nnan(
+; CHECK-SAME: i1 [[C:%.*]], float [[A:%.*]], float [[B:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[C]], label [[THEN:%.*]], label [[END:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    call void @dummy()
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[RET:%.*]] = fadd nnan float [[A]], [[B]]
+; CHECK-NEXT:    ret float [[RET]]
+;
+entry:
+  br i1 %c, label %then, label %end
+then:
+  call void @dummy()
+  br label %end
+end:
+  %phi1 = phi float [%a, %entry], [%b, %then]
+  %phi2 = phi float [%b, %entry], [%a, %then]
+  %ret = fadd nnan float %phi1, %phi2
+  ret float %ret
+}
+
+
+define float @fold_phi_fmul(i1 %c, float %a, float %b)  {
+; CHECK-LABEL: define float @fold_phi_fmul(
+; CHECK-SAME: i1 [[C:%.*]], float [[A:%.*]], float [[B:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[C]], label [[THEN:%.*]], label [[END:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    call void @dummy()
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[RET:%.*]] = fmul float [[A]], [[B]]
+; CHECK-NEXT:    ret float [[RET]]
+;
+entry:
+  br i1 %c, label %then, label %end
+then:
+  call void @dummy()
+  br label %end
+end:
+  %phi1 = phi float [%a, %entry], [%b, %then]
+  %phi2 = phi float [%b, %entry], [%a, %then]
+  %ret = fmul float %phi1, %phi2
+  ret float %ret
+}
+
+
+define i32 @fold_phi_smax(i1 %c, i32 %a, i32 %b)  {
+; CHECK-LABEL: define i32 @fold_phi_smax(
+; CHECK-SAME: i1 [[C:%.*]], i32 [[A:%.*]], i32 [[B:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[C]], label [[THEN:%.*]], label [[END:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    call void @dummy()
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[RET:%.*]] = call i32 @llvm.smax.i32(i32 [[A]], i32 [[B]])
+; CHECK-NEXT:    ret i32 [[RET]]
+;
+entry:
+  br i1 %c, label %then, label %end
+then:
+  call void @dummy()
+  br label %end
+end:
+  %phi1 = phi i32 [%a, %entry], [%b, %then]
+  %phi2 = phi i32 [%b, %entry], [%a, %then]
+  %ret = call i32 @llvm.smax.i32(i32  %phi1, i32 %phi2)
+  ret i32 %ret
+}
+
+
+define i32 @fold_phi_smin(i1 %c, i32 %a, i32 %b)  {
+; CHECK-LABEL: define i32 @fold_phi_smin(
+; CHECK-SAME: i1 [[C:%.*]], i32 [[A:%.*]], i32 [[B:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[C]], label [[THEN:%.*]], label [[END:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    call void @dummy()
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[RET:%.*]] = call i32 @llvm.smin.i32(i32 [[A]], i32 [[B]])
+; CHECK-NEXT:    ret i32 [[RET]]
+;
+entry:
+  br i1 %c, label %then, label %end
+then:
+  call void @dummy()
+  br label %end
+end:
+  %phi1 = phi i32 [%a, %entry], [%b, %then]
+  %phi2 = phi i32 [%b, %entry], [%a, %then]
+  %ret = call i32 @llvm.smin.i32(i32  %phi1, i32 %phi2)
+  ret i32 %ret
+}
+
+
+define i32 @fold_phi_umax(i1 %c, i32 %a, i32 %b)  {
+; CHECK-LABEL: define i32 @fold_phi_umax(
+; CHECK-SAME: i1 [[C:%.*]], i32 [[A:%.*]], i32 [[B:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[C]], label [[THEN:%.*]], label [[END:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    call void @dummy()
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[RET:%.*]] = call i32 @llvm.umax.i32(i32 [[A]], i32 [[B]])
+; CHECK-NEXT:    ret i32 [[RET]]
+;
+entry:
+  br i1 %c, label %then, label %end
+then:
+  call void @dummy()
+  br label %end
+end:
+  %phi1 = phi i32 [%a, %entry], [%b, %then]
+  %phi2 = phi i32 [%b, %entry], [%a, %then]
+  %ret = call i32 @llvm.umax.i32(i32  %phi1, i32 %phi2)
+  ret i32 %ret
+}
+
+define i32 @fold_phi_umin(i1 %c, i32 %a, i32 %b)  {
+; CHECK-LABEL: define i32 @fold_phi_umin(
+; CHECK-SAME: i1 [[C:%.*]], i32 [[A:%.*]], i32 [[B:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[C]], label [[THEN:%.*]], label [[END:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    call void @dummy()
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[RET:%.*]] = call i32 @llvm.umin.i32(i32 [[A]], i32 [[B]])
+; CHECK-NEXT:    ret i32 [[RET]]
+;
+entry:
+  br i1 %c, label %then, label %end
+then:
+  call void @dummy()
+  br label %end
+end:
+  %phi1 = phi i32 [%a, %entry], [%b, %then]
+  %phi2 = phi i32 [%b, %entry], [%a, %then]
+  %ret = call i32 @llvm.umin.i32(i32  %phi1, i32 %phi2)
+  ret i32 %ret
+}
+
+
+define float @fold_phi_maxnum(i1 %c, float %a, float %b)  {
+; CHECK-LABEL: define float @fold_phi_maxnum(
+; CHECK-SAME: i1 [[C:%.*]], float [[A:%.*]], float [[B:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[C]], label [[THEN:%.*]], label [[END:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    call void @dummy()
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[RET:%.*]] = call float @llvm.maxnum.f32(float [[A]], float [[B]])
+; CHECK-NEXT:    ret float [[RET]]
+;
+entry:
+  br i1 %c, label %then, label %end
+then:
+  call void @dummy()
+  br label %end
+end:
+  %phi1 = phi float [%a, %entry], [%b, %then]
+  %phi2 = phi float [%b, %entry], [%a, %then]
+  %ret = call float @llvm.maxnum.f32(float  %phi1, float %phi2)
+  ret float %ret
+}
+
+define float @fold_phi_pow(i1 %c, float %a, float %b)  {
+; CHECK-LABEL: define float @fold_phi_pow(
+; CHECK-SAME: i1 [[C:%.*]], float [[A:%.*]], float [[B:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[C]], label [[THEN:%.*]], label [[END:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    call void @dummy()
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[PHI1:%.*]] = phi float [ [[A]], [[ENTRY:%.*]] ], [ [[B]], [[THEN]] ]
+; CHECK-NEXT:    [[PHI2:%.*]] = phi float [ [[B]], [[ENTRY]] ], [ [[A]], [[THEN]] ]
+; CHECK-NEXT:    [[RET:%.*]] = call float @llvm.pow.f32(float [[PHI1]], float [[PHI2]])
+; CHECK-NEXT:    ret float [[RET]]
+;
+entry:
+  br i1 %c, label %then, label %end
+then:
+  call void @dummy()
+  br label %end
+end:
+  %phi1 = phi float [%a, %entry], [%b, %then]
+  %phi2 = phi float [%b, %entry], [%a, %then]
+  %ret = call float @llvm.pow.f32(float  %phi1, float %phi2)
+  ret float %ret
+}
+
+define float @fold_phi_minnum(i1 %c, float %a, float %b)  {
+; CHECK-LABEL: define float @fold_phi_minnum(
+; CHECK-SAME: i1 [[C:%.*]], float [[A:%.*]], float [[B:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[C]], label [[THEN:%.*]], label [[END:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    call void @dummy()
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[RET:%.*]] = call float @llvm.minnum.f32(float [[A]], float [[B]])
+; CHECK-NEXT:    ret float [[RET]]
+;
+entry:
+  br i1 %c, label %then, label %end
+then:
+  call void @dummy()
+  br label %end
+end:
+  %phi1 = phi float [%a, %entry], [%b, %then]
+  %phi2 = phi float [%b, %entry], [%a, %then]
+  %ret = call float @llvm.minnum.f32(float  %phi1, float %phi2)
+  ret float %ret
+}
+
+define float @fold_phi_maximum(i1 %c, float %a, float %b)  {
+; CHECK-LABEL: define float @fold_phi_maximum(
+; CHECK-SAME: i1 [[C:%.*]], float [[A:%.*]], float [[B:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[C]], label [[THEN:%.*]], label [[END:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    call void @dummy()
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[RET:%.*]] = call float @llvm.maximum.f32(float [[A]], float [[B]])
+; CHECK-NEXT:    ret float [[RET]]
+;
+entry:
+  br i1 %c, label %then, label %end
+then:
+  call void @dummy()
+  br label %end
+end:
+  %phi1 = phi float [%a, %entry], [%b, %then]
+  %phi2 = phi float [%b, %entry], [%a, %then]
+  %ret = call float @llvm.maximum.f32(float  %phi1, float %phi2)
+  ret float %ret
+}
+
+define float @fold_phi_minimum(i1 %c, float %a, float %b)  {
+; CHECK-LABEL: define float @fold_phi_minimum(
+; CHECK-SAME: i1 [[C:%.*]], float [[A:%.*]], float [[B:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[C]], label [[THEN:%.*]], label [[END:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    call void @dummy()
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[RET:%.*]] = call float @llvm.minimum.f32(float [[A]], float [[B]])
+; CHECK-NEXT:    ret float [[RET]]
+;
+entry:
+  br i1 %c, label %then, label %end
+then:
+  call void @dummy()
+  br label %end
+end:
+  %phi1 = phi float [%a, %entry], [%b, %then]
+  %phi2 = phi float [%b, %entry], [%a, %then]
+  %ret = call float @llvm.minimum.f32(float  %phi1, float %phi2)
+  ret float %ret
+}
+

From 70b00b4a6aa06c906c30d614d5b0042fdbfdbd50 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Thu, 21 Dec 2023 14:53:17 +0000
Subject: [PATCH 077/342] [AMDGPU] Rename AMDGPUGlobalAtomicRtn ->
 AMDGPUAtomicRtn (#76157)

It is used for FLAT atomics as well as Global atomics.
---
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 28 ++++++++++++------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 51bd9b63c127e..cb48f54b13a6c 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2353,14 +2353,14 @@ def int_amdgcn_s_get_waveid_in_workgroup :
   Intrinsic<[llvm_i32_ty], [],
     [IntrNoMem, IntrHasSideEffects, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
 
-class AMDGPUGlobalAtomicRtn<LLVMType vt> : Intrinsic <
+class AMDGPUAtomicRtn<LLVMType vt> : Intrinsic <
   [vt],
   [llvm_anyptr_ty,    // vaddr
    vt],               // vdata(VGPR)
   [IntrArgMemOnly, IntrWillReturn, NoCapture<ArgIndex<0>>, IntrNoCallback, IntrNoFree], "",
   [SDNPMemOperand]>;
 
-def int_amdgcn_global_atomic_csub : AMDGPUGlobalAtomicRtn<llvm_i32_ty>;
+def int_amdgcn_global_atomic_csub : AMDGPUAtomicRtn<llvm_i32_ty>;
 
 // uint4 llvm.amdgcn.image.bvh.intersect.ray <node_ptr>, <ray_extent>, <ray_origin>,
 //                                           <ray_dir>, <ray_inv_dir>, <texture_descr>
@@ -2486,10 +2486,10 @@ def int_amdgcn_permlanex16_var : ClangBuiltin<"__builtin_amdgcn_permlanex16_var"
             [IntrNoMem, IntrConvergent, IntrWillReturn,
              ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, IntrNoCallback, IntrNoFree]>;
 
-def int_amdgcn_flat_atomic_fmin_num   : AMDGPUGlobalAtomicRtn<llvm_anyfloat_ty>;
-def int_amdgcn_flat_atomic_fmax_num   : AMDGPUGlobalAtomicRtn<llvm_anyfloat_ty>;
-def int_amdgcn_global_atomic_fmin_num : AMDGPUGlobalAtomicRtn<llvm_anyfloat_ty>;
-def int_amdgcn_global_atomic_fmax_num : AMDGPUGlobalAtomicRtn<llvm_anyfloat_ty>;
+def int_amdgcn_flat_atomic_fmin_num   : AMDGPUAtomicRtn<llvm_anyfloat_ty>;
+def int_amdgcn_flat_atomic_fmax_num   : AMDGPUAtomicRtn<llvm_anyfloat_ty>;
+def int_amdgcn_global_atomic_fmin_num : AMDGPUAtomicRtn<llvm_anyfloat_ty>;
+def int_amdgcn_global_atomic_fmax_num : AMDGPUAtomicRtn<llvm_anyfloat_ty>;
 
 //===----------------------------------------------------------------------===//
 // Deep learning intrinsics.
@@ -2692,7 +2692,7 @@ def int_amdgcn_udot8 :
 // gfx908 intrinsics
 // ===----------------------------------------------------------------------===//
 
-def int_amdgcn_global_atomic_fadd : AMDGPUGlobalAtomicRtn<llvm_anyfloat_ty>;
+def int_amdgcn_global_atomic_fadd : AMDGPUAtomicRtn<llvm_anyfloat_ty>;
 
 // llvm.amdgcn.mfma.*.* vdst, srcA, srcB, srcC, cbsz, abid, blgp
 class AMDGPUMfmaIntrinsic<LLVMType DestTy, LLVMType SrcABTy> :
@@ -2728,11 +2728,11 @@ def int_amdgcn_mfma_f32_16x16x8bf16 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty,  llvm_v
 // gfx90a intrinsics
 // ===----------------------------------------------------------------------===//
 
-def int_amdgcn_global_atomic_fmin : AMDGPUGlobalAtomicRtn<llvm_anyfloat_ty>;
-def int_amdgcn_global_atomic_fmax : AMDGPUGlobalAtomicRtn<llvm_anyfloat_ty>;
-def int_amdgcn_flat_atomic_fadd   : AMDGPUGlobalAtomicRtn<llvm_anyfloat_ty>;
-def int_amdgcn_flat_atomic_fmin   : AMDGPUGlobalAtomicRtn<llvm_anyfloat_ty>;
-def int_amdgcn_flat_atomic_fmax   : AMDGPUGlobalAtomicRtn<llvm_anyfloat_ty>;
+def int_amdgcn_global_atomic_fmin : AMDGPUAtomicRtn<llvm_anyfloat_ty>;
+def int_amdgcn_global_atomic_fmax : AMDGPUAtomicRtn<llvm_anyfloat_ty>;
+def int_amdgcn_flat_atomic_fadd   : AMDGPUAtomicRtn<llvm_anyfloat_ty>;
+def int_amdgcn_flat_atomic_fmin   : AMDGPUAtomicRtn<llvm_anyfloat_ty>;
+def int_amdgcn_flat_atomic_fmax   : AMDGPUAtomicRtn<llvm_anyfloat_ty>;
 
 def int_amdgcn_mfma_f32_32x32x4bf16_1k  : AMDGPUMfmaIntrinsic<llvm_v32f32_ty, llvm_v4i16_ty>;
 def int_amdgcn_mfma_f32_16x16x4bf16_1k  : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v4i16_ty>;
@@ -2751,8 +2751,8 @@ def int_amdgcn_mfma_f64_4x4x4f64        : AMDGPUMfmaIntrinsic<llvm_double_ty, ll
 // ===----------------------------------------------------------------------===//
 
 // bf16 atomics use v2i16 argument since there is no bf16 data type in the llvm.
-def int_amdgcn_global_atomic_fadd_v2bf16 : AMDGPUGlobalAtomicRtn<llvm_v2i16_ty>;
-def int_amdgcn_flat_atomic_fadd_v2bf16   : AMDGPUGlobalAtomicRtn<llvm_v2i16_ty>;
+def int_amdgcn_global_atomic_fadd_v2bf16 : AMDGPUAtomicRtn<llvm_v2i16_ty>;
+def int_amdgcn_flat_atomic_fadd_v2bf16   : AMDGPUAtomicRtn<llvm_v2i16_ty>;
 def int_amdgcn_ds_fadd_v2bf16 : DefaultAttrsIntrinsic<
     [llvm_v2i16_ty],
     [LLVMQualPointerType<3>, llvm_v2i16_ty],

From 8eccf2b872cc1a88a1e5d4e5af0bfdabfb66c7bb Mon Sep 17 00:00:00 2001
From: Shengchen Kan <shengchen.kan@intel.com>
Date: Thu, 21 Dec 2023 22:56:14 +0800
Subject: [PATCH 078/342] [X86] Set Uses = [EFLAGS] for ADCX/ADOX

According to Intel SDE, ADCX reads CF and ADOX reads OF. `Uses` was
set to empty by accident, the bug was not exposed b/c compiler never
emits these instructions.
---
 llvm/lib/Target/X86/X86InstrArithmetic.td | 36 ++++++++---------------
 1 file changed, 13 insertions(+), 23 deletions(-)

diff --git a/llvm/lib/Target/X86/X86InstrArithmetic.td b/llvm/lib/Target/X86/X86InstrArithmetic.td
index dad8818b1c3b7..87feb7dc3b4ee 100644
--- a/llvm/lib/Target/X86/X86InstrArithmetic.td
+++ b/llvm/lib/Target/X86/X86InstrArithmetic.td
@@ -102,7 +102,7 @@ class BinOpRM_RF<bits<8> o, string m, X86TypeInfo t, SDPatternOperator node>
              (t.LoadNode addr:$src2)))]>, DefEFLAGS;
 // BinOpRMF_RF - Instructions that read "reg, [mem]", write "reg" and read/write
 // EFLAGS.
-class BinOpRMF_RF<bits<8> o, string m, X86TypeInfo t, SDNode node>
+class BinOpRMF_RF<bits<8> o, string m, X86TypeInfo t, SDPatternOperator node>
   : BinOpRM<o, m, t, (outs t.RegClass:$dst),
             [(set t.RegClass:$dst, EFLAGS,
              (node t.RegClass:$src1, (t.LoadNode addr:$src2), EFLAGS))]>,
@@ -1189,34 +1189,24 @@ let Uses = [RDX] in
 // We don't have patterns for these as there is no advantage over ADC for
 // most code.
 class ADCOXOpRR <string m, X86TypeInfo t>
-  : BinOpRR_RF<0xF6, m, t, null_frag> {
-  let OpSize = OpSizeFixed;
+  : BinOpRRF_RF<0xF6, m, t, null_frag> {
   let Form = MRMSrcReg;
   let isCommutable = 1;
 }
 
 class ADCOXOpRM <string m, X86TypeInfo t>
-  : BinOpRM_RF<0xF6, m, t, null_frag> {
-  let OpSize = OpSizeFixed;
+  : BinOpRMF_RF<0xF6, m, t, null_frag> {
   let Form = MRMSrcMem;
 }
 
-let Predicates = [HasADX], Constraints = "$src1 = $dst" in {
-  let SchedRW = [WriteADC] in {
-  def ADCX32rr : ADCOXOpRR<"adcx", Xi32>, T8PD;
-  def ADCX64rr : ADCOXOpRR<"adcx", Xi64>, T8PD;
-  def ADOX32rr : ADCOXOpRR<"adox", Xi32>, T8XS;
-  def ADOX64rr : ADCOXOpRR<"adox", Xi64>, T8XS;
-  }
-
-  let SchedRW = [WriteADC.Folded, WriteADC.ReadAfterFold,
-                 // Memory operand.
-                 ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault,
-                 // Implicit read of EFLAGS
-                 WriteADC.ReadAfterFold] in {
-  def ADCX32rm : ADCOXOpRM<"adcx", Xi32>, T8PD;
-  def ADCX64rm : ADCOXOpRM<"adcx", Xi64>, T8PD;
-  def ADOX32rm : ADCOXOpRM<"adox", Xi32>, T8XS;
-  def ADOX64rm : ADCOXOpRM<"adox", Xi64>, T8XS;
-  }
+let OpSize = OpSizeFixed, Constraints = "$src1 = $dst",
+    Predicates = [HasADX] in {
+def ADCX32rr : ADCOXOpRR<"adcx", Xi32>, T8PD;
+def ADCX64rr : ADCOXOpRR<"adcx", Xi64>, T8PD;
+def ADOX32rr : ADCOXOpRR<"adox", Xi32>, T8XS;
+def ADOX64rr : ADCOXOpRR<"adox", Xi64>, T8XS;
+def ADCX32rm : ADCOXOpRM<"adcx", Xi32>, T8PD;
+def ADCX64rm : ADCOXOpRM<"adcx", Xi64>, T8PD;
+def ADOX32rm : ADCOXOpRM<"adox", Xi32>, T8XS;
+def ADOX64rm : ADCOXOpRM<"adox", Xi64>, T8XS;
 }

From 6b505406a3403a9ab6c733ccf1fbcc52d9ca0601 Mon Sep 17 00:00:00 2001
From: madanial0 <118996571+madanial0@users.noreply.github.com>
Date: Thu, 21 Dec 2023 10:22:30 -0500
Subject: [PATCH 079/342] [Flang] remove whole-archive option for AIX linker
 (#76039)

The AIX linker does not support the `--whole-archive` option, removing
the option if the OS is AIX.

---------

Co-authored-by: Mark Danial <mark.danial@ibm.com>
---
 clang/lib/Driver/ToolChains/CommonArgs.cpp | 5 +++--
 flang/test/Driver/no-duplicate-main.f90    | 2 +-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp
index 4f4bdac793bea..6eb0ed8f3fed9 100644
--- a/clang/lib/Driver/ToolChains/CommonArgs.cpp
+++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp
@@ -1174,8 +1174,9 @@ static void addFortranMain(const ToolChain &TC, const ArgList &Args,
   // The --whole-archive option needs to be part of the link line to make
   // sure that the main() function from Fortran_main.a is pulled in by the
   // linker. However, it shouldn't be used if it's already active.
-  // TODO: Find an equivalent of `--whole-archive` for Darwin.
-  if (!isWholeArchivePresent(Args) && !TC.getTriple().isMacOSX()) {
+  // TODO: Find an equivalent of `--whole-archive` for Darwin and AIX.
+  if (!isWholeArchivePresent(Args) && !TC.getTriple().isMacOSX() &&
+      !TC.getTriple().isOSAIX()) {
     CmdArgs.push_back("--whole-archive");
     CmdArgs.push_back("-lFortran_main");
     CmdArgs.push_back("--no-whole-archive");
diff --git a/flang/test/Driver/no-duplicate-main.f90 b/flang/test/Driver/no-duplicate-main.f90
index b884e7ecd7f12..88f4430828e09 100644
--- a/flang/test/Driver/no-duplicate-main.f90
+++ b/flang/test/Driver/no-duplicate-main.f90
@@ -1,4 +1,4 @@
-! UNSUPPORTED: system-windows, system-darwin
+! UNSUPPORTED: system-windows, system-darwin, system-aix
 
 ! RUN: %flang -x ir -o %t.c-object -c %S/Inputs/no_duplicate_main.ll
 ! RUN: %flang -o %t -c %s

From 35111695dd71affe100c1579858c5680091da5c1 Mon Sep 17 00:00:00 2001
From: Kerry McLaughlin <kerry.mclaughlin@arm.com>
Date: Thu, 21 Dec 2023 15:25:23 +0000
Subject: [PATCH 080/342] [Clang][SME2] Enable multi-vector loads & stores for
 SME2 (#75821)

This patch enables the following builtins for SME2:
 - svld1, svld1_vnum
 - svldnt1, svldnt1_vnum
 - svst1, svst1_vnum
 - svstnt1, svstnt1_vnum
---
 clang/include/clang/Basic/arm_sve.td          | 124 ++++++++----------
 .../acle_sve2p1_ld1.c                         |  99 +++++++-------
 .../acle_sve2p1_ldnt1.c                       |  99 +++++++-------
 .../acle_sve2p1_st1.c                         |  98 +++++++-------
 .../acle_sve2p1_stnt1.c                       |  98 +++++++-------
 5 files changed, 267 insertions(+), 251 deletions(-)

diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td
index dcce325188bc4..91f62c4c76339 100644
--- a/clang/include/clang/Basic/arm_sve.td
+++ b/clang/include/clang/Basic/arm_sve.td
@@ -1988,79 +1988,61 @@ def SVWHILELO_COUNT  : SInst<"svwhilelt_{d}[_{1}]",  "}nni", "QcQsQiQl", MergeNo
 def SVWHILELS_COUNT  : SInst<"svwhilele_{d}[_{1}]",  "}nni", "QcQsQiQl", MergeNone, "aarch64_sve_whilels_{d}", [IsOverloadNone], [ImmCheck<2, ImmCheck2_4_Mul2>]>;
 def SVWHILEHI_COUNT  : SInst<"svwhilegt_{d}[_{1}]",  "}nni", "QcQsQiQl", MergeNone, "aarch64_sve_whilehi_{d}", [IsOverloadNone], [ImmCheck<2, ImmCheck2_4_Mul2>]>;
 def SVWHILEHS_COUNT  : SInst<"svwhilege_{d}[_{1}]",  "}nni", "QcQsQiQl", MergeNone, "aarch64_sve_whilehs_{d}", [IsOverloadNone], [ImmCheck<2, ImmCheck2_4_Mul2>]>;
+}
+
+multiclass MultiVecLoad<string i> {
+  // FIXME: Replace IsStreamingCompatible with IsStreamingOrHasSVE2p1 when available (SME2 requires __arm_streaming)
+  def SV # NAME # B_X2 : MInst<"sv" # i # "[_{2}]_x2", "2}c", "cUc",   [IsStructLoad, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x2">;
+  def SV # NAME # H_X2 : MInst<"sv" # i # "[_{2}]_x2", "2}c", "sUshb", [IsStructLoad, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x2">;
+  def SV # NAME # W_X2 : MInst<"sv" # i # "[_{2}]_x2", "2}c", "iUif",  [IsStructLoad, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x2">;
+  def SV # NAME # D_X2 : MInst<"sv" # i # "[_{2}]_x2", "2}c", "lUld",  [IsStructLoad, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x2">;
+  def SV # NAME # B_X4 : MInst<"sv" # i # "[_{2}]_x4", "4}c", "cUc",   [IsStructLoad, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x4">;
+  def SV # NAME # H_X4 : MInst<"sv" # i # "[_{2}]_x4", "4}c", "sUshb", [IsStructLoad, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x4">;
+  def SV # NAME # W_X4 : MInst<"sv" # i # "[_{2}]_x4", "4}c", "iUif",  [IsStructLoad, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x4">;
+  def SV # NAME # D_X4 : MInst<"sv" # i # "[_{2}]_x4", "4}c", "lUld",  [IsStructLoad, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x4">;
+
+  def SV # NAME # B_VNUM_X2 : MInst<"sv" # i # "_vnum" # "[_{2}]_x2", "2}cl", "cUc",   [IsStructLoad, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x2">;
+  def SV # NAME # H_VNUM_X2 : MInst<"sv" # i # "_vnum" # "[_{2}]_x2", "2}cl", "sUshb", [IsStructLoad, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x2">;
+  def SV # NAME # W_VNUM_X2 : MInst<"sv" # i # "_vnum" # "[_{2}]_x2", "2}cl", "iUif",  [IsStructLoad, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x2">;
+  def SV # NAME # D_VNUM_X2 : MInst<"sv" # i # "_vnum" # "[_{2}]_x2", "2}cl", "lUld",  [IsStructLoad, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x2">;
+  def SV # NAME # B_VNUM_X4 : MInst<"sv" # i # "_vnum" # "[_{2}]_x4", "4}cl", "cUc",   [IsStructLoad, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x4">;
+  def SV # NAME # H_VNUM_X4 : MInst<"sv" # i # "_vnum" # "[_{2}]_x4", "4}cl", "sUshb", [IsStructLoad, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x4">;
+  def SV # NAME # W_VNUM_X4 : MInst<"sv" # i # "_vnum" # "[_{2}]_x4", "4}cl", "iUif",  [IsStructLoad, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x4">;
+  def SV # NAME # D_VNUM_X4 : MInst<"sv" # i # "_vnum" # "[_{2}]_x4", "4}cl", "lUld",  [IsStructLoad, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x4">;
+}
+
+let TargetGuard = "sve2p1|sme2" in {
+  defm LD1   : MultiVecLoad<"ld1">;
+  defm LDNT1 : MultiVecLoad<"ldnt1">;
+}
+
+multiclass MultiVecStore<string i> {
+  // FIXME: Replace IsStreamingCompatible with IsStreamingOrHasSVE2p1 when available (SME2 requires __arm_streaming)
+  def SV # NAME # B_X2 : MInst<"sv" # i # "[_{2}_x2]", "v}p2", "cUc",   [IsStructStore, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x2">;
+  def SV # NAME # H_X2 : MInst<"sv" # i # "[_{2}_x2]", "v}p2", "sUshb", [IsStructStore, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x2">;
+  def SV # NAME # W_X2 : MInst<"sv" # i # "[_{2}_x2]", "v}p2", "iUif",  [IsStructStore, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x2">;
+  def SV # NAME # D_X2 : MInst<"sv" # i # "[_{2}_x2]", "v}p2", "lUld",  [IsStructStore, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x2">;
+  def SV # NAME # B_X4 : MInst<"sv" # i # "[_{2}_x4]", "v}p4", "cUc",   [IsStructStore, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x4">;
+  def SV # NAME # H_X4 : MInst<"sv" # i # "[_{2}_x4]", "v}p4", "sUshb", [IsStructStore, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x4">;
+  def SV # NAME # W_X4 : MInst<"sv" # i # "[_{2}_x4]", "v}p4", "iUif",  [IsStructStore, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x4">;
+  def SV # NAME # D_X4 : MInst<"sv" # i # "[_{2}_x4]", "v}p4", "lUld",  [IsStructStore, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x4">;
+
+  def SV # NAME # B_VNUM_X2 : MInst<"sv" # i # "_vnum" # "[_{2}_x2]", "v}pl2", "cUc",   [IsStructStore, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x2">;
+  def SV # NAME # H_VNUM_X2 : MInst<"sv" # i # "_vnum" # "[_{2}_x2]", "v}pl2", "sUshb", [IsStructStore, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x2">;
+  def SV # NAME # W_VNUM_X2 : MInst<"sv" # i # "_vnum" # "[_{2}_x2]", "v}pl2", "iUif",  [IsStructStore, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x2">;
+  def SV # NAME # D_VNUM_X2 : MInst<"sv" # i # "_vnum" # "[_{2}_x2]", "v}pl2", "lUld",  [IsStructStore, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x2">;
+  def SV # NAME # B_VNUM_X4 : MInst<"sv" # i # "_vnum" # "[_{2}_x4]", "v}pl4", "cUc",   [IsStructStore, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x4">;
+  def SV # NAME # H_VNUM_X4 : MInst<"sv" # i # "_vnum" # "[_{2}_x4]", "v}pl4", "sUshb", [IsStructStore, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x4">;
+  def SV # NAME # W_VNUM_X4 : MInst<"sv" # i # "_vnum" # "[_{2}_x4]", "v}pl4", "iUif",  [IsStructStore, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x4">;
+  def SV # NAME # D_VNUM_X4 : MInst<"sv" # i # "_vnum" # "[_{2}_x4]", "v}pl4", "lUld",  [IsStructStore, IsStreamingCompatible], MemEltTyDefault, "aarch64_sve_" # i # "_pn_x4">;
+}
 
-def SVLD1B_X2 : MInst<"svld1[_{2}]_x2", "2}c", "cUc", [IsStructLoad], MemEltTyDefault, "aarch64_sve_ld1_pn_x2">;
-def SVLD1H_X2 : MInst<"svld1[_{2}]_x2", "2}c", "sUshb", [IsStructLoad], MemEltTyDefault, "aarch64_sve_ld1_pn_x2">;
-def SVLD1W_X2 : MInst<"svld1[_{2}]_x2", "2}c", "iUif", [IsStructLoad], MemEltTyDefault, "aarch64_sve_ld1_pn_x2">;
-def SVLD1D_X2 : MInst<"svld1[_{2}]_x2", "2}c", "lUld", [IsStructLoad], MemEltTyDefault, "aarch64_sve_ld1_pn_x2">;
-def SVLD1B_X4 : MInst<"svld1[_{2}]_x4", "4}c", "cUc", [IsStructLoad], MemEltTyDefault, "aarch64_sve_ld1_pn_x4">;
-def SVLD1H_X4 : MInst<"svld1[_{2}]_x4", "4}c", "sUshb", [IsStructLoad], MemEltTyDefault, "aarch64_sve_ld1_pn_x4">;
-def SVLD1W_X4 : MInst<"svld1[_{2}]_x4", "4}c", "iUif", [IsStructLoad], MemEltTyDefault, "aarch64_sve_ld1_pn_x4">;
-def SVLD1D_X4 : MInst<"svld1[_{2}]_x4", "4}c", "lUld", [IsStructLoad], MemEltTyDefault, "aarch64_sve_ld1_pn_x4">;
-
-def SVLDNT1B_X2 : MInst<"svldnt1[_{2}]_x2", "2}c", "cUc", [IsStructLoad], MemEltTyDefault, "aarch64_sve_ldnt1_pn_x2">;
-def SVLDNT1H_X2 : MInst<"svldnt1[_{2}]_x2", "2}c", "sUshb", [IsStructLoad], MemEltTyDefault, "aarch64_sve_ldnt1_pn_x2">;
-def SVLDNT1W_X2 : MInst<"svldnt1[_{2}]_x2", "2}c", "iUif", [IsStructLoad], MemEltTyDefault, "aarch64_sve_ldnt1_pn_x2">;
-def SVLDNT1D_X2 : MInst<"svldnt1[_{2}]_x2", "2}c", "lUld", [IsStructLoad], MemEltTyDefault, "aarch64_sve_ldnt1_pn_x2">;
-def SVLDNT1B_X4 : MInst<"svldnt1[_{2}]_x4", "4}c", "cUc", [IsStructLoad], MemEltTyDefault, "aarch64_sve_ldnt1_pn_x4">;
-def SVLDNT1H_X4 : MInst<"svldnt1[_{2}]_x4", "4}c", "sUshb", [IsStructLoad], MemEltTyDefault, "aarch64_sve_ldnt1_pn_x4">;
-def SVLDNT1W_X4 : MInst<"svldnt1[_{2}]_x4", "4}c", "iUif", [IsStructLoad], MemEltTyDefault, "aarch64_sve_ldnt1_pn_x4">;
-def SVLDNT1D_X4 : MInst<"svldnt1[_{2}]_x4", "4}c", "lUld", [IsStructLoad], MemEltTyDefault, "aarch64_sve_ldnt1_pn_x4">;
-
-def SVLD1B_VNUM_X2 : MInst<"svld1_vnum[_{2}]_x2", "2}cl", "cUc", [IsStructLoad], MemEltTyDefault, "aarch64_sve_ld1_pn_x2">;
-def SVLD1H_VNUM_X2 : MInst<"svld1_vnum[_{2}]_x2", "2}cl", "sUshb", [IsStructLoad], MemEltTyDefault, "aarch64_sve_ld1_pn_x2">;
-def SVLD1W_VNUM_X2 : MInst<"svld1_vnum[_{2}]_x2", "2}cl", "iUif", [IsStructLoad], MemEltTyDefault, "aarch64_sve_ld1_pn_x2">;
-def SVLD1D_VNUM_X2 : MInst<"svld1_vnum[_{2}]_x2", "2}cl", "lUld", [IsStructLoad], MemEltTyDefault, "aarch64_sve_ld1_pn_x2">;
-def SVLD1B_VNUM_X4 : MInst<"svld1_vnum[_{2}]_x4", "4}cl", "cUc", [IsStructLoad], MemEltTyDefault, "aarch64_sve_ld1_pn_x4">;
-def SVLD1H_VNUM_X4 : MInst<"svld1_vnum[_{2}]_x4", "4}cl", "sUshb", [IsStructLoad], MemEltTyDefault, "aarch64_sve_ld1_pn_x4">;
-def SVLD1W_VNUM_X4 : MInst<"svld1_vnum[_{2}]_x4", "4}cl", "iUif", [IsStructLoad], MemEltTyDefault, "aarch64_sve_ld1_pn_x4">;
-def SVLD1D_VNUM_X4 : MInst<"svld1_vnum[_{2}]_x4", "4}cl", "lUld", [IsStructLoad], MemEltTyDefault, "aarch64_sve_ld1_pn_x4">;
-
-def SVLDNT1B_VNUM_X2 : MInst<"svldnt1_vnum[_{2}]_x2", "2}cl", "cUc", [IsStructLoad], MemEltTyDefault, "aarch64_sve_ldnt1_pn_x2">;
-def SVLDNT1H_VNUM_X2 : MInst<"svldnt1_vnum[_{2}]_x2", "2}cl", "sUshb", [IsStructLoad], MemEltTyDefault, "aarch64_sve_ldnt1_pn_x2">;
-def SVLDNT1W_VNUM_X2 : MInst<"svldnt1_vnum[_{2}]_x2", "2}cl", "iUif", [IsStructLoad], MemEltTyDefault, "aarch64_sve_ldnt1_pn_x2">;
-def SVLDNT1D_VNUM_X2 : MInst<"svldnt1_vnum[_{2}]_x2", "2}cl", "lUld", [IsStructLoad], MemEltTyDefault, "aarch64_sve_ldnt1_pn_x2">;
-def SVLDNT1B_VNUM_X4 : MInst<"svldnt1_vnum[_{2}]_x4", "4}cl", "cUc", [IsStructLoad], MemEltTyDefault, "aarch64_sve_ldnt1_pn_x4">;
-def SVLDNT1H_VNUM_X4 : MInst<"svldnt1_vnum[_{2}]_x4", "4}cl", "sUshb", [IsStructLoad], MemEltTyDefault, "aarch64_sve_ldnt1_pn_x4">;
-def SVLDNT1W_VNUM_X4 : MInst<"svldnt1_vnum[_{2}]_x4", "4}cl", "iUif", [IsStructLoad], MemEltTyDefault, "aarch64_sve_ldnt1_pn_x4">;
-def SVLDNT1D_VNUM_X4 : MInst<"svldnt1_vnum[_{2}]_x4", "4}cl", "lUld", [IsStructLoad], MemEltTyDefault, "aarch64_sve_ldnt1_pn_x4">;
-
-def SVST1B_X2 : MInst<"svst1[_{2}_x2]", "v}p2", "cUc", [IsStructStore], MemEltTyDefault, "aarch64_sve_st1_pn_x2">;
-def SVST1H_X2 : MInst<"svst1[_{2}_x2]", "v}p2", "sUshb", [IsStructStore], MemEltTyDefault, "aarch64_sve_st1_pn_x2">;
-def SVST1W_X2 : MInst<"svst1[_{2}_x2]", "v}p2", "iUif", [IsStructStore], MemEltTyDefault, "aarch64_sve_st1_pn_x2">;
-def SVST1D_X2 : MInst<"svst1[_{2}_x2]", "v}p2", "lUld", [IsStructStore], MemEltTyDefault, "aarch64_sve_st1_pn_x2">;
-def SVST1B_X4 : MInst<"svst1[_{2}_x4]", "v}p4", "cUc", [IsStructStore], MemEltTyDefault, "aarch64_sve_st1_pn_x4">;
-def SVST1H_X4 : MInst<"svst1[_{2}_x4]", "v}p4", "sUshb", [IsStructStore], MemEltTyDefault, "aarch64_sve_st1_pn_x4">;
-def SVST1W_X4 : MInst<"svst1[_{2}_x4]", "v}p4", "iUif", [IsStructStore], MemEltTyDefault, "aarch64_sve_st1_pn_x4">;
-def SVST1D_X4 : MInst<"svst1[_{2}_x4]", "v}p4", "lUld", [IsStructStore], MemEltTyDefault, "aarch64_sve_st1_pn_x4">;
-
-def SVST1B_VNUM_X2 : MInst<"svst1_vnum[_{2}_x2]", "v}pl2", "cUc", [IsStructStore], MemEltTyDefault, "aarch64_sve_st1_pn_x2">;
-def SVST1H_VNUM_X2 : MInst<"svst1_vnum[_{2}_x2]", "v}pl2", "sUshb", [IsStructStore], MemEltTyDefault, "aarch64_sve_st1_pn_x2">;
-def SVST1W_VNUM_X2 : MInst<"svst1_vnum[_{2}_x2]", "v}pl2", "iUif", [IsStructStore], MemEltTyDefault, "aarch64_sve_st1_pn_x2">;
-def SVST1D_VNUM_X2 : MInst<"svst1_vnum[_{2}_x2]", "v}pl2", "lUld", [IsStructStore], MemEltTyDefault, "aarch64_sve_st1_pn_x2">;
-def SVST1B_VNUM_X4 : MInst<"svst1_vnum[_{2}_x4]", "v}pl4", "cUc", [IsStructStore], MemEltTyDefault, "aarch64_sve_st1_pn_x4">;
-def SVST1H_VNUM_X4 : MInst<"svst1_vnum[_{2}_x4]", "v}pl4", "sUshb", [IsStructStore], MemEltTyDefault, "aarch64_sve_st1_pn_x4">;
-def SVST1W_VNUM_X4 : MInst<"svst1_vnum[_{2}_x4]", "v}pl4", "iUif", [IsStructStore], MemEltTyDefault, "aarch64_sve_st1_pn_x4">;
-def SVST1D_VNUM_X4 : MInst<"svst1_vnum[_{2}_x4]", "v}pl4", "lUld", [IsStructStore], MemEltTyDefault, "aarch64_sve_st1_pn_x4">;
-
-def SVSTNT1B_X2 : MInst<"svstnt1[_{2}_x2]", "v}p2", "cUc", [IsStructStore], MemEltTyDefault, "aarch64_sve_stnt1_pn_x2">;
-def SVSTNT1H_X2 : MInst<"svstnt1[_{2}_x2]", "v}p2", "sUshb", [IsStructStore], MemEltTyDefault, "aarch64_sve_stnt1_pn_x2">;
-def SVSTNT1W_X2 : MInst<"svstnt1[_{2}_x2]", "v}p2", "iUif", [IsStructStore], MemEltTyDefault, "aarch64_sve_stnt1_pn_x2">;
-def SVSTNT1D_X2 : MInst<"svstnt1[_{2}_x2]", "v}p2", "lUld", [IsStructStore], MemEltTyDefault, "aarch64_sve_stnt1_pn_x2">;
-def SVSTNT1B_X4 : MInst<"svstnt1[_{2}_x4]", "v}p4", "cUc", [IsStructStore], MemEltTyDefault, "aarch64_sve_stnt1_pn_x4">;
-def SVSTNT1H_X4 : MInst<"svstnt1[_{2}_x4]", "v}p4", "sUshb", [IsStructStore], MemEltTyDefault, "aarch64_sve_stnt1_pn_x4">;
-def SVSTNT1W_X4 : MInst<"svstnt1[_{2}_x4]", "v}p4", "iUif", [IsStructStore], MemEltTyDefault, "aarch64_sve_stnt1_pn_x4">;
-def SVSTNT1D_X4 : MInst<"svstnt1[_{2}_x4]", "v}p4", "lUld", [IsStructStore], MemEltTyDefault, "aarch64_sve_stnt1_pn_x4">;
-
-def SVSTNT1B_VNUM_X2 : MInst<"svstnt1_vnum[_{2}_x2]", "v}pl2", "cUc", [IsStructStore], MemEltTyDefault, "aarch64_sve_stnt1_pn_x2">;
-def SVSTNT1H_VNUM_X2 : MInst<"svstnt1_vnum[_{2}_x2]", "v}pl2", "sUshb", [IsStructStore], MemEltTyDefault, "aarch64_sve_stnt1_pn_x2">;
-def SVSTNT1W_VNUM_X2 : MInst<"svstnt1_vnum[_{2}_x2]", "v}pl2", "iUif", [IsStructStore], MemEltTyDefault, "aarch64_sve_stnt1_pn_x2">;
-def SVSTNT1D_VNUM_X2 : MInst<"svstnt1_vnum[_{2}_x2]", "v}pl2", "lUld", [IsStructStore], MemEltTyDefault, "aarch64_sve_stnt1_pn_x2">;
-def SVSTNT1B_VNUM_X4 : MInst<"svstnt1_vnum[_{2}_x4]", "v}pl4", "cUc", [IsStructStore], MemEltTyDefault, "aarch64_sve_stnt1_pn_x4">;
-def SVSTNT1H_VNUM_X4 : MInst<"svstnt1_vnum[_{2}_x4]", "v}pl4", "sUshb", [IsStructStore], MemEltTyDefault, "aarch64_sve_stnt1_pn_x4">;
-def SVSTNT1W_VNUM_X4 : MInst<"svstnt1_vnum[_{2}_x4]", "v}pl4", "iUif", [IsStructStore], MemEltTyDefault, "aarch64_sve_stnt1_pn_x4">;
-def SVSTNT1D_VNUM_X4 : MInst<"svstnt1_vnum[_{2}_x4]", "v}pl4", "lUld", [IsStructStore], MemEltTyDefault, "aarch64_sve_stnt1_pn_x4">;
+let TargetGuard = "sve2p1|sme2" in {
+  defm ST1   : MultiVecStore<"st1">;
+  defm STNT1 : MultiVecStore<"stnt1">;
+}
 
+let TargetGuard = "sve2p1" in {
 def SVDOT_X2_S : SInst<"svdot[_{d}_{2}_{3}]", "ddhh", "i",  MergeNone, "aarch64_sve_sdot_x2", [], []>;
 def SVDOT_X2_U : SInst<"svdot[_{d}_{2}_{3}]", "ddhh", "Ui", MergeNone, "aarch64_sve_udot_x2", [], []>;
 def SVDOT_X2_F : SInst<"svdot[_{d}_{2}_{3}]", "ddhh", "f",  MergeNone, "aarch64_sve_fdot_x2", [], []>;
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_ld1.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_ld1.c
index 7a25d31de0130..6f1231e776aa3 100644
--- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_ld1.c
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_ld1.c
@@ -1,9 +1,12 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -DTEST_SME2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -DTEST_SME2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
@@ -13,6 +16,12 @@
 #define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4
 #endif
 
+#ifndef TEST_SME2
+#define ATTR
+#else
+#define ATTR __arm_streaming
+#endif
+
 // CHECK-LABEL: @test_svld1_u8_x2(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]])
@@ -31,7 +40,7 @@
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x i8> [[TMP4]]
 //
-svuint8x2_t test_svld1_u8_x2(svcount_t pn, const uint8_t *base)
+svuint8x2_t test_svld1_u8_x2(svcount_t pn, const uint8_t *base) ATTR
 {
   return SVE_ACLE_FUNC(svld1,_u8,_x2,)(pn, base);
 }
@@ -54,7 +63,7 @@ svuint8x2_t test_svld1_u8_x2(svcount_t pn, const uint8_t *base)
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i16> [[TMP4]]
 //
-svuint16x2_t test_svld1_u16_x2(svcount_t pn, const uint16_t *base)
+svuint16x2_t test_svld1_u16_x2(svcount_t pn, const uint16_t *base) ATTR
 {
   return SVE_ACLE_FUNC(svld1,_u16,_x2,)(pn, base);
 }
@@ -77,7 +86,7 @@ svuint16x2_t test_svld1_u16_x2(svcount_t pn, const uint16_t *base)
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i32> [[TMP4]]
 //
-svuint32x2_t test_svld1_u32_x2(svcount_t pn, const uint32_t *base)
+svuint32x2_t test_svld1_u32_x2(svcount_t pn, const uint32_t *base) ATTR
 {
   return SVE_ACLE_FUNC(svld1,_u32,_x2,)(pn, base);
 }
@@ -100,7 +109,7 @@ svuint32x2_t test_svld1_u32_x2(svcount_t pn, const uint32_t *base)
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
 // CPP-CHECK-NEXT:    ret <vscale x 4 x i64> [[TMP4]]
 //
-svuint64x2_t test_svld1_u64_x2(svcount_t pn, const uint64_t *base)
+svuint64x2_t test_svld1_u64_x2(svcount_t pn, const uint64_t *base) ATTR
 {
   return SVE_ACLE_FUNC(svld1,_u64,_x2,)(pn, base);
 }
@@ -131,7 +140,7 @@ svuint64x2_t test_svld1_u64_x2(svcount_t pn, const uint64_t *base)
 // CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
 // CPP-CHECK-NEXT:    ret <vscale x 64 x i8> [[TMP8]]
 //
-svuint8x4_t test_svld1_u8_x4(svcount_t pn, const uint8_t *base)
+svuint8x4_t test_svld1_u8_x4(svcount_t pn, const uint8_t *base) ATTR
 {
   return SVE_ACLE_FUNC(svld1,_u8,_x4,)(pn, base);
 }
@@ -162,7 +171,7 @@ svuint8x4_t test_svld1_u8_x4(svcount_t pn, const uint8_t *base)
 // CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x i16> [[TMP8]]
 //
-svuint16x4_t test_svld1_u16_x4(svcount_t pn, const uint16_t *base)
+svuint16x4_t test_svld1_u16_x4(svcount_t pn, const uint16_t *base) ATTR
 {
   return SVE_ACLE_FUNC(svld1,_u16,_x4,)(pn, base);
 }
@@ -193,7 +202,7 @@ svuint16x4_t test_svld1_u16_x4(svcount_t pn, const uint16_t *base)
 // CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i32> [[TMP8]]
 //
-svuint32x4_t test_svld1_u32_x4(svcount_t pn, const uint32_t *base)
+svuint32x4_t test_svld1_u32_x4(svcount_t pn, const uint32_t *base) ATTR
 {
   return SVE_ACLE_FUNC(svld1,_u32,_x4,)(pn, base);
 }
@@ -224,7 +233,7 @@ svuint32x4_t test_svld1_u32_x4(svcount_t pn, const uint32_t *base)
 // CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 6)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i64> [[TMP8]]
 //
-svuint64x4_t test_svld1_u64_x4(svcount_t pn, const uint64_t *base)
+svuint64x4_t test_svld1_u64_x4(svcount_t pn, const uint64_t *base) ATTR
 {
   return SVE_ACLE_FUNC(svld1,_u64,_x4,)(pn, base);
 }
@@ -247,7 +256,7 @@ svuint64x4_t test_svld1_u64_x4(svcount_t pn, const uint64_t *base)
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x i8> [[TMP4]]
 //
-svint8x2_t test_svld1_s8_x2(svcount_t pn, const int8_t *base)
+svint8x2_t test_svld1_s8_x2(svcount_t pn, const int8_t *base) ATTR
 {
   return SVE_ACLE_FUNC(svld1,_s8,_x2,)(pn, base);
 }
@@ -270,7 +279,7 @@ svint8x2_t test_svld1_s8_x2(svcount_t pn, const int8_t *base)
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i16> [[TMP4]]
 //
-svint16x2_t test_svld1_s16_x2(svcount_t pn, const int16_t *base)
+svint16x2_t test_svld1_s16_x2(svcount_t pn, const int16_t *base) ATTR
 {
   return SVE_ACLE_FUNC(svld1,_s16,_x2,)(pn, base);
 }
@@ -293,7 +302,7 @@ svint16x2_t test_svld1_s16_x2(svcount_t pn, const int16_t *base)
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i32> [[TMP4]]
 //
-svint32x2_t test_svld1_s32_x2(svcount_t pn, const int32_t *base)
+svint32x2_t test_svld1_s32_x2(svcount_t pn, const int32_t *base) ATTR
 {
   return SVE_ACLE_FUNC(svld1,_s32,_x2,)(pn, base);
 }
@@ -316,7 +325,7 @@ svint32x2_t test_svld1_s32_x2(svcount_t pn, const int32_t *base)
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
 // CPP-CHECK-NEXT:    ret <vscale x 4 x i64> [[TMP4]]
 //
-svint64x2_t test_svld1_s64_x2(svcount_t pn, const int64_t *base)
+svint64x2_t test_svld1_s64_x2(svcount_t pn, const int64_t *base) ATTR
 {
   return SVE_ACLE_FUNC(svld1,_s64,_x2,)(pn, base);
 }
@@ -347,7 +356,7 @@ svint64x2_t test_svld1_s64_x2(svcount_t pn, const int64_t *base)
 // CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
 // CPP-CHECK-NEXT:    ret <vscale x 64 x i8> [[TMP8]]
 //
-svint8x4_t test_svld1_s8_x4(svcount_t pn, const int8_t *base)
+svint8x4_t test_svld1_s8_x4(svcount_t pn, const int8_t *base) ATTR
 {
   return SVE_ACLE_FUNC(svld1,_s8,_x4,)(pn, base);
 }
@@ -378,7 +387,7 @@ svint8x4_t test_svld1_s8_x4(svcount_t pn, const int8_t *base)
 // CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x i16> [[TMP8]]
 //
-svint16x4_t test_svld1_s16_x4(svcount_t pn, const int16_t *base)
+svint16x4_t test_svld1_s16_x4(svcount_t pn, const int16_t *base) ATTR
 {
   return SVE_ACLE_FUNC(svld1,_s16,_x4,)(pn, base);
 }
@@ -409,7 +418,7 @@ svint16x4_t test_svld1_s16_x4(svcount_t pn, const int16_t *base)
 // CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i32> [[TMP8]]
 //
-svint32x4_t test_svld1_s32_x4(svcount_t pn, const int32_t *base)
+svint32x4_t test_svld1_s32_x4(svcount_t pn, const int32_t *base) ATTR
 {
   return SVE_ACLE_FUNC(svld1,_s32,_x4,)(pn, base);
 }
@@ -440,7 +449,7 @@ svint32x4_t test_svld1_s32_x4(svcount_t pn, const int32_t *base)
 // CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 6)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i64> [[TMP8]]
 //
-svint64x4_t test_svld1_s64_x4(svcount_t pn, const int64_t *base)
+svint64x4_t test_svld1_s64_x4(svcount_t pn, const int64_t *base) ATTR
 {
   return SVE_ACLE_FUNC(svld1,_s64,_x4,)(pn, base);
 }
@@ -463,7 +472,7 @@ svint64x4_t test_svld1_s64_x4(svcount_t pn, const int64_t *base)
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x half> [[TMP4]]
 //
-svfloat16x2_t test_svld1_f16_x2(svcount_t pn, const float16_t *base)
+svfloat16x2_t test_svld1_f16_x2(svcount_t pn, const float16_t *base) ATTR
 {
   return SVE_ACLE_FUNC(svld1,_f16,_x2,)(pn, base);
 }
@@ -486,7 +495,7 @@ svfloat16x2_t test_svld1_f16_x2(svcount_t pn, const float16_t *base)
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x float> [[TMP4]]
 //
-svfloat32x2_t test_svld1_f32_x2(svcount_t pn, const float32_t *base)
+svfloat32x2_t test_svld1_f32_x2(svcount_t pn, const float32_t *base) ATTR
 {
   return SVE_ACLE_FUNC(svld1,_f32,_x2,)(pn, base);
 }
@@ -509,7 +518,7 @@ svfloat32x2_t test_svld1_f32_x2(svcount_t pn, const float32_t *base)
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], i64 2)
 // CPP-CHECK-NEXT:    ret <vscale x 4 x double> [[TMP4]]
 //
-svfloat64x2_t test_svld1_f64_x2(svcount_t pn, const float64_t *base)
+svfloat64x2_t test_svld1_f64_x2(svcount_t pn, const float64_t *base) ATTR
 {
   return SVE_ACLE_FUNC(svld1,_f64,_x2,)(pn, base);
 }
@@ -540,7 +549,7 @@ svfloat64x2_t test_svld1_f64_x2(svcount_t pn, const float64_t *base)
 // CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]], i64 24)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x half> [[TMP8]]
 //
-svfloat16x4_t test_svld1_f16_x4(svcount_t pn, const float16_t *base)
+svfloat16x4_t test_svld1_f16_x4(svcount_t pn, const float16_t *base) ATTR
 {
   return SVE_ACLE_FUNC(svld1,_f16,_x4,)(pn, base);
 }
@@ -571,7 +580,7 @@ svfloat16x4_t test_svld1_f16_x4(svcount_t pn, const float16_t *base)
 // CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]], i64 12)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x float> [[TMP8]]
 //
-svfloat32x4_t test_svld1_f32_x4(svcount_t pn, const float32_t *base)
+svfloat32x4_t test_svld1_f32_x4(svcount_t pn, const float32_t *base) ATTR
 {
   return SVE_ACLE_FUNC(svld1,_f32,_x4,)(pn, base);
 }
@@ -602,7 +611,7 @@ svfloat32x4_t test_svld1_f32_x4(svcount_t pn, const float32_t *base)
 // CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP6]], <vscale x 2 x double> [[TMP7]], i64 6)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x double> [[TMP8]]
 //
-svfloat64x4_t test_svld1_f64_x4(svcount_t pn, const float64_t *base)
+svfloat64x4_t test_svld1_f64_x4(svcount_t pn, const float64_t *base) ATTR
 {
   return SVE_ACLE_FUNC(svld1,_f64,_x4,)(pn, base);
 }
@@ -631,7 +640,7 @@ svfloat64x4_t test_svld1_f64_x4(svcount_t pn, const float64_t *base)
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], i64 16)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x i8> [[TMP5]]
 //
-svuint8x2_t test_svld1_vnum_u8_x2(svcount_t pn, const uint8_t *base, int64_t vnum)
+svuint8x2_t test_svld1_vnum_u8_x2(svcount_t pn, const uint8_t *base, int64_t vnum) ATTR
 {
   return SVE_ACLE_FUNC(svld1_vnum,_u8,_x2,)(pn, base, vnum);
 }
@@ -656,7 +665,7 @@ svuint8x2_t test_svld1_vnum_u8_x2(svcount_t pn, const uint8_t *base, int64_t vnu
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], i64 8)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i16> [[TMP5]]
 //
-svuint16x2_t test_svld1_vnum_u16_x2(svcount_t pn, const uint16_t *base, int64_t vnum)
+svuint16x2_t test_svld1_vnum_u16_x2(svcount_t pn, const uint16_t *base, int64_t vnum) ATTR
 {
   return SVE_ACLE_FUNC(svld1_vnum,_u16,_x2,)(pn, base, vnum);
 }
@@ -681,7 +690,7 @@ svuint16x2_t test_svld1_vnum_u16_x2(svcount_t pn, const uint16_t *base, int64_t
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP3]], <vscale x 4 x i32> [[TMP4]], i64 4)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i32> [[TMP5]]
 //
-svuint32x2_t test_svld1_vnum_u32_x2(svcount_t pn, const uint32_t *base, int64_t vnum)
+svuint32x2_t test_svld1_vnum_u32_x2(svcount_t pn, const uint32_t *base, int64_t vnum) ATTR
 {
   return SVE_ACLE_FUNC(svld1_vnum,_u32,_x2,)(pn, base, vnum);
 }
@@ -706,7 +715,7 @@ svuint32x2_t test_svld1_vnum_u32_x2(svcount_t pn, const uint32_t *base, int64_t
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP4]], i64 2)
 // CPP-CHECK-NEXT:    ret <vscale x 4 x i64> [[TMP5]]
 //
-svuint64x2_t test_svld1_vnum_u64_x2(svcount_t pn, const uint64_t *base, int64_t vnum)
+svuint64x2_t test_svld1_vnum_u64_x2(svcount_t pn, const uint64_t *base, int64_t vnum) ATTR
 {
   return SVE_ACLE_FUNC(svld1_vnum,_u64,_x2,)(pn, base, vnum);
 }
@@ -739,7 +748,7 @@ svuint64x2_t test_svld1_vnum_u64_x2(svcount_t pn, const uint64_t *base, int64_t
 // CPP-CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP7]], <vscale x 16 x i8> [[TMP8]], i64 48)
 // CPP-CHECK-NEXT:    ret <vscale x 64 x i8> [[TMP9]]
 //
-svuint8x4_t test_svld1_vnum_u8_x4(svcount_t pn, const uint8_t *base, int64_t vnum)
+svuint8x4_t test_svld1_vnum_u8_x4(svcount_t pn, const uint8_t *base, int64_t vnum) ATTR
 {
   return SVE_ACLE_FUNC(svld1_vnum,_u8,_x4,)(pn, base, vnum);
 }
@@ -772,7 +781,7 @@ svuint8x4_t test_svld1_vnum_u8_x4(svcount_t pn, const uint8_t *base, int64_t vnu
 // CPP-CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP7]], <vscale x 8 x i16> [[TMP8]], i64 24)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x i16> [[TMP9]]
 //
-svuint16x4_t test_svld1_vnum_u16_x4(svcount_t pn, const uint16_t *base, int64_t vnum)
+svuint16x4_t test_svld1_vnum_u16_x4(svcount_t pn, const uint16_t *base, int64_t vnum) ATTR
 {
   return SVE_ACLE_FUNC(svld1_vnum,_u16,_x4,)(pn, base, vnum);
 }
@@ -805,7 +814,7 @@ svuint16x4_t test_svld1_vnum_u16_x4(svcount_t pn, const uint16_t *base, int64_t
 // CPP-CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP7]], <vscale x 4 x i32> [[TMP8]], i64 12)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i32> [[TMP9]]
 //
-svuint32x4_t test_svld1_vnum_u32_x4(svcount_t pn, const uint32_t *base, int64_t vnum)
+svuint32x4_t test_svld1_vnum_u32_x4(svcount_t pn, const uint32_t *base, int64_t vnum) ATTR
 {
   return SVE_ACLE_FUNC(svld1_vnum,_u32,_x4,)(pn, base, vnum);
 }
@@ -838,7 +847,7 @@ svuint32x4_t test_svld1_vnum_u32_x4(svcount_t pn, const uint32_t *base, int64_t
 // CPP-CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP7]], <vscale x 2 x i64> [[TMP8]], i64 6)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i64> [[TMP9]]
 //
-svuint64x4_t test_svld1_vnum_u64_x4(svcount_t pn, const uint64_t *base, int64_t vnum)
+svuint64x4_t test_svld1_vnum_u64_x4(svcount_t pn, const uint64_t *base, int64_t vnum) ATTR
 {
   return SVE_ACLE_FUNC(svld1_vnum,_u64,_x4,)(pn, base, vnum);
 }
@@ -863,7 +872,7 @@ svuint64x4_t test_svld1_vnum_u64_x4(svcount_t pn, const uint64_t *base, int64_t
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], i64 16)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x i8> [[TMP5]]
 //
-svint8x2_t test_svld1_vnum_s8_x2(svcount_t pn, const int8_t *base, int64_t vnum)
+svint8x2_t test_svld1_vnum_s8_x2(svcount_t pn, const int8_t *base, int64_t vnum) ATTR
 {
   return SVE_ACLE_FUNC(svld1_vnum,_s8,_x2,)(pn, base, vnum);
 }
@@ -888,7 +897,7 @@ svint8x2_t test_svld1_vnum_s8_x2(svcount_t pn, const int8_t *base, int64_t vnum)
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], i64 8)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i16> [[TMP5]]
 //
-svint16x2_t test_svld1_vnum_s16_x2(svcount_t pn, const int16_t *base, int64_t vnum)
+svint16x2_t test_svld1_vnum_s16_x2(svcount_t pn, const int16_t *base, int64_t vnum) ATTR
 {
   return SVE_ACLE_FUNC(svld1_vnum,_s16,_x2,)(pn, base, vnum);
 }
@@ -913,7 +922,7 @@ svint16x2_t test_svld1_vnum_s16_x2(svcount_t pn, const int16_t *base, int64_t vn
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP3]], <vscale x 4 x i32> [[TMP4]], i64 4)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i32> [[TMP5]]
 //
-svint32x2_t test_svld1_vnum_s32_x2(svcount_t pn, const int32_t *base, int64_t vnum)
+svint32x2_t test_svld1_vnum_s32_x2(svcount_t pn, const int32_t *base, int64_t vnum) ATTR
 {
   return SVE_ACLE_FUNC(svld1_vnum,_s32,_x2,)(pn, base, vnum);
 }
@@ -938,7 +947,7 @@ svint32x2_t test_svld1_vnum_s32_x2(svcount_t pn, const int32_t *base, int64_t vn
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP4]], i64 2)
 // CPP-CHECK-NEXT:    ret <vscale x 4 x i64> [[TMP5]]
 //
-svint64x2_t test_svld1_vnum_s64_x2(svcount_t pn, const int64_t *base, int64_t vnum)
+svint64x2_t test_svld1_vnum_s64_x2(svcount_t pn, const int64_t *base, int64_t vnum) ATTR
 {
   return SVE_ACLE_FUNC(svld1_vnum,_s64,_x2,)(pn, base, vnum);
 }
@@ -971,7 +980,7 @@ svint64x2_t test_svld1_vnum_s64_x2(svcount_t pn, const int64_t *base, int64_t vn
 // CPP-CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP7]], <vscale x 16 x i8> [[TMP8]], i64 48)
 // CPP-CHECK-NEXT:    ret <vscale x 64 x i8> [[TMP9]]
 //
-svint8x4_t test_svld1_vnum_s8_x4(svcount_t pn, const int8_t *base, int64_t vnum)
+svint8x4_t test_svld1_vnum_s8_x4(svcount_t pn, const int8_t *base, int64_t vnum) ATTR
 {
   return SVE_ACLE_FUNC(svld1_vnum,_s8,_x4,)(pn, base, vnum);
 }
@@ -1004,7 +1013,7 @@ svint8x4_t test_svld1_vnum_s8_x4(svcount_t pn, const int8_t *base, int64_t vnum)
 // CPP-CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP7]], <vscale x 8 x i16> [[TMP8]], i64 24)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x i16> [[TMP9]]
 //
-svint16x4_t test_svld1_vnum_s16_x4(svcount_t pn, const int16_t *base, int64_t vnum)
+svint16x4_t test_svld1_vnum_s16_x4(svcount_t pn, const int16_t *base, int64_t vnum) ATTR
 {
   return SVE_ACLE_FUNC(svld1_vnum,_s16,_x4,)(pn, base, vnum);
 }
@@ -1037,7 +1046,7 @@ svint16x4_t test_svld1_vnum_s16_x4(svcount_t pn, const int16_t *base, int64_t vn
 // CPP-CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP7]], <vscale x 4 x i32> [[TMP8]], i64 12)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i32> [[TMP9]]
 //
-svint32x4_t test_svld1_vnum_s32_x4(svcount_t pn, const int32_t *base, int64_t vnum)
+svint32x4_t test_svld1_vnum_s32_x4(svcount_t pn, const int32_t *base, int64_t vnum) ATTR
 {
   return SVE_ACLE_FUNC(svld1_vnum,_s32,_x4,)(pn, base, vnum);
 }
@@ -1070,7 +1079,7 @@ svint32x4_t test_svld1_vnum_s32_x4(svcount_t pn, const int32_t *base, int64_t vn
 // CPP-CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP7]], <vscale x 2 x i64> [[TMP8]], i64 6)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i64> [[TMP9]]
 //
-svint64x4_t test_svld1_vnum_s64_x4(svcount_t pn, const int64_t *base, int64_t vnum)
+svint64x4_t test_svld1_vnum_s64_x4(svcount_t pn, const int64_t *base, int64_t vnum) ATTR
 {
   return SVE_ACLE_FUNC(svld1_vnum,_s64,_x4,)(pn, base, vnum);
 }
@@ -1095,7 +1104,7 @@ svint64x4_t test_svld1_vnum_s64_x4(svcount_t pn, const int64_t *base, int64_t vn
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> [[TMP3]], <vscale x 8 x half> [[TMP4]], i64 8)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x half> [[TMP5]]
 //
-svfloat16x2_t test_svld1_vnum_f16_x2(svcount_t pn, const float16_t *base, int64_t vnum)
+svfloat16x2_t test_svld1_vnum_f16_x2(svcount_t pn, const float16_t *base, int64_t vnum) ATTR
 {
   return SVE_ACLE_FUNC(svld1_vnum,_f16,_x2,)(pn, base, vnum);
 }
@@ -1120,7 +1129,7 @@ svfloat16x2_t test_svld1_vnum_f16_x2(svcount_t pn, const float16_t *base, int64_
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP3]], <vscale x 4 x float> [[TMP4]], i64 4)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x float> [[TMP5]]
 //
-svfloat32x2_t test_svld1_vnum_f32_x2(svcount_t pn, const float32_t *base, int64_t vnum)
+svfloat32x2_t test_svld1_vnum_f32_x2(svcount_t pn, const float32_t *base, int64_t vnum) ATTR
 {
   return SVE_ACLE_FUNC(svld1_vnum,_f32,_x2,)(pn, base, vnum);
 }
@@ -1145,7 +1154,7 @@ svfloat32x2_t test_svld1_vnum_f32_x2(svcount_t pn, const float32_t *base, int64_
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> [[TMP3]], <vscale x 2 x double> [[TMP4]], i64 2)
 // CPP-CHECK-NEXT:    ret <vscale x 4 x double> [[TMP5]]
 //
-svfloat64x2_t test_svld1_vnum_f64_x2(svcount_t pn, const float64_t *base, int64_t vnum)
+svfloat64x2_t test_svld1_vnum_f64_x2(svcount_t pn, const float64_t *base, int64_t vnum) ATTR
 {
   return SVE_ACLE_FUNC(svld1_vnum,_f64,_x2,)(pn, base, vnum);
 }
@@ -1178,7 +1187,7 @@ svfloat64x2_t test_svld1_vnum_f64_x2(svcount_t pn, const float64_t *base, int64_
 // CPP-CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP7]], <vscale x 8 x half> [[TMP8]], i64 24)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x half> [[TMP9]]
 //
-svfloat16x4_t test_svld1_vnum_f16_x4(svcount_t pn, const float16_t *base, int64_t vnum)
+svfloat16x4_t test_svld1_vnum_f16_x4(svcount_t pn, const float16_t *base, int64_t vnum) ATTR
 {
   return SVE_ACLE_FUNC(svld1_vnum,_f16,_x4,)(pn, base, vnum);
 }
@@ -1211,7 +1220,7 @@ svfloat16x4_t test_svld1_vnum_f16_x4(svcount_t pn, const float16_t *base, int64_
 // CPP-CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP7]], <vscale x 4 x float> [[TMP8]], i64 12)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x float> [[TMP9]]
 //
-svfloat32x4_t test_svld1_vnum_f32_x4(svcount_t pn, const float32_t *base, int64_t vnum)
+svfloat32x4_t test_svld1_vnum_f32_x4(svcount_t pn, const float32_t *base, int64_t vnum) ATTR
 {
   return SVE_ACLE_FUNC(svld1_vnum,_f32,_x4,)(pn, base, vnum);
 }
@@ -1244,7 +1253,7 @@ svfloat32x4_t test_svld1_vnum_f32_x4(svcount_t pn, const float32_t *base, int64_
 // CPP-CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP7]], <vscale x 2 x double> [[TMP8]], i64 6)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x double> [[TMP9]]
 //
-svfloat64x4_t test_svld1_vnum_f64_x4(svcount_t pn, const float64_t *base, int64_t vnum)
+svfloat64x4_t test_svld1_vnum_f64_x4(svcount_t pn, const float64_t *base, int64_t vnum) ATTR
 {
   return SVE_ACLE_FUNC(svld1_vnum,_f64,_x4,)(pn, base, vnum);
 }
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_ldnt1.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_ldnt1.c
index 7a0fcde819dce..3f61cc3de1395 100644
--- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_ldnt1.c
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_ldnt1.c
@@ -1,9 +1,12 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -DTEST_SME2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -DTEST_SME2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
@@ -13,6 +16,12 @@
 #define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4
 #endif
 
+#ifndef TEST_SME2
+#define ATTR
+#else
+#define ATTR __arm_streaming
+#endif
+
 // CHECK-LABEL: @test_svldnt1_u8_x2(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv16i8(target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]])
@@ -31,7 +40,7 @@
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x i8> [[TMP4]]
 //
-svuint8x2_t test_svldnt1_u8_x2(svcount_t pn, const uint8_t *base)
+svuint8x2_t test_svldnt1_u8_x2(svcount_t pn, const uint8_t *base) ATTR
 {
   return SVE_ACLE_FUNC(svldnt1,_u8,_x2,)(pn, base);
 }
@@ -54,7 +63,7 @@ svuint8x2_t test_svldnt1_u8_x2(svcount_t pn, const uint8_t *base)
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i16> [[TMP4]]
 //
-svuint16x2_t test_svldnt1_u16_x2(svcount_t pn, const uint16_t *base)
+svuint16x2_t test_svldnt1_u16_x2(svcount_t pn, const uint16_t *base) ATTR
 {
   return SVE_ACLE_FUNC(svldnt1,_u16,_x2,)(pn, base);
 }
@@ -77,7 +86,7 @@ svuint16x2_t test_svldnt1_u16_x2(svcount_t pn, const uint16_t *base)
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i32> [[TMP4]]
 //
-svuint32x2_t test_svldnt1_u32_x2(svcount_t pn, const uint32_t *base)
+svuint32x2_t test_svldnt1_u32_x2(svcount_t pn, const uint32_t *base) ATTR
 {
   return SVE_ACLE_FUNC(svldnt1,_u32,_x2,)(pn, base);
 }
@@ -100,7 +109,7 @@ svuint32x2_t test_svldnt1_u32_x2(svcount_t pn, const uint32_t *base)
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
 // CPP-CHECK-NEXT:    ret <vscale x 4 x i64> [[TMP4]]
 //
-svuint64x2_t test_svldnt1_u64_x2(svcount_t pn, const uint64_t *base)
+svuint64x2_t test_svldnt1_u64_x2(svcount_t pn, const uint64_t *base) ATTR
 {
   return SVE_ACLE_FUNC(svldnt1,_u64,_x2,)(pn, base);
 }
@@ -131,7 +140,7 @@ svuint64x2_t test_svldnt1_u64_x2(svcount_t pn, const uint64_t *base)
 // CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
 // CPP-CHECK-NEXT:    ret <vscale x 64 x i8> [[TMP8]]
 //
-svuint8x4_t test_svldnt1_u8_x4(svcount_t pn, const uint8_t *base)
+svuint8x4_t test_svldnt1_u8_x4(svcount_t pn, const uint8_t *base) ATTR
 {
   return SVE_ACLE_FUNC(svldnt1,_u8,_x4,)(pn, base);
 }
@@ -162,7 +171,7 @@ svuint8x4_t test_svldnt1_u8_x4(svcount_t pn, const uint8_t *base)
 // CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x i16> [[TMP8]]
 //
-svuint16x4_t test_svldnt1_u16_x4(svcount_t pn, const uint16_t *base)
+svuint16x4_t test_svldnt1_u16_x4(svcount_t pn, const uint16_t *base) ATTR
 {
   return SVE_ACLE_FUNC(svldnt1,_u16,_x4,)(pn, base);
 }
@@ -193,7 +202,7 @@ svuint16x4_t test_svldnt1_u16_x4(svcount_t pn, const uint16_t *base)
 // CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i32> [[TMP8]]
 //
-svuint32x4_t test_svldnt1_u32_x4(svcount_t pn, const uint32_t *base)
+svuint32x4_t test_svldnt1_u32_x4(svcount_t pn, const uint32_t *base) ATTR
 {
   return SVE_ACLE_FUNC(svldnt1,_u32,_x4,)(pn, base);
 }
@@ -224,7 +233,7 @@ svuint32x4_t test_svldnt1_u32_x4(svcount_t pn, const uint32_t *base)
 // CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 6)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i64> [[TMP8]]
 //
-svuint64x4_t test_svldnt1_u64_x4(svcount_t pn, const uint64_t *base)
+svuint64x4_t test_svldnt1_u64_x4(svcount_t pn, const uint64_t *base) ATTR
 {
   return SVE_ACLE_FUNC(svldnt1,_u64,_x4,)(pn, base);
 }
@@ -247,7 +256,7 @@ svuint64x4_t test_svldnt1_u64_x4(svcount_t pn, const uint64_t *base)
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x i8> [[TMP4]]
 //
-svint8x2_t test_svldnt1_s8_x2(svcount_t pn, const int8_t *base)
+svint8x2_t test_svldnt1_s8_x2(svcount_t pn, const int8_t *base) ATTR
 {
   return SVE_ACLE_FUNC(svldnt1,_s8,_x2,)(pn, base);
 }
@@ -270,7 +279,7 @@ svint8x2_t test_svldnt1_s8_x2(svcount_t pn, const int8_t *base)
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i16> [[TMP4]]
 //
-svint16x2_t test_svldnt1_s16_x2(svcount_t pn, const int16_t *base)
+svint16x2_t test_svldnt1_s16_x2(svcount_t pn, const int16_t *base) ATTR
 {
   return SVE_ACLE_FUNC(svldnt1,_s16,_x2,)(pn, base);
 }
@@ -293,7 +302,7 @@ svint16x2_t test_svldnt1_s16_x2(svcount_t pn, const int16_t *base)
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i32> [[TMP4]]
 //
-svint32x2_t test_svldnt1_s32_x2(svcount_t pn, const int32_t *base)
+svint32x2_t test_svldnt1_s32_x2(svcount_t pn, const int32_t *base) ATTR
 {
   return SVE_ACLE_FUNC(svldnt1,_s32,_x2,)(pn, base);
 }
@@ -316,7 +325,7 @@ svint32x2_t test_svldnt1_s32_x2(svcount_t pn, const int32_t *base)
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], i64 2)
 // CPP-CHECK-NEXT:    ret <vscale x 4 x i64> [[TMP4]]
 //
-svint64x2_t test_svldnt1_s64_x2(svcount_t pn, const int64_t *base)
+svint64x2_t test_svldnt1_s64_x2(svcount_t pn, const int64_t *base) ATTR
 {
   return SVE_ACLE_FUNC(svldnt1,_s64,_x2,)(pn, base);
 }
@@ -347,7 +356,7 @@ svint64x2_t test_svldnt1_s64_x2(svcount_t pn, const int64_t *base)
 // CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
 // CPP-CHECK-NEXT:    ret <vscale x 64 x i8> [[TMP8]]
 //
-svint8x4_t test_svldnt1_s8_x4(svcount_t pn, const int8_t *base)
+svint8x4_t test_svldnt1_s8_x4(svcount_t pn, const int8_t *base) ATTR
 {
   return SVE_ACLE_FUNC(svldnt1,_s8,_x4,)(pn, base);
 }
@@ -378,7 +387,7 @@ svint8x4_t test_svldnt1_s8_x4(svcount_t pn, const int8_t *base)
 // CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x i16> [[TMP8]]
 //
-svint16x4_t test_svldnt1_s16_x4(svcount_t pn, const int16_t *base)
+svint16x4_t test_svldnt1_s16_x4(svcount_t pn, const int16_t *base) ATTR
 {
   return SVE_ACLE_FUNC(svldnt1,_s16,_x4,)(pn, base);
 }
@@ -409,7 +418,7 @@ svint16x4_t test_svldnt1_s16_x4(svcount_t pn, const int16_t *base)
 // CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i32> [[TMP8]]
 //
-svint32x4_t test_svldnt1_s32_x4(svcount_t pn, const int32_t *base)
+svint32x4_t test_svldnt1_s32_x4(svcount_t pn, const int32_t *base) ATTR
 {
   return SVE_ACLE_FUNC(svldnt1,_s32,_x4,)(pn, base);
 }
@@ -440,7 +449,7 @@ svint32x4_t test_svldnt1_s32_x4(svcount_t pn, const int32_t *base)
 // CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP6]], <vscale x 2 x i64> [[TMP7]], i64 6)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i64> [[TMP8]]
 //
-svint64x4_t test_svldnt1_s64_x4(svcount_t pn, const int64_t *base)
+svint64x4_t test_svldnt1_s64_x4(svcount_t pn, const int64_t *base) ATTR
 {
   return SVE_ACLE_FUNC(svldnt1,_s64,_x4,)(pn, base);
 }
@@ -463,7 +472,7 @@ svint64x4_t test_svldnt1_s64_x4(svcount_t pn, const int64_t *base)
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x half> [[TMP4]]
 //
-svfloat16x2_t test_svldnt1_f16_x2(svcount_t pn, const float16_t *base)
+svfloat16x2_t test_svldnt1_f16_x2(svcount_t pn, const float16_t *base) ATTR
 {
   return SVE_ACLE_FUNC(svldnt1,_f16,_x2,)(pn, base);
 }
@@ -486,7 +495,7 @@ svfloat16x2_t test_svldnt1_f16_x2(svcount_t pn, const float16_t *base)
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x float> [[TMP4]]
 //
-svfloat32x2_t test_svldnt1_f32_x2(svcount_t pn, const float32_t *base)
+svfloat32x2_t test_svldnt1_f32_x2(svcount_t pn, const float32_t *base) ATTR
 {
   return SVE_ACLE_FUNC(svldnt1,_f32,_x2,)(pn, base);
 }
@@ -509,7 +518,7 @@ svfloat32x2_t test_svldnt1_f32_x2(svcount_t pn, const float32_t *base)
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], i64 2)
 // CPP-CHECK-NEXT:    ret <vscale x 4 x double> [[TMP4]]
 //
-svfloat64x2_t test_svldnt1_f64_x2(svcount_t pn, const float64_t *base)
+svfloat64x2_t test_svldnt1_f64_x2(svcount_t pn, const float64_t *base) ATTR
 {
   return SVE_ACLE_FUNC(svldnt1,_f64,_x2,)(pn, base);
 }
@@ -540,7 +549,7 @@ svfloat64x2_t test_svldnt1_f64_x2(svcount_t pn, const float64_t *base)
 // CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]], i64 24)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x half> [[TMP8]]
 //
-svfloat16x4_t test_svldnt1_f16_x4(svcount_t pn, const float16_t *base)
+svfloat16x4_t test_svldnt1_f16_x4(svcount_t pn, const float16_t *base) ATTR
 {
   return SVE_ACLE_FUNC(svldnt1,_f16,_x4,)(pn, base);
 }
@@ -571,7 +580,7 @@ svfloat16x4_t test_svldnt1_f16_x4(svcount_t pn, const float16_t *base)
 // CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]], i64 12)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x float> [[TMP8]]
 //
-svfloat32x4_t test_svldnt1_f32_x4(svcount_t pn, const float32_t *base)
+svfloat32x4_t test_svldnt1_f32_x4(svcount_t pn, const float32_t *base) ATTR
 {
   return SVE_ACLE_FUNC(svldnt1,_f32,_x4,)(pn, base);
 }
@@ -602,7 +611,7 @@ svfloat32x4_t test_svldnt1_f32_x4(svcount_t pn, const float32_t *base)
 // CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP6]], <vscale x 2 x double> [[TMP7]], i64 6)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x double> [[TMP8]]
 //
-svfloat64x4_t test_svldnt1_f64_x4(svcount_t pn, const float64_t *base)
+svfloat64x4_t test_svldnt1_f64_x4(svcount_t pn, const float64_t *base) ATTR
 {
   return SVE_ACLE_FUNC(svldnt1,_f64,_x4,)(pn, base);
 }
@@ -631,7 +640,7 @@ svfloat64x4_t test_svldnt1_f64_x4(svcount_t pn, const float64_t *base)
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], i64 16)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x i8> [[TMP5]]
 //
-svuint8x2_t test_svldnt1_vnum_u8_x2(svcount_t pn, const uint8_t *base, int64_t vnum)
+svuint8x2_t test_svldnt1_vnum_u8_x2(svcount_t pn, const uint8_t *base, int64_t vnum) ATTR
 {
   return SVE_ACLE_FUNC(svldnt1_vnum,_u8,_x2,)(pn, base, vnum);
 }
@@ -656,7 +665,7 @@ svuint8x2_t test_svldnt1_vnum_u8_x2(svcount_t pn, const uint8_t *base, int64_t v
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], i64 8)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i16> [[TMP5]]
 //
-svuint16x2_t test_svldnt1_vnum_u16_x2(svcount_t pn, const uint16_t *base, int64_t vnum)
+svuint16x2_t test_svldnt1_vnum_u16_x2(svcount_t pn, const uint16_t *base, int64_t vnum) ATTR
 {
   return SVE_ACLE_FUNC(svldnt1_vnum,_u16,_x2,)(pn, base, vnum);
 }
@@ -681,7 +690,7 @@ svuint16x2_t test_svldnt1_vnum_u16_x2(svcount_t pn, const uint16_t *base, int64_
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP3]], <vscale x 4 x i32> [[TMP4]], i64 4)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i32> [[TMP5]]
 //
-svuint32x2_t test_svldnt1_vnum_u32_x2(svcount_t pn, const uint32_t *base, int64_t vnum)
+svuint32x2_t test_svldnt1_vnum_u32_x2(svcount_t pn, const uint32_t *base, int64_t vnum) ATTR
 {
   return SVE_ACLE_FUNC(svldnt1_vnum,_u32,_x2,)(pn, base, vnum);
 }
@@ -706,7 +715,7 @@ svuint32x2_t test_svldnt1_vnum_u32_x2(svcount_t pn, const uint32_t *base, int64_
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP4]], i64 2)
 // CPP-CHECK-NEXT:    ret <vscale x 4 x i64> [[TMP5]]
 //
-svuint64x2_t test_svldnt1_vnum_u64_x2(svcount_t pn, const uint64_t *base, int64_t vnum)
+svuint64x2_t test_svldnt1_vnum_u64_x2(svcount_t pn, const uint64_t *base, int64_t vnum) ATTR
 {
   return SVE_ACLE_FUNC(svldnt1_vnum,_u64,_x2,)(pn, base, vnum);
 }
@@ -739,7 +748,7 @@ svuint64x2_t test_svldnt1_vnum_u64_x2(svcount_t pn, const uint64_t *base, int64_
 // CPP-CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP7]], <vscale x 16 x i8> [[TMP8]], i64 48)
 // CPP-CHECK-NEXT:    ret <vscale x 64 x i8> [[TMP9]]
 //
-svuint8x4_t test_svldnt1_vnum_u8_x4(svcount_t pn, const uint8_t *base, int64_t vnum)
+svuint8x4_t test_svldnt1_vnum_u8_x4(svcount_t pn, const uint8_t *base, int64_t vnum) ATTR
 {
   return SVE_ACLE_FUNC(svldnt1_vnum,_u8,_x4,)(pn, base, vnum);
 }
@@ -772,7 +781,7 @@ svuint8x4_t test_svldnt1_vnum_u8_x4(svcount_t pn, const uint8_t *base, int64_t v
 // CPP-CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP7]], <vscale x 8 x i16> [[TMP8]], i64 24)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x i16> [[TMP9]]
 //
-svuint16x4_t test_svldnt1_vnum_u16_x4(svcount_t pn, const uint16_t *base, int64_t vnum)
+svuint16x4_t test_svldnt1_vnum_u16_x4(svcount_t pn, const uint16_t *base, int64_t vnum) ATTR
 {
   return SVE_ACLE_FUNC(svldnt1_vnum,_u16,_x4,)(pn, base, vnum);
 }
@@ -805,7 +814,7 @@ svuint16x4_t test_svldnt1_vnum_u16_x4(svcount_t pn, const uint16_t *base, int64_
 // CPP-CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP7]], <vscale x 4 x i32> [[TMP8]], i64 12)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i32> [[TMP9]]
 //
-svuint32x4_t test_svldnt1_vnum_u32_x4(svcount_t pn, const uint32_t *base, int64_t vnum)
+svuint32x4_t test_svldnt1_vnum_u32_x4(svcount_t pn, const uint32_t *base, int64_t vnum) ATTR
 {
   return SVE_ACLE_FUNC(svldnt1_vnum,_u32,_x4,)(pn, base, vnum);
 }
@@ -838,7 +847,7 @@ svuint32x4_t test_svldnt1_vnum_u32_x4(svcount_t pn, const uint32_t *base, int64_
 // CPP-CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP7]], <vscale x 2 x i64> [[TMP8]], i64 6)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i64> [[TMP9]]
 //
-svuint64x4_t test_svldnt1_vnum_u64_x4(svcount_t pn, const uint64_t *base, int64_t vnum)
+svuint64x4_t test_svldnt1_vnum_u64_x4(svcount_t pn, const uint64_t *base, int64_t vnum) ATTR
 {
   return SVE_ACLE_FUNC(svldnt1_vnum,_u64,_x4,)(pn, base, vnum);
 }
@@ -863,7 +872,7 @@ svuint64x4_t test_svldnt1_vnum_u64_x4(svcount_t pn, const uint64_t *base, int64_
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], i64 16)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x i8> [[TMP5]]
 //
-svint8x2_t test_svldnt1_vnum_s8_x2(svcount_t pn, const int8_t *base, int64_t vnum)
+svint8x2_t test_svldnt1_vnum_s8_x2(svcount_t pn, const int8_t *base, int64_t vnum) ATTR
 {
   return SVE_ACLE_FUNC(svldnt1_vnum,_s8,_x2,)(pn, base, vnum);
 }
@@ -888,7 +897,7 @@ svint8x2_t test_svldnt1_vnum_s8_x2(svcount_t pn, const int8_t *base, int64_t vnu
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], i64 8)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i16> [[TMP5]]
 //
-svint16x2_t test_svldnt1_vnum_s16_x2(svcount_t pn, const int16_t *base, int64_t vnum)
+svint16x2_t test_svldnt1_vnum_s16_x2(svcount_t pn, const int16_t *base, int64_t vnum) ATTR
 {
   return SVE_ACLE_FUNC(svldnt1_vnum,_s16,_x2,)(pn, base, vnum);
 }
@@ -913,7 +922,7 @@ svint16x2_t test_svldnt1_vnum_s16_x2(svcount_t pn, const int16_t *base, int64_t
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP3]], <vscale x 4 x i32> [[TMP4]], i64 4)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i32> [[TMP5]]
 //
-svint32x2_t test_svldnt1_vnum_s32_x2(svcount_t pn, const int32_t *base, int64_t vnum)
+svint32x2_t test_svldnt1_vnum_s32_x2(svcount_t pn, const int32_t *base, int64_t vnum) ATTR
 {
   return SVE_ACLE_FUNC(svldnt1_vnum,_s32,_x2,)(pn, base, vnum);
 }
@@ -938,7 +947,7 @@ svint32x2_t test_svldnt1_vnum_s32_x2(svcount_t pn, const int32_t *base, int64_t
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP4]], i64 2)
 // CPP-CHECK-NEXT:    ret <vscale x 4 x i64> [[TMP5]]
 //
-svint64x2_t test_svldnt1_vnum_s64_x2(svcount_t pn, const int64_t *base, int64_t vnum)
+svint64x2_t test_svldnt1_vnum_s64_x2(svcount_t pn, const int64_t *base, int64_t vnum) ATTR
 {
   return SVE_ACLE_FUNC(svldnt1_vnum,_s64,_x2,)(pn, base, vnum);
 }
@@ -971,7 +980,7 @@ svint64x2_t test_svldnt1_vnum_s64_x2(svcount_t pn, const int64_t *base, int64_t
 // CPP-CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP7]], <vscale x 16 x i8> [[TMP8]], i64 48)
 // CPP-CHECK-NEXT:    ret <vscale x 64 x i8> [[TMP9]]
 //
-svint8x4_t test_svldnt1_vnum_s8_x4(svcount_t pn, const int8_t *base, int64_t vnum)
+svint8x4_t test_svldnt1_vnum_s8_x4(svcount_t pn, const int8_t *base, int64_t vnum) ATTR
 {
   return SVE_ACLE_FUNC(svldnt1_vnum,_s8,_x4,)(pn, base, vnum);
 }
@@ -1004,7 +1013,7 @@ svint8x4_t test_svldnt1_vnum_s8_x4(svcount_t pn, const int8_t *base, int64_t vnu
 // CPP-CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP7]], <vscale x 8 x i16> [[TMP8]], i64 24)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x i16> [[TMP9]]
 //
-svint16x4_t test_svldnt1_vnum_s16_x4(svcount_t pn, const int16_t *base, int64_t vnum)
+svint16x4_t test_svldnt1_vnum_s16_x4(svcount_t pn, const int16_t *base, int64_t vnum) ATTR
 {
   return SVE_ACLE_FUNC(svldnt1_vnum,_s16,_x4,)(pn, base, vnum);
 }
@@ -1037,7 +1046,7 @@ svint16x4_t test_svldnt1_vnum_s16_x4(svcount_t pn, const int16_t *base, int64_t
 // CPP-CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP7]], <vscale x 4 x i32> [[TMP8]], i64 12)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i32> [[TMP9]]
 //
-svint32x4_t test_svldnt1_vnum_s32_x4(svcount_t pn, const int32_t *base, int64_t vnum)
+svint32x4_t test_svldnt1_vnum_s32_x4(svcount_t pn, const int32_t *base, int64_t vnum) ATTR
 {
   return SVE_ACLE_FUNC(svldnt1_vnum,_s32,_x4,)(pn, base, vnum);
 }
@@ -1070,7 +1079,7 @@ svint32x4_t test_svldnt1_vnum_s32_x4(svcount_t pn, const int32_t *base, int64_t
 // CPP-CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP7]], <vscale x 2 x i64> [[TMP8]], i64 6)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i64> [[TMP9]]
 //
-svint64x4_t test_svldnt1_vnum_s64_x4(svcount_t pn, const int64_t *base, int64_t vnum)
+svint64x4_t test_svldnt1_vnum_s64_x4(svcount_t pn, const int64_t *base, int64_t vnum) ATTR
 {
   return SVE_ACLE_FUNC(svldnt1_vnum,_s64,_x4,)(pn, base, vnum);
 }
@@ -1095,7 +1104,7 @@ svint64x4_t test_svldnt1_vnum_s64_x4(svcount_t pn, const int64_t *base, int64_t
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> [[TMP3]], <vscale x 8 x half> [[TMP4]], i64 8)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x half> [[TMP5]]
 //
-svfloat16x2_t test_svldnt1_vnum_f16_x2(svcount_t pn, const float16_t *base, int64_t vnum)
+svfloat16x2_t test_svldnt1_vnum_f16_x2(svcount_t pn, const float16_t *base, int64_t vnum) ATTR
 {
   return SVE_ACLE_FUNC(svldnt1_vnum,_f16,_x2,)(pn, base, vnum);
 }
@@ -1120,7 +1129,7 @@ svfloat16x2_t test_svldnt1_vnum_f16_x2(svcount_t pn, const float16_t *base, int6
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP3]], <vscale x 4 x float> [[TMP4]], i64 4)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x float> [[TMP5]]
 //
-svfloat32x2_t test_svldnt1_vnum_f32_x2(svcount_t pn, const float32_t *base, int64_t vnum)
+svfloat32x2_t test_svldnt1_vnum_f32_x2(svcount_t pn, const float32_t *base, int64_t vnum) ATTR
 {
   return SVE_ACLE_FUNC(svldnt1_vnum,_f32,_x2,)(pn, base, vnum);
 }
@@ -1145,7 +1154,7 @@ svfloat32x2_t test_svldnt1_vnum_f32_x2(svcount_t pn, const float32_t *base, int6
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> [[TMP3]], <vscale x 2 x double> [[TMP4]], i64 2)
 // CPP-CHECK-NEXT:    ret <vscale x 4 x double> [[TMP5]]
 //
-svfloat64x2_t test_svldnt1_vnum_f64_x2(svcount_t pn, const float64_t *base, int64_t vnum)
+svfloat64x2_t test_svldnt1_vnum_f64_x2(svcount_t pn, const float64_t *base, int64_t vnum) ATTR
 {
   return SVE_ACLE_FUNC(svldnt1_vnum,_f64,_x2,)(pn, base, vnum);
 }
@@ -1178,7 +1187,7 @@ svfloat64x2_t test_svldnt1_vnum_f64_x2(svcount_t pn, const float64_t *base, int6
 // CPP-CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP7]], <vscale x 8 x half> [[TMP8]], i64 24)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x half> [[TMP9]]
 //
-svfloat16x4_t test_svldnt1_vnum_f16_x4(svcount_t pn, const float16_t *base, int64_t vnum)
+svfloat16x4_t test_svldnt1_vnum_f16_x4(svcount_t pn, const float16_t *base, int64_t vnum) ATTR
 {
   return SVE_ACLE_FUNC(svldnt1_vnum,_f16,_x4,)(pn, base, vnum);
 }
@@ -1211,7 +1220,7 @@ svfloat16x4_t test_svldnt1_vnum_f16_x4(svcount_t pn, const float16_t *base, int6
 // CPP-CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP7]], <vscale x 4 x float> [[TMP8]], i64 12)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x float> [[TMP9]]
 //
-svfloat32x4_t test_svldnt1_vnum_f32_x4(svcount_t pn, const float32_t *base, int64_t vnum)
+svfloat32x4_t test_svldnt1_vnum_f32_x4(svcount_t pn, const float32_t *base, int64_t vnum) ATTR
 {
   return SVE_ACLE_FUNC(svldnt1_vnum,_f32,_x4,)(pn, base, vnum);
 }
@@ -1244,7 +1253,7 @@ svfloat32x4_t test_svldnt1_vnum_f32_x4(svcount_t pn, const float32_t *base, int6
 // CPP-CHECK-NEXT:    [[TMP9:%.*]] = tail call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP7]], <vscale x 2 x double> [[TMP8]], i64 6)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x double> [[TMP9]]
 //
-svfloat64x4_t test_svldnt1_vnum_f64_x4(svcount_t pn, const float64_t *base, int64_t vnum)
+svfloat64x4_t test_svldnt1_vnum_f64_x4(svcount_t pn, const float64_t *base, int64_t vnum) ATTR
 {
   return SVE_ACLE_FUNC(svldnt1_vnum,_f64,_x4,)(pn, base, vnum);
 }
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_st1.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_st1.c
index 9efc37a1dd58e..7aa994345a8c3 100644
--- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_st1.c
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_st1.c
@@ -1,9 +1,11 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -DTEST_SME2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -DTEST_SME2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
@@ -13,6 +15,12 @@
 #define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4
 #endif
 
+#ifndef TEST_SME2
+#define ATTR
+#else
+#define ATTR __arm_streaming
+#endif
+
 // CHECK-LABEL: @test_svst1_u8_x2(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[V:%.*]], i64 0)
@@ -27,7 +35,7 @@
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x2.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svst1_u8_x2(svcount_t pn, uint8_t *base, svuint8x2_t v)
+void test_svst1_u8_x2(svcount_t pn, uint8_t *base, svuint8x2_t v) ATTR
 {
   return SVE_ACLE_FUNC(svst1,_u8_x2,,)(pn, base, v);
 }
@@ -46,7 +54,7 @@ void test_svst1_u8_x2(svcount_t pn, uint8_t *base, svuint8x2_t v)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x2.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svst1_u16_x2(svcount_t pn, uint16_t *base, svuint16x2_t v)
+void test_svst1_u16_x2(svcount_t pn, uint16_t *base, svuint16x2_t v) ATTR
 {
   return SVE_ACLE_FUNC(svst1,_u16_x2,,)(pn, base, v);
 }
@@ -65,7 +73,7 @@ void test_svst1_u16_x2(svcount_t pn, uint16_t *base, svuint16x2_t v)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x2.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svst1_u32_x2(svcount_t pn, uint32_t *base, svuint32x2_t v)
+void test_svst1_u32_x2(svcount_t pn, uint32_t *base, svuint32x2_t v) ATTR
 {
   return SVE_ACLE_FUNC(svst1,_u32_x2,,)(pn, base, v);
 }
@@ -84,7 +92,7 @@ void test_svst1_u32_x2(svcount_t pn, uint32_t *base, svuint32x2_t v)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x2.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svst1_u64_x2(svcount_t pn, uint64_t *base, svuint64x2_t v)
+void test_svst1_u64_x2(svcount_t pn, uint64_t *base, svuint64x2_t v) ATTR
 {
   return SVE_ACLE_FUNC(svst1,_u64_x2,,)(pn, base, v);
 }
@@ -107,7 +115,7 @@ void test_svst1_u64_x2(svcount_t pn, uint64_t *base, svuint64x2_t v)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x4.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svst1_u8_x4(svcount_t pn, uint8_t *base, svuint8x4_t v)
+void test_svst1_u8_x4(svcount_t pn, uint8_t *base, svuint8x4_t v) ATTR
 {
   return SVE_ACLE_FUNC(svst1,_u8_x4,,)(pn, base, v);
 }
@@ -130,7 +138,7 @@ void test_svst1_u8_x4(svcount_t pn, uint8_t *base, svuint8x4_t v)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x4.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svst1_u16_x4(svcount_t pn, uint16_t *base, svuint16x4_t v)
+void test_svst1_u16_x4(svcount_t pn, uint16_t *base, svuint16x4_t v) ATTR
 {
   return SVE_ACLE_FUNC(svst1,_u16_x4,,)(pn, base, v);
 }
@@ -153,7 +161,7 @@ void test_svst1_u16_x4(svcount_t pn, uint16_t *base, svuint16x4_t v)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x4.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svst1_u32_x4(svcount_t pn, uint32_t *base, svuint32x4_t v)
+void test_svst1_u32_x4(svcount_t pn, uint32_t *base, svuint32x4_t v) ATTR
 {
   return SVE_ACLE_FUNC(svst1,_u32_x4,,)(pn, base, v);
 }
@@ -176,7 +184,7 @@ void test_svst1_u32_x4(svcount_t pn, uint32_t *base, svuint32x4_t v)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x4.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svst1_u64_x4(svcount_t pn, uint64_t *base, svuint64x4_t v)
+void test_svst1_u64_x4(svcount_t pn, uint64_t *base, svuint64x4_t v) ATTR
 {
   return SVE_ACLE_FUNC(svst1,_u64_x4,,)(pn, base, v);
 }
@@ -195,7 +203,7 @@ void test_svst1_u64_x4(svcount_t pn, uint64_t *base, svuint64x4_t v)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x2.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svst1_s8_x2(svcount_t pn, int8_t *base, svint8x2_t v)
+void test_svst1_s8_x2(svcount_t pn, int8_t *base, svint8x2_t v) ATTR
 {
   return SVE_ACLE_FUNC(svst1,_s8_x2,,)(pn, base, v);
 }
@@ -214,7 +222,7 @@ void test_svst1_s8_x2(svcount_t pn, int8_t *base, svint8x2_t v)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x2.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svst1_s16_x2(svcount_t pn, int16_t *base, svint16x2_t v)
+void test_svst1_s16_x2(svcount_t pn, int16_t *base, svint16x2_t v) ATTR
 {
   return SVE_ACLE_FUNC(svst1,_s16_x2,,)(pn, base, v);
 }
@@ -233,7 +241,7 @@ void test_svst1_s16_x2(svcount_t pn, int16_t *base, svint16x2_t v)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x2.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svst1_s32_x2(svcount_t pn, int32_t *base, svint32x2_t v)
+void test_svst1_s32_x2(svcount_t pn, int32_t *base, svint32x2_t v) ATTR
 {
   return SVE_ACLE_FUNC(svst1,_s32_x2,,)(pn, base, v);
 }
@@ -252,7 +260,7 @@ void test_svst1_s32_x2(svcount_t pn, int32_t *base, svint32x2_t v)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x2.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svst1_s64_x2(svcount_t pn, int64_t *base, svint64x2_t v)
+void test_svst1_s64_x2(svcount_t pn, int64_t *base, svint64x2_t v) ATTR
 {
   return SVE_ACLE_FUNC(svst1,_s64_x2,,)(pn, base, v);
 }
@@ -275,7 +283,7 @@ void test_svst1_s64_x2(svcount_t pn, int64_t *base, svint64x2_t v)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x4.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svst1_s8_x4(svcount_t pn, int8_t *base, svint8x4_t v)
+void test_svst1_s8_x4(svcount_t pn, int8_t *base, svint8x4_t v) ATTR
 {
   return SVE_ACLE_FUNC(svst1,_s8_x4,,)(pn, base, v);
 }
@@ -298,7 +306,7 @@ void test_svst1_s8_x4(svcount_t pn, int8_t *base, svint8x4_t v)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x4.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svst1_s16_x4(svcount_t pn, int16_t *base, svint16x4_t v)
+void test_svst1_s16_x4(svcount_t pn, int16_t *base, svint16x4_t v) ATTR
 {
   return SVE_ACLE_FUNC(svst1,_s16_x4,,)(pn, base, v);
 }
@@ -321,7 +329,7 @@ void test_svst1_s16_x4(svcount_t pn, int16_t *base, svint16x4_t v)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x4.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svst1_s32_x4(svcount_t pn, int32_t *base, svint32x4_t v)
+void test_svst1_s32_x4(svcount_t pn, int32_t *base, svint32x4_t v) ATTR
 {
   return SVE_ACLE_FUNC(svst1,_s32_x4,,)(pn, base, v);
 }
@@ -344,7 +352,7 @@ void test_svst1_s32_x4(svcount_t pn, int32_t *base, svint32x4_t v)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x4.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svst1_s64_x4(svcount_t pn, int64_t *base, svint64x4_t v)
+void test_svst1_s64_x4(svcount_t pn, int64_t *base, svint64x4_t v) ATTR
 {
   return SVE_ACLE_FUNC(svst1,_s64_x4,,)(pn, base, v);
 }
@@ -363,7 +371,7 @@ void test_svst1_s64_x4(svcount_t pn, int64_t *base, svint64x4_t v)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x2.nxv8f16(<vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svst1_f16_x2(svcount_t pn, float16_t *base, svfloat16x2_t v)
+void test_svst1_f16_x2(svcount_t pn, float16_t *base, svfloat16x2_t v) ATTR
 {
   return SVE_ACLE_FUNC(svst1,_f16_x2,,)(pn, base, v);
 }
@@ -382,7 +390,7 @@ void test_svst1_f16_x2(svcount_t pn, float16_t *base, svfloat16x2_t v)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x2.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svst1_f32_x2(svcount_t pn, float32_t *base, svfloat32x2_t v)
+void test_svst1_f32_x2(svcount_t pn, float32_t *base, svfloat32x2_t v) ATTR
 {
   return SVE_ACLE_FUNC(svst1,_f32_x2,,)(pn, base, v);
 }
@@ -401,7 +409,7 @@ void test_svst1_f32_x2(svcount_t pn, float32_t *base, svfloat32x2_t v)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x2.nxv2f64(<vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svst1_f64_x2(svcount_t pn, float64_t *base, svfloat64x2_t v)
+void test_svst1_f64_x2(svcount_t pn, float64_t *base, svfloat64x2_t v) ATTR
 {
   return SVE_ACLE_FUNC(svst1,_f64_x2,,)(pn, base, v);
 }
@@ -424,7 +432,7 @@ void test_svst1_f64_x2(svcount_t pn, float64_t *base, svfloat64x2_t v)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x4.nxv8f16(<vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svst1_f16_x4(svcount_t pn, float16_t *base, svfloat16x4_t v)
+void test_svst1_f16_x4(svcount_t pn, float16_t *base, svfloat16x4_t v) ATTR
 {
   return SVE_ACLE_FUNC(svst1,_f16_x4,,)(pn, base, v);
 }
@@ -447,7 +455,7 @@ void test_svst1_f16_x4(svcount_t pn, float16_t *base, svfloat16x4_t v)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x4.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svst1_f32_x4(svcount_t pn, float32_t *base, svfloat32x4_t v)
+void test_svst1_f32_x4(svcount_t pn, float32_t *base, svfloat32x4_t v) ATTR
 {
   return SVE_ACLE_FUNC(svst1,_f32_x4,,)(pn, base, v);
 }
@@ -470,7 +478,7 @@ void test_svst1_f32_x4(svcount_t pn, float32_t *base, svfloat32x4_t v)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x4.nxv2f64(<vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svst1_f64_x4(svcount_t pn, float64_t *base, svfloat64x4_t v)
+void test_svst1_f64_x4(svcount_t pn, float64_t *base, svfloat64x4_t v) ATTR
 {
   return SVE_ACLE_FUNC(svst1,_f64_x4,,)(pn, base, v);
 }
@@ -495,7 +503,7 @@ void test_svst1_f64_x4(svcount_t pn, float64_t *base, svfloat64x4_t v)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x2.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svst1_vnum_u8_x2(svcount_t pn, uint8_t *base, int64_t vnum, svuint8x2_t v)
+void test_svst1_vnum_u8_x2(svcount_t pn, uint8_t *base, int64_t vnum, svuint8x2_t v) ATTR
 {
   return SVE_ACLE_FUNC(svst1_vnum,_u8_x2,,)(pn, base, vnum, v);
 }
@@ -516,7 +524,7 @@ void test_svst1_vnum_u8_x2(svcount_t pn, uint8_t *base, int64_t vnum, svuint8x2_
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x2.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svst1_vnum_u16_x2(svcount_t pn, uint16_t *base, int64_t vnum, svuint16x2_t v)
+void test_svst1_vnum_u16_x2(svcount_t pn, uint16_t *base, int64_t vnum, svuint16x2_t v) ATTR
 {
   return SVE_ACLE_FUNC(svst1_vnum,_u16_x2,,)(pn, base, vnum, v);
 }
@@ -537,7 +545,7 @@ void test_svst1_vnum_u16_x2(svcount_t pn, uint16_t *base, int64_t vnum, svuint16
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x2.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svst1_vnum_u32_x2(svcount_t pn, uint32_t *base, int64_t vnum, svuint32x2_t v)
+void test_svst1_vnum_u32_x2(svcount_t pn, uint32_t *base, int64_t vnum, svuint32x2_t v) ATTR
 {
   return SVE_ACLE_FUNC(svst1_vnum,_u32_x2,,)(pn, base, vnum, v);
 }
@@ -558,7 +566,7 @@ void test_svst1_vnum_u32_x2(svcount_t pn, uint32_t *base, int64_t vnum, svuint32
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x2.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svst1_vnum_u64_x2(svcount_t pn, uint64_t *base, int64_t vnum, svuint64x2_t v)
+void test_svst1_vnum_u64_x2(svcount_t pn, uint64_t *base, int64_t vnum, svuint64x2_t v) ATTR
 {
   return SVE_ACLE_FUNC(svst1_vnum,_u64_x2,,)(pn, base, vnum, v);
 }
@@ -583,7 +591,7 @@ void test_svst1_vnum_u64_x2(svcount_t pn, uint64_t *base, int64_t vnum, svuint64
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x4.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svst1_vnum_u8_x4(svcount_t pn, uint8_t *base, int64_t vnum, svuint8x4_t v)
+void test_svst1_vnum_u8_x4(svcount_t pn, uint8_t *base, int64_t vnum, svuint8x4_t v) ATTR
 {
   return SVE_ACLE_FUNC(svst1_vnum,_u8_x4,,)(pn, base, vnum, v);
 }
@@ -608,7 +616,7 @@ void test_svst1_vnum_u8_x4(svcount_t pn, uint8_t *base, int64_t vnum, svuint8x4_
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x4.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svst1_vnum_u16_x4(svcount_t pn, uint16_t *base, int64_t vnum, svuint16x4_t v)
+void test_svst1_vnum_u16_x4(svcount_t pn, uint16_t *base, int64_t vnum, svuint16x4_t v) ATTR
 {
   return SVE_ACLE_FUNC(svst1_vnum,_u16_x4,,)(pn, base, vnum, v);
 }
@@ -633,7 +641,7 @@ void test_svst1_vnum_u16_x4(svcount_t pn, uint16_t *base, int64_t vnum, svuint16
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x4.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svst1_vnum_u32_x4(svcount_t pn, uint32_t *base, int64_t vnum, svuint32x4_t v)
+void test_svst1_vnum_u32_x4(svcount_t pn, uint32_t *base, int64_t vnum, svuint32x4_t v) ATTR
 {
   return SVE_ACLE_FUNC(svst1_vnum,_u32_x4,,)(pn, base, vnum, v);
 }
@@ -658,7 +666,7 @@ void test_svst1_vnum_u32_x4(svcount_t pn, uint32_t *base, int64_t vnum, svuint32
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x4.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svst1_vnum_u64_x4(svcount_t pn, uint64_t *base, int64_t vnum, svuint64x4_t v)
+void test_svst1_vnum_u64_x4(svcount_t pn, uint64_t *base, int64_t vnum, svuint64x4_t v) ATTR
 {
   return SVE_ACLE_FUNC(svst1_vnum,_u64_x4,,)(pn, base, vnum, v);
 }
@@ -679,7 +687,7 @@ void test_svst1_vnum_u64_x4(svcount_t pn, uint64_t *base, int64_t vnum, svuint64
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x2.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svst1_vnum_s8_x2(svcount_t pn, int8_t *base, int64_t vnum, svint8x2_t v)
+void test_svst1_vnum_s8_x2(svcount_t pn, int8_t *base, int64_t vnum, svint8x2_t v) ATTR
 {
   return SVE_ACLE_FUNC(svst1_vnum,_s8_x2,,)(pn, base, vnum, v);
 }
@@ -700,7 +708,7 @@ void test_svst1_vnum_s8_x2(svcount_t pn, int8_t *base, int64_t vnum, svint8x2_t
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x2.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svst1_vnum_s16_x2(svcount_t pn, int16_t *base, int64_t vnum, svint16x2_t v)
+void test_svst1_vnum_s16_x2(svcount_t pn, int16_t *base, int64_t vnum, svint16x2_t v) ATTR
 {
   return SVE_ACLE_FUNC(svst1_vnum,_s16_x2,,)(pn, base, vnum, v);
 }
@@ -721,7 +729,7 @@ void test_svst1_vnum_s16_x2(svcount_t pn, int16_t *base, int64_t vnum, svint16x2
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x2.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svst1_vnum_s32_x2(svcount_t pn, int32_t *base, int64_t vnum, svint32x2_t v)
+void test_svst1_vnum_s32_x2(svcount_t pn, int32_t *base, int64_t vnum, svint32x2_t v) ATTR
 {
   return SVE_ACLE_FUNC(svst1_vnum,_s32_x2,,)(pn, base, vnum, v);
 }
@@ -742,7 +750,7 @@ void test_svst1_vnum_s32_x2(svcount_t pn, int32_t *base, int64_t vnum, svint32x2
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x2.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svst1_vnum_s64_x2(svcount_t pn, int64_t *base, int64_t vnum, svint64x2_t v)
+void test_svst1_vnum_s64_x2(svcount_t pn, int64_t *base, int64_t vnum, svint64x2_t v) ATTR
 {
   return SVE_ACLE_FUNC(svst1_vnum,_s64_x2,,)(pn, base, vnum, v);
 }
@@ -767,7 +775,7 @@ void test_svst1_vnum_s64_x2(svcount_t pn, int64_t *base, int64_t vnum, svint64x2
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x4.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svst1_vnum_s8_x4(svcount_t pn, int8_t *base, int64_t vnum, svint8x4_t v)
+void test_svst1_vnum_s8_x4(svcount_t pn, int8_t *base, int64_t vnum, svint8x4_t v) ATTR
 {
   return SVE_ACLE_FUNC(svst1_vnum,_s8_x4,,)(pn, base, vnum, v);
 }
@@ -792,7 +800,7 @@ void test_svst1_vnum_s8_x4(svcount_t pn, int8_t *base, int64_t vnum, svint8x4_t
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x4.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svst1_vnum_s16_x4(svcount_t pn, int16_t *base, int64_t vnum, svint16x4_t v)
+void test_svst1_vnum_s16_x4(svcount_t pn, int16_t *base, int64_t vnum, svint16x4_t v) ATTR
 {
   return SVE_ACLE_FUNC(svst1_vnum,_s16_x4,,)(pn, base, vnum, v);
 }
@@ -817,7 +825,7 @@ void test_svst1_vnum_s16_x4(svcount_t pn, int16_t *base, int64_t vnum, svint16x4
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x4.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svst1_vnum_s32_x4(svcount_t pn, int32_t *base, int64_t vnum, svint32x4_t v)
+void test_svst1_vnum_s32_x4(svcount_t pn, int32_t *base, int64_t vnum, svint32x4_t v) ATTR
 {
   return SVE_ACLE_FUNC(svst1_vnum,_s32_x4,,)(pn, base, vnum, v);
 }
@@ -842,7 +850,7 @@ void test_svst1_vnum_s32_x4(svcount_t pn, int32_t *base, int64_t vnum, svint32x4
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x4.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svst1_vnum_s64_x4(svcount_t pn, int64_t *base, int64_t vnum, svint64x4_t v)
+void test_svst1_vnum_s64_x4(svcount_t pn, int64_t *base, int64_t vnum, svint64x4_t v) ATTR
 {
   return SVE_ACLE_FUNC(svst1_vnum,_s64_x4,,)(pn, base, vnum, v);
 }
@@ -865,7 +873,7 @@ void test_svst1_vnum_s64_x4(svcount_t pn, int64_t *base, int64_t vnum, svint64x4
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x2.nxv8f16(<vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svst1_vnum_f16_x2(svcount_t pn, float16_t *base, float64_t vnum, svfloat16x2_t v)
+void test_svst1_vnum_f16_x2(svcount_t pn, float16_t *base, float64_t vnum, svfloat16x2_t v) ATTR
 {
   return SVE_ACLE_FUNC(svst1_vnum,_f16_x2,,)(pn, base, vnum, v);
 }
@@ -888,7 +896,7 @@ void test_svst1_vnum_f16_x2(svcount_t pn, float16_t *base, float64_t vnum, svflo
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x2.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svst1_vnum_f32_x2(svcount_t pn, float32_t *base, float64_t vnum, svfloat32x2_t v)
+void test_svst1_vnum_f32_x2(svcount_t pn, float32_t *base, float64_t vnum, svfloat32x2_t v) ATTR
 {
   return SVE_ACLE_FUNC(svst1_vnum,_f32_x2,,)(pn, base, vnum, v);
 }
@@ -911,7 +919,7 @@ void test_svst1_vnum_f32_x2(svcount_t pn, float32_t *base, float64_t vnum, svflo
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x2.nxv2f64(<vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svst1_vnum_f64_x2(svcount_t pn, float64_t *base, float64_t vnum, svfloat64x2_t v)
+void test_svst1_vnum_f64_x2(svcount_t pn, float64_t *base, float64_t vnum, svfloat64x2_t v) ATTR
 {
   return SVE_ACLE_FUNC(svst1_vnum,_f64_x2,,)(pn, base, vnum, v);
 }
@@ -938,7 +946,7 @@ void test_svst1_vnum_f64_x2(svcount_t pn, float64_t *base, float64_t vnum, svflo
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x4.nxv8f16(<vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svst1_vnum_f16_x4(svcount_t pn, float16_t *base, float64_t vnum, svfloat16x4_t v)
+void test_svst1_vnum_f16_x4(svcount_t pn, float16_t *base, float64_t vnum, svfloat16x4_t v) ATTR
 {
   return SVE_ACLE_FUNC(svst1_vnum,_f16_x4,,)(pn, base, vnum, v);
 }
@@ -965,7 +973,7 @@ void test_svst1_vnum_f16_x4(svcount_t pn, float16_t *base, float64_t vnum, svflo
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x4.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svst1_vnum_f32_x4(svcount_t pn, float32_t *base, float64_t vnum, svfloat32x4_t v)
+void test_svst1_vnum_f32_x4(svcount_t pn, float32_t *base, float64_t vnum, svfloat32x4_t v) ATTR
 {
   return SVE_ACLE_FUNC(svst1_vnum,_f32_x4,,)(pn, base, vnum, v);
 }
@@ -992,7 +1000,7 @@ void test_svst1_vnum_f32_x4(svcount_t pn, float32_t *base, float64_t vnum, svflo
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1.pn.x4.nxv2f64(<vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svst1_vnum_f64_x4(svcount_t pn, float64_t *base, float64_t vnum, svfloat64x4_t v)
+void test_svst1_vnum_f64_x4(svcount_t pn, float64_t *base, float64_t vnum, svfloat64x4_t v) ATTR
 {
   return SVE_ACLE_FUNC(svst1_vnum,_f64_x4,,)(pn, base, vnum, v);
 }
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_stnt1.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_stnt1.c
index 9b860fe7180e1..0d8696a7634a7 100644
--- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_stnt1.c
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_stnt1.c
@@ -1,9 +1,12 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -DTEST_SME2 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -DTEST_SME2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
@@ -13,6 +16,11 @@
 #define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4
 #endif
 
+#ifndef TEST_SME2
+#define ATTR
+#else
+#define ATTR __arm_streaming
+#endif
 
 // CHECK-LABEL: @test_svstnt1_u8_x2(
 // CHECK-NEXT:  entry:
@@ -28,7 +36,7 @@
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svstnt1_u8_x2(svcount_t pn, uint8_t *base, svuint8x2_t v)
+void test_svstnt1_u8_x2(svcount_t pn, uint8_t *base, svuint8x2_t v) ATTR
 {
   return SVE_ACLE_FUNC(svstnt1,_u8_x2,,)(pn, base, v);
 }
@@ -48,7 +56,7 @@ void test_svstnt1_u8_x2(svcount_t pn, uint8_t *base, svuint8x2_t v)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svstnt1_u16_x2(svcount_t pn, uint16_t *base, svuint16x2_t v)
+void test_svstnt1_u16_x2(svcount_t pn, uint16_t *base, svuint16x2_t v) ATTR
 {
   return SVE_ACLE_FUNC(svstnt1,_u16_x2,,)(pn, base, v);
 }
@@ -68,7 +76,7 @@ void test_svstnt1_u16_x2(svcount_t pn, uint16_t *base, svuint16x2_t v)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svstnt1_u32_x2(svcount_t pn, uint32_t *base, svuint32x2_t v)
+void test_svstnt1_u32_x2(svcount_t pn, uint32_t *base, svuint32x2_t v) ATTR
 {
   return SVE_ACLE_FUNC(svstnt1,_u32_x2,,)(pn, base, v);
 }
@@ -88,7 +96,7 @@ void test_svstnt1_u32_x2(svcount_t pn, uint32_t *base, svuint32x2_t v)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svstnt1_u64_x2(svcount_t pn, uint64_t *base, svuint64x2_t v)
+void test_svstnt1_u64_x2(svcount_t pn, uint64_t *base, svuint64x2_t v) ATTR
 {
   return SVE_ACLE_FUNC(svstnt1,_u64_x2,,)(pn, base, v);
 }
@@ -112,7 +120,7 @@ void test_svstnt1_u64_x2(svcount_t pn, uint64_t *base, svuint64x2_t v)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svstnt1_u8_x4(svcount_t pn, uint8_t *base, svuint8x4_t v)
+void test_svstnt1_u8_x4(svcount_t pn, uint8_t *base, svuint8x4_t v) ATTR
 {
   return SVE_ACLE_FUNC(svstnt1,_u8_x4,,)(pn, base, v);
 }
@@ -136,7 +144,7 @@ void test_svstnt1_u8_x4(svcount_t pn, uint8_t *base, svuint8x4_t v)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svstnt1_u16_x4(svcount_t pn, uint16_t *base, svuint16x4_t v)
+void test_svstnt1_u16_x4(svcount_t pn, uint16_t *base, svuint16x4_t v) ATTR
 {
   return SVE_ACLE_FUNC(svstnt1,_u16_x4,,)(pn, base, v);
 }
@@ -160,7 +168,7 @@ void test_svstnt1_u16_x4(svcount_t pn, uint16_t *base, svuint16x4_t v)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svstnt1_u32_x4(svcount_t pn, uint32_t *base, svuint32x4_t v)
+void test_svstnt1_u32_x4(svcount_t pn, uint32_t *base, svuint32x4_t v) ATTR
 {
   return SVE_ACLE_FUNC(svstnt1,_u32_x4,,)(pn, base, v);
 }
@@ -184,7 +192,7 @@ void test_svstnt1_u32_x4(svcount_t pn, uint32_t *base, svuint32x4_t v)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svstnt1_u64_x4(svcount_t pn, uint64_t *base, svuint64x4_t v)
+void test_svstnt1_u64_x4(svcount_t pn, uint64_t *base, svuint64x4_t v) ATTR
 {
   return SVE_ACLE_FUNC(svstnt1,_u64_x4,,)(pn, base, v);
 }
@@ -204,7 +212,7 @@ void test_svstnt1_u64_x4(svcount_t pn, uint64_t *base, svuint64x4_t v)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svstnt1_s8_x2(svcount_t pn, int8_t *base, svint8x2_t v)
+void test_svstnt1_s8_x2(svcount_t pn, int8_t *base, svint8x2_t v) ATTR
 {
   return SVE_ACLE_FUNC(svstnt1,_s8_x2,,)(pn, base, v);
 }
@@ -224,7 +232,7 @@ void test_svstnt1_s8_x2(svcount_t pn, int8_t *base, svint8x2_t v)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svstnt1_s16_x2(svcount_t pn, int16_t *base, svint16x2_t v)
+void test_svstnt1_s16_x2(svcount_t pn, int16_t *base, svint16x2_t v) ATTR
 {
   return SVE_ACLE_FUNC(svstnt1,_s16_x2,,)(pn, base, v);
 }
@@ -244,7 +252,7 @@ void test_svstnt1_s16_x2(svcount_t pn, int16_t *base, svint16x2_t v)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svstnt1_s32_x2(svcount_t pn, int32_t *base, svint32x2_t v)
+void test_svstnt1_s32_x2(svcount_t pn, int32_t *base, svint32x2_t v) ATTR
 {
   return SVE_ACLE_FUNC(svstnt1,_s32_x2,,)(pn, base, v);
 }
@@ -264,7 +272,7 @@ void test_svstnt1_s32_x2(svcount_t pn, int32_t *base, svint32x2_t v)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svstnt1_s64_x2(svcount_t pn, int64_t *base, svint64x2_t v)
+void test_svstnt1_s64_x2(svcount_t pn, int64_t *base, svint64x2_t v) ATTR
 {
   return SVE_ACLE_FUNC(svstnt1,_s64_x2,,)(pn, base, v);
 }
@@ -288,7 +296,7 @@ void test_svstnt1_s64_x2(svcount_t pn, int64_t *base, svint64x2_t v)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svstnt1_s8_x4(svcount_t pn, int8_t *base, svint8x4_t v)
+void test_svstnt1_s8_x4(svcount_t pn, int8_t *base, svint8x4_t v) ATTR
 {
   return SVE_ACLE_FUNC(svstnt1,_s8_x4,,)(pn, base, v);
 }
@@ -312,7 +320,7 @@ void test_svstnt1_s8_x4(svcount_t pn, int8_t *base, svint8x4_t v)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svstnt1_s16_x4(svcount_t pn, int16_t *base, svint16x4_t v)
+void test_svstnt1_s16_x4(svcount_t pn, int16_t *base, svint16x4_t v) ATTR
 {
   return SVE_ACLE_FUNC(svstnt1,_s16_x4,,)(pn, base, v);
 }
@@ -336,7 +344,7 @@ void test_svstnt1_s16_x4(svcount_t pn, int16_t *base, svint16x4_t v)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svstnt1_s32_x4(svcount_t pn, int32_t *base, svint32x4_t v)
+void test_svstnt1_s32_x4(svcount_t pn, int32_t *base, svint32x4_t v) ATTR
 {
   return SVE_ACLE_FUNC(svstnt1,_s32_x4,,)(pn, base, v);
 }
@@ -360,7 +368,7 @@ void test_svstnt1_s32_x4(svcount_t pn, int32_t *base, svint32x4_t v)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svstnt1_s64_x4(svcount_t pn, int64_t *base, svint64x4_t v)
+void test_svstnt1_s64_x4(svcount_t pn, int64_t *base, svint64x4_t v) ATTR
 {
   return SVE_ACLE_FUNC(svstnt1,_s64_x4,,)(pn, base, v);
 }
@@ -380,7 +388,7 @@ void test_svstnt1_s64_x4(svcount_t pn, int64_t *base, svint64x4_t v)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv8f16(<vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svstnt1_f16_x2(svcount_t pn, float16_t *base, svfloat16x2_t v)
+void test_svstnt1_f16_x2(svcount_t pn, float16_t *base, svfloat16x2_t v) ATTR
 {
   return SVE_ACLE_FUNC(svstnt1,_f16_x2,,)(pn, base, v);
 }
@@ -400,7 +408,7 @@ void test_svstnt1_f16_x2(svcount_t pn, float16_t *base, svfloat16x2_t v)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svstnt1_f32_x2(svcount_t pn, float32_t *base, svfloat32x2_t v)
+void test_svstnt1_f32_x2(svcount_t pn, float32_t *base, svfloat32x2_t v) ATTR
 {
   return SVE_ACLE_FUNC(svstnt1,_f32_x2,,)(pn, base, v);
 }
@@ -420,7 +428,7 @@ void test_svstnt1_f32_x2(svcount_t pn, float32_t *base, svfloat32x2_t v)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv2f64(<vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svstnt1_f64_x2(svcount_t pn, float64_t *base, svfloat64x2_t v)
+void test_svstnt1_f64_x2(svcount_t pn, float64_t *base, svfloat64x2_t v) ATTR
 {
   return SVE_ACLE_FUNC(svstnt1,_f64_x2,,)(pn, base, v);
 }
@@ -444,7 +452,7 @@ void test_svstnt1_f64_x2(svcount_t pn, float64_t *base, svfloat64x2_t v)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv8f16(<vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svstnt1_f16_x4(svcount_t pn, float16_t *base, svfloat16x4_t v)
+void test_svstnt1_f16_x4(svcount_t pn, float16_t *base, svfloat16x4_t v) ATTR
 {
   return SVE_ACLE_FUNC(svstnt1,_f16_x4,,)(pn, base, v);
 }
@@ -468,7 +476,7 @@ void test_svstnt1_f16_x4(svcount_t pn, float16_t *base, svfloat16x4_t v)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svstnt1_f32_x4(svcount_t pn, float32_t *base, svfloat32x4_t v)
+void test_svstnt1_f32_x4(svcount_t pn, float32_t *base, svfloat32x4_t v) ATTR
 {
   return SVE_ACLE_FUNC(svstnt1,_f32_x4,,)(pn, base, v);
 }
@@ -492,7 +500,7 @@ void test_svstnt1_f32_x4(svcount_t pn, float32_t *base, svfloat32x4_t v)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv2f64(<vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svstnt1_f64_x4(svcount_t pn, float64_t *base, svfloat64x4_t v)
+void test_svstnt1_f64_x4(svcount_t pn, float64_t *base, svfloat64x4_t v) ATTR
 {
   return SVE_ACLE_FUNC(svstnt1,_f64_x4,,)(pn, base, v);
 }
@@ -518,7 +526,7 @@ void test_svstnt1_f64_x4(svcount_t pn, float64_t *base, svfloat64x4_t v)
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svstnt1_vnum_u8_x2(svcount_t pn, uint8_t *base, int64_t vnum, svuint8x2_t v)
+void test_svstnt1_vnum_u8_x2(svcount_t pn, uint8_t *base, int64_t vnum, svuint8x2_t v) ATTR
 {
   return SVE_ACLE_FUNC(svstnt1_vnum,_u8_x2,,)(pn, base, vnum, v);
 }
@@ -540,7 +548,7 @@ void test_svstnt1_vnum_u8_x2(svcount_t pn, uint8_t *base, int64_t vnum, svuint8x
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svstnt1_vnum_u16_x2(svcount_t pn, uint16_t *base, int64_t vnum, svuint16x2_t v)
+void test_svstnt1_vnum_u16_x2(svcount_t pn, uint16_t *base, int64_t vnum, svuint16x2_t v) ATTR
 {
   return SVE_ACLE_FUNC(svstnt1_vnum,_u16_x2,,)(pn, base, vnum, v);
 }
@@ -562,7 +570,7 @@ void test_svstnt1_vnum_u16_x2(svcount_t pn, uint16_t *base, int64_t vnum, svuint
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svstnt1_vnum_u32_x2(svcount_t pn, uint32_t *base, int64_t vnum, svuint32x2_t v)
+void test_svstnt1_vnum_u32_x2(svcount_t pn, uint32_t *base, int64_t vnum, svuint32x2_t v) ATTR
 {
   return SVE_ACLE_FUNC(svstnt1_vnum,_u32_x2,,)(pn, base, vnum, v);
 }
@@ -584,7 +592,7 @@ void test_svstnt1_vnum_u32_x2(svcount_t pn, uint32_t *base, int64_t vnum, svuint
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svstnt1_vnum_u64_x2(svcount_t pn, uint64_t *base, int64_t vnum, svuint64x2_t v)
+void test_svstnt1_vnum_u64_x2(svcount_t pn, uint64_t *base, int64_t vnum, svuint64x2_t v) ATTR
 {
   return SVE_ACLE_FUNC(svstnt1_vnum,_u64_x2,,)(pn, base, vnum, v);
 }
@@ -610,7 +618,7 @@ void test_svstnt1_vnum_u64_x2(svcount_t pn, uint64_t *base, int64_t vnum, svuint
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svstnt1_vnum_u8_x4(svcount_t pn, uint8_t *base, int64_t vnum, svuint8x4_t v)
+void test_svstnt1_vnum_u8_x4(svcount_t pn, uint8_t *base, int64_t vnum, svuint8x4_t v) ATTR
 {
   return SVE_ACLE_FUNC(svstnt1_vnum,_u8_x4,,)(pn, base, vnum, v);
 }
@@ -636,7 +644,7 @@ void test_svstnt1_vnum_u8_x4(svcount_t pn, uint8_t *base, int64_t vnum, svuint8x
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svstnt1_vnum_u16_x4(svcount_t pn, uint16_t *base, int64_t vnum, svuint16x4_t v)
+void test_svstnt1_vnum_u16_x4(svcount_t pn, uint16_t *base, int64_t vnum, svuint16x4_t v) ATTR
 {
   return SVE_ACLE_FUNC(svstnt1_vnum,_u16_x4,,)(pn, base, vnum, v);
 }
@@ -662,7 +670,7 @@ void test_svstnt1_vnum_u16_x4(svcount_t pn, uint16_t *base, int64_t vnum, svuint
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svstnt1_vnum_u32_x4(svcount_t pn, uint32_t *base, int64_t vnum, svuint32x4_t v)
+void test_svstnt1_vnum_u32_x4(svcount_t pn, uint32_t *base, int64_t vnum, svuint32x4_t v) ATTR
 {
   return SVE_ACLE_FUNC(svstnt1_vnum,_u32_x4,,)(pn, base, vnum, v);
 }
@@ -688,7 +696,7 @@ void test_svstnt1_vnum_u32_x4(svcount_t pn, uint32_t *base, int64_t vnum, svuint
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svstnt1_vnum_u64_x4(svcount_t pn, uint64_t *base, int64_t vnum, svuint64x4_t v)
+void test_svstnt1_vnum_u64_x4(svcount_t pn, uint64_t *base, int64_t vnum, svuint64x4_t v) ATTR
 {
   return SVE_ACLE_FUNC(svstnt1_vnum,_u64_x4,,)(pn, base, vnum, v);
 }
@@ -710,7 +718,7 @@ void test_svstnt1_vnum_u64_x4(svcount_t pn, uint64_t *base, int64_t vnum, svuint
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svstnt1_vnum_s8_x2(svcount_t pn, int8_t *base, int64_t vnum, svint8x2_t v)
+void test_svstnt1_vnum_s8_x2(svcount_t pn, int8_t *base, int64_t vnum, svint8x2_t v) ATTR
 {
   return SVE_ACLE_FUNC(svstnt1_vnum,_s8_x2,,)(pn, base, vnum, v);
 }
@@ -732,7 +740,7 @@ void test_svstnt1_vnum_s8_x2(svcount_t pn, int8_t *base, int64_t vnum, svint8x2_
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svstnt1_vnum_s16_x2(svcount_t pn, int16_t *base, int64_t vnum, svint16x2_t v)
+void test_svstnt1_vnum_s16_x2(svcount_t pn, int16_t *base, int64_t vnum, svint16x2_t v) ATTR
 {
   return SVE_ACLE_FUNC(svstnt1_vnum,_s16_x2,,)(pn, base, vnum, v);
 }
@@ -754,7 +762,7 @@ void test_svstnt1_vnum_s16_x2(svcount_t pn, int16_t *base, int64_t vnum, svint16
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svstnt1_vnum_s32_x2(svcount_t pn, int32_t *base, int64_t vnum, svint32x2_t v)
+void test_svstnt1_vnum_s32_x2(svcount_t pn, int32_t *base, int64_t vnum, svint32x2_t v) ATTR
 {
   return SVE_ACLE_FUNC(svstnt1_vnum,_s32_x2,,)(pn, base, vnum, v);
 }
@@ -776,7 +784,7 @@ void test_svstnt1_vnum_s32_x2(svcount_t pn, int32_t *base, int64_t vnum, svint32
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svstnt1_vnum_s64_x2(svcount_t pn, int64_t *base, int64_t vnum, svint64x2_t v)
+void test_svstnt1_vnum_s64_x2(svcount_t pn, int64_t *base, int64_t vnum, svint64x2_t v) ATTR
 {
   return SVE_ACLE_FUNC(svstnt1_vnum,_s64_x2,,)(pn, base, vnum, v);
 }
@@ -802,7 +810,7 @@ void test_svstnt1_vnum_s64_x2(svcount_t pn, int64_t *base, int64_t vnum, svint64
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svstnt1_vnum_s8_x4(svcount_t pn, int8_t *base, int64_t vnum, svint8x4_t v)
+void test_svstnt1_vnum_s8_x4(svcount_t pn, int8_t *base, int64_t vnum, svint8x4_t v) ATTR
 {
   return SVE_ACLE_FUNC(svstnt1_vnum,_s8_x4,,)(pn, base, vnum, v);
 }
@@ -828,7 +836,7 @@ void test_svstnt1_vnum_s8_x4(svcount_t pn, int8_t *base, int64_t vnum, svint8x4_
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svstnt1_vnum_s16_x4(svcount_t pn, int16_t *base, int64_t vnum, svint16x4_t v)
+void test_svstnt1_vnum_s16_x4(svcount_t pn, int16_t *base, int64_t vnum, svint16x4_t v) ATTR
 {
   return SVE_ACLE_FUNC(svstnt1_vnum,_s16_x4,,)(pn, base, vnum, v);
 }
@@ -854,7 +862,7 @@ void test_svstnt1_vnum_s16_x4(svcount_t pn, int16_t *base, int64_t vnum, svint16
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svstnt1_vnum_s32_x4(svcount_t pn, int32_t *base, int64_t vnum, svint32x4_t v)
+void test_svstnt1_vnum_s32_x4(svcount_t pn, int32_t *base, int64_t vnum, svint32x4_t v) ATTR
 {
   return SVE_ACLE_FUNC(svstnt1_vnum,_s32_x4,,)(pn, base, vnum, v);
 }
@@ -880,7 +888,7 @@ void test_svstnt1_vnum_s32_x4(svcount_t pn, int32_t *base, int64_t vnum, svint32
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svstnt1_vnum_s64_x4(svcount_t pn, int64_t *base, int64_t vnum, svint64x4_t v)
+void test_svstnt1_vnum_s64_x4(svcount_t pn, int64_t *base, int64_t vnum, svint64x4_t v) ATTR
 {
   return SVE_ACLE_FUNC(svstnt1_vnum,_s64_x4,,)(pn, base, vnum, v);
 }
@@ -904,7 +912,7 @@ void test_svstnt1_vnum_s64_x4(svcount_t pn, int64_t *base, int64_t vnum, svint64
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv8f16(<vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svstnt1_vnum_f16_x2(svcount_t pn, float16_t *base, float64_t vnum, svfloat16x2_t v)
+void test_svstnt1_vnum_f16_x2(svcount_t pn, float16_t *base, float64_t vnum, svfloat16x2_t v) ATTR
 {
   return SVE_ACLE_FUNC(svstnt1_vnum,_f16_x2,,)(pn, base, vnum, v);
 }
@@ -928,7 +936,7 @@ void test_svstnt1_vnum_f16_x2(svcount_t pn, float16_t *base, float64_t vnum, svf
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svstnt1_vnum_f32_x2(svcount_t pn, float32_t *base, float64_t vnum, svfloat32x2_t v)
+void test_svstnt1_vnum_f32_x2(svcount_t pn, float32_t *base, float64_t vnum, svfloat32x2_t v) ATTR
 {
   return SVE_ACLE_FUNC(svstnt1_vnum,_f32_x2,,)(pn, base, vnum, v);
 }
@@ -952,7 +960,7 @@ void test_svstnt1_vnum_f32_x2(svcount_t pn, float32_t *base, float64_t vnum, svf
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x2.nxv2f64(<vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svstnt1_vnum_f64_x2(svcount_t pn, float64_t *base, float64_t vnum, svfloat64x2_t v)
+void test_svstnt1_vnum_f64_x2(svcount_t pn, float64_t *base, float64_t vnum, svfloat64x2_t v) ATTR
 {
   return SVE_ACLE_FUNC(svstnt1_vnum,_f64_x2,,)(pn, base, vnum, v);
 }
@@ -980,7 +988,7 @@ void test_svstnt1_vnum_f64_x2(svcount_t pn, float64_t *base, float64_t vnum, svf
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv8f16(<vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svstnt1_vnum_f16_x4(svcount_t pn, float16_t *base, float64_t vnum, svfloat16x4_t v)
+void test_svstnt1_vnum_f16_x4(svcount_t pn, float16_t *base, float64_t vnum, svfloat16x4_t v) ATTR
 {
   return SVE_ACLE_FUNC(svstnt1_vnum,_f16_x4,,)(pn, base, vnum, v);
 }
@@ -1008,7 +1016,7 @@ void test_svstnt1_vnum_f16_x4(svcount_t pn, float16_t *base, float64_t vnum, svf
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svstnt1_vnum_f32_x4(svcount_t pn, float32_t *base, float64_t vnum, svfloat32x4_t v)
+void test_svstnt1_vnum_f32_x4(svcount_t pn, float32_t *base, float64_t vnum, svfloat32x4_t v) ATTR
 {
   return SVE_ACLE_FUNC(svstnt1_vnum,_f32_x4,,)(pn, base, vnum, v);
 }
@@ -1036,7 +1044,7 @@ void test_svstnt1_vnum_f32_x4(svcount_t pn, float32_t *base, float64_t vnum, svf
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.stnt1.pn.x4.nxv2f64(<vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], target("aarch64.svcount") [[PN:%.*]], ptr [[TMP4]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svstnt1_vnum_f64_x4(svcount_t pn, float64_t *base, float64_t vnum, svfloat64x4_t v)
+void test_svstnt1_vnum_f64_x4(svcount_t pn, float64_t *base, float64_t vnum, svfloat64x4_t v) ATTR
 {
   return SVE_ACLE_FUNC(svstnt1_vnum,_f64_x4,,)(pn, base, vnum, v);
 }

From 8fdfd34cd2ad67cd3fe2ded59b476790240a52bb Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Thu, 21 Dec 2023 15:27:08 +0000
Subject: [PATCH 081/342] [AMDGPU] Remove GDS and GWS for GFX12 (#76148)

---
 llvm/lib/Target/AMDGPU/AMDGPU.td                     |  4 ++--
 llvm/lib/Target/AMDGPU/DSInstructions.td             | 12 ++++++++----
 .../AMDGPU/Disassembler/AMDGPUDisassembler.cpp       |  5 +++++
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp               |  8 ++++++++
 llvm/test/CodeGen/AMDGPU/gds-unsupported.ll          |  1 +
 .../AMDGPU/llvm.amdgcn.ds.ordered.add.gfx11.ll       |  3 +++
 llvm/test/CodeGen/AMDGPU/verify-gfx12-gds.mir        | 10 ++++++++++
 llvm/test/MC/Disassembler/AMDGPU/decode-err.txt      |  8 ++++++++
 llvm/test/MC/Disassembler/AMDGPU/gfx90a_features.txt |  5 -----
 9 files changed, 45 insertions(+), 11 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/verify-gfx12-gds.mir

diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 060fb66d38f7b..d2a325d5ad898 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -1100,8 +1100,8 @@ def FeatureGFX12 : GCNSubtargetFeatureGeneration<"GFX12",
    FeatureVOP3Literal, FeatureDPP8,
    FeatureNoDataDepHazard, FeaturePkFmacF16Inst,
    FeatureA16, FeatureFastDenormalF32, FeatureG16,
-   FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess, FeatureGDS,
-   FeatureGWS, FeatureTrue16BitInsts
+   FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess,
+   FeatureTrue16BitInsts
   ]
 >;
 
diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td
index 3a895923fa4b9..bc9049b4ef33c 100644
--- a/llvm/lib/Target/AMDGPU/DSInstructions.td
+++ b/llvm/lib/Target/AMDGPU/DSInstructions.td
@@ -1147,7 +1147,8 @@ def : GCNPat <
 >;
 } // End SubtargetPredicate = HasAtomicDsPkAdd16Insts
 
-def : Pat <
+let OtherPredicates = [HasGDS] in
+def : GCNPat <
   (SIds_ordered_count i32:$value, i16:$offset),
   (DS_ORDERED_COUNT $value, (as_i16imm $offset))
 >;
@@ -1189,7 +1190,8 @@ def : GCNPat <
 //===----------------------------------------------------------------------===//
 
 class Base_DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<bits<8> op, DS_Pseudo ps, int ef,
-                                               string opName = ps.Mnemonic>
+                                               string opName = ps.Mnemonic,
+                                               bit hasGFX12Enc = 0>
     : DS_Real<ps, opName>, SIMCInstr <ps.Mnemonic, ef> {
 
   let Inst{7-0}   = !if(ps.has_offset0, offset0, 0);
@@ -1201,6 +1203,8 @@ class Base_DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<bits<8> op, DS_Pseudo ps, int ef,
   let Inst{47-40} = !if(ps.has_data0, data0{7-0}, 0);
   let Inst{55-48} = !if(ps.has_data1, data1{7-0}, 0);
   let Inst{63-56} = !if(ps.has_vdst, vdst{7-0}, 0);
+
+  let gds = !if(hasGFX12Enc, 0, ?);
 }
 
 //===----------------------------------------------------------------------===//
@@ -1212,7 +1216,7 @@ let AssemblerPredicate = isGFX12Plus, DecoderNamespace = "GFX12" in {
     defvar ps = !cast<DS_Pseudo>(NAME);
     def _gfx12 :
       Base_DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<op, ps, SIEncodingFamily.GFX12,
-                                               ps.Mnemonic>;
+                                               ps.Mnemonic, 1>;
   }
 
   multiclass DS_Real_Renamed_gfx12<bits<8> op, DS_Pseudo backing_pseudo,
@@ -1220,7 +1224,7 @@ let AssemblerPredicate = isGFX12Plus, DecoderNamespace = "GFX12" in {
     def _gfx12 :
       Base_DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<op, backing_pseudo,
                                                SIEncodingFamily.GFX12,
-                                               real_name>,
+                                               real_name, 1>,
       MnemonicAlias<backing_pseudo.Mnemonic, real_name>,
       Requires<[isGFX12Plus]>;
   }
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index ed2e7e4f189e0..7939d0036568d 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -702,6 +702,11 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
                          AMDGPU::OpName::src2_modifiers);
   }
 
+  if (Res && (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::DS) &&
+      !AMDGPU::hasGDS(STI)) {
+    insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::gds);
+  }
+
   if (Res && (MCII->get(MI.getOpcode()).TSFlags &
           (SIInstrFlags::MUBUF | SIInstrFlags::FLAT | SIInstrFlags::SMRD))) {
     int CPolPos = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index e599f23101c81..29ac08b6895e5 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -4983,6 +4983,14 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
     }
   }
 
+  if (isDS(MI) && !ST.hasGDS()) {
+    const MachineOperand *GDSOp = getNamedOperand(MI, AMDGPU::OpName::gds);
+    if (GDSOp && GDSOp->getImm() != 0) {
+      ErrInfo = "GDS is not supported on this subtarget";
+      return false;
+    }
+  }
+
   if (isImage(MI)) {
     const MachineOperand *DimOp = getNamedOperand(MI, AMDGPU::OpName::dim);
     if (DimOp) {
diff --git a/llvm/test/CodeGen/AMDGPU/gds-unsupported.ll b/llvm/test/CodeGen/AMDGPU/gds-unsupported.ll
index 174de0d58295a..d35b9f79e23ab 100644
--- a/llvm/test/CodeGen/AMDGPU/gds-unsupported.ll
+++ b/llvm/test/CodeGen/AMDGPU/gds-unsupported.ll
@@ -1,4 +1,5 @@
 ; RUN: not --crash llc -march=amdgcn -mcpu=gfx90a < %s 2>&1 | FileCheck %s
+; RUN: not --crash llc -march=amdgcn -mcpu=gfx1200 < %s 2>&1 | FileCheck %s
 
 ; GDS is not supported on GFX90A+
 ; CHECK: LLVM ERROR: Cannot select: {{.*}} AtomicLoadAdd
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.ordered.add.gfx11.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.ordered.add.gfx11.ll
index a9c2c27903898..3e977c054ec2e 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.ordered.add.gfx11.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.ordered.add.gfx11.ll
@@ -1,5 +1,8 @@
 ; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,FUNC %s
 ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,FUNC %s
+; RUN: not --crash llc -march=amdgcn -mcpu=gfx1200 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=GFX12-ERR %s
+
+; GFX12-ERR: LLVM ERROR: Cannot select: {{.*}} = DS_ORDERED_COUNT
 
 ; FUNC-LABEL: {{^}}ds_ordered_add:
 ; GCN-DAG: v_mov_b32_e32 v[[INCR:[0-9]+]], 31
diff --git a/llvm/test/CodeGen/AMDGPU/verify-gfx12-gds.mir b/llvm/test/CodeGen/AMDGPU/verify-gfx12-gds.mir
new file mode 100644
index 0000000000000..a2182aa8d6efe
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/verify-gfx12-gds.mir
@@ -0,0 +1,10 @@
+# RUN: not --crash llc -march=amdgcn -mcpu=gfx1200 -run-pass=none -o /dev/null %s 2>&1 | FileCheck -check-prefix=GFX12 %s
+
+---
+name: gds
+body: |
+  bb.0:
+    ; GFX12: *** Bad machine code: GDS is not supported on this subtarget ***
+    ; GFX12: - instruction: DS_ADD_U32 %0:vgpr_32, %1:vgpr_32, 0, 1, implicit $m0, implicit $exec :: (load store acq_rel (s32), addrspace 2)
+    DS_ADD_U32 %0:vgpr_32, %2:vgpr_32, 0, 1, implicit $m0, implicit $exec :: (load store acq_rel (s32), addrspace 2)
+...
diff --git a/llvm/test/MC/Disassembler/AMDGPU/decode-err.txt b/llvm/test/MC/Disassembler/AMDGPU/decode-err.txt
index 24716b1226e49..e1bb7ad511715 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/decode-err.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/decode-err.txt
@@ -34,6 +34,14 @@
 # W64: v_wmma_f32_16x16x16_f16 v[16:19], v[0:7], v[8:15], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/ ; encoding: [0x10,0x40,0x40,0xcc,0x00,0x11,0x02,0x18]
 0x10,0x40,0x40,0xcc,0x00,0x11,0x02,0x18 # src2 sgpr0
 
+# this is ds_add_f32 with gds bit which is not valid on gfx12+
+# GFX12: [[@LINE+1]]:1: warning: invalid instruction encoding
+0x00,0x00,0x56,0xd8,0x00,0x01,0x00,0x00
+
 # this is image_msaa_load where samp field for gfx12 VSAMPLE is not all zeros
 # GFX12: [[@LINE+1]]:1: warning: invalid instruction encoding
 0x06,0x00,0x46,0xe4,0x01,0x10,0x80,0x00,0x05,0x06,0x07,0x00
+
+# This is ds_read_b32 with gds bit which is not valid on gfx90a.
+# GFX90A: [[@LINE+1]]:1: warning: invalid instruction encoding
+0x00,0x00,0x6d,0xd8,0x01,0x00,0x00,0x00
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx90a_features.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx90a_features.txt
index 8746ee79c8f55..b348e8b5ef013 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx90a_features.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx90a_features.txt
@@ -773,8 +773,3 @@
 
 # GFX90A: flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] glc ; encoding: [0x00,0x00,0x41,0xdd,0x00,0x02,0x00,0x00]
 0x00,0x00,0x41,0xdd,0x00,0x02,0x00,0x00
-
-# Disassembler still decodes the gds modifier even though the assembler does
-# not accept it.
-# GFX90A: ds_read_b32 v0, v1 gds ; encoding: [0x00,0x00,0x6d,0xd8,0x01,0x00,0x00,0x00]
-0x00,0x00,0x6d,0xd8,0x01,0x00,0x00,0x00

From 4d7112435e31dafb5854f69c516373e4548bd0a3 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Thu, 21 Dec 2023 16:31:56 +0100
Subject: [PATCH 082/342] [InstCombine] Add zext nneg test variant for gep of
 sext add fold (NFC)

---
 llvm/test/Transforms/InstCombine/array.ll | 36 +++++++++++++++++++++++
 1 file changed, 36 insertions(+)

diff --git a/llvm/test/Transforms/InstCombine/array.ll b/llvm/test/Transforms/InstCombine/array.ll
index 8bab3104fd8cd..824cf3d663c82 100644
--- a/llvm/test/Transforms/InstCombine/array.ll
+++ b/llvm/test/Transforms/InstCombine/array.ll
@@ -72,3 +72,39 @@ entry:
   store i32 %b, ptr %gep
   ret void
 }
+
+define void @test_zext_nneg(ptr %ptr, i32 %a, i32 %b) {
+; CHECK-LABEL: define void @test_zext_nneg(
+; CHECK-SAME: ptr [[PTR:%.*]], i32 [[A:%.*]], i32 [[B:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[A]], 10
+; CHECK-NEXT:    [[IDX:%.*]] = zext nneg i32 [[ADD]] to i64
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, ptr [[PTR]], i64 [[IDX]]
+; CHECK-NEXT:    store i32 [[B]], ptr [[GEP]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %add = add nsw i32 %a, 10
+  %idx = zext nneg i32 %add to i64
+  %gep = getelementptr inbounds i32, ptr %ptr, i64 %idx
+  store i32 %b, ptr %gep
+  ret void
+}
+
+define void @test_zext_missing_nneg(ptr %ptr, i32 %a, i32 %b) {
+; CHECK-LABEL: define void @test_zext_missing_nneg(
+; CHECK-SAME: ptr [[PTR:%.*]], i32 [[A:%.*]], i32 [[B:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[A]], 10
+; CHECK-NEXT:    [[IDX:%.*]] = zext i32 [[ADD]] to i64
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, ptr [[PTR]], i64 [[IDX]]
+; CHECK-NEXT:    store i32 [[B]], ptr [[GEP]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %add = add nsw i32 %a, 10
+  %idx = zext i32 %add to i64
+  %gep = getelementptr inbounds i32, ptr %ptr, i64 %idx
+  store i32 %b, ptr %gep
+  ret void
+}

From b8df88b41c8a1b4e879b4fd34be3522c9b45e86f Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Thu, 21 Dec 2023 16:27:11 +0100
Subject: [PATCH 083/342] [InstCombine] Support zext nneg in gep of sext add
 fold

Add m_NNegZext() and m_SExtLike() matchers to make doing these kinds
of changes simpler in the future.
---
 llvm/include/llvm/IR/PatternMatch.h           | 26 +++++++++++++++++++
 .../InstCombine/InstructionCombining.cpp      |  2 +-
 llvm/test/Transforms/InstCombine/array.ll     |  6 ++---
 3 files changed, 30 insertions(+), 4 deletions(-)

diff --git a/llvm/include/llvm/IR/PatternMatch.h b/llvm/include/llvm/IR/PatternMatch.h
index 096d1688af3f7..48afdb867ba6c 100644
--- a/llvm/include/llvm/IR/PatternMatch.h
+++ b/llvm/include/llvm/IR/PatternMatch.h
@@ -1656,6 +1656,19 @@ template <typename Op_t> struct PtrToIntSameSize_match {
   }
 };
 
+template <typename Op_t> struct NNegZExt_match {
+  Op_t Op;
+
+  NNegZExt_match(const Op_t &OpMatch) : Op(OpMatch) {}
+
+  template <typename OpTy> bool match(OpTy *V) {
+    if (auto *I = dyn_cast<Instruction>(V))
+      return I->getOpcode() == Instruction::ZExt && I->hasNonNeg() &&
+             Op.match(I->getOperand(0));
+    return false;
+  }
+};
+
 /// Matches BitCast.
 template <typename OpTy>
 inline CastOperator_match<OpTy, Instruction::BitCast>
@@ -1707,6 +1720,11 @@ inline CastInst_match<OpTy, Instruction::ZExt> m_ZExt(const OpTy &Op) {
   return CastInst_match<OpTy, Instruction::ZExt>(Op);
 }
 
+template <typename OpTy>
+inline NNegZExt_match<OpTy> m_NNegZExt(const OpTy &Op) {
+  return NNegZExt_match<OpTy>(Op);
+}
+
 template <typename OpTy>
 inline match_combine_or<CastInst_match<OpTy, Instruction::ZExt>, OpTy>
 m_ZExtOrSelf(const OpTy &Op) {
@@ -1719,6 +1737,14 @@ m_SExtOrSelf(const OpTy &Op) {
   return m_CombineOr(m_SExt(Op), Op);
 }
 
+/// Match either "sext" or "zext nneg".
+template <typename OpTy>
+inline match_combine_or<CastInst_match<OpTy, Instruction::SExt>,
+                        NNegZExt_match<OpTy>>
+m_SExtLike(const OpTy &Op) {
+  return m_CombineOr(m_SExt(Op), m_NNegZExt(Op));
+}
+
 template <typename OpTy>
 inline match_combine_or<CastInst_match<OpTy, Instruction::ZExt>,
                         CastInst_match<OpTy, Instruction::SExt>>
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index 775720ab43a5c..7f5a7b666903d 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -2516,7 +2516,7 @@ Instruction *InstCombinerImpl::visitGetElementPtrInst(GetElementPtrInst &GEP) {
                                        Idx2);
     }
     ConstantInt *C;
-    if (match(GEP.getOperand(1), m_OneUse(m_SExt(m_OneUse(m_NSWAdd(
+    if (match(GEP.getOperand(1), m_OneUse(m_SExtLike(m_OneUse(m_NSWAdd(
                                      m_Value(Idx1), m_ConstantInt(C))))))) {
       // %add = add nsw i32 %idx1, idx2
       // %sidx = sext i32 %add to i64
diff --git a/llvm/test/Transforms/InstCombine/array.ll b/llvm/test/Transforms/InstCombine/array.ll
index 824cf3d663c82..396a7aa340f6b 100644
--- a/llvm/test/Transforms/InstCombine/array.ll
+++ b/llvm/test/Transforms/InstCombine/array.ll
@@ -77,9 +77,9 @@ define void @test_zext_nneg(ptr %ptr, i32 %a, i32 %b) {
 ; CHECK-LABEL: define void @test_zext_nneg(
 ; CHECK-SAME: ptr [[PTR:%.*]], i32 [[A:%.*]], i32 [[B:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[A]], 10
-; CHECK-NEXT:    [[IDX:%.*]] = zext nneg i32 [[ADD]] to i64
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, ptr [[PTR]], i64 [[IDX]]
+; CHECK-NEXT:    [[TMP0:%.*]] = sext i32 [[A]] to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[TMP0]]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr [[TMP1]], i64 10
 ; CHECK-NEXT:    store i32 [[B]], ptr [[GEP]], align 4
 ; CHECK-NEXT:    ret void
 ;

From a134abf4be132cfff2fc5132d6226db919c0865b Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Thu, 21 Dec 2023 16:49:37 +0100
Subject: [PATCH 084/342] [ValueTracking] Make isGuaranteedNotToBeUndef() more
 precise (#76160)

Currently isGuaranteedNotToBeUndef() is the same as
isGuaranteedNotToBeUndefOrPoison(). This function is used in places
where we only care about undef (due to multi-use issues), not poison.

Make it more precise by only considering instructions that can create
undef (like loads or call), and ignore those that can only create
poison. In particular, we can ignore poison-generating flags.

This means that inferring more flags has less chance to pessimize other
transforms.
---
 llvm/lib/Analysis/ValueTracking.cpp           | 82 ++++++++++++-------
 .../CorrelatedValuePropagation/cond-at-use.ll |  3 +-
 2 files changed, 52 insertions(+), 33 deletions(-)

diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index 2ce660b9a858e..769d921eb1e8d 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -6560,10 +6560,25 @@ static bool shiftAmountKnownInRange(const Value *ShiftAmount) {
   return Safe;
 }
 
-static bool canCreateUndefOrPoison(const Operator *Op, bool PoisonOnly,
+enum class UndefPoisonKind {
+  PoisonOnly = (1 << 0),
+  UndefOnly = (1 << 1),
+  UndefOrPoison = PoisonOnly | UndefOnly,
+};
+
+static bool includesPoison(UndefPoisonKind Kind) {
+  return (unsigned(Kind) & unsigned(UndefPoisonKind::PoisonOnly)) != 0;
+}
+
+static bool includesUndef(UndefPoisonKind Kind) {
+  return (unsigned(Kind) & unsigned(UndefPoisonKind::UndefOnly)) != 0;
+}
+
+static bool canCreateUndefOrPoison(const Operator *Op, UndefPoisonKind Kind,
                                    bool ConsiderFlagsAndMetadata) {
 
-  if (ConsiderFlagsAndMetadata && Op->hasPoisonGeneratingFlagsOrMetadata())
+  if (ConsiderFlagsAndMetadata && includesPoison(Kind) &&
+      Op->hasPoisonGeneratingFlagsOrMetadata())
     return true;
 
   unsigned Opcode = Op->getOpcode();
@@ -6573,7 +6588,7 @@ static bool canCreateUndefOrPoison(const Operator *Op, bool PoisonOnly,
   case Instruction::Shl:
   case Instruction::AShr:
   case Instruction::LShr:
-    return !shiftAmountKnownInRange(Op->getOperand(1));
+    return includesPoison(Kind) && !shiftAmountKnownInRange(Op->getOperand(1));
   case Instruction::FPToSI:
   case Instruction::FPToUI:
     // fptosi/ui yields poison if the resulting value does not fit in the
@@ -6614,7 +6629,8 @@ static bool canCreateUndefOrPoison(const Operator *Op, bool PoisonOnly,
         return false;
       case Intrinsic::sshl_sat:
       case Intrinsic::ushl_sat:
-        return !shiftAmountKnownInRange(II->getArgOperand(1));
+        return includesPoison(Kind) &&
+               !shiftAmountKnownInRange(II->getArgOperand(1));
       case Intrinsic::fma:
       case Intrinsic::fmuladd:
       case Intrinsic::sqrt:
@@ -6669,15 +6685,16 @@ static bool canCreateUndefOrPoison(const Operator *Op, bool PoisonOnly,
     auto *VTy = cast<VectorType>(Op->getOperand(0)->getType());
     unsigned IdxOp = Op->getOpcode() == Instruction::InsertElement ? 2 : 1;
     auto *Idx = dyn_cast<ConstantInt>(Op->getOperand(IdxOp));
-    if (!Idx || Idx->getValue().uge(VTy->getElementCount().getKnownMinValue()))
-      return true;
+    if (includesPoison(Kind))
+      return !Idx ||
+             Idx->getValue().uge(VTy->getElementCount().getKnownMinValue());
     return false;
   }
   case Instruction::ShuffleVector: {
     ArrayRef<int> Mask = isa<ConstantExpr>(Op)
                              ? cast<ConstantExpr>(Op)->getShuffleMask()
                              : cast<ShuffleVectorInst>(Op)->getShuffleMask();
-    return is_contained(Mask, PoisonMaskElem);
+    return includesPoison(Kind) && is_contained(Mask, PoisonMaskElem);
   }
   case Instruction::FNeg:
   case Instruction::PHI:
@@ -6713,17 +6730,17 @@ static bool canCreateUndefOrPoison(const Operator *Op, bool PoisonOnly,
 
 bool llvm::canCreateUndefOrPoison(const Operator *Op,
                                   bool ConsiderFlagsAndMetadata) {
-  return ::canCreateUndefOrPoison(Op, /*PoisonOnly=*/false,
+  return ::canCreateUndefOrPoison(Op, UndefPoisonKind::UndefOrPoison,
                                   ConsiderFlagsAndMetadata);
 }
 
 bool llvm::canCreatePoison(const Operator *Op, bool ConsiderFlagsAndMetadata) {
-  return ::canCreateUndefOrPoison(Op, /*PoisonOnly=*/true,
+  return ::canCreateUndefOrPoison(Op, UndefPoisonKind::PoisonOnly,
                                   ConsiderFlagsAndMetadata);
 }
 
-static bool directlyImpliesPoison(const Value *ValAssumedPoison,
-                                  const Value *V, unsigned Depth) {
+static bool directlyImpliesPoison(const Value *ValAssumedPoison, const Value *V,
+                                  unsigned Depth) {
   if (ValAssumedPoison == V)
     return true;
 
@@ -6775,14 +6792,11 @@ bool llvm::impliesPoison(const Value *ValAssumedPoison, const Value *V) {
   return ::impliesPoison(ValAssumedPoison, V, /* Depth */ 0);
 }
 
-static bool programUndefinedIfUndefOrPoison(const Value *V,
-                                            bool PoisonOnly);
+static bool programUndefinedIfUndefOrPoison(const Value *V, bool PoisonOnly);
 
-static bool isGuaranteedNotToBeUndefOrPoison(const Value *V,
-                                             AssumptionCache *AC,
-                                             const Instruction *CtxI,
-                                             const DominatorTree *DT,
-                                             unsigned Depth, bool PoisonOnly) {
+static bool isGuaranteedNotToBeUndefOrPoison(
+    const Value *V, AssumptionCache *AC, const Instruction *CtxI,
+    const DominatorTree *DT, unsigned Depth, UndefPoisonKind Kind) {
   if (Depth >= MaxAnalysisRecursionDepth)
     return false;
 
@@ -6797,16 +6811,19 @@ static bool isGuaranteedNotToBeUndefOrPoison(const Value *V,
   }
 
   if (auto *C = dyn_cast<Constant>(V)) {
+    if (isa<PoisonValue>(C))
+      return !includesPoison(Kind);
+
     if (isa<UndefValue>(C))
-      return PoisonOnly && !isa<PoisonValue>(C);
+      return !includesUndef(Kind);
 
     if (isa<ConstantInt>(C) || isa<GlobalVariable>(C) || isa<ConstantFP>(V) ||
         isa<ConstantPointerNull>(C) || isa<Function>(C))
       return true;
 
     if (C->getType()->isVectorTy() && !isa<ConstantExpr>(C))
-      return (PoisonOnly ? !C->containsPoisonElement()
-                         : !C->containsUndefOrPoisonElement()) &&
+      return (!includesUndef(Kind) ? !C->containsPoisonElement()
+                                   : !C->containsUndefOrPoisonElement()) &&
              !C->containsConstantExpression();
   }
 
@@ -6824,8 +6841,7 @@ static bool isGuaranteedNotToBeUndefOrPoison(const Value *V,
     return true;
 
   auto OpCheck = [&](const Value *V) {
-    return isGuaranteedNotToBeUndefOrPoison(V, AC, CtxI, DT, Depth + 1,
-                                            PoisonOnly);
+    return isGuaranteedNotToBeUndefOrPoison(V, AC, CtxI, DT, Depth + 1, Kind);
   };
 
   if (auto *Opr = dyn_cast<Operator>(V)) {
@@ -6847,14 +6863,16 @@ static bool isGuaranteedNotToBeUndefOrPoison(const Value *V,
       for (unsigned i = 0; i < Num; ++i) {
         auto *TI = PN->getIncomingBlock(i)->getTerminator();
         if (!isGuaranteedNotToBeUndefOrPoison(PN->getIncomingValue(i), AC, TI,
-                                              DT, Depth + 1, PoisonOnly)) {
+                                              DT, Depth + 1, Kind)) {
           IsWellDefined = false;
           break;
         }
       }
       if (IsWellDefined)
         return true;
-    } else if (!canCreateUndefOrPoison(Opr) && all_of(Opr->operands(), OpCheck))
+    } else if (!::canCreateUndefOrPoison(Opr, Kind,
+                                         /*ConsiderFlagsAndMetadata*/ true) &&
+               all_of(Opr->operands(), OpCheck))
       return true;
   }
 
@@ -6864,7 +6882,7 @@ static bool isGuaranteedNotToBeUndefOrPoison(const Value *V,
         I->hasMetadata(LLVMContext::MD_dereferenceable_or_null))
       return true;
 
-  if (programUndefinedIfUndefOrPoison(V, PoisonOnly))
+  if (programUndefinedIfUndefOrPoison(V, !includesUndef(Kind)))
     return true;
 
   // CxtI may be null or a cloned instruction.
@@ -6896,7 +6914,7 @@ static bool isGuaranteedNotToBeUndefOrPoison(const Value *V,
     if (Cond) {
       if (Cond == V)
         return true;
-      else if (PoisonOnly && isa<Operator>(Cond)) {
+      else if (!includesUndef(Kind) && isa<Operator>(Cond)) {
         // For poison, we can analyze further
         auto *Opr = cast<Operator>(Cond);
         if (any_of(Opr->operands(),
@@ -6918,20 +6936,22 @@ bool llvm::isGuaranteedNotToBeUndefOrPoison(const Value *V, AssumptionCache *AC,
                                             const Instruction *CtxI,
                                             const DominatorTree *DT,
                                             unsigned Depth) {
-  return ::isGuaranteedNotToBeUndefOrPoison(V, AC, CtxI, DT, Depth, false);
+  return ::isGuaranteedNotToBeUndefOrPoison(V, AC, CtxI, DT, Depth,
+                                            UndefPoisonKind::UndefOrPoison);
 }
 
 bool llvm::isGuaranteedNotToBePoison(const Value *V, AssumptionCache *AC,
                                      const Instruction *CtxI,
                                      const DominatorTree *DT, unsigned Depth) {
-  return ::isGuaranteedNotToBeUndefOrPoison(V, AC, CtxI, DT, Depth, true);
+  return ::isGuaranteedNotToBeUndefOrPoison(V, AC, CtxI, DT, Depth,
+                                            UndefPoisonKind::PoisonOnly);
 }
 
 bool llvm::isGuaranteedNotToBeUndef(const Value *V, AssumptionCache *AC,
                                     const Instruction *CtxI,
                                     const DominatorTree *DT, unsigned Depth) {
-  // TODO: This is currently equivalent to isGuaranteedNotToBeUndefOrPoison().
-  return ::isGuaranteedNotToBeUndefOrPoison(V, AC, CtxI, DT, Depth, false);
+  return ::isGuaranteedNotToBeUndefOrPoison(V, AC, CtxI, DT, Depth,
+                                            UndefPoisonKind::UndefOnly);
 }
 
 /// Return true if undefined behavior would provably be executed on the path to
diff --git a/llvm/test/Transforms/CorrelatedValuePropagation/cond-at-use.ll b/llvm/test/Transforms/CorrelatedValuePropagation/cond-at-use.ll
index 546baf086cdbb..0b95139f3dcba 100644
--- a/llvm/test/Transforms/CorrelatedValuePropagation/cond-at-use.ll
+++ b/llvm/test/Transforms/CorrelatedValuePropagation/cond-at-use.ll
@@ -571,9 +571,8 @@ define i16 @cond_value_may_not_well_defined(i16 %x) {
 define i16 @and_elide_poison_flags(i16 noundef %a) {
 ; CHECK-LABEL: @and_elide_poison_flags(
 ; CHECK-NEXT:    [[X:%.*]] = add nuw i16 [[A:%.*]], 1
-; CHECK-NEXT:    [[AND:%.*]] = and i16 [[X]], 7
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i16 [[X]], 8
-; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i16 [[AND]], i16 24
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i16 [[X]], i16 24
 ; CHECK-NEXT:    ret i16 [[SEL]]
 ;
   %x = add nuw i16 %a, 1

From e01c063684b76da3ceacd01a0c47c73402cbc775 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Thu, 21 Dec 2023 08:18:47 -0800
Subject: [PATCH 085/342] [llvm] Use DenseMap::contains (NFC)

---
 llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h | 2 +-
 llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp            | 6 +++---
 llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp            | 2 +-
 llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h              | 2 +-
 llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp              | 2 +-
 5 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h b/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h
index 493689f6a61e7..2757b8cd54a69 100644
--- a/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h
+++ b/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h
@@ -447,7 +447,7 @@ struct MCDCRecord {
   bool isConditionIndependencePairCovered(unsigned Condition) const {
     auto It = PosToID.find(Condition);
     if (It != PosToID.end())
-      return (IndependencePairs.find(It->second) != IndependencePairs.end());
+      return IndependencePairs.contains(It->second);
     llvm_unreachable("Condition ID without an Ordinal mapping");
   }
 
diff --git a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
index b8a6784ff3c62..3a34a0bfae46e 100644
--- a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
@@ -645,7 +645,7 @@ SPIRVType *SPIRVGlobalRegistry::findSPIRVType(
   Register Reg = DT.find(Ty, &MIRBuilder.getMF());
   if (Reg.isValid())
     return getSPIRVTypeForVReg(Reg);
-  if (ForwardPointerTypes.find(Ty) != ForwardPointerTypes.end())
+  if (ForwardPointerTypes.contains(Ty))
     return ForwardPointerTypes[Ty];
   return restOfCreateSPIRVType(Ty, MIRBuilder, AccQual, EmitIR);
 }
@@ -712,14 +712,14 @@ SPIRVType *SPIRVGlobalRegistry::createSPIRVType(
     // Null pointer means we have a loop in type definitions, make and
     // return corresponding OpTypeForwardPointer.
     if (SpvElementType == nullptr) {
-      if (ForwardPointerTypes.find(Ty) == ForwardPointerTypes.end())
+      if (!ForwardPointerTypes.contains(Ty))
         ForwardPointerTypes[PType] = getOpTypeForwardPointer(SC, MIRBuilder);
       return ForwardPointerTypes[PType];
     }
     Register Reg(0);
     // If we have forward pointer associated with this type, use its register
     // operand to create OpTypePointer.
-    if (ForwardPointerTypes.find(PType) != ForwardPointerTypes.end())
+    if (ForwardPointerTypes.contains(PType))
       Reg = getSPIRVTypeID(ForwardPointerTypes[PType]);
 
     return getOpTypePointer(SC, SpvElementType, MIRBuilder, Reg);
diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
index 779036016560e..2a830535a2aa1 100644
--- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
@@ -391,7 +391,7 @@ void SPIRVModuleAnalysis::numberRegistersGlobally(const Module &M) {
         if (MI.getOpcode() != SPIRV::OpExtInst)
           continue;
         auto Set = MI.getOperand(2).getImm();
-        if (MAI.ExtInstSetMap.find(Set) == MAI.ExtInstSetMap.end())
+        if (!MAI.ExtInstSetMap.contains(Set))
           MAI.ExtInstSetMap[Set] = Register::index2VirtReg(MAI.getNextID());
       }
     }
diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h
index 5124181b49e2c..d0b8027edd420 100644
--- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h
+++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h
@@ -189,7 +189,7 @@ struct ModuleAnalysisInfo {
   }
   unsigned getNextID() { return MaxID++; }
   bool hasMBBRegister(const MachineBasicBlock &MBB) {
-    return BBNumToRegMap.find(MBB.getNumber()) != BBNumToRegMap.end();
+    return BBNumToRegMap.contains(MBB.getNumber());
   }
   // Convert MBB's number to corresponding ID register.
   Register getOrCreateMBBRegister(const MachineBasicBlock &MBB) {
diff --git a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp
index f4076be2a7b77..1bfce70fedc0e 100644
--- a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp
@@ -83,7 +83,7 @@ static void addConstantsToTrack(MachineFunction &MF, SPIRVGlobalRegistry *GR) {
   }
   for (MachineInstr *MI : ToErase) {
     Register Reg = MI->getOperand(2).getReg();
-    if (RegsAlreadyAddedToDT.find(MI) != RegsAlreadyAddedToDT.end())
+    if (RegsAlreadyAddedToDT.contains(MI))
       Reg = RegsAlreadyAddedToDT[MI];
     auto *RC = MRI.getRegClassOrNull(MI->getOperand(0).getReg());
     if (!MRI.getRegClassOrNull(Reg) && RC)

From 886655869cef2e0f11da8981da30d70ad7892ff9 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Thu, 21 Dec 2023 08:18:49 -0800
Subject: [PATCH 086/342] [clang] Fix typos in documentation

---
 clang/docs/ControlFlowIntegrityDesign.rst | 2 +-
 clang/docs/LanguageExtensions.rst         | 6 +++---
 clang/docs/ReleaseNotes.rst               | 2 +-
 clang/docs/SanitizerCoverage.rst          | 2 +-
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/clang/docs/ControlFlowIntegrityDesign.rst b/clang/docs/ControlFlowIntegrityDesign.rst
index f3a3c8294f7c7..d66bd16155a9f 100644
--- a/clang/docs/ControlFlowIntegrityDesign.rst
+++ b/clang/docs/ControlFlowIntegrityDesign.rst
@@ -349,7 +349,7 @@ address point. Note that libraries like libcxxabi do assume this property.
 
 (2) virtual function entry layout property
 
-For each virtual function the distance between an virtual table entry for this function and the corresponding
+For each virtual function the distance between a virtual table entry for this function and the corresponding
 address point is always the same. This property ensures that dynamic dispatch still works with the interleaving layout.
 
 Note that the interleaving scheme in the CFI implementation guarantees both properties above whereas the original scheme proposed
diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst
index 13fb7c345aa4e..23a7f4f5d5b92 100644
--- a/clang/docs/LanguageExtensions.rst
+++ b/clang/docs/LanguageExtensions.rst
@@ -2019,7 +2019,7 @@ would be +1.  ``ns_returns_autoreleased`` specifies that the returned object is
 autorelease pool.
 
 **Usage**: The ``ns_consumed`` and ``cf_consumed`` attributes can be placed on
-an parameter declaration; they specify that the argument is expected to have a
+a parameter declaration; they specify that the argument is expected to have a
 +1 retain count, which will be balanced in some way by the function or method.
 The ``ns_consumes_self`` attribute can only be placed on an Objective-C
 method; it specifies that the method expects its ``self`` parameter to have a
@@ -3601,7 +3601,7 @@ scalar calls of ``__builtin_isfpclass`` applied to the input elementwise.
 The result of ``__builtin_isfpclass`` is a boolean value, if the first argument
 is a scalar, or an integer vector with the same element count as the first
 argument. The element type in this vector has the same bit length as the
-element of the the first argument type.
+element of the first argument type.
 
 This function never raises floating-point exceptions and does not canonicalize
 its input. The floating-point argument is not promoted, its data class is
@@ -4959,7 +4959,7 @@ Clang supports the following match rules:
 - ``record(unless(is_union))``: Can be used to apply attributes only to
   ``struct`` and ``class`` declarations.
 
-- ``enum``: Can be be used to apply attributes to enumeration declarations.
+- ``enum``: Can be used to apply attributes to enumeration declarations.
 
 - ``enum_constant``: Can be used to apply attributes to enumerators.
 
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 39b9176865fc0..ee211c16a48ac 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -253,7 +253,7 @@ New Compiler Flags
   the preprocessed text to the output. This can greatly reduce the size of the
   preprocessed output, which can be helpful when trying to reduce a test case.
 * ``-fassume-nothrow-exception-dtor`` is added to assume that the destructor of
-  an thrown exception object will not throw. The generated code for catch
+  a thrown exception object will not throw. The generated code for catch
   handlers will be smaller. A throw expression of a type with a
   potentially-throwing destructor will lead to an error.
 
diff --git a/clang/docs/SanitizerCoverage.rst b/clang/docs/SanitizerCoverage.rst
index c7ced397c7223..45ad03cb43774 100644
--- a/clang/docs/SanitizerCoverage.rst
+++ b/clang/docs/SanitizerCoverage.rst
@@ -496,7 +496,7 @@ offsets in the corresponding binary/DSO that were executed during the run.
 Sancov Tool
 -----------
 
-An simple ``sancov`` tool is provided to process coverage files.
+A simple ``sancov`` tool is provided to process coverage files.
 The tool is part of LLVM project and is currently supported only on Linux.
 It can handle symbolization tasks autonomously without any extra support
 from the environment. You need to pass .sancov files (named

From 9f0f5587426a4ff24b240018cf8bf3acc3c566ae Mon Sep 17 00:00:00 2001
From: Tomas Matheson <tomas.matheson@arm.com>
Date: Thu, 21 Dec 2023 16:25:55 +0000
Subject: [PATCH 087/342] Revert "[AArch64] Codegen support for FEAT_PAuthLR"

This reverts commit 5992ce90b8c0fac06436c3c86621fbf6d5398ee5.

Builtbot failures with expensive checks enabled.
---
 clang/include/clang/Basic/LangOptions.def     |   1 -
 clang/include/clang/Basic/TargetInfo.h        |   1 -
 clang/include/clang/Driver/Options.td         |   2 -
 clang/lib/Basic/Targets/AArch64.cpp           |   1 -
 clang/lib/Basic/Targets/ARM.cpp               |   1 -
 clang/lib/CodeGen/CodeGenModule.cpp           |   3 -
 clang/lib/CodeGen/Targets/AArch64.cpp         |   2 -
 clang/lib/Driver/ToolChains/Clang.cpp         |   7 +-
 .../CodeGen/aarch64-branch-protection-attr.c  |  28 -
 clang/test/Driver/aarch64-pauth-lr.c          |  23 -
 clang/test/Driver/aarch64-v95a.c              |   7 -
 .../llvm/TargetParser/AArch64TargetParser.h   |   2 -
 .../llvm/TargetParser/ARMTargetParserCommon.h |   1 -
 llvm/lib/Target/AArch64/AArch64InstrInfo.cpp  |  11 -
 .../AArch64/AArch64MachineFunctionInfo.cpp    |  28 +-
 .../AArch64/AArch64MachineFunctionInfo.h      |  18 -
 .../lib/Target/AArch64/AArch64PointerAuth.cpp |  86 +--
 .../TargetParser/ARMTargetParserCommon.cpp    |   6 +-
 .../AArch64/sign-return-address-pauth-lr.ll   | 542 ------------------
 .../CodeGen/AArch64/sign-return-address.ll    |   3 -
 .../TargetParser/TargetParserTest.cpp         |   4 +-
 21 files changed, 25 insertions(+), 752 deletions(-)
 delete mode 100644 clang/test/Driver/aarch64-pauth-lr.c
 delete mode 100644 llvm/test/CodeGen/AArch64/sign-return-address-pauth-lr.ll

diff --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def
index 21abc346cf17a..152d9f65f86db 100644
--- a/clang/include/clang/Basic/LangOptions.def
+++ b/clang/include/clang/Basic/LangOptions.def
@@ -456,7 +456,6 @@ ENUM_LANGOPT(SignReturnAddressScope, SignReturnAddressScopeKind, 2, SignReturnAd
 ENUM_LANGOPT(SignReturnAddressKey, SignReturnAddressKeyKind, 1, SignReturnAddressKeyKind::AKey,
              "Key used for return address signing")
 LANGOPT(BranchTargetEnforcement, 1, 0, "Branch-target enforcement enabled")
-LANGOPT(BranchProtectionPAuthLR, 1, 0, "Use PC as a diversifier using PAuthLR NOP instructions.")
 
 LANGOPT(SpeculativeLoadHardening, 1, 0, "Speculative load hardening enabled")
 
diff --git a/clang/include/clang/Basic/TargetInfo.h b/clang/include/clang/Basic/TargetInfo.h
index ac3c324c6c29c..aa0f5023104a1 100644
--- a/clang/include/clang/Basic/TargetInfo.h
+++ b/clang/include/clang/Basic/TargetInfo.h
@@ -1372,7 +1372,6 @@ class TargetInfo : public TransferrableTargetInfo,
     LangOptions::SignReturnAddressKeyKind SignKey =
         LangOptions::SignReturnAddressKeyKind::AKey;
     bool BranchTargetEnforcement = false;
-    bool BranchProtectionPAuthLR = false;
   };
 
   /// Determine if the Architecture in this TargetInfo supports branch
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 2b93ddf033499..9678165bfd98e 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -7000,8 +7000,6 @@ def msign_return_address_key_EQ : Joined<["-"], "msign-return-address-key=">,
     Values<"a_key,b_key">;
 def mbranch_target_enforce : Flag<["-"], "mbranch-target-enforce">,
   MarshallingInfoFlag<LangOpts<"BranchTargetEnforcement">>;
-def mbranch_protection_pauth_lr : Flag<["-"], "mbranch-protection-pauth-lr">,
-  MarshallingInfoFlag<LangOpts<"BranchProtectionPAuthLR">>;
 def fno_dllexport_inlines : Flag<["-"], "fno-dllexport-inlines">,
   MarshallingInfoNegativeFlag<LangOpts<"DllExportInlines">>;
 def cfguard_no_checks : Flag<["-"], "cfguard-no-checks">,
diff --git a/clang/lib/Basic/Targets/AArch64.cpp b/clang/lib/Basic/Targets/AArch64.cpp
index 3ee39133fcee7..def16c032c869 100644
--- a/clang/lib/Basic/Targets/AArch64.cpp
+++ b/clang/lib/Basic/Targets/AArch64.cpp
@@ -225,7 +225,6 @@ bool AArch64TargetInfo::validateBranchProtection(StringRef Spec, StringRef,
     BPI.SignKey = LangOptions::SignReturnAddressKeyKind::BKey;
 
   BPI.BranchTargetEnforcement = PBP.BranchTargetEnforcement;
-  BPI.BranchProtectionPAuthLR = PBP.BranchProtectionPAuthLR;
   return true;
 }
 
diff --git a/clang/lib/Basic/Targets/ARM.cpp b/clang/lib/Basic/Targets/ARM.cpp
index 6e1842fc64e50..ce7e4d4639cea 100644
--- a/clang/lib/Basic/Targets/ARM.cpp
+++ b/clang/lib/Basic/Targets/ARM.cpp
@@ -419,7 +419,6 @@ bool ARMTargetInfo::validateBranchProtection(StringRef Spec, StringRef Arch,
   BPI.SignKey = LangOptions::SignReturnAddressKeyKind::AKey;
 
   BPI.BranchTargetEnforcement = PBP.BranchTargetEnforcement;
-  BPI.BranchProtectionPAuthLR = PBP.BranchProtectionPAuthLR;
   return true;
 }
 
diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index d78f2594a2376..b2e173d0d6949 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -1106,9 +1106,6 @@ void CodeGenModule::Release() {
     if (LangOpts.BranchTargetEnforcement)
       getModule().addModuleFlag(llvm::Module::Min, "branch-target-enforcement",
                                 1);
-    if (LangOpts.BranchProtectionPAuthLR)
-      getModule().addModuleFlag(llvm::Module::Min, "branch-protection-pauth-lr",
-                                1);
     if (LangOpts.hasSignReturnAddress())
       getModule().addModuleFlag(llvm::Module::Min, "sign-return-address", 1);
     if (LangOpts.isSignReturnAddressScopeAll())
diff --git a/clang/lib/CodeGen/Targets/AArch64.cpp b/clang/lib/CodeGen/Targets/AArch64.cpp
index 7102d190fe008..be5145daa00b7 100644
--- a/clang/lib/CodeGen/Targets/AArch64.cpp
+++ b/clang/lib/CodeGen/Targets/AArch64.cpp
@@ -136,8 +136,6 @@ class AArch64TargetCodeGenInfo : public TargetCodeGenInfo {
 
     Fn->addFnAttr("branch-target-enforcement",
                   BPI.BranchTargetEnforcement ? "true" : "false");
-    Fn->addFnAttr("branch-protection-pauth-lr",
-                  BPI.BranchProtectionPAuthLR ? "true" : "false");
   }
 
   bool isScalarizableAsmOperand(CodeGen::CodeGenFunction &CGF,
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 4783affd3220b..de9fd5eaa1e02 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -1497,7 +1497,7 @@ static void CollectARMPACBTIOptions(const ToolChain &TC, const ArgList &Args,
         << Triple.getArchName();
 
   StringRef Scope, Key;
-  bool IndirectBranches, BranchProtectionPAuthLR;
+  bool IndirectBranches;
 
   if (A->getOption().matches(options::OPT_msign_return_address_EQ)) {
     Scope = A->getValue();
@@ -1506,7 +1506,6 @@ static void CollectARMPACBTIOptions(const ToolChain &TC, const ArgList &Args,
           << A->getSpelling() << Scope;
     Key = "a_key";
     IndirectBranches = false;
-    BranchProtectionPAuthLR = false;
   } else {
     StringRef DiagMsg;
     llvm::ARM::ParsedBranchProtection PBP;
@@ -1518,7 +1517,6 @@ static void CollectARMPACBTIOptions(const ToolChain &TC, const ArgList &Args,
           << "b-key" << A->getAsString(Args);
     Scope = PBP.Scope;
     Key = PBP.Key;
-    BranchProtectionPAuthLR = PBP.BranchProtectionPAuthLR;
     IndirectBranches = PBP.BranchTargetEnforcement;
   }
 
@@ -1527,9 +1525,6 @@ static void CollectARMPACBTIOptions(const ToolChain &TC, const ArgList &Args,
   if (!Scope.equals("none"))
     CmdArgs.push_back(
         Args.MakeArgString(Twine("-msign-return-address-key=") + Key));
-  if (BranchProtectionPAuthLR)
-    CmdArgs.push_back(
-        Args.MakeArgString(Twine("-mbranch-protection-pauth-lr")));
   if (IndirectBranches)
     CmdArgs.push_back("-mbranch-target-enforce");
 }
diff --git a/clang/test/CodeGen/aarch64-branch-protection-attr.c b/clang/test/CodeGen/aarch64-branch-protection-attr.c
index 8ab3e17ade426..3c2714e2feda2 100644
--- a/clang/test/CodeGen/aarch64-branch-protection-attr.c
+++ b/clang/test/CodeGen/aarch64-branch-protection-attr.c
@@ -46,24 +46,6 @@ __attribute__ ((target("branch-protection=pac-ret+leaf+bti")))
 void btileaf() {}
 // CHECK: define{{.*}} void @btileaf() #[[#BTIPACLEAF:]]
 
-
-__attribute__ ((target("branch-protection=pac-ret+pc")))
-void pauthlr() {}
-// CHECK: define{{.*}} void @pauthlr()  #[[#PAUTHLR:]]
-
-__attribute__ ((target("branch-protection=pac-ret+pc+b-key")))
-void pauthlr_bkey() {}
-// CHECK: define{{.*}} void @pauthlr_bkey()  #[[#PAUTHLR_BKEY:]]
-
-__attribute__ ((target("branch-protection=pac-ret+pc+leaf")))
-void pauthlr_leaf() {}
-// CHECK: define{{.*}} void @pauthlr_leaf()  #[[#PAUTHLR_LEAF:]]
-
-__attribute__ ((target("branch-protection=pac-ret+pc+bti")))
-void pauthlr_bti() {}
-// CHECK: define{{.*}} void @pauthlr_bti()  #[[#PAUTHLR_BTI:]]
-
-
 // CHECK-DAG: attributes #[[#NONE]] = { {{.*}} "branch-target-enforcement"="false" {{.*}} "sign-return-address"="none"
 
 // CHECK-DAG: attributes #[[#STD]] = { {{.*}} "branch-target-enforcement"="true" {{.*}} "sign-return-address"="non-leaf" "sign-return-address-key"="a_key"
@@ -79,13 +61,3 @@ void pauthlr_bti() {}
 // CHECK-DAG: attributes #[[#PACBKEYLEAF]] = { {{.*}} "branch-target-enforcement"="false" {{.*}}"sign-return-address"="all" "sign-return-address-key"="b_key"
 
 // CHECK-DAG: attributes #[[#BTIPACLEAF]] = { {{.*}}"branch-target-enforcement"="true" {{.*}} "sign-return-address"="all" "sign-return-address-key"="a_key"
-
-
-// CHECK-DAG: attributes #[[#PAUTHLR]] = { {{.*}}"branch-protection-pauth-lr"="true" {{.*}}"branch-target-enforcement"="false" {{.*}}"sign-return-address"="non-leaf" "sign-return-address-key"="a_key"
-
-// CHECK-DAG: attributes #[[#PAUTHLR_BKEY]] = { {{.*}}"branch-protection-pauth-lr"="true" {{.*}}"branch-target-enforcement"="false" {{.*}}"sign-return-address"="non-leaf" "sign-return-address-key"="b_key"
-
-// CHECK-DAG: attributes #[[#PAUTHLR_LEAF]] = { {{.*}}"branch-protection-pauth-lr"="true" {{.*}}"branch-target-enforcement"="false" {{.*}}"sign-return-address"="all" "sign-return-address-key"="a_key"
-
-// CHECK-DAG: attributes #[[#PAUTHLR_BTI]] = { {{.*}}"branch-protection-pauth-lr"="true" {{.*}}"branch-target-enforcement"="true" {{.*}}"sign-return-address"="non-leaf" "sign-return-address-key"="a_key"
-
diff --git a/clang/test/Driver/aarch64-pauth-lr.c b/clang/test/Driver/aarch64-pauth-lr.c
deleted file mode 100644
index 2e1b530fc9895..0000000000000
--- a/clang/test/Driver/aarch64-pauth-lr.c
+++ /dev/null
@@ -1,23 +0,0 @@
-// Check the -cc1 flags for the various forms of -mbranch-protection=pac-ret+pc.
-
-// RUN: %clang -target aarch64-arm-none-eabi -c %s -### -mbranch-protection=pac-ret+pc                  2>&1 |  FileCheck %s --check-prefixes=PAUTH-LR
-// RUN: %clang -target aarch64-arm-none-eabi -c %s -### -mbranch-protection=pac-ret+pc+b-key            2>&1 |  FileCheck %s --check-prefixes=PAUTH-LR-B-KEY
-// RUN: %clang -target aarch64-arm-none-eabi -c %s -### -mbranch-protection=pac-ret+pc+leaf             2>&1 |  FileCheck %s --check-prefixes=PAUTH-LR-LEAF
-// RUN: %clang -target aarch64-arm-none-eabi -c %s -### -mbranch-protection=pac-ret+pc+bti              2>&1 |  FileCheck %s --check-prefixes=PAUTH-LR-BTI
-// RUN: %clang -target aarch64-arm-none-eabi -c %s -### -mbranch-protection=pac-ret+pc+leaf+b-key+bti   2>&1 |  FileCheck %s --check-prefixes=PAUTH-LR-LEAF-B-KEY-BTI
-// RUN: %clang -target aarch64-arm-none-eabi -c %s -### -mbranch-protection=pac-ret+pc                  -march=armv9.5-a 2>&1 |  FileCheck %s --check-prefixes=PAUTH-LR
-// RUN: %clang -target aarch64-arm-none-eabi -c %s -### -mbranch-protection=pac-ret+pc+b-key            -march=armv9.5-a 2>&1 |  FileCheck %s --check-prefixes=PAUTH-LR-B-KEY
-// RUN: %clang -target aarch64-arm-none-eabi -c %s -### -mbranch-protection=pac-ret+pc+leaf             -march=armv9.5-a 2>&1 |  FileCheck %s --check-prefixes=PAUTH-LR-LEAF
-// RUN: %clang -target aarch64-arm-none-eabi -c %s -### -mbranch-protection=pac-ret+pc+bti              -march=armv9.5-a 2>&1 |  FileCheck %s --check-prefixes=PAUTH-LR-BTI
-// RUN: %clang -target aarch64-arm-none-eabi -c %s -### -mbranch-protection=pac-ret+pc+leaf+b-key+bti   -march=armv9.5-a 2>&1 |  FileCheck %s --check-prefixes=PAUTH-LR-LEAF-B-KEY-BTI
-
-// PAUTH-LR: "-msign-return-address=non-leaf" "-msign-return-address-key=a_key" "-mbranch-protection-pauth-lr"
-// PAUTH-LR-B-KEY: "-msign-return-address=non-leaf" "-msign-return-address-key=b_key" "-mbranch-protection-pauth-lr"
-// PAUTH-LR-LEAF: "-msign-return-address=all" "-msign-return-address-key=a_key" "-mbranch-protection-pauth-lr"
-// PAUTH-LR-BTI: "-msign-return-address=non-leaf" "-msign-return-address-key=a_key" "-mbranch-protection-pauth-lr"
-// PAUTH-LR-LEAF-B-KEY-BTI: "-msign-return-address=all" "-msign-return-address-key=b_key" "-mbranch-protection-pauth-lr" "-mbranch-target-enforce"
-
-// NOT-PAUTH-LR: "-mbranch-target-enforce"
-// NOT-PAUTH-LR-B-KEY: "-mbranch-target-enforce"
-// NOT-PAUTH-LR-LEAF: "-mbranch-target-enforce"
-// NOT-PAUTH-LR-BTI: "-mbranch-target-enforce"
diff --git a/clang/test/Driver/aarch64-v95a.c b/clang/test/Driver/aarch64-v95a.c
index 6fac62e8b389a..366cade86a9fb 100644
--- a/clang/test/Driver/aarch64-v95a.c
+++ b/clang/test/Driver/aarch64-v95a.c
@@ -1,5 +1,3 @@
-// ===== Base v9.5a architecture =====
-
 // RUN: %clang -target aarch64 -march=armv9.5a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV95A %s
 // RUN: %clang -target aarch64 -march=armv9.5-a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV95A %s
 // RUN: %clang -target aarch64 -mlittle-endian -march=armv9.5a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV95A %s
@@ -7,7 +5,6 @@
 // RUN: %clang -target aarch64_be -mlittle-endian -march=armv9.5a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV95A %s
 // RUN: %clang -target aarch64_be -mlittle-endian -march=armv9.5-a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV95A %s
 // GENERICV95A: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "generic" "-target-feature" "+neon" "-target-feature" "+v9.5a"
-
 // RUN: %clang -target aarch64_be -march=armv9.5a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV95A-BE %s
 // RUN: %clang -target aarch64_be -march=armv9.5-a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV95A-BE %s
 // RUN: %clang -target aarch64 -mbig-endian -march=armv9.5a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV95A-BE %s
@@ -21,7 +18,3 @@
 // RUN: %clang -target aarch64 -march=armv9.5a+cpa -### -c %s 2>&1 | FileCheck -check-prefix=V95A-CPA %s
 // RUN: %clang -target aarch64 -march=armv9.5-a+cpa -### -c %s 2>&1 | FileCheck -check-prefix=V95A-CPA %s
 // V95A-CPA: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "generic" "-target-feature" "+neon" "-target-feature" "+v9.5a" "-target-feature" "+cpa"
-
-// RUN: %clang -target aarch64 -march=armv9.5a+pauth-lr -### -c %s 2>&1 | FileCheck -check-prefix=V95A-PAUTHLR %s
-// RUN: %clang -target aarch64 -march=armv9.5-a+pauth-lr -### -c %s 2>&1 | FileCheck -check-prefix=V95A-PAUTHLR %s
-// V95A-PAUTHLR: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "generic" "-target-feature" "+neon" "-target-feature" "+v9.5a" "-target-feature" "+pauth-lr"
diff --git a/llvm/include/llvm/TargetParser/AArch64TargetParser.h b/llvm/include/llvm/TargetParser/AArch64TargetParser.h
index 6c7410a8b8f79..f0b35790133fb 100644
--- a/llvm/include/llvm/TargetParser/AArch64TargetParser.h
+++ b/llvm/include/llvm/TargetParser/AArch64TargetParser.h
@@ -174,7 +174,6 @@ enum ArchExtKind : unsigned {
   AEK_SMEF8F32 =      70, // FEAT_SME_F8F32
   AEK_SMEFA64 =       71, // FEAT_SME_FA64
   AEK_CPA =           72, // FEAT_CPA
-  AEK_PAUTHLR =       73, // FEAT_PAuth_LR
   AEK_NUM_EXTENSIONS
 };
 using ExtensionBitset = Bitset<AEK_NUM_EXTENSIONS>;
@@ -298,7 +297,6 @@ inline constexpr ExtensionInfo Extensions[] = {
     {"sme-f8f32", AArch64::AEK_SMEF8F32, "+sme-f8f32", "-sme-f8f32", FEAT_INIT, "+sme2,+fp8", 0},
     {"sme-fa64",  AArch64::AEK_SMEFA64,  "+sme-fa64", "-sme-fa64",  FEAT_INIT, "", 0},
     {"cpa", AArch64::AEK_CPA, "+cpa", "-cpa", FEAT_INIT, "", 0},
-    {"pauth-lr", AArch64::AEK_PAUTHLR, "+pauth-lr", "-pauth-lr", FEAT_INIT, "", 0},
     // Special cases
     {"none", AArch64::AEK_NONE, {}, {}, FEAT_INIT, "", ExtensionInfo::MaxFMVPriority},
 };
diff --git a/llvm/include/llvm/TargetParser/ARMTargetParserCommon.h b/llvm/include/llvm/TargetParser/ARMTargetParserCommon.h
index 1e4187c6fb111..e3d9ffc1d4db5 100644
--- a/llvm/include/llvm/TargetParser/ARMTargetParserCommon.h
+++ b/llvm/include/llvm/TargetParser/ARMTargetParserCommon.h
@@ -41,7 +41,6 @@ struct ParsedBranchProtection {
   StringRef Scope;
   StringRef Key;
   bool BranchTargetEnforcement;
-  bool BranchProtectionPAuthLR;
 };
 
 bool parseBranchProtection(StringRef Spec, ParsedBranchProtection &PBP,
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 6d85e1fb5fbf1..175f6ef49c3ba 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -8802,23 +8802,12 @@ AArch64InstrInfo::getOutliningTypeImpl(MachineBasicBlock::iterator &MIT,
   // Don't outline anything used for return address signing. The outlined
   // function will get signed later if needed
   switch (MI.getOpcode()) {
-  case AArch64::PACM:
   case AArch64::PACIASP:
   case AArch64::PACIBSP:
-  case AArch64::PACIASPPC:
-  case AArch64::PACIBSPPC:
   case AArch64::AUTIASP:
   case AArch64::AUTIBSP:
-  case AArch64::AUTIASPPCi:
-  case AArch64::AUTIASPPCr:
-  case AArch64::AUTIBSPPCi:
-  case AArch64::AUTIBSPPCr:
   case AArch64::RETAA:
   case AArch64::RETAB:
-  case AArch64::RETAASPPCi:
-  case AArch64::RETAASPPCr:
-  case AArch64::RETABSPPCi:
-  case AArch64::RETABSPPCr:
   case AArch64::EMITBKEY:
   case AArch64::PAUTH_PROLOGUE:
   case AArch64::PAUTH_EPILOGUE:
diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp
index 1a8c71888a852..9da59ef2a8062 100644
--- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp
@@ -93,24 +93,16 @@ AArch64FunctionInfo::AArch64FunctionInfo(const Function &F,
   // TODO: skip functions that have no instrumented allocas for optimization
   IsMTETagged = F.hasFnAttribute(Attribute::SanitizeMemTag);
 
-  // BTI/PAuthLR may be set either on the function or the module. Set Bool from
-  // either the function attribute or module attribute, depending on what is
-  // set.
-  // Note: the module attributed is numeric (0 or 1) but the function attribute
-  // is stringy ("true" or "false").
-  auto TryFnThenModule = [&](StringRef AttrName, bool &Bool) {
-    if (F.hasFnAttribute(AttrName)) {
-      const StringRef V = F.getFnAttribute(AttrName).getValueAsString();
-      assert(V.equals_insensitive("true") || V.equals_insensitive("false"));
-      Bool = V.equals_insensitive("true");
-    } else if (const auto *ModVal = mdconst::extract_or_null<ConstantInt>(
-                   F.getParent()->getModuleFlag(AttrName))) {
-      Bool = ModVal->getZExtValue();
-    }
-  };
-
-  TryFnThenModule("branch-target-enforcement", BranchTargetEnforcement);
-  TryFnThenModule("branch-protection-pauth-lr", BranchProtectionPAuthLR);
+  if (!F.hasFnAttribute("branch-target-enforcement")) {
+    if (const auto *BTE = mdconst::extract_or_null<ConstantInt>(
+            F.getParent()->getModuleFlag("branch-target-enforcement")))
+      BranchTargetEnforcement = BTE->getZExtValue();
+  } else {
+    const StringRef BTIEnable =
+        F.getFnAttribute("branch-target-enforcement").getValueAsString();
+    assert(BTIEnable == "true" || BTIEnable == "false");
+    BranchTargetEnforcement = BTIEnable == "true";
+  }
 
   // The default stack probe size is 4096 if the function has no
   // stack-probe-size attribute. This is a safe default because it is the
diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
index cd4a18bfbc23a..219f83cfd32e0 100644
--- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
@@ -22,7 +22,6 @@
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/IR/Function.h"
 #include "llvm/MC/MCLinkerOptimizationHint.h"
-#include "llvm/MC/MCSymbol.h"
 #include <cassert>
 #include <optional>
 
@@ -165,21 +164,10 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
   /// SignWithBKey modifies the default PAC-RET mode to signing with the B key.
   bool SignWithBKey = false;
 
-  /// SigningInstrOffset captures the offset of the PAC-RET signing instruction
-  /// within the prologue, so it can be re-used for authentication in the
-  /// epilogue when using PC as a second salt (FEAT_PAuth_LR)
-  MCSymbol *SignInstrLabel = nullptr;
-
   /// BranchTargetEnforcement enables placing BTI instructions at potential
   /// indirect branch destinations.
   bool BranchTargetEnforcement = false;
 
-  /// Indicates that SP signing should be diversified with PC as-per PAuthLR.
-  /// This is set by -mbranch-protection and will emit NOP instructions unless
-  /// the subtarget feature +pauthlr is also used (in which case non-NOP
-  /// instructions are emitted).
-  bool BranchProtectionPAuthLR = false;
-
   /// Whether this function has an extended frame record [Ctx, FP, LR]. If so,
   /// bit 60 of the in-memory FP will be 1 to enable other tools to detect the
   /// extended record.
@@ -448,16 +436,10 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
   bool needsShadowCallStackPrologueEpilogue(MachineFunction &MF) const;
 
   bool shouldSignWithBKey() const { return SignWithBKey; }
-
-  MCSymbol *getSigningInstrLabel() const { return SignInstrLabel; }
-  void setSigningInstrLabel(MCSymbol *Label) { SignInstrLabel = Label; }
-
   bool isMTETagged() const { return IsMTETagged; }
 
   bool branchTargetEnforcement() const { return BranchTargetEnforcement; }
 
-  bool branchProtectionPAuthLR() const { return BranchProtectionPAuthLR; }
-
   void setHasSwiftAsyncContext(bool HasContext) {
     HasSwiftAsyncContext = HasContext;
   }
diff --git a/llvm/lib/Target/AArch64/AArch64PointerAuth.cpp b/llvm/lib/Target/AArch64/AArch64PointerAuth.cpp
index 334149a6bf5cf..7576d2a899d1a 100644
--- a/llvm/lib/Target/AArch64/AArch64PointerAuth.cpp
+++ b/llvm/lib/Target/AArch64/AArch64PointerAuth.cpp
@@ -60,35 +60,11 @@ FunctionPass *llvm::createAArch64PointerAuthPass() {
 
 char AArch64PointerAuth::ID = 0;
 
-// Where PAuthLR support is not known at compile time, it is supported using
-// PACM. PACM is in the hint space so has no effect when PAuthLR is not
-// supported by the hardware, but will alter the behaviour of PACI*SP, AUTI*SP
-// and RETAA/RETAB if the hardware supports PAuthLR.
-static void BuildPACM(const AArch64Subtarget &Subtarget, MachineBasicBlock &MBB,
-                      MachineBasicBlock::iterator MBBI, DebugLoc DL,
-                      MachineInstr::MIFlag Flags, MCSymbol *PACSym = nullptr) {
-  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
-  auto &MFnI = *MBB.getParent()->getInfo<AArch64FunctionInfo>();
-
-  // ADR X16,<address_of_PACIASP>
-  if (PACSym) {
-    assert(Flags == MachineInstr::FrameDestroy);
-    BuildMI(MBB, MBBI, DL, TII->get(AArch64::ADR))
-        .addReg(AArch64::X16)
-        .addSym(PACSym);
-  }
-
-  // Only emit PACM if -mbranch-protection has +pc and the target does not
-  // have feature +pauth-lr.
-  if (MFnI.branchProtectionPAuthLR() && !Subtarget.hasPAuthLR())
-    BuildMI(MBB, MBBI, DL, TII->get(AArch64::PACM)).setMIFlag(Flags);
-}
-
 void AArch64PointerAuth::signLR(MachineFunction &MF,
                                 MachineBasicBlock::iterator MBBI) const {
-  auto &MFnI = *MF.getInfo<AArch64FunctionInfo>();
-  bool UseBKey = MFnI.shouldSignWithBKey();
-  bool EmitCFI = MFnI.needsDwarfUnwindInfo(MF);
+  const AArch64FunctionInfo *MFnI = MF.getInfo<AArch64FunctionInfo>();
+  bool UseBKey = MFnI->shouldSignWithBKey();
+  bool EmitCFI = MFnI->needsDwarfUnwindInfo(MF);
   bool NeedsWinCFI = MF.hasWinCFI();
 
   MachineBasicBlock &MBB = *MBBI->getParent();
@@ -101,29 +77,11 @@ void AArch64PointerAuth::signLR(MachineFunction &MF,
         .setMIFlag(MachineInstr::FrameSetup);
   }
 
-  // PAuthLR authentication instructions need to know the value of PC at the
-  // point of signing (PACI*).
-  if (MFnI.branchProtectionPAuthLR()) {
-    MCSymbol *PACSym = MF.getMMI().getContext().createTempSymbol();
-    MFnI.setSigningInstrLabel(PACSym);
-  }
-
   // No SEH opcode for this one; it doesn't materialize into an
   // instruction on Windows.
-  if (MFnI.branchProtectionPAuthLR() && Subtarget->hasPAuthLR()) {
-    BuildMI(MBB, MBBI, DL,
-            TII->get(MFnI.shouldSignWithBKey() ? AArch64::PACIBSPPC
-                                               : AArch64::PACIASPPC))
-        .setMIFlag(MachineInstr::FrameSetup)
-        ->setPreInstrSymbol(MF, MFnI.getSigningInstrLabel());
-  } else {
-    BuildPACM(*Subtarget, MBB, MBBI, DL, MachineInstr::FrameSetup);
-    BuildMI(MBB, MBBI, DL,
-            TII->get(MFnI.shouldSignWithBKey() ? AArch64::PACIBSP
-                                               : AArch64::PACIASP))
-        .setMIFlag(MachineInstr::FrameSetup)
-        ->setPreInstrSymbol(MF, MFnI.getSigningInstrLabel());
-  }
+  BuildMI(MBB, MBBI, DL,
+          TII->get(UseBKey ? AArch64::PACIBSP : AArch64::PACIASP))
+      .setMIFlag(MachineInstr::FrameSetup);
 
   if (EmitCFI) {
     unsigned CFIIndex =
@@ -160,37 +118,15 @@ void AArch64PointerAuth::authenticateLR(
   // DW_CFA_AARCH64_negate_ra_state can't be emitted.
   bool TerminatorIsCombinable =
       TI != MBB.end() && TI->getOpcode() == AArch64::RET;
-  MCSymbol *PACSym = MFnI->getSigningInstrLabel();
-
   if (Subtarget->hasPAuth() && TerminatorIsCombinable && !NeedsWinCFI &&
       !MF.getFunction().hasFnAttribute(Attribute::ShadowCallStack)) {
-    if (MFnI->branchProtectionPAuthLR() && Subtarget->hasPAuthLR()) {
-      assert(PACSym && "No PAC instruction to refer to");
-      BuildMI(MBB, TI, DL,
-              TII->get(UseBKey ? AArch64::RETABSPPCi : AArch64::RETAASPPCi))
-          .addSym(PACSym)
-          .copyImplicitOps(*MBBI)
-          .setMIFlag(MachineInstr::FrameDestroy);
-    } else {
-      BuildPACM(*Subtarget, MBB, TI, DL, MachineInstr::FrameDestroy, PACSym);
-      BuildMI(MBB, TI, DL, TII->get(UseBKey ? AArch64::RETAB : AArch64::RETAA))
-          .copyImplicitOps(*MBBI)
-          .setMIFlag(MachineInstr::FrameDestroy);
-    }
+    unsigned CombinedRetOpcode = UseBKey ? AArch64::RETAB : AArch64::RETAA;
+    BuildMI(MBB, TI, DL, TII->get(CombinedRetOpcode)).copyImplicitOps(*TI);
     MBB.erase(TI);
   } else {
-    if (MFnI->branchProtectionPAuthLR() && Subtarget->hasPAuthLR()) {
-      assert(PACSym && "No PAC instruction to refer to");
-      BuildMI(MBB, MBBI, DL,
-              TII->get(UseBKey ? AArch64::AUTIBSPPCi : AArch64::AUTIASPPCi))
-          .addSym(PACSym)
-          .setMIFlag(MachineInstr::FrameDestroy);
-    } else {
-      BuildPACM(*Subtarget, MBB, MBBI, DL, MachineInstr::FrameDestroy, PACSym);
-      BuildMI(MBB, MBBI, DL,
-              TII->get(UseBKey ? AArch64::AUTIBSP : AArch64::AUTIASP))
-          .setMIFlag(MachineInstr::FrameDestroy);
-    }
+    unsigned AutOpcode = UseBKey ? AArch64::AUTIBSP : AArch64::AUTIASP;
+    BuildMI(MBB, MBBI, DL, TII->get(AutOpcode))
+        .setMIFlag(MachineInstr::FrameDestroy);
 
     if (EmitAsyncCFI) {
       unsigned CFIIndex =
diff --git a/llvm/lib/TargetParser/ARMTargetParserCommon.cpp b/llvm/lib/TargetParser/ARMTargetParserCommon.cpp
index 6d3a59d532fd3..10b80cad43472 100644
--- a/llvm/lib/TargetParser/ARMTargetParserCommon.cpp
+++ b/llvm/lib/TargetParser/ARMTargetParserCommon.cpp
@@ -134,13 +134,13 @@ ARM::EndianKind ARM::parseArchEndian(StringRef Arch) {
 }
 
 // Parse a branch protection specification, which has the form
-//   standard | none | [bti,pac-ret[+b-key,+leaf,+pc]*]
+//   standard | none | [bti,pac-ret[+b-key,+leaf]*]
 // Returns true on success, with individual elements of the specification
 // returned in `PBP`. Returns false in error, with `Err` containing
 // an erroneous part of the spec.
 bool ARM::parseBranchProtection(StringRef Spec, ParsedBranchProtection &PBP,
                                 StringRef &Err) {
-  PBP = {"none", "a_key", false, false};
+  PBP = {"none", "a_key", false};
   if (Spec == "none")
     return true; // defaults are ok
 
@@ -166,8 +166,6 @@ bool ARM::parseBranchProtection(StringRef Spec, ParsedBranchProtection &PBP,
           PBP.Scope = "all";
         else if (PACOpt == "b-key")
           PBP.Key = "b_key";
-        else if (PACOpt == "pc")
-          PBP.BranchProtectionPAuthLR = true;
         else
           break;
       }
diff --git a/llvm/test/CodeGen/AArch64/sign-return-address-pauth-lr.ll b/llvm/test/CodeGen/AArch64/sign-return-address-pauth-lr.ll
deleted file mode 100644
index a78fa853d99dc..0000000000000
--- a/llvm/test/CodeGen/AArch64/sign-return-address-pauth-lr.ll
+++ /dev/null
@@ -1,542 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-
-; PauthLR is controlled via a combination of -mbranch-protection and +pauth-lr.
-; -mbranch-protection=+pc enables branch protection. If the feature +pauth-lr
-; is available (v9.5a onwards) then non-NOP instructions are used; otherwise
-; NOP instructions are used.
-
-; There are 6 cases to cover:
-
-; feature \ -mbranch-protection= |    none    | pac-ret |   pac-ret+pc
-; ------------------------------------------------------------------------
-; without +pauth-lr              | no codegen | old pac |     NOP pauth-lr
-;    with +pauth-lr              | no codegen | old pac | non-NOP pauth-lr
-
-; sign-return-address.ll tests combinations of -mbranch-protection=none/pac-ret
-; and whether +pauth-lr is present or not.
-
-; sign-return-address-pauth-lr.ll is identical, with the addition of this module
-; attribute, which enables -mbranch-protection=pac-ret+pc, and therefore tests
-; the remaining parameter combinations in the table:
-!llvm.module.flags = !{!1}
-!1 = !{i32 1, !"branch-protection-pauth-lr", i32 1}
-
-; RUN: llc -mtriple=aarch64              < %s | FileCheck --check-prefixes=CHECK,COMPAT %s
-; RUN: llc -mtriple=aarch64 -mattr=v8.3a < %s | FileCheck --check-prefixes=CHECK,V83A %s
-; RUN: llc -mtriple=aarch64 -mattr=v9a -mattr=pauth-lr < %s | FileCheck --check-prefixes=PAUTHLR %s
-
-define i32 @leaf(i32 %x) {
-; CHECK-LABEL: leaf:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ret
-;
-; PAUTHLR-LABEL: leaf:
-; PAUTHLR:       // %bb.0:
-; PAUTHLR-NEXT:    ret
-  ret i32 %x
-}
-
-define i32 @leaf_sign_none(i32 %x) "sign-return-address"="none"  {
-; CHECK-LABEL: leaf_sign_none:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ret
-;
-; PAUTHLR-LABEL: leaf_sign_none:
-; PAUTHLR:       // %bb.0:
-; PAUTHLR-NEXT:    ret
-  ret i32 %x
-}
-
-define i32 @leaf_sign_non_leaf(i32 %x) "sign-return-address"="non-leaf"  {
-; CHECK-LABEL: leaf_sign_non_leaf:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ret
-;
-; PAUTHLR-LABEL: leaf_sign_non_leaf:
-; PAUTHLR:       // %bb.0:
-; PAUTHLR-NEXT:    ret
-  ret i32 %x
-}
-
-define i32 @leaf_sign_all(i32 %x) "sign-return-address"="all" {
-; COMPAT-LABEL: leaf_sign_all:
-; COMPAT:       // %bb.0:
-; COMPAT-NEXT:    hint #39
-; COMPAT-NEXT:  .Ltmp0:
-; COMPAT-NEXT:    hint #25
-; COMPAT-NEXT:    .cfi_negate_ra_state
-; COMPAT-NEXT:    adr x16, .Ltmp0
-; COMPAT-NEXT:    hint #39
-; COMPAT-NEXT:    hint #29
-; COMPAT-NEXT:    ret
-;
-; V83A-LABEL: leaf_sign_all:
-; V83A:       // %bb.0:
-; V83A-NEXT:    hint #39
-; V83A-NEXT:  .Ltmp0:
-; V83A-NEXT:    paciasp
-; V83A-NEXT:    .cfi_negate_ra_state
-; V83A-NEXT:    adr x16, .Ltmp0
-; V83A-NEXT:    hint #39
-; V83A-NEXT:    retaa
-;
-; PAUTHLR-LABEL: leaf_sign_all:
-; PAUTHLR:       // %bb.0:
-; PAUTHLR-NEXT:  .Ltmp0:
-; PAUTHLR-NEXT:    paciasppc
-; PAUTHLR-NEXT:    .cfi_negate_ra_state
-; PAUTHLR-NEXT:    retaasppc .Ltmp0
-  ret i32 %x
-}
-
-define i64 @leaf_clobbers_lr(i64 %x) "sign-return-address"="non-leaf"  {
-; COMPAT-LABEL: leaf_clobbers_lr:
-; COMPAT:       // %bb.0:
-; COMPAT-NEXT:    hint #39
-; COMPAT-NEXT:  .Ltmp1:
-; COMPAT-NEXT:    hint #25
-; COMPAT-NEXT:    .cfi_negate_ra_state
-; COMPAT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; COMPAT-NEXT:    .cfi_def_cfa_offset 16
-; COMPAT-NEXT:    .cfi_offset w30, -16
-; COMPAT-NEXT:    //APP
-; COMPAT-NEXT:    mov x30, x0
-; COMPAT-NEXT:    //NO_APP
-; COMPAT-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; COMPAT-NEXT:    adr x16, .Ltmp1
-; COMPAT-NEXT:    hint #39
-; COMPAT-NEXT:    hint #29
-; COMPAT-NEXT:    ret
-;
-; V83A-LABEL: leaf_clobbers_lr:
-; V83A:       // %bb.0:
-; V83A-NEXT:    hint #39
-; V83A-NEXT:  .Ltmp1:
-; V83A-NEXT:    paciasp
-; V83A-NEXT:    .cfi_negate_ra_state
-; V83A-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; V83A-NEXT:    .cfi_def_cfa_offset 16
-; V83A-NEXT:    .cfi_offset w30, -16
-; V83A-NEXT:    //APP
-; V83A-NEXT:    mov x30, x0
-; V83A-NEXT:    //NO_APP
-; V83A-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; V83A-NEXT:    adr x16, .Ltmp1
-; V83A-NEXT:    hint #39
-; V83A-NEXT:    retaa
-;
-; PAUTHLR-LABEL: leaf_clobbers_lr:
-; PAUTHLR:       // %bb.0:
-; PAUTHLR-NEXT:  .Ltmp1:
-; PAUTHLR-NEXT:    paciasppc
-; PAUTHLR-NEXT:    .cfi_negate_ra_state
-; PAUTHLR-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; PAUTHLR-NEXT:    .cfi_def_cfa_offset 16
-; PAUTHLR-NEXT:    .cfi_offset w30, -16
-; PAUTHLR-NEXT:    //APP
-; PAUTHLR-NEXT:    mov x30, x0
-; PAUTHLR-NEXT:    //NO_APP
-; PAUTHLR-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; PAUTHLR-NEXT:    retaasppc .Ltmp1
-  call void asm sideeffect "mov x30, $0", "r,~{lr}"(i64 %x) #1
-  ret i64 %x
-}
-
-declare i32 @foo(i32)
-
-define i32 @non_leaf_sign_all(i32 %x) "sign-return-address"="all" {
-; COMPAT-LABEL: non_leaf_sign_all:
-; COMPAT:       // %bb.0:
-; COMPAT-NEXT:    hint #39
-; COMPAT-NEXT:  .Ltmp2:
-; COMPAT-NEXT:    hint #25
-; COMPAT-NEXT:    .cfi_negate_ra_state
-; COMPAT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; COMPAT-NEXT:    .cfi_def_cfa_offset 16
-; COMPAT-NEXT:    .cfi_offset w30, -16
-; COMPAT-NEXT:    bl foo
-; COMPAT-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; COMPAT-NEXT:    adr x16, .Ltmp2
-; COMPAT-NEXT:    hint #39
-; COMPAT-NEXT:    hint #29
-; COMPAT-NEXT:    ret
-;
-; V83A-LABEL: non_leaf_sign_all:
-; V83A:       // %bb.0:
-; V83A-NEXT:    hint #39
-; V83A-NEXT:  .Ltmp2:
-; V83A-NEXT:    paciasp
-; V83A-NEXT:    .cfi_negate_ra_state
-; V83A-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; V83A-NEXT:    .cfi_def_cfa_offset 16
-; V83A-NEXT:    .cfi_offset w30, -16
-; V83A-NEXT:    bl foo
-; V83A-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; V83A-NEXT:    adr x16, .Ltmp2
-; V83A-NEXT:    hint #39
-; V83A-NEXT:    retaa
-;
-; PAUTHLR-LABEL: non_leaf_sign_all:
-; PAUTHLR:       // %bb.0:
-; PAUTHLR-NEXT:  .Ltmp2:
-; PAUTHLR-NEXT:    paciasppc
-; PAUTHLR-NEXT:    .cfi_negate_ra_state
-; PAUTHLR-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; PAUTHLR-NEXT:    .cfi_def_cfa_offset 16
-; PAUTHLR-NEXT:    .cfi_offset w30, -16
-; PAUTHLR-NEXT:    bl foo
-; PAUTHLR-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; PAUTHLR-NEXT:    retaasppc .Ltmp2
-  %call = call i32 @foo(i32 %x)
-  ret i32 %call
-}
-
-define i32 @non_leaf_sign_non_leaf(i32 %x) "sign-return-address"="non-leaf"  {
-; COMPAT-LABEL: non_leaf_sign_non_leaf:
-; COMPAT:       // %bb.0:
-; COMPAT-NEXT:    hint #39
-; COMPAT-NEXT:  .Ltmp3:
-; COMPAT-NEXT:    hint #25
-; COMPAT-NEXT:    .cfi_negate_ra_state
-; COMPAT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; COMPAT-NEXT:    .cfi_def_cfa_offset 16
-; COMPAT-NEXT:    .cfi_offset w30, -16
-; COMPAT-NEXT:    bl foo
-; COMPAT-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; COMPAT-NEXT:    adr x16, .Ltmp3
-; COMPAT-NEXT:    hint #39
-; COMPAT-NEXT:    hint #29
-; COMPAT-NEXT:    ret
-;
-; V83A-LABEL: non_leaf_sign_non_leaf:
-; V83A:       // %bb.0:
-; V83A-NEXT:    hint #39
-; V83A-NEXT:  .Ltmp3:
-; V83A-NEXT:    paciasp
-; V83A-NEXT:    .cfi_negate_ra_state
-; V83A-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; V83A-NEXT:    .cfi_def_cfa_offset 16
-; V83A-NEXT:    .cfi_offset w30, -16
-; V83A-NEXT:    bl foo
-; V83A-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; V83A-NEXT:    adr x16, .Ltmp3
-; V83A-NEXT:    hint #39
-; V83A-NEXT:    retaa
-;
-; PAUTHLR-LABEL: non_leaf_sign_non_leaf:
-; PAUTHLR:       // %bb.0:
-; PAUTHLR-NEXT:  .Ltmp3:
-; PAUTHLR-NEXT:    paciasppc
-; PAUTHLR-NEXT:    .cfi_negate_ra_state
-; PAUTHLR-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; PAUTHLR-NEXT:    .cfi_def_cfa_offset 16
-; PAUTHLR-NEXT:    .cfi_offset w30, -16
-; PAUTHLR-NEXT:    bl foo
-; PAUTHLR-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; PAUTHLR-NEXT:    retaasppc .Ltmp3
-  %call = call i32 @foo(i32 %x)
-  ret i32 %call
-}
-
-; Should not use the RETAA instruction.
-define i32 @non_leaf_scs(i32 %x) "sign-return-address"="non-leaf" shadowcallstack "target-features"="+v8.3a,+reserve-x18"  {
-; CHECK-LABEL: non_leaf_scs:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [x18], #8
-; CHECK-NEXT:    .cfi_escape 0x16, 0x12, 0x02, 0x82, 0x78 //
-; CHECK-NEXT:    hint #39
-; CHECK-NEXT:  .Ltmp4:
-; CHECK-NEXT:    paciasp
-; CHECK-NEXT:    .cfi_negate_ra_state
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    .cfi_offset w30, -16
-; CHECK-NEXT:    bl foo
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    adr x16, .Ltmp4
-; CHECK-NEXT:    hint #39
-; CHECK-NEXT:    autiasp
-; CHECK-NEXT:    ldr x30, [x18, #-8]!
-; CHECK-NEXT:    ret
-;
-; PAUTHLR-LABEL: non_leaf_scs:
-; PAUTHLR:       // %bb.0:
-; PAUTHLR-NEXT:    str x30, [x18], #8
-; PAUTHLR-NEXT:    .cfi_escape 0x16, 0x12, 0x02, 0x82, 0x78 //
-; PAUTHLR-NEXT:  .Ltmp4:
-; PAUTHLR-NEXT:    paciasppc
-; PAUTHLR-NEXT:    .cfi_negate_ra_state
-; PAUTHLR-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; PAUTHLR-NEXT:    .cfi_def_cfa_offset 16
-; PAUTHLR-NEXT:    .cfi_offset w30, -16
-; PAUTHLR-NEXT:    bl foo
-; PAUTHLR-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; PAUTHLR-NEXT:    autiasppc .Ltmp4
-; PAUTHLR-NEXT:    ldr x30, [x18, #-8]!
-; PAUTHLR-NEXT:    ret
-  %call = call i32 @foo(i32 %x)
-  ret i32 %call
-}
-
-define i32 @leaf_sign_all_v83(i32 %x) "sign-return-address"="all" "target-features"="+v8.3a" {
-; CHECK-LABEL: leaf_sign_all_v83:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    hint #39
-; CHECK-NEXT:  .Ltmp5:
-; CHECK-NEXT:    paciasp
-; CHECK-NEXT:    .cfi_negate_ra_state
-; CHECK-NEXT:    adr x16, .Ltmp5
-; CHECK-NEXT:    hint #39
-; CHECK-NEXT:    retaa
-;
-; PAUTHLR-LABEL: leaf_sign_all_v83:
-; PAUTHLR:       // %bb.0:
-; PAUTHLR-NEXT:  .Ltmp5:
-; PAUTHLR-NEXT:    paciasppc
-; PAUTHLR-NEXT:    .cfi_negate_ra_state
-; PAUTHLR-NEXT:    retaasppc .Ltmp5
-  ret i32 %x
-}
-
-declare fastcc i64 @bar(i64)
-
-define fastcc void @spill_lr_and_tail_call(i64 %x) "sign-return-address"="all" {
-; COMPAT-LABEL: spill_lr_and_tail_call:
-; COMPAT:       // %bb.0:
-; COMPAT-NEXT:    hint #39
-; COMPAT-NEXT:  .Ltmp6:
-; COMPAT-NEXT:    hint #25
-; COMPAT-NEXT:    .cfi_negate_ra_state
-; COMPAT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; COMPAT-NEXT:    .cfi_def_cfa_offset 16
-; COMPAT-NEXT:    .cfi_offset w30, -16
-; COMPAT-NEXT:    //APP
-; COMPAT-NEXT:    mov x30, x0
-; COMPAT-NEXT:    //NO_APP
-; COMPAT-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; COMPAT-NEXT:    adr x16, .Ltmp6
-; COMPAT-NEXT:    hint #39
-; COMPAT-NEXT:    hint #29
-; COMPAT-NEXT:    b bar
-;
-; V83A-LABEL: spill_lr_and_tail_call:
-; V83A:       // %bb.0:
-; V83A-NEXT:    hint #39
-; V83A-NEXT:  .Ltmp6:
-; V83A-NEXT:    paciasp
-; V83A-NEXT:    .cfi_negate_ra_state
-; V83A-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; V83A-NEXT:    .cfi_def_cfa_offset 16
-; V83A-NEXT:    .cfi_offset w30, -16
-; V83A-NEXT:    //APP
-; V83A-NEXT:    mov x30, x0
-; V83A-NEXT:    //NO_APP
-; V83A-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; V83A-NEXT:    adr x16, .Ltmp6
-; V83A-NEXT:    hint #39
-; V83A-NEXT:    autiasp
-; V83A-NEXT:    b bar
-;
-; PAUTHLR-LABEL: spill_lr_and_tail_call:
-; PAUTHLR:       // %bb.0:
-; PAUTHLR-NEXT:  .Ltmp6:
-; PAUTHLR-NEXT:    paciasppc
-; PAUTHLR-NEXT:    .cfi_negate_ra_state
-; PAUTHLR-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; PAUTHLR-NEXT:    .cfi_def_cfa_offset 16
-; PAUTHLR-NEXT:    .cfi_offset w30, -16
-; PAUTHLR-NEXT:    //APP
-; PAUTHLR-NEXT:    mov x30, x0
-; PAUTHLR-NEXT:    //NO_APP
-; PAUTHLR-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; PAUTHLR-NEXT:    autiasppc .Ltmp6
-; PAUTHLR-NEXT:    b bar
-  call void asm sideeffect "mov x30, $0", "r,~{lr}"(i64 %x) #1
-  tail call fastcc i64 @bar(i64 %x)
-  ret void
-}
-
-define i32 @leaf_sign_all_a_key(i32 %x) "sign-return-address"="all" "sign-return-address-key"="a_key" {
-; COMPAT-LABEL: leaf_sign_all_a_key:
-; COMPAT:       // %bb.0:
-; COMPAT-NEXT:    hint #39
-; COMPAT-NEXT:  .Ltmp7:
-; COMPAT-NEXT:    hint #25
-; COMPAT-NEXT:    .cfi_negate_ra_state
-; COMPAT-NEXT:    adr x16, .Ltmp7
-; COMPAT-NEXT:    hint #39
-; COMPAT-NEXT:    hint #29
-; COMPAT-NEXT:    ret
-;
-; V83A-LABEL: leaf_sign_all_a_key:
-; V83A:       // %bb.0:
-; V83A-NEXT:    hint #39
-; V83A-NEXT:  .Ltmp7:
-; V83A-NEXT:    paciasp
-; V83A-NEXT:    .cfi_negate_ra_state
-; V83A-NEXT:    adr x16, .Ltmp7
-; V83A-NEXT:    hint #39
-; V83A-NEXT:    retaa
-;
-; PAUTHLR-LABEL: leaf_sign_all_a_key:
-; PAUTHLR:       // %bb.0:
-; PAUTHLR-NEXT:  .Ltmp7:
-; PAUTHLR-NEXT:    paciasppc
-; PAUTHLR-NEXT:    .cfi_negate_ra_state
-; PAUTHLR-NEXT:    retaasppc .Ltmp7
-  ret i32 %x
-}
-
-define i32 @leaf_sign_all_b_key(i32 %x) "sign-return-address"="all" "sign-return-address-key"="b_key" {
-; COMPAT-LABEL: leaf_sign_all_b_key:
-; COMPAT:       // %bb.0:
-; COMPAT-NEXT:    .cfi_b_key_frame
-; COMPAT-NEXT:    hint #39
-; COMPAT-NEXT:  .Ltmp8:
-; COMPAT-NEXT:    hint #27
-; COMPAT-NEXT:    .cfi_negate_ra_state
-; COMPAT-NEXT:    adr x16, .Ltmp8
-; COMPAT-NEXT:    hint #39
-; COMPAT-NEXT:    hint #31
-; COMPAT-NEXT:    ret
-;
-; V83A-LABEL: leaf_sign_all_b_key:
-; V83A:       // %bb.0:
-; V83A-NEXT:    .cfi_b_key_frame
-; V83A-NEXT:    hint #39
-; V83A-NEXT:  .Ltmp8:
-; V83A-NEXT:    pacibsp
-; V83A-NEXT:    .cfi_negate_ra_state
-; V83A-NEXT:    adr x16, .Ltmp8
-; V83A-NEXT:    hint #39
-; V83A-NEXT:    retab
-;
-; PAUTHLR-LABEL: leaf_sign_all_b_key:
-; PAUTHLR:       // %bb.0:
-; PAUTHLR-NEXT:    .cfi_b_key_frame
-; PAUTHLR-NEXT:  .Ltmp8:
-; PAUTHLR-NEXT:    pacibsppc
-; PAUTHLR-NEXT:    .cfi_negate_ra_state
-; PAUTHLR-NEXT:    retabsppc .Ltmp8
-  ret i32 %x
-}
-
-define i32 @leaf_sign_all_v83_b_key(i32 %x) "sign-return-address"="all" "target-features"="+v8.3a" "sign-return-address-key"="b_key" {
-; CHECK-LABEL: leaf_sign_all_v83_b_key:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    .cfi_b_key_frame
-; CHECK-NEXT:    hint #39
-; CHECK-NEXT:  .Ltmp9:
-; CHECK-NEXT:    pacibsp
-; CHECK-NEXT:    .cfi_negate_ra_state
-; CHECK-NEXT:    adr x16, .Ltmp9
-; CHECK-NEXT:    hint #39
-; CHECK-NEXT:    retab
-;
-; PAUTHLR-LABEL: leaf_sign_all_v83_b_key:
-; PAUTHLR:       // %bb.0:
-; PAUTHLR-NEXT:    .cfi_b_key_frame
-; PAUTHLR-NEXT:  .Ltmp9:
-; PAUTHLR-NEXT:    pacibsppc
-; PAUTHLR-NEXT:    .cfi_negate_ra_state
-; PAUTHLR-NEXT:    retabsppc .Ltmp9
-  ret i32 %x
-}
-
-; Note that BTI instruction is not needed before PACIASP.
-define i32 @leaf_sign_all_a_key_bti(i32 %x) "sign-return-address"="all" "sign-return-address-key"="a_key" "branch-target-enforcement"="true"{
-; COMPAT-LABEL: leaf_sign_all_a_key_bti:
-; COMPAT:       // %bb.0:
-; COMPAT-NEXT:    hint #34
-; COMPAT-NEXT:    hint #39
-; COMPAT-NEXT:  .Ltmp10:
-; COMPAT-NEXT:    hint #25
-; COMPAT-NEXT:    .cfi_negate_ra_state
-; COMPAT-NEXT:    adr x16, .Ltmp10
-; COMPAT-NEXT:    hint #39
-; COMPAT-NEXT:    hint #29
-; COMPAT-NEXT:    ret
-;
-; V83A-LABEL: leaf_sign_all_a_key_bti:
-; V83A:       // %bb.0:
-; V83A-NEXT:    hint #34
-; V83A-NEXT:    hint #39
-; V83A-NEXT:  .Ltmp10:
-; V83A-NEXT:    paciasp
-; V83A-NEXT:    .cfi_negate_ra_state
-; V83A-NEXT:    adr x16, .Ltmp10
-; V83A-NEXT:    hint #39
-; V83A-NEXT:    retaa
-;
-; PAUTHLR-LABEL: leaf_sign_all_a_key_bti:
-; PAUTHLR:       // %bb.0:
-; PAUTHLR-NEXT:    bti c
-; PAUTHLR-NEXT:  .Ltmp10:
-; PAUTHLR-NEXT:    paciasppc
-; PAUTHLR-NEXT:    .cfi_negate_ra_state
-; PAUTHLR-NEXT:    retaasppc .Ltmp10
-  ret i32 %x
-}
-
-; Note that BTI instruction is not needed before PACIBSP.
-define i32 @leaf_sign_all_b_key_bti(i32 %x) "sign-return-address"="all" "sign-return-address-key"="b_key" "branch-target-enforcement"="true"{
-; COMPAT-LABEL: leaf_sign_all_b_key_bti:
-; COMPAT:       // %bb.0:
-; COMPAT-NEXT:    hint #34
-; COMPAT-NEXT:    .cfi_b_key_frame
-; COMPAT-NEXT:    hint #39
-; COMPAT-NEXT:  .Ltmp11:
-; COMPAT-NEXT:    hint #27
-; COMPAT-NEXT:    .cfi_negate_ra_state
-; COMPAT-NEXT:    adr x16, .Ltmp11
-; COMPAT-NEXT:    hint #39
-; COMPAT-NEXT:    hint #31
-; COMPAT-NEXT:    ret
-;
-; V83A-LABEL: leaf_sign_all_b_key_bti:
-; V83A:       // %bb.0:
-; V83A-NEXT:    hint #34
-; V83A-NEXT:    .cfi_b_key_frame
-; V83A-NEXT:    hint #39
-; V83A-NEXT:  .Ltmp11:
-; V83A-NEXT:    pacibsp
-; V83A-NEXT:    .cfi_negate_ra_state
-; V83A-NEXT:    adr x16, .Ltmp11
-; V83A-NEXT:    hint #39
-; V83A-NEXT:    retab
-;
-; PAUTHLR-LABEL: leaf_sign_all_b_key_bti:
-; PAUTHLR:       // %bb.0:
-; PAUTHLR-NEXT:    bti c
-; PAUTHLR-NEXT:    .cfi_b_key_frame
-; PAUTHLR-NEXT:  .Ltmp11:
-; PAUTHLR-NEXT:    pacibsppc
-; PAUTHLR-NEXT:    .cfi_negate_ra_state
-; PAUTHLR-NEXT:    retabsppc .Ltmp11
-  ret i32 %x
-}
-
-; Note that BTI instruction is not needed before PACIBSP.
-define i32 @leaf_sign_all_v83_b_key_bti(i32 %x) "sign-return-address"="all" "target-features"="+v8.3a" "sign-return-address-key"="b_key" "branch-target-enforcement"="true" {
-; CHECK-LABEL: leaf_sign_all_v83_b_key_bti:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    hint #34
-; CHECK-NEXT:    .cfi_b_key_frame
-; CHECK-NEXT:    hint #39
-; CHECK-NEXT:  .Ltmp12:
-; CHECK-NEXT:    pacibsp
-; CHECK-NEXT:    .cfi_negate_ra_state
-; CHECK-NEXT:    adr x16, .Ltmp12
-; CHECK-NEXT:    hint #39
-; CHECK-NEXT:    retab
-;
-; PAUTHLR-LABEL: leaf_sign_all_v83_b_key_bti:
-; PAUTHLR:       // %bb.0:
-; PAUTHLR-NEXT:    bti c
-; PAUTHLR-NEXT:    .cfi_b_key_frame
-; PAUTHLR-NEXT:  .Ltmp12:
-; PAUTHLR-NEXT:    pacibsppc
-; PAUTHLR-NEXT:    .cfi_negate_ra_state
-; PAUTHLR-NEXT:    retabsppc .Ltmp12
-  ret i32 %x
-}
diff --git a/llvm/test/CodeGen/AArch64/sign-return-address.ll b/llvm/test/CodeGen/AArch64/sign-return-address.ll
index 1481d4beb50d6..5680915c7f414 100644
--- a/llvm/test/CodeGen/AArch64/sign-return-address.ll
+++ b/llvm/test/CodeGen/AArch64/sign-return-address.ll
@@ -2,9 +2,6 @@
 ; RUN: llc -mtriple=aarch64              < %s | FileCheck --check-prefixes=CHECK,COMPAT %s
 ; RUN: llc -mtriple=aarch64 -mattr=v8.3a < %s | FileCheck --check-prefixes=CHECK,V83A %s
 
-; v9.5-A is not expected to change codegen without -mbranch-protection=+pc, so reuse V83A.
-; RUN: llc -mtriple=aarch64 -mattr=v9.5a < %s | FileCheck --check-prefixes=CHECK,V83A %s
-
 define i32 @leaf(i32 %x) {
 ; CHECK-LABEL: leaf:
 ; CHECK:       // %bb.0:
diff --git a/llvm/unittests/TargetParser/TargetParserTest.cpp b/llvm/unittests/TargetParser/TargetParserTest.cpp
index 866176ab09836..30e60ad92b68e 100644
--- a/llvm/unittests/TargetParser/TargetParserTest.cpp
+++ b/llvm/unittests/TargetParser/TargetParserTest.cpp
@@ -1812,8 +1812,7 @@ TEST(TargetParserTest, AArch64ExtensionFeatures) {
       AArch64::AEK_SSVE_FP8DOT4, AArch64::AEK_LUT,
       AArch64::AEK_SME_LUTv2,    AArch64::AEK_SMEF8F16,
       AArch64::AEK_SMEF8F32,     AArch64::AEK_SMEFA64,
-      AArch64::AEK_CPA,          AArch64::AEK_PAUTHLR,
-  };
+      AArch64::AEK_CPA};
 
   std::vector<StringRef> Features;
 
@@ -1900,7 +1899,6 @@ TEST(TargetParserTest, AArch64ExtensionFeatures) {
   EXPECT_TRUE(llvm::is_contained(Features, "+sme-f8f32"));
   EXPECT_TRUE(llvm::is_contained(Features, "+sme-fa64"));
   EXPECT_TRUE(llvm::is_contained(Features, "+cpa"));
-  EXPECT_TRUE(llvm::is_contained(Features, "+pauth-lr"));
 
   // Assuming we listed every extension above, this should produce the same
   // result. (note that AEK_NONE doesn't have a name so it won't be in the

From 199a0f9f5aaf72ff856f68e3bb708e783252af17 Mon Sep 17 00:00:00 2001
From: Tomas Matheson <tomas.matheson@arm.com>
Date: Thu, 21 Dec 2023 16:26:39 +0000
Subject: [PATCH 088/342] Revert "[AArch64] Add FEAT_PAuthLR assembler support"

This reverts commit 934b1099cbf14fa3f86a269dff957da8e5fb619f.

Buildbot failues on sanitizer-x86_64-linux-fast
---
 llvm/lib/Target/AArch64/AArch64.td            |   9 +-
 .../lib/Target/AArch64/AArch64InstrFormats.td |  74 ---------
 llvm/lib/Target/AArch64/AArch64InstrInfo.td   |  39 -----
 llvm/lib/Target/AArch64/AArch64SchedA64FX.td  |   2 +-
 .../Target/AArch64/AArch64SchedNeoverseN2.td  |   2 +-
 .../AArch64/AsmParser/AArch64AsmParser.cpp    |  28 ----
 .../Disassembler/AArch64Disassembler.cpp      |  18 ---
 .../MCTargetDesc/AArch64AsmBackend.cpp        |  14 --
 .../MCTargetDesc/AArch64ELFObjectWriter.cpp   |   4 -
 .../AArch64/MCTargetDesc/AArch64FixupKinds.h  |   5 -
 .../MCTargetDesc/AArch64MCCodeEmitter.cpp     |  29 ----
 .../MC/AArch64/armv9.5a-pauthlr-diagnostics.s |  57 -------
 llvm/test/MC/AArch64/armv9.5a-pauthlr-reloc.s |  12 --
 llvm/test/MC/AArch64/armv9.5a-pauthlr.s       | 151 ------------------
 .../Disassembler/AArch64/armv9.5a-pauthlr.txt |  78 ---------
 15 files changed, 4 insertions(+), 518 deletions(-)
 delete mode 100644 llvm/test/MC/AArch64/armv9.5a-pauthlr-diagnostics.s
 delete mode 100644 llvm/test/MC/AArch64/armv9.5a-pauthlr-reloc.s
 delete mode 100644 llvm/test/MC/AArch64/armv9.5a-pauthlr.s
 delete mode 100644 llvm/test/MC/Disassembler/AArch64/armv9.5a-pauthlr.txt

diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td
index 97e92a57a7ff4..db92a94e40e4b 100644
--- a/llvm/lib/Target/AArch64/AArch64.td
+++ b/llvm/lib/Target/AArch64/AArch64.td
@@ -622,13 +622,8 @@ def FeatureLdpAlignedOnly : SubtargetFeature<"ldp-aligned-only", "HasLdpAlignedO
 def FeatureStpAlignedOnly : SubtargetFeature<"stp-aligned-only", "HasStpAlignedOnly",
     "true", "In order to emit stp, first check if the store will be aligned to 2 * element_size">;
 
-// AArch64 2023 Architecture Extensions (v9.5-A)
-
 def FeatureCPA : SubtargetFeature<"cpa", "HasCPA", "true",
-    "Enable Armv9.5-A Checked Pointer Arithmetic (FEAT_CPA)">;
-
-def FeaturePAuthLR : SubtargetFeature<"pauth-lr", "HasPAuthLR",
-    "true", "Enable Armv9.5-A PAC enhancements (FEAT_PAuth_LR)">;
+  "Enable ARMv9.5-A Checked Pointer Arithmetic (FEAT_CPA)">;
 
 //===----------------------------------------------------------------------===//
 // Architectures.
@@ -815,7 +810,7 @@ def SMEUnsupported : AArch64Unsupported {
                       SME2Unsupported.F);
 }
 
-let F = [HasPAuth, HasPAuthLR] in
+let F = [HasPAuth] in
 def PAUnsupported : AArch64Unsupported;
 
 include "AArch64SchedA53.td"
diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
index cb63d8726744d..690ac0dcda621 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -2368,80 +2368,6 @@ class ClearAuth<bits<1> data, string asm>
   let Inst{4-0} = Rd;
 }
 
-// v9.5-A FEAT_PAuth_LR
-
-class SignAuthFixedRegs<bits<5> opcode2, bits<6> opcode, string asm>
-  : I<(outs), (ins), asm, "", "", []>,
-    Sched<[WriteI, ReadI]> {
-  let Inst{31} = 0b1; // sf
-  let Inst{30} = 0b1;
-  let Inst{29} = 0b0; // S
-  let Inst{28-21} = 0b11010110;
-  let Inst{20-16} = opcode2;
-  let Inst{15-10} = opcode;
-  let Inst{9-5} = 0b11111; // Rn
-  let Inst{4-0} = 0b11110; // Rd
-}
-
-def PAuthPCRelLabel16Operand : PCRelLabel<16> {
-  let Name = "PAuthPCRelLabel16";
-  let PredicateMethod = "isPAuthPCRelLabel16Operand";
-}
-def am_pauth_pcrel : Operand<OtherVT> {
-  let EncoderMethod = "getPAuthPCRelOpValue";
-  let DecoderMethod = "DecodePCRelLabel16";
-  let PrintMethod = "printAlignedLabel";
-  let ParserMatchClass = PAuthPCRelLabel16Operand;
-  let OperandType = "OPERAND_PCREL";
-}
-
-class SignAuthPCRel<bits<2> opc, string asm>
-  : I<(outs), (ins am_pauth_pcrel:$label), asm, "\t$label", "", []>,
-    Sched<[]> {
-  bits<16> label;
-  let Inst{31} = 0b1; // sf
-  let Inst{30-23} = 0b11100111;
-  let Inst{22-21} = opc;
-  let Inst{20-5} = label; // imm
-  let Inst{4-0} = 0b11111; // Rd
-}
-
-class SignAuthOneReg<bits<5> opcode2, bits<6> opcode, string asm>
-  : I<(outs), (ins GPR64:$Rn), asm, "\t$Rn", "", []>,
-    Sched<[]> {
-  bits<5> Rn;
-  let Inst{31} = 0b1; // sf
-  let Inst{30} = 0b1;
-  let Inst{29} = 0b0; // S
-  let Inst{28-21} = 0b11010110;
-  let Inst{20-16} = opcode2;
-  let Inst{15-10} = opcode;
-  let Inst{9-5} = Rn;
-  let Inst{4-0} = 0b11110; // Rd
-}
-
-class SignAuthReturnPCRel<bits<3> opc, bits<5> op2, string asm>
-  : I<(outs), (ins am_pauth_pcrel:$label), asm, "\t$label", "", []>,
-    Sched<[WriteAtomic]> {
-  bits<16> label;
-  let Inst{31-24} = 0b01010101;
-  let Inst{23-21} = opc;
-  let Inst{20-5} = label; // imm16
-  let Inst{4-0} = op2;
-}
-
-class SignAuthReturnReg<bits<6> op3, string asm>
-  : I<(outs), (ins GPR64common:$Rm), asm, "\t$Rm", "", []>,
-    Sched<[WriteAtomic]> {
-  bits<5> Rm;
-  let Inst{31-25} = 0b1101011;
-  let Inst{24-21} = 0b0010; // opc
-  let Inst{20-16} = 0b11111; // op2
-  let Inst{15-10} = op3;
-  let Inst{9-5} = 0b11111; // Rn
-  let Inst{4-0} = Rm; // op4 (Rm)
-}
-
 // Base class for the Armv8.4-A 8 and 16-bit flag manipulation instructions
 class BaseFlagManipulation<bit sf, bit sz, dag iops, string asm, string ops>
     : I<(outs), iops, asm, ops, "", []>,
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 977729bb082b7..4ccac40f99a0a 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -61,9 +61,6 @@ def HasLOR           : Predicate<"Subtarget->hasLOR()">,
 def HasPAuth         : Predicate<"Subtarget->hasPAuth()">,
                        AssemblerPredicateWithAll<(all_of FeaturePAuth), "pauth">;
 
-def HasPAuthLR       : Predicate<"Subtarget->hasPAuthLR()">,
-                       AssemblerPredicateWithAll<(all_of FeaturePAuthLR), "pauth-lr">;
-
 def HasJS            : Predicate<"Subtarget->hasJS()">,
                        AssemblerPredicateWithAll<(all_of FeatureJS), "jsconv">;
 
@@ -1649,42 +1646,6 @@ let Predicates = [HasPAuth] in {
 
 }
 
-// v9.5-A pointer authentication extensions
-
-// Always accept "pacm" as an alias for "hint #39", but don't emit it when
-// disassembling if we don't have the pauth-lr feature.
-let CRm = 0b0100 in {
-  def PACM : SystemNoOperands<0b111, "hint\t#39">;
-}
-def : InstAlias<"pacm", (PACM), 0>;
-
-let Predicates = [HasPAuthLR] in {
-  let Defs = [LR], Uses = [LR, SP] in {
-    //                                opcode2, opcode,   asm
-    def PACIASPPC : SignAuthFixedRegs<0b00001, 0b101000, "paciasppc">;
-    def PACIBSPPC : SignAuthFixedRegs<0b00001, 0b101001, "pacibsppc">;
-    def PACNBIASPPC : SignAuthFixedRegs<0b00001, 0b100000, "pacnbiasppc">;
-    def PACNBIBSPPC : SignAuthFixedRegs<0b00001, 0b100001, "pacnbibsppc">;
-    //                             opc,  asm
-    def AUTIASPPCi : SignAuthPCRel<0b00, "autiasppc">;
-    def AUTIBSPPCi : SignAuthPCRel<0b01, "autibsppc">;
-    //                              opcode2, opcode,   asm
-    def AUTIASPPCr : SignAuthOneReg<0b00001, 0b100100, "autiasppc">;
-    def AUTIBSPPCr : SignAuthOneReg<0b00001, 0b100101, "autibsppc">;
-  }
-
-  let Uses = [LR, SP], isReturn = 1, isTerminator = 1, isBarrier = 1 in {
-    //                                   opc,   op2,     asm
-    def RETAASPPCi : SignAuthReturnPCRel<0b000, 0b11111, "retaasppc">;
-    def RETABSPPCi : SignAuthReturnPCRel<0b001, 0b11111, "retabsppc">;
-    //                                 op3,      asm
-    def RETAASPPCr : SignAuthReturnReg<0b000010, "retaasppc">;
-    def RETABSPPCr : SignAuthReturnReg<0b000011, "retabsppc">;
-  }
-  def : InstAlias<"pacm", (PACM), 1>;
-}
-
-
 // v8.3a floating point conversion for javascript
 let Predicates = [HasJS, HasFPARMv8], Defs = [NZCV] in
 def FJCVTZS  : BaseFPToIntegerUnscaled<0b01, 0b11, 0b110, FPR64, GPR32,
diff --git a/llvm/lib/Target/AArch64/AArch64SchedA64FX.td b/llvm/lib/Target/AArch64/AArch64SchedA64FX.td
index 7edce4b61605d..813b4a3affcfd 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedA64FX.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedA64FX.td
@@ -22,7 +22,7 @@ def A64FXModel : SchedMachineModel {
 
   list<Predicate> UnsupportedFeatures = !listconcat(SMEUnsupported.F, SVEUnsupported.F,
                                                     [HasMTE, HasMatMulInt8, HasBF16,
-                                                    HasPAuth, HasPAuthLR, HasCPA]);
+                                                    HasPAuth, HasCPA]);
   let FullInstRWOverlapCheck = 0;
 }
 
diff --git a/llvm/lib/Target/AArch64/AArch64SchedNeoverseN2.td b/llvm/lib/Target/AArch64/AArch64SchedNeoverseN2.td
index a6fab5e6245f8..53cf725f0e235 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedNeoverseN2.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedNeoverseN2.td
@@ -19,7 +19,7 @@ def NeoverseN2Model : SchedMachineModel {
   let CompleteModel         =   1;
 
   list<Predicate> UnsupportedFeatures = !listconcat(SMEUnsupported.F,
-    [HasSVE2p1, HasPAuthLR, HasCPA]);
+                                                    [HasSVE2p1, HasCPA]);
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
index 38a92cb096029..74afa4183e67e 100644
--- a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
+++ b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
@@ -1696,21 +1696,6 @@ class AArch64Operand : public MCParsedAsmOperand {
     return DiagnosticPredicateTy::Match;
   }
 
-  bool isPAuthPCRelLabel16Operand() const {
-    // PAuth PCRel16 operands are similar to regular branch targets, but only
-    // negative values are allowed for concrete immediates as signing instr
-    // should be in a lower address.
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return true;
-    int64_t Val = MCE->getValue();
-    if (Val & 0b11)
-      return false;
-    return (Val <= 0) && (Val > -(1 << 18));
-  }
-
   void addExpr(MCInst &Inst, const MCExpr *Expr) const {
     // Add as immediates when possible.  Null MCExpr = 0.
     if (!Expr)
@@ -2012,19 +1997,6 @@ class AArch64Operand : public MCParsedAsmOperand {
     Inst.addOperand(MCOperand::createImm(MCE->getValue() >> 2));
   }
 
-  void addPAuthPCRelLabel16Operands(MCInst &Inst, unsigned N) const {
-    // PC-relative operands don't encode the low bits, so shift them off
-    // here. If it's a label, however, just put it on directly as there's
-    // not enough information now to do anything.
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE) {
-      addExpr(Inst, getImm());
-      return;
-    }
-    Inst.addOperand(MCOperand::createImm(MCE->getValue() >> 2));
-  }
-
   void addPCRelLabel19Operands(MCInst &Inst, unsigned N) const {
     // Branch operands don't encode the low bits, so shift them off
     // here. If it's a label, however, just put it on directly as there's
diff --git a/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp b/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
index c8cebaa5995e0..cf2d3879292d1 100644
--- a/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
+++ b/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
@@ -165,9 +165,6 @@ static DecodeStatus DecodeFixedPointScaleImm32(MCInst &Inst, unsigned Imm,
 static DecodeStatus DecodeFixedPointScaleImm64(MCInst &Inst, unsigned Imm,
                                                uint64_t Address,
                                                const MCDisassembler *Decoder);
-static DecodeStatus DecodePCRelLabel16(MCInst &Inst, unsigned Imm,
-                                       uint64_t Address,
-                                       const MCDisassembler *Decoder);
 static DecodeStatus DecodePCRelLabel19(MCInst &Inst, unsigned Imm,
                                        uint64_t Address,
                                        const MCDisassembler *Decoder);
@@ -890,21 +887,6 @@ static DecodeStatus DecodeFixedPointScaleImm64(MCInst &Inst, unsigned Imm,
   return Success;
 }
 
-static DecodeStatus DecodePCRelLabel16(MCInst &Inst, unsigned Imm,
-                                       uint64_t Addr,
-                                       const MCDisassembler *Decoder) {
-  // Immediate is encoded as the top 16-bits of an unsigned 18-bit negative
-  // PC-relative offset.
-  int64_t ImmVal = Imm;
-  if (ImmVal < 0 || ImmVal > (1 << 16))
-    return Fail;
-  ImmVal = -ImmVal;
-  if (!Decoder->tryAddingSymbolicOperand(Inst, (ImmVal << 2), Addr,
-                                         /*IsBranch=*/false, 0, 0, 4))
-    Inst.addOperand(MCOperand::createImm(ImmVal));
-  return Success;
-}
-
 static DecodeStatus DecodePCRelLabel19(MCInst &Inst, unsigned Imm,
                                        uint64_t Addr,
                                        const MCDisassembler *Decoder) {
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
index 30ef3680ae79c..a6900b8963bb3 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
@@ -67,7 +67,6 @@ class AArch64AsmBackend : public MCAsmBackend {
         {"fixup_aarch64_ldr_pcrel_imm19", 5, 19, PCRelFlagVal},
         {"fixup_aarch64_movw", 5, 16, 0},
         {"fixup_aarch64_pcrel_branch14", 5, 14, PCRelFlagVal},
-        {"fixup_aarch64_pcrel_branch16", 5, 16, PCRelFlagVal},
         {"fixup_aarch64_pcrel_branch19", 5, 19, PCRelFlagVal},
         {"fixup_aarch64_pcrel_branch26", 0, 26, PCRelFlagVal},
         {"fixup_aarch64_pcrel_call26", 0, 26, PCRelFlagVal}};
@@ -122,7 +121,6 @@ static unsigned getFixupKindNumBytes(unsigned Kind) {
 
   case AArch64::fixup_aarch64_movw:
   case AArch64::fixup_aarch64_pcrel_branch14:
-  case AArch64::fixup_aarch64_pcrel_branch16:
   case AArch64::fixup_aarch64_add_imm12:
   case AArch64::fixup_aarch64_ldst_imm12_scale1:
   case AArch64::fixup_aarch64_ldst_imm12_scale2:
@@ -316,17 +314,6 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, const MCValue &Target,
     if (Value & 0x3)
       Ctx.reportError(Fixup.getLoc(), "fixup not sufficiently aligned");
     return (Value >> 2) & 0x3fff;
-  case AArch64::fixup_aarch64_pcrel_branch16:
-    // Unsigned PC-relative offset, so invert the negative immediate.
-    SignedValue = -SignedValue;
-    Value = static_cast<uint64_t>(SignedValue);
-    // Check valid 18-bit unsigned range.
-    if (SignedValue < 0 || SignedValue > ((1 << 18) - 1))
-      Ctx.reportError(Fixup.getLoc(), "fixup value out of range");
-    // Low two bits are not encoded (4-byte alignment assumed).
-    if (Value & 0b11)
-      Ctx.reportError(Fixup.getLoc(), "fixup not sufficiently aligned");
-    return (Value >> 2) & 0xffff;
   case AArch64::fixup_aarch64_pcrel_branch26:
   case AArch64::fixup_aarch64_pcrel_call26:
     if (TheTriple.isOSBinFormatCOFF() && !IsResolved && SignedValue != 0) {
@@ -393,7 +380,6 @@ unsigned AArch64AsmBackend::getFixupKindContainereSizeInBytes(unsigned Kind) con
 
   case AArch64::fixup_aarch64_movw:
   case AArch64::fixup_aarch64_pcrel_branch14:
-  case AArch64::fixup_aarch64_pcrel_branch16:
   case AArch64::fixup_aarch64_add_imm12:
   case AArch64::fixup_aarch64_ldst_imm12_scale1:
   case AArch64::fixup_aarch64_ldst_imm12_scale2:
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
index 496ab18e9b195..9de40661298cc 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
@@ -186,10 +186,6 @@ unsigned AArch64ELFObjectWriter::getRelocType(MCContext &Ctx,
       return R_CLS(LD_PREL_LO19);
     case AArch64::fixup_aarch64_pcrel_branch14:
       return R_CLS(TSTBR14);
-    case AArch64::fixup_aarch64_pcrel_branch16:
-      Ctx.reportError(Fixup.getLoc(),
-                      "relocation of PAC/AUT instructions is not supported");
-      return ELF::R_AARCH64_NONE;
     case AArch64::fixup_aarch64_pcrel_branch19:
       return R_CLS(CONDBR19);
     default:
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64FixupKinds.h b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64FixupKinds.h
index fdee2d5ad2bf3..767dd88055201 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64FixupKinds.h
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64FixupKinds.h
@@ -43,11 +43,6 @@ enum Fixups {
   // The high 14 bits of a 21-bit pc-relative immediate.
   fixup_aarch64_pcrel_branch14,
 
-  // The high 16 bits of a 18-bit unsigned PC-relative immediate. Used by
-  // pointer authentication, only within a function, so no relocation can be
-  // generated.
-  fixup_aarch64_pcrel_branch16,
-
   // The high 19 bits of a 21-bit pc-relative immediate. Same encoding as
   // fixup_aarch64_pcrel_adrhi, except this is use by b.cc and generates
   // relocations directly when necessary.
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
index c3e12b6d8024e..dbc4323a860f5 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
@@ -88,12 +88,6 @@ class AArch64MCCodeEmitter : public MCCodeEmitter {
                                       SmallVectorImpl<MCFixup> &Fixups,
                                       const MCSubtargetInfo &STI) const;
 
-  /// getPAuthPCRelOpValue - Return the encoded value for a pointer
-  /// authentication pc-relative operand.
-  uint32_t getPAuthPCRelOpValue(const MCInst &MI, unsigned OpIdx,
-                                SmallVectorImpl<MCFixup> &Fixups,
-                                const MCSubtargetInfo &STI) const;
-
   /// getLoadLiteralOpValue - Return the encoded value for a load-literal
   /// pc-relative address.
   uint32_t getLoadLiteralOpValue(const MCInst &MI, unsigned OpIdx,
@@ -333,29 +327,6 @@ uint32_t AArch64MCCodeEmitter::getCondBranchTargetOpValue(
   return 0;
 }
 
-/// getPAuthPCRelOpValue - Return the encoded value for a pointer
-/// authentication pc-relative operand.
-uint32_t
-AArch64MCCodeEmitter::getPAuthPCRelOpValue(const MCInst &MI, unsigned OpIdx,
-                                           SmallVectorImpl<MCFixup> &Fixups,
-                                           const MCSubtargetInfo &STI) const {
-  const MCOperand &MO = MI.getOperand(OpIdx);
-
-  // If the destination is an immediate, invert sign as it's a negative value
-  // that should be encoded as unsigned
-  if (MO.isImm())
-    return -(MO.getImm());
-  assert(MO.isExpr() && "Unexpected target type!");
-
-  MCFixupKind Kind = MCFixupKind(AArch64::fixup_aarch64_pcrel_branch16);
-  Fixups.push_back(MCFixup::create(0, MO.getExpr(), Kind, MI.getLoc()));
-
-  ++MCNumFixups;
-
-  // All of the information is in the fixup.
-  return 0;
-}
-
 /// getLoadLiteralOpValue - Return the encoded value for a load-literal
 /// pc-relative address.
 uint32_t
diff --git a/llvm/test/MC/AArch64/armv9.5a-pauthlr-diagnostics.s b/llvm/test/MC/AArch64/armv9.5a-pauthlr-diagnostics.s
deleted file mode 100644
index d06183be9da3e..0000000000000
--- a/llvm/test/MC/AArch64/armv9.5a-pauthlr-diagnostics.s
+++ /dev/null
@@ -1,57 +0,0 @@
-// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+pauth-lr 2>&1 < %s | FileCheck %s
-
-  autiasppc #2
-// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: expected label or encodable integer pc offset
-// CHECK-NEXT: autiasppc #2
-// CHECK-NOT: [[@LINE-3]]:{{[0-9]+}}:
-
-  autiasppc #1<<17
-// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: expected label or encodable integer pc offset
-// CHECK-NEXT: autiasppc #1<<17
-// CHECK-NOT: [[@LINE-3]]:{{[0-9]+}}:
-
-  autiasppc #-2
-// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: expected label or encodable integer pc offset
-// CHECK-NEXT: autiasppc #-2
-// CHECK-NOT: [[@LINE-3]]:{{[0-9]+}}:
-
-  autiasppc w0
-// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: expected label or encodable integer pc offset
-// CHECK-NEXT: autiasppc w0
-// CHECK-NOT: [[@LINE-3]]:{{[0-9]+}}:
-
-  autiasppc sp
-// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: expected label or encodable integer pc offset
-// CHECK-NEXT: autiasppc sp
-// CHECK-NOT: [[@LINE-3]]:{{[0-9]+}}:
-
-  retabsppc #2
-// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: expected label or encodable integer pc offset
-// CHECK-NEXT: retabsppc #2
-// CHECK-NOT: [[@LINE-3]]:{{[0-9]+}}:
-
-  retabsppc #(1<<17)
-// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: expected label or encodable integer pc offset
-// CHECK-NEXT: retabsppc #(1<<17)
-// CHECK-NOT: [[@LINE-3]]:{{[0-9]+}}:
-
-  retabsppc #-2
-// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: expected label or encodable integer pc offset
-// CHECK-NEXT: retabsppc #-2
-// CHECK-NOT: [[@LINE-3]]:{{[0-9]+}}:
-
-  retaasppc w0
-// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: expected label or encodable integer pc offset
-// CHECK-NEXT: retaasppc w0
-// CHECK-NOT: [[@LINE-3]]:{{[0-9]+}}:
-
-  retaasppc sp
-// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: expected label or encodable integer pc offset
-// CHECK-NEXT: retaasppc sp
-// CHECK-NOT: [[@LINE-3]]:{{[0-9]+}}:
-
-  retaasppc xzr
-// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: expected label or encodable integer pc offset
-// CHECK-NEXT: retaasppc xzr
-// CHECK-NOT: [[@LINE-3]]:{{[0-9]+}}:
-
diff --git a/llvm/test/MC/AArch64/armv9.5a-pauthlr-reloc.s b/llvm/test/MC/AArch64/armv9.5a-pauthlr-reloc.s
deleted file mode 100644
index c10142a199766..0000000000000
--- a/llvm/test/MC/AArch64/armv9.5a-pauthlr-reloc.s
+++ /dev/null
@@ -1,12 +0,0 @@
-// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+pauth-lr -filetype=obj -o /dev/null 2>&1 < %s | FileCheck %s
-
-  autiasppc undef_label
-// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: relocation of PAC/AUT instructions is not supported
-// CHECK-NEXT: autiasppc undef_label
-// CHECK-NOT: [[@LINE-3]]:{{[0-9]+}}:
-
-  autibsppc undef_label
-// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: relocation of PAC/AUT instructions is not supported
-// CHECK-NEXT: autibsppc undef_label
-// CHECK-NOT: [[@LINE-3]]:{{[0-9]+}}:
-
diff --git a/llvm/test/MC/AArch64/armv9.5a-pauthlr.s b/llvm/test/MC/AArch64/armv9.5a-pauthlr.s
deleted file mode 100644
index 24e9c44984683..0000000000000
--- a/llvm/test/MC/AArch64/armv9.5a-pauthlr.s
+++ /dev/null
@@ -1,151 +0,0 @@
-// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+pauth-lr < %s \
-// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
-// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
-// RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
-// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+pauth-lr < %s \
-// RUN:        | llvm-objdump -d --mattr=+pauth-lr - | FileCheck %s --check-prefix=CHECK-DISASS
-// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+pauth-lr < %s \
-// RUN:        | llvm-objdump -d --mattr=-pauth-lr - | FileCheck %s --check-prefix=CHECK-UNKNOWN
-
-// Label at address 4, so we can test that the address shows up in the
-// disassembly.
-  nop
-label1:
-
-  paciasppc
-// CHECK-INST: paciasppc
-// CHECK-DISASS: paciasppc
-// CHECK-ENCODING: [0xfe,0xa3,0xc1,0xda]
-// CHECK-ERROR: instruction requires: pauth-lr
-// CHECK-UNKNOWN: dac1a3fe <unknown>
-
-  pacibsppc
-// CHECK-INST: pacibsppc
-// CHECK-DISASS: pacibsppc
-// CHECK-ENCODING: [0xfe,0xa7,0xc1,0xda]
-// CHECK-ERROR: instruction requires: pauth-lr
-// CHECK-UNKNOWN: dac1a7fe <unknown>
-
-  pacnbiasppc
-// CHECK-INST: pacnbiasppc
-// CHECK-DISASS: pacnbiasppc
-// CHECK-ENCODING: [0xfe,0x83,0xc1,0xda]
-// CHECK-ERROR: instruction requires: pauth-lr
-// CHECK-UNKNOWN: dac183fe <unknown>
-
-  pacnbibsppc
-// CHECK-INST: pacnbibsppc
-// CHECK-DISASS: pacnbibsppc
-// CHECK-ENCODING: [0xfe,0x87,0xc1,0xda]
-// CHECK-ERROR: instruction requires: pauth-lr
-// CHECK-UNKNOWN: dac187fe <unknown>
-
-  autiasppc label1
-// CHECK-INST: autiasppc label1
-// CHECK-DISASS: autiasppc 0x4 <label1>
-// CHECK-ENCODING: [0bAAA11111,A,0b100AAAAA,0xf3]
-// CHECK-ENCODING: fixup A - offset: 0, value: label1, kind: fixup_aarch64_pcrel_branch16
-// CHECK-ERROR: instruction requires: pauth-lr
-// CHECK-UNKNOWN: f380009f <unknown>
-
-  autibsppc label1
-// CHECK-INST: autibsppc label1
-// CHECK-DISASS: autibsppc 0x4 <label1>
-// CHECK-ENCODING: [0bAAA11111,A,0b101AAAAA,0xf3]
-// CHECK-ENCODING: fixup A - offset: 0, value: label1, kind: fixup_aarch64_pcrel_branch16
-// CHECK-ERROR: instruction requires: pauth-lr
-// CHECK-UNKNOWN: f3a000bf <unknown>
-
-  autibsppc #0
-// CHECK-INST: autibsppc #0
-// CHECK-DISASS: autibsppc 0x1c <label1+0x18>
-// CHECK-ENCODING: [0x1f,0x00,0xa0,0xf3]
-// CHECK-ERROR: instruction requires: pauth-lr
-// CHECK-UNKNOWN: f3a0001f <unknown>
-
-  autibsppc #-(1<<18)+4
-// CHECK-INST: autibsppc #-262140
-// CHECK-DISASS: autibsppc 0xfffffffffffc0024 <label1+0xfffffffffffc0020>
-// CHECK-ENCODING: [0xff,0xff,0xbf,0xf3]
-// CHECK-ERROR: instruction requires: pauth-lr
-// CHECK-UNKNOWN: f3bfffff <unknown>
-
-  autiasppc x0
-// CHECK-INST: autiasppc x0
-// CHECK-DISASS: autiasppc x0
-// CHECK-ENCODING: [0x1e,0x90,0xc1,0xda]
-// CHECK-ERROR: instruction requires: pauth-lr
-// CHECK-UNKNOWN: dac1901e <unknown>
-
-  autibsppc x1
-// CHECK-INST: autibsppc x1
-// CHECK-DISASS: autibsppc x1
-// CHECK-ENCODING: [0x3e,0x94,0xc1,0xda]
-// CHECK-ERROR: instruction requires: pauth-lr
-// CHECK-UNKNOWN: dac1943e <unknown>
-
-  autiasppc xzr
-// CHECK-INST: autiasppc xzr
-// CHECK-DISASS: autiasppc xzr
-// CHECK-ENCODING: [0xfe,0x93,0xc1,0xda]
-// CHECK-ERROR: instruction requires: pauth-lr
-// CHECK-UNKNOWN: dac193fe <unknown>
-
-  autibsppc xzr
-// CHECK-INST: autibsppc xzr
-// CHECK-DISASS: autibsppc xzr
-// CHECK-ENCODING: [0xfe,0x97,0xc1,0xda]
-// CHECK-ERROR: instruction requires: pauth-lr
-// CHECK-UNKNOWN: dac197fe <unknown>
-
-
-  retaasppc label1
-// CHECK-INST: retaasppc label1
-// CHECK-DISASS: retaasppc 0x4 <label1>
-// CHECK-ENCODING: [0bAAA11111,A,0b000AAAAA,0x55]
-// CHECK-ENCODING: //   fixup A - offset: 0, value: label1, kind: fixup_aarch64_pcrel_branch16
-// CHECK-ERROR: instruction requires: pauth-lr
-// CHECK-UNKNOWN: 5500019f <unknown>
-
-  retabsppc label1
-// CHECK-INST: retabsppc label1
-// CHECK-DISASS: retabsppc 0x4 <label1>
-// CHECK-ENCODING: [0bAAA11111,A,0b001AAAAA,0x55]
-// CHECK-ENCODING: //   fixup A - offset: 0, value: label1, kind: fixup_aarch64_pcrel_branch16
-// CHECK-ERROR: instruction requires: pauth-lr
-// CHECK-UNKNOWN: 552001bf <unknown>
-
-  retaasppc #0
-// CHECK-INST: retaasppc #0
-// CHECK-DISASS: retaasppc 0x3c <label1+0x38>
-// CHECK-ENCODING: [0x1f,0x00,0x00,0x55]
-// CHECK-ERROR: instruction requires: pauth-lr
-// CHECK-UNKNOWN: 5500001f <unknown>
-
-  retaasppc #-(1<<18)+4
-// CHECK-INST: retaasppc #-262140
-// CHECK-DISASS: retaasppc 0xfffffffffffc0044 <label1+0xfffffffffffc0040>
-// CHECK-ENCODING: [0xff,0xff,0x1f,0x55]
-// CHECK-ERROR: instruction requires: pauth-lr
-// CHECK-UNKNOWN: 551fffff <unknown>
-
-  retaasppc x2
-// CHECK-INST: retaasppc x2
-// CHECK-DISASS: retaasppc x2
-// CHECK-ENCODING: [0xe2,0x0b,0x5f,0xd6]
-// CHECK-ERROR: instruction requires: pauth-lr
-// CHECK-UNKNOWN: d65f0be2 <unknown>
-
-  retabsppc x3
-// CHECK-INST: retabsppc x3
-// CHECK-DISASS: retabsppc x3
-// CHECK-ENCODING: [0xe3,0x0f,0x5f,0xd6]
-// CHECK-ERROR: instruction requires: pauth-lr
-// CHECK-UNKNOWN: d65f0fe3 <unknown>
-
-  pacm
-// CHECK-INST: pacm
-// CHECK-DISASS: pacm
-// CHECK-ENCODING: [0xff,0x24,0x03,0xd5]
-// CHECK-ERROR-NOT: instruction requires:
-// CHECK-UNKNOWN: d50324ff hint #39
diff --git a/llvm/test/MC/Disassembler/AArch64/armv9.5a-pauthlr.txt b/llvm/test/MC/Disassembler/AArch64/armv9.5a-pauthlr.txt
deleted file mode 100644
index caf1fde2c2b7c..0000000000000
--- a/llvm/test/MC/Disassembler/AArch64/armv9.5a-pauthlr.txt
+++ /dev/null
@@ -1,78 +0,0 @@
-# RUN: llvm-mc -triple aarch64 -disassemble -mattr=+pauth-lr < %s | FileCheck %s
-# RUN: not llvm-mc -triple aarch64 -disassemble < %s 2>&1 | FileCheck %s --check-prefix=NO-PAUTHLR
-
-[0xfe,0xa3,0xc1,0xda]
-# CHECK: paciasppc
-# NO-PAUTHLR: invalid instruction encoding
-
-[0xfe,0xa7,0xc1,0xda]
-# CHECK: pacibsppc
-# NO-PAUTHLR: invalid instruction encoding
-
-[0xfe,0x83,0xc1,0xda]
-# CHECK: pacnbiasppc
-# NO-PAUTHLR: invalid instruction encoding
-
-[0xfe,0x87,0xc1,0xda]
-# CHECK: pacnbibsppc
-# NO-PAUTHLR: invalid instruction encoding
-
-[0x9f,0x00,0x80,0xf3]
-# CHECK: autiasppc #-16
-# NO-PAUTHLR: invalid instruction encoding
-
-[0xbf,0x00,0xa0,0xf3]
-# CHECK: autibsppc #-20
-# NO-PAUTHLR: invalid instruction encoding
-
-[0x1f,0x00,0xa0,0xf3]
-# CHECK: autibsppc #0
-# NO-PAUTHLR: invalid instruction encoding
-
-[0xff,0xff,0xbf,0xf3]
-# CHECK: autibsppc #-262140
-# NO-PAUTHLR: invalid instruction encoding
-
-[0x1e,0x90,0xc1,0xda]
-# CHECK: autiasppc x0
-# NO-PAUTHLR: invalid instruction encoding
-
-[0x3e,0x94,0xc1,0xda]
-# CHECK: autibsppc x1
-# NO-PAUTHLR: invalid instruction encoding
-
-[0xfe,0x93,0xc1,0xda]
-# CHECK: autiasppc xzr
-# NO-PAUTHLR: invalid instruction encoding
-
-[0xfe,0x97,0xc1,0xda]
-# CHECK: autibsppc xzr
-# NO-PAUTHLR: invalid instruction encoding
-
-[0xbf,0x01,0x00,0x55]
-# CHECK: retaasppc #-52
-# NO-PAUTHLR: invalid instruction encoding
-
-[0xdf,0x01,0x20,0x55]
-# CHECK: retabsppc #-56
-# NO-PAUTHLR: invalid instruction encoding
-
-[0x1f,0x00,0x00,0x55]
-# CHECK: retaasppc #0
-# NO-PAUTHLR: invalid instruction encoding
-
-[0xff,0xff,0x1f,0x55]
-# CHECK: retaasppc #-262140
-# NO-PAUTHLR: invalid instruction encoding
-
-[0xe2,0x0b,0x5f,0xd6]
-# CHECK: retaasppc x2
-# NO-PAUTHLR: invalid instruction encoding
-
-[0xe3,0x0f,0x5f,0xd6]
-# CHECK: retabsppc x3
-# NO-PAUTHLR: invalid instruction encoding
-
-[0xff,0x24,0x03,0xd5]
-# CHECK: pacm
-# NO-PAUTHLR: hint #39

From 0dcff0db3a11d8a3cb8f78bf3b995f1211c2585d Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Thu, 21 Dec 2023 08:38:32 -0800
Subject: [PATCH 089/342] [RISCV] Add codegen support for
 experimental.vp.splice (#74688)

IR intrinsics were already defined, but no codegen support had been
added.

I extracted this code from our downstream. Some of it may have come from
https://repo.hca.bsc.es/gitlab/rferrer/llvm-epi/ originally.
---
 .../SelectionDAG/LegalizeIntegerTypes.cpp     |  17 +
 llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h |   1 +
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   |  89 ++-
 llvm/lib/Target/RISCV/RISCVISelLowering.h     |   1 +
 .../CodeGen/RISCV/vp-splice-fixed-vectors.ll  | 281 +++++++++
 .../RISCV/vp-splice-mask-fixed-vectors.ll     | 316 ++++++++++
 .../CodeGen/RISCV/vp-splice-mask-vectors.ll   | 553 ++++++++++++++++++
 llvm/test/CodeGen/RISCV/vp-splice.ll          | 330 +++++++++++
 8 files changed, 1586 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/CodeGen/RISCV/vp-splice-fixed-vectors.ll
 create mode 100644 llvm/test/CodeGen/RISCV/vp-splice-mask-fixed-vectors.ll
 create mode 100644 llvm/test/CodeGen/RISCV/vp-splice-mask-vectors.ll
 create mode 100644 llvm/test/CodeGen/RISCV/vp-splice.ll

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 362fa92dd44b2..3d21bd22e6ef5 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -1871,6 +1871,9 @@ bool DAGTypeLegalizer::PromoteIntegerOperand(SDNode *N, unsigned OpNo) {
   case ISD::EXPERIMENTAL_VP_STRIDED_STORE:
     Res = PromoteIntOp_VP_STRIDED(N, OpNo);
     break;
+  case ISD::EXPERIMENTAL_VP_SPLICE:
+    Res = PromoteIntOp_VP_SPLICE(N, OpNo);
+    break;
   }
 
   // If the result is null, the sub-method took care of registering results etc.
@@ -2549,6 +2552,20 @@ SDValue DAGTypeLegalizer::PromoteIntOp_VP_STRIDED(SDNode *N, unsigned OpNo) {
   return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
 }
 
+SDValue DAGTypeLegalizer::PromoteIntOp_VP_SPLICE(SDNode *N, unsigned OpNo) {
+  SmallVector<SDValue, 6> NewOps(N->op_begin(), N->op_end());
+
+  if (OpNo == 2) { // Offset operand
+    NewOps[OpNo] = SExtPromotedInteger(N->getOperand(OpNo));
+    return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
+  }
+
+  assert((OpNo == 4 || OpNo == 5) && "Unexpected operand for promotion");
+
+  NewOps[OpNo] = ZExtPromotedInteger(N->getOperand(OpNo));
+  return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
+}
+
 //===----------------------------------------------------------------------===//
 //  Integer Result Expansion
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index 9d5931b44ac67..4a249e7a2dc92 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -410,6 +410,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
   SDValue PromoteIntOp_STACKMAP(SDNode *N, unsigned OpNo);
   SDValue PromoteIntOp_PATCHPOINT(SDNode *N, unsigned OpNo);
   SDValue PromoteIntOp_VP_STRIDED(SDNode *N, unsigned OpNo);
+  SDValue PromoteIntOp_VP_SPLICE(SDNode *N, unsigned OpNo);
 
   void PromoteSetCCOperands(SDValue &LHS,SDValue &RHS, ISD::CondCode Code);
 
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index de15bea72e466..d6dedd669ffd0 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -675,7 +675,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
         ISD::VP_FP_TO_UINT,  ISD::VP_SETCC,       ISD::VP_SIGN_EXTEND,
         ISD::VP_ZERO_EXTEND, ISD::VP_TRUNCATE,    ISD::VP_SMIN,
         ISD::VP_SMAX,        ISD::VP_UMIN,        ISD::VP_UMAX,
-        ISD::VP_ABS, ISD::EXPERIMENTAL_VP_REVERSE};
+        ISD::VP_ABS, ISD::EXPERIMENTAL_VP_REVERSE, ISD::EXPERIMENTAL_VP_SPLICE};
 
     static const unsigned FloatingPointVPOps[] = {
         ISD::VP_FADD,        ISD::VP_FSUB,        ISD::VP_FMUL,
@@ -688,7 +688,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
         ISD::VP_FCEIL,       ISD::VP_FFLOOR,      ISD::VP_FROUND,
         ISD::VP_FROUNDEVEN,  ISD::VP_FCOPYSIGN,   ISD::VP_FROUNDTOZERO,
         ISD::VP_FRINT,       ISD::VP_FNEARBYINT,  ISD::VP_IS_FPCLASS,
-        ISD::EXPERIMENTAL_VP_REVERSE};
+        ISD::EXPERIMENTAL_VP_REVERSE, ISD::EXPERIMENTAL_VP_SPLICE};
 
     static const unsigned IntegerVecReduceOps[] = {
         ISD::VECREDUCE_ADD,  ISD::VECREDUCE_AND,  ISD::VECREDUCE_OR,
@@ -773,6 +773,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
 
       setOperationAction(ISD::VECTOR_REVERSE, VT, Custom);
 
+      setOperationAction(ISD::EXPERIMENTAL_VP_SPLICE, VT, Custom);
       setOperationAction(ISD::EXPERIMENTAL_VP_REVERSE, VT, Custom);
 
       setOperationPromotedToType(
@@ -1147,6 +1148,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
                               ISD::VP_SETCC, ISD::VP_TRUNCATE},
                              VT, Custom);
 
+          setOperationAction(ISD::EXPERIMENTAL_VP_SPLICE, VT, Custom);
           setOperationAction(ISD::EXPERIMENTAL_VP_REVERSE, VT, Custom);
           continue;
         }
@@ -6637,6 +6639,8 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
          !Subtarget.hasVInstructionsF16()))
       return SplitVPOp(Op, DAG);
     return lowerVectorFTRUNC_FCEIL_FFLOOR_FROUND(Op, DAG, Subtarget);
+  case ISD::EXPERIMENTAL_VP_SPLICE:
+    return lowerVPSpliceExperimental(Op, DAG);
   case ISD::EXPERIMENTAL_VP_REVERSE:
     return lowerVPReverseExperimental(Op, DAG);
   }
@@ -10582,6 +10586,87 @@ SDValue RISCVTargetLowering::lowerVPFPIntConvOp(SDValue Op,
   return convertFromScalableVector(VT, Result, DAG, Subtarget);
 }
 
+SDValue
+RISCVTargetLowering::lowerVPSpliceExperimental(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+
+  SDValue Op1 = Op.getOperand(0);
+  SDValue Op2 = Op.getOperand(1);
+  SDValue Offset = Op.getOperand(2);
+  SDValue Mask = Op.getOperand(3);
+  SDValue EVL1 = Op.getOperand(4);
+  SDValue EVL2 = Op.getOperand(5);
+
+  const MVT XLenVT = Subtarget.getXLenVT();
+  MVT VT = Op.getSimpleValueType();
+  MVT ContainerVT = VT;
+  if (VT.isFixedLengthVector()) {
+    ContainerVT = getContainerForFixedLengthVector(VT);
+    Op1 = convertToScalableVector(ContainerVT, Op1, DAG, Subtarget);
+    Op2 = convertToScalableVector(ContainerVT, Op2, DAG, Subtarget);
+    MVT MaskVT = getMaskTypeFor(ContainerVT);
+    Mask = convertToScalableVector(MaskVT, Mask, DAG, Subtarget);
+  }
+
+  bool IsMaskVector = VT.getVectorElementType() == MVT::i1;
+  if (IsMaskVector) {
+    ContainerVT = ContainerVT.changeVectorElementType(MVT::i8);
+
+    // Expand input operands
+    SDValue SplatOneOp1 = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, ContainerVT,
+                                      DAG.getUNDEF(ContainerVT),
+                                      DAG.getConstant(1, DL, XLenVT), EVL1);
+    SDValue SplatZeroOp1 = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, ContainerVT,
+                                       DAG.getUNDEF(ContainerVT),
+                                       DAG.getConstant(0, DL, XLenVT), EVL1);
+    Op1 = DAG.getNode(RISCVISD::VSELECT_VL, DL, ContainerVT, Op1, SplatOneOp1,
+                      SplatZeroOp1, EVL1);
+
+    SDValue SplatOneOp2 = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, ContainerVT,
+                                      DAG.getUNDEF(ContainerVT),
+                                      DAG.getConstant(1, DL, XLenVT), EVL2);
+    SDValue SplatZeroOp2 = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, ContainerVT,
+                                       DAG.getUNDEF(ContainerVT),
+                                       DAG.getConstant(0, DL, XLenVT), EVL2);
+    Op2 = DAG.getNode(RISCVISD::VSELECT_VL, DL, ContainerVT, Op2, SplatOneOp2,
+                      SplatZeroOp2, EVL2);
+  }
+
+  int64_t ImmValue = cast<ConstantSDNode>(Offset)->getSExtValue();
+  SDValue DownOffset, UpOffset;
+  if (ImmValue >= 0) {
+    // The operand is a TargetConstant, we need to rebuild it as a regular
+    // constant.
+    DownOffset = DAG.getConstant(ImmValue, DL, XLenVT);
+    UpOffset = DAG.getNode(ISD::SUB, DL, XLenVT, EVL1, DownOffset);
+  } else {
+    // The operand is a TargetConstant, we need to rebuild it as a regular
+    // constant rather than negating the original operand.
+    UpOffset = DAG.getConstant(-ImmValue, DL, XLenVT);
+    DownOffset = DAG.getNode(ISD::SUB, DL, XLenVT, EVL1, UpOffset);
+  }
+
+  SDValue SlideDown =
+      getVSlidedown(DAG, Subtarget, DL, ContainerVT, DAG.getUNDEF(ContainerVT),
+                    Op1, DownOffset, Mask, UpOffset);
+  SDValue Result = getVSlideup(DAG, Subtarget, DL, ContainerVT, SlideDown, Op2,
+                               UpOffset, Mask, EVL2, RISCVII::TAIL_AGNOSTIC);
+
+  if (IsMaskVector) {
+    // Truncate Result back to a mask vector (Result has same EVL as Op2)
+    Result = DAG.getNode(
+        RISCVISD::SETCC_VL, DL, ContainerVT.changeVectorElementType(MVT::i1),
+        {Result, DAG.getConstant(0, DL, ContainerVT),
+         DAG.getCondCode(ISD::SETNE), DAG.getUNDEF(getMaskTypeFor(ContainerVT)),
+         Mask, EVL2});
+  }
+
+  if (!VT.isFixedLengthVector())
+    return Result;
+  return convertFromScalableVector(VT, Result, DAG, Subtarget);
+}
+
 SDValue
 RISCVTargetLowering::lowerVPReverseExperimental(SDValue Op,
                                                 SelectionDAG &DAG) const {
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index 41a2dc5771c82..2d9f716cdf9a4 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -910,6 +910,7 @@ class RISCVTargetLowering : public TargetLowering {
   SDValue lowerLogicVPOp(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerVPExtMaskOp(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerVPSetCCMaskOp(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerVPSpliceExperimental(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerVPReverseExperimental(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerVPFPIntConvOp(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerVPStridedLoad(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/test/CodeGen/RISCV/vp-splice-fixed-vectors.ll b/llvm/test/CodeGen/RISCV/vp-splice-fixed-vectors.ll
new file mode 100644
index 0000000000000..f7c8c251e197b
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/vp-splice-fixed-vectors.ll
@@ -0,0 +1,281 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple riscv64 -mattr=+f,+d,+v -verify-machineinstrs -riscv-v-vector-bits-min=128 \
+; RUN:   < %s | FileCheck %s
+
+declare <2 x i64> @llvm.experimental.vp.splice.v2i64(<2 x i64>, <2 x i64>, i32, <2 x i1>, i32, i32)
+declare <4 x i32> @llvm.experimental.vp.splice.v4i32(<4 x i32>, <4 x i32>, i32, <4 x i1>, i32, i32)
+declare <8 x i16> @llvm.experimental.vp.splice.v8i16(<8 x i16>, <8 x i16>, i32, <8 x i1>, i32, i32)
+declare <16 x i8> @llvm.experimental.vp.splice.v16i8(<16 x i8>, <16 x i8>, i32, <16 x i1>, i32, i32)
+
+declare <2 x double> @llvm.experimental.vp.splice.v2f64(<2 x double>, <2 x double>, i32, <2 x i1>, i32, i32)
+declare <4 x float> @llvm.experimental.vp.splice.v4f32(<4 x float>, <4 x float>, i32, <4 x i1>, i32, i32)
+
+define <2 x i64> @test_vp_splice_v2i64(<2 x i64> %va, <2 x i64> %vb, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_v2i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vi v8, v8, 5
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v8, v9, a0
+; CHECK-NEXT:    ret
+  %head = insertelement <2 x i1> undef, i1 1, i32 0
+  %allones = shufflevector <2 x i1> %head, <2 x i1> undef, <2 x i32> zeroinitializer
+
+  %v = call <2 x i64> @llvm.experimental.vp.splice.v2i64(<2 x i64> %va, <2 x i64> %vb, i32 5, <2 x i1> %allones, i32 %evla, i32 %evlb)
+  ret <2 x i64> %v
+}
+
+define <2 x i64> @test_vp_splice_v2i64_negative_offset(<2 x i64> %va, <2 x i64> %vb, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_v2i64_negative_offset:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetivli zero, 5, e64, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vx v8, v8, a0
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vslideup.vi v8, v9, 5
+; CHECK-NEXT:    ret
+  %head = insertelement <2 x i1> undef, i1 1, i32 0
+  %allones = shufflevector <2 x i1> %head, <2 x i1> undef, <2 x i32> zeroinitializer
+
+  %v = call <2 x i64> @llvm.experimental.vp.splice.v2i64(<2 x i64> %va, <2 x i64> %vb, i32 -5, <2 x i1> %allones, i32 %evla, i32 %evlb)
+  ret <2 x i64> %v
+}
+
+define <2 x i64> @test_vp_splice_v2i64_masked(<2 x i64> %va, <2 x i64> %vb, <2 x i1> %mask, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_v2i64_masked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vi v8, v8, 5, v0.t
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, mu
+; CHECK-NEXT:    vslideup.vx v8, v9, a0, v0.t
+; CHECK-NEXT:    ret
+  %v = call <2 x i64> @llvm.experimental.vp.splice.v2i64(<2 x i64> %va, <2 x i64> %vb, i32 5, <2 x i1> %mask, i32 %evla, i32 %evlb)
+  ret <2 x i64> %v
+}
+
+define <4 x i32> @test_vp_splice_v4i32(<4 x i32> %va, <4 x i32> %vb, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_v4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vi v8, v8, 5
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v8, v9, a0
+; CHECK-NEXT:    ret
+  %head = insertelement <4 x i1> undef, i1 1, i32 0
+  %allones = shufflevector <4 x i1> %head, <4 x i1> undef, <4 x i32> zeroinitializer
+
+  %v = call <4 x i32> @llvm.experimental.vp.splice.v4i32(<4 x i32> %va, <4 x i32> %vb, i32 5, <4 x i1> %allones, i32 %evla, i32 %evlb)
+  ret <4 x i32> %v
+}
+
+define <4 x i32> @test_vp_splice_v4i32_negative_offset(<4 x i32> %va, <4 x i32> %vb, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_v4i32_negative_offset:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetivli zero, 5, e32, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vx v8, v8, a0
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vslideup.vi v8, v9, 5
+; CHECK-NEXT:    ret
+  %head = insertelement <4 x i1> undef, i1 1, i32 0
+  %allones = shufflevector <4 x i1> %head, <4 x i1> undef, <4 x i32> zeroinitializer
+
+  %v = call <4 x i32> @llvm.experimental.vp.splice.v4i32(<4 x i32> %va, <4 x i32> %vb, i32 -5, <4 x i1> %allones, i32 %evla, i32 %evlb)
+  ret <4 x i32> %v
+}
+
+define <4 x i32> @test_vp_splice_v4i32_masked(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %mask, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_v4i32_masked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vi v8, v8, 5, v0.t
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, mu
+; CHECK-NEXT:    vslideup.vx v8, v9, a0, v0.t
+; CHECK-NEXT:    ret
+  %v = call <4 x i32> @llvm.experimental.vp.splice.v4i32(<4 x i32> %va, <4 x i32> %vb, i32 5, <4 x i1> %mask, i32 %evla, i32 %evlb)
+  ret <4 x i32> %v
+}
+
+define <8 x i16> @test_vp_splice_v8i16(<8 x i16> %va, <8 x i16> %vb, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_v8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vi v8, v8, 5
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v8, v9, a0
+; CHECK-NEXT:    ret
+  %head = insertelement <8 x i1> undef, i1 1, i32 0
+  %allones = shufflevector <8 x i1> %head, <8 x i1> undef, <8 x i32> zeroinitializer
+
+  %v = call <8 x i16> @llvm.experimental.vp.splice.v8i16(<8 x i16> %va, <8 x i16> %vb, i32 5, <8 x i1> %allones, i32 %evla, i32 %evlb)
+  ret <8 x i16> %v
+}
+
+define <8 x i16> @test_vp_splice_v8i16_negative_offset(<8 x i16> %va, <8 x i16> %vb, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_v8i16_negative_offset:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetivli zero, 5, e16, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vx v8, v8, a0
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vslideup.vi v8, v9, 5
+; CHECK-NEXT:    ret
+  %head = insertelement <8 x i1> undef, i1 1, i32 0
+  %allones = shufflevector <8 x i1> %head, <8 x i1> undef, <8 x i32> zeroinitializer
+
+  %v = call <8 x i16> @llvm.experimental.vp.splice.v8i16(<8 x i16> %va, <8 x i16> %vb, i32 -5, <8 x i1> %allones, i32 %evla, i32 %evlb)
+  ret <8 x i16> %v
+}
+
+define <8 x i16> @test_vp_splice_v8i16_masked(<8 x i16> %va, <8 x i16> %vb, <8 x i1> %mask, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_v8i16_masked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vi v8, v8, 5, v0.t
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
+; CHECK-NEXT:    vslideup.vx v8, v9, a0, v0.t
+; CHECK-NEXT:    ret
+  %v = call <8 x i16> @llvm.experimental.vp.splice.v8i16(<8 x i16> %va, <8 x i16> %vb, i32 5, <8 x i1> %mask, i32 %evla, i32 %evlb)
+  ret <8 x i16> %v
+}
+
+define <16 x i8> @test_vp_splice_v16i8(<16 x i8> %va, <16 x i8> %vb, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_v16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vi v8, v8, 5
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v8, v9, a0
+; CHECK-NEXT:    ret
+  %head = insertelement <16 x i1> undef, i1 1, i32 0
+  %allones = shufflevector <16 x i1> %head, <16 x i1> undef, <16 x i32> zeroinitializer
+
+  %v = call <16 x i8> @llvm.experimental.vp.splice.v16i8(<16 x i8> %va, <16 x i8> %vb, i32 5, <16 x i1> %allones, i32 %evla, i32 %evlb)
+  ret <16 x i8> %v
+}
+
+define <16 x i8> @test_vp_splice_v16i8_negative_offset(<16 x i8> %va, <16 x i8> %vb, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_v16i8_negative_offset:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetivli zero, 5, e8, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vx v8, v8, a0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vslideup.vi v8, v9, 5
+; CHECK-NEXT:    ret
+  %head = insertelement <16 x i1> undef, i1 1, i32 0
+  %allones = shufflevector <16 x i1> %head, <16 x i1> undef, <16 x i32> zeroinitializer
+
+  %v = call <16 x i8> @llvm.experimental.vp.splice.v16i8(<16 x i8> %va, <16 x i8> %vb, i32 -5, <16 x i1> %allones, i32 %evla, i32 %evlb)
+  ret <16 x i8> %v
+}
+
+define <16 x i8> @test_vp_splice_v16i8_masked(<16 x i8> %va, <16 x i8> %vb, <16 x i1> %mask, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_v16i8_masked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vi v8, v8, 5, v0.t
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, mu
+; CHECK-NEXT:    vslideup.vx v8, v9, a0, v0.t
+; CHECK-NEXT:    ret
+  %v = call <16 x i8> @llvm.experimental.vp.splice.v16i8(<16 x i8> %va, <16 x i8> %vb, i32 5, <16 x i1> %mask, i32 %evla, i32 %evlb)
+  ret <16 x i8> %v
+}
+
+define <2 x double> @test_vp_splice_v2f64(<2 x double> %va, <2 x double> %vb, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_v2f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vi v8, v8, 5
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v8, v9, a0
+; CHECK-NEXT:    ret
+  %head = insertelement <2 x i1> undef, i1 1, i32 0
+  %allones = shufflevector <2 x i1> %head, <2 x i1> undef, <2 x i32> zeroinitializer
+
+  %v = call <2 x double> @llvm.experimental.vp.splice.v2f64(<2 x double> %va, <2 x double> %vb, i32 5, <2 x i1> %allones, i32 %evla, i32 %evlb)
+  ret <2 x double> %v
+}
+
+define <2 x double> @test_vp_splice_v2f64_negative_offset(<2 x double> %va, <2 x double> %vb, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_v2f64_negative_offset:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetivli zero, 5, e64, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vx v8, v8, a0
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vslideup.vi v8, v9, 5
+; CHECK-NEXT:    ret
+  %head = insertelement <2 x i1> undef, i1 1, i32 0
+  %allones = shufflevector <2 x i1> %head, <2 x i1> undef, <2 x i32> zeroinitializer
+
+  %v = call <2 x double> @llvm.experimental.vp.splice.v2f64(<2 x double> %va, <2 x double> %vb, i32 -5, <2 x i1> %allones, i32 %evla, i32 %evlb)
+  ret <2 x double> %v
+}
+
+define <2 x double> @test_vp_splice_v2f64_masked(<2 x double> %va, <2 x double> %vb, <2 x i1> %mask, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_v2f64_masked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vi v8, v8, 5, v0.t
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, mu
+; CHECK-NEXT:    vslideup.vx v8, v9, a0, v0.t
+; CHECK-NEXT:    ret
+  %v = call <2 x double> @llvm.experimental.vp.splice.v2f64(<2 x double> %va, <2 x double> %vb, i32 5, <2 x i1> %mask, i32 %evla, i32 %evlb)
+  ret <2 x double> %v
+}
+
+define <4 x float> @test_vp_splice_v4f32(<4 x float> %va, <4 x float> %vb, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_v4f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vi v8, v8, 5
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v8, v9, a0
+; CHECK-NEXT:    ret
+  %head = insertelement <4 x i1> undef, i1 1, i32 0
+  %allones = shufflevector <4 x i1> %head, <4 x i1> undef, <4 x i32> zeroinitializer
+
+  %v = call <4 x float> @llvm.experimental.vp.splice.v4f32(<4 x float> %va, <4 x float> %vb, i32 5, <4 x i1> %allones, i32 %evla, i32 %evlb)
+  ret <4 x float> %v
+}
+
+define <4 x float> @test_vp_splice_v4f32_negative_offset(<4 x float> %va, <4 x float> %vb, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_v4f32_negative_offset:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetivli zero, 5, e32, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vx v8, v8, a0
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vslideup.vi v8, v9, 5
+; CHECK-NEXT:    ret
+  %head = insertelement <4 x i1> undef, i1 1, i32 0
+  %allones = shufflevector <4 x i1> %head, <4 x i1> undef, <4 x i32> zeroinitializer
+
+  %v = call <4 x float> @llvm.experimental.vp.splice.v4f32(<4 x float> %va, <4 x float> %vb, i32 -5, <4 x i1> %allones, i32 %evla, i32 %evlb)
+  ret <4 x float> %v
+}
+
+define <4 x float> @test_vp_splice_v4f32_masked(<4 x float> %va, <4 x float> %vb, <4 x i1> %mask, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_v4f32_masked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vi v8, v8, 5, v0.t
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, mu
+; CHECK-NEXT:    vslideup.vx v8, v9, a0, v0.t
+; CHECK-NEXT:    ret
+  %v = call <4 x float> @llvm.experimental.vp.splice.v4f32(<4 x float> %va, <4 x float> %vb, i32 5, <4 x i1> %mask, i32 %evla, i32 %evlb)
+  ret <4 x float> %v
+}
diff --git a/llvm/test/CodeGen/RISCV/vp-splice-mask-fixed-vectors.ll b/llvm/test/CodeGen/RISCV/vp-splice-mask-fixed-vectors.ll
new file mode 100644
index 0000000000000..9579973aee0d6
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/vp-splice-mask-fixed-vectors.ll
@@ -0,0 +1,316 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs -riscv-v-vector-bits-min=128 \
+; RUN:   < %s | FileCheck %s
+
+declare <2 x i1> @llvm.experimental.vp.splice.v2i1(<2 x i1>, <2 x i1>, i32, <2 x i1>, i32, i32)
+declare <4 x i1> @llvm.experimental.vp.splice.v4i1(<4 x i1>, <4 x i1>, i32, <4 x i1>, i32, i32)
+declare <8 x i1> @llvm.experimental.vp.splice.v8i1(<8 x i1>, <8 x i1>, i32, <8 x i1>, i32, i32)
+declare <16 x i1> @llvm.experimental.vp.splice.v16i1(<16 x i1>, <16 x i1>, i32, <16 x i1>, i32, i32)
+
+define <2 x i1> @test_vp_splice_v2i1(<2 x i1> %va, <2 x i1> %vb, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_v2i1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v9, v0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmerge.vim v9, v10, 1, v0
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    vslidedown.vi v9, v9, 5
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vslideup.vx v9, v8, a0
+; CHECK-NEXT:    vmsne.vi v0, v9, 0
+; CHECK-NEXT:    ret
+  %head = insertelement <2 x i1> undef, i1 1, i32 0
+  %allones = shufflevector <2 x i1> %head, <2 x i1> undef, <2 x i32> zeroinitializer
+
+  %v = call <2 x i1> @llvm.experimental.vp.splice.v2i1(<2 x i1> %va, <2 x i1> %vb, i32 5, <2 x i1> %allones, i32 %evla, i32 %evlb)
+  ret <2 x i1> %v
+}
+
+define <2 x i1> @test_vp_splice_v2i1_negative_offset(<2 x i1> %va, <2 x i1> %vb, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_v2i1_negative_offset:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v9, v0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmerge.vim v9, v10, 1, v0
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetivli zero, 5, e8, mf8, ta, ma
+; CHECK-NEXT:    vslidedown.vx v9, v9, a0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vslideup.vi v9, v8, 5
+; CHECK-NEXT:    vmsne.vi v0, v9, 0
+; CHECK-NEXT:    ret
+  %head = insertelement <2 x i1> undef, i1 1, i32 0
+  %allones = shufflevector <2 x i1> %head, <2 x i1> undef, <2 x i32> zeroinitializer
+
+  %v = call <2 x i1> @llvm.experimental.vp.splice.v2i1(<2 x i1> %va, <2 x i1> %vb, i32 -5, <2 x i1> %allones, i32 %evla, i32 %evlb)
+  ret <2 x i1> %v
+}
+
+define <2 x i1> @test_vp_splice_v2i1_masked(<2 x i1> %va, <2 x i1> %vb, <2 x i1> %mask, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_v2i1_masked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v10, v0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vmv.v.i v11, 0
+; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmerge.vim v8, v11, 1, v0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    vmv.v.i v11, 0
+; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmerge.vim v10, v11, 1, v0
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vslidedown.vi v10, v10, 5, v0.t
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, mu
+; CHECK-NEXT:    vslideup.vx v10, v8, a0, v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
+; CHECK-NEXT:    vmsne.vi v0, v10, 0, v0.t
+; CHECK-NEXT:    ret
+  %v = call <2 x i1> @llvm.experimental.vp.splice.v2i1(<2 x i1> %va, <2 x i1> %vb, i32 5, <2 x i1> %mask, i32 %evla, i32 %evlb)
+  ret <2 x i1> %v
+}
+
+define <4 x i1> @test_vp_splice_v4i1(<4 x i1> %va, <4 x i1> %vb, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_v4i1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v9, v0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmerge.vim v9, v10, 1, v0
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vslidedown.vi v9, v9, 5
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vslideup.vx v9, v8, a0
+; CHECK-NEXT:    vmsne.vi v0, v9, 0
+; CHECK-NEXT:    ret
+  %head = insertelement <4 x i1> undef, i1 1, i32 0
+  %allones = shufflevector <4 x i1> %head, <4 x i1> undef, <4 x i32> zeroinitializer
+
+  %v = call <4 x i1> @llvm.experimental.vp.splice.v4i1(<4 x i1> %va, <4 x i1> %vb, i32 5, <4 x i1> %allones, i32 %evla, i32 %evlb)
+  ret <4 x i1> %v
+}
+
+define <4 x i1> @test_vp_splice_v4i1_negative_offset(<4 x i1> %va, <4 x i1> %vb, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_v4i1_negative_offset:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v9, v0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmerge.vim v9, v10, 1, v0
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetivli zero, 5, e8, mf4, ta, ma
+; CHECK-NEXT:    vslidedown.vx v9, v9, a0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vslideup.vi v9, v8, 5
+; CHECK-NEXT:    vmsne.vi v0, v9, 0
+; CHECK-NEXT:    ret
+  %head = insertelement <4 x i1> undef, i1 1, i32 0
+  %allones = shufflevector <4 x i1> %head, <4 x i1> undef, <4 x i32> zeroinitializer
+
+  %v = call <4 x i1> @llvm.experimental.vp.splice.v4i1(<4 x i1> %va, <4 x i1> %vb, i32 -5, <4 x i1> %allones, i32 %evla, i32 %evlb)
+  ret <4 x i1> %v
+}
+
+define <4 x i1> @test_vp_splice_v4i1_masked(<4 x i1> %va, <4 x i1> %vb, <4 x i1> %mask, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_v4i1_masked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v10, v0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vmv.v.i v11, 0
+; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmerge.vim v8, v11, 1, v0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vmv.v.i v11, 0
+; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmerge.vim v10, v11, 1, v0
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vslidedown.vi v10, v10, 5, v0.t
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, mu
+; CHECK-NEXT:    vslideup.vx v10, v8, a0, v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    vmsne.vi v0, v10, 0, v0.t
+; CHECK-NEXT:    ret
+  %v = call <4 x i1> @llvm.experimental.vp.splice.v4i1(<4 x i1> %va, <4 x i1> %vb, i32 5, <4 x i1> %mask, i32 %evla, i32 %evlb)
+  ret <4 x i1> %v
+}
+
+define <8 x i1> @test_vp_splice_v8i1(<8 x i1> %va, <8 x i1> %vb, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_v8i1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v9, v0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmerge.vim v9, v10, 1, v0
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vslidedown.vi v9, v9, 5
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vslideup.vx v9, v8, a0
+; CHECK-NEXT:    vmsne.vi v0, v9, 0
+; CHECK-NEXT:    ret
+  %head = insertelement <8 x i1> undef, i1 1, i32 0
+  %allones = shufflevector <8 x i1> %head, <8 x i1> undef, <8 x i32> zeroinitializer
+
+  %v = call <8 x i1> @llvm.experimental.vp.splice.v8i1(<8 x i1> %va, <8 x i1> %vb, i32 5, <8 x i1> %allones, i32 %evla, i32 %evlb)
+  ret <8 x i1> %v
+}
+
+define <8 x i1> @test_vp_splice_v8i1_negative_offset(<8 x i1> %va, <8 x i1> %vb, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_v8i1_negative_offset:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v9, v0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmerge.vim v9, v10, 1, v0
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetivli zero, 5, e8, mf2, ta, ma
+; CHECK-NEXT:    vslidedown.vx v9, v9, a0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vslideup.vi v9, v8, 5
+; CHECK-NEXT:    vmsne.vi v0, v9, 0
+; CHECK-NEXT:    ret
+  %head = insertelement <8 x i1> undef, i1 1, i32 0
+  %allones = shufflevector <8 x i1> %head, <8 x i1> undef, <8 x i32> zeroinitializer
+
+  %v = call <8 x i1> @llvm.experimental.vp.splice.v8i1(<8 x i1> %va, <8 x i1> %vb, i32 -5, <8 x i1> %allones, i32 %evla, i32 %evlb)
+  ret <8 x i1> %v
+}
+
+define <8 x i1> @test_vp_splice_v8i1_masked(<8 x i1> %va, <8 x i1> %vb, <8 x i1> %mask, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_v8i1_masked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v10, v0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vmv.v.i v11, 0
+; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmerge.vim v8, v11, 1, v0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vmv.v.i v11, 0
+; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmerge.vim v10, v11, 1, v0
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vslidedown.vi v10, v10, 5, v0.t
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, mu
+; CHECK-NEXT:    vslideup.vx v10, v8, a0, v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vmsne.vi v0, v10, 0, v0.t
+; CHECK-NEXT:    ret
+  %v = call <8 x i1> @llvm.experimental.vp.splice.v8i1(<8 x i1> %va, <8 x i1> %vb, i32 5, <8 x i1> %mask, i32 %evla, i32 %evlb)
+  ret <8 x i1> %v
+}
+
+define <16 x i1> @test_vp_splice_v16i1(<16 x i1> %va, <16 x i1> %vb, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_v16i1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v9, v0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmerge.vim v9, v10, 1, v0
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vi v9, v9, 5
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v9, v8, a0
+; CHECK-NEXT:    vmsne.vi v0, v9, 0
+; CHECK-NEXT:    ret
+  %head = insertelement <16 x i1> undef, i1 1, i32 0
+  %allones = shufflevector <16 x i1> %head, <16 x i1> undef, <16 x i32> zeroinitializer
+
+  %v = call <16 x i1> @llvm.experimental.vp.splice.v16i1(<16 x i1> %va, <16 x i1> %vb, i32 5, <16 x i1> %allones, i32 %evla, i32 %evlb)
+  ret <16 x i1> %v
+}
+
+define <16 x i1> @test_vp_splice_v16i1_negative_offset(<16 x i1> %va, <16 x i1> %vb, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_v16i1_negative_offset:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v9, v0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmerge.vim v9, v10, 1, v0
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetivli zero, 5, e8, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vx v9, v9, a0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vslideup.vi v9, v8, 5
+; CHECK-NEXT:    vmsne.vi v0, v9, 0
+; CHECK-NEXT:    ret
+  %head = insertelement <16 x i1> undef, i1 1, i32 0
+  %allones = shufflevector <16 x i1> %head, <16 x i1> undef, <16 x i32> zeroinitializer
+
+  %v = call <16 x i1> @llvm.experimental.vp.splice.v16i1(<16 x i1> %va, <16 x i1> %vb, i32 -5, <16 x i1> %allones, i32 %evla, i32 %evlb)
+  ret <16 x i1> %v
+}
+
+define <16 x i1> @test_vp_splice_v16i1_masked(<16 x i1> %va, <16 x i1> %vb, <16 x i1> %mask, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_v16i1_masked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v10, v0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv.v.i v11, 0
+; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmerge.vim v8, v11, 1, v0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vmv.v.i v11, 0
+; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmerge.vim v10, v11, 1, v0
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vslidedown.vi v10, v10, 5, v0.t
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, mu
+; CHECK-NEXT:    vslideup.vx v10, v8, a0, v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vmsne.vi v0, v10, 0, v0.t
+; CHECK-NEXT:    ret
+  %v = call <16 x i1> @llvm.experimental.vp.splice.v16i1(<16 x i1> %va, <16 x i1> %vb, i32 5, <16 x i1> %mask, i32 %evla, i32 %evlb)
+  ret <16 x i1> %v
+}
diff --git a/llvm/test/CodeGen/RISCV/vp-splice-mask-vectors.ll b/llvm/test/CodeGen/RISCV/vp-splice-mask-vectors.ll
new file mode 100644
index 0000000000000..4eaadb3c24fbb
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/vp-splice-mask-vectors.ll
@@ -0,0 +1,553 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs \
+; RUN:   < %s | FileCheck %s
+
+declare <vscale x 1 x i1> @llvm.experimental.vp.splice.nxv1i1(<vscale x 1 x i1>, <vscale x 1 x i1>, i32, <vscale x 1 x i1>, i32, i32)
+declare <vscale x 2 x i1> @llvm.experimental.vp.splice.nxv2i1(<vscale x 2 x i1>, <vscale x 2 x i1>, i32, <vscale x 2 x i1>, i32, i32)
+declare <vscale x 4 x i1> @llvm.experimental.vp.splice.nxv4i1(<vscale x 4 x i1>, <vscale x 4 x i1>, i32, <vscale x 4 x i1>, i32, i32)
+declare <vscale x 8 x i1> @llvm.experimental.vp.splice.nxv8i1(<vscale x 8 x i1>, <vscale x 8 x i1>, i32, <vscale x 8 x i1>, i32, i32)
+declare <vscale x 16 x i1> @llvm.experimental.vp.splice.nxv16i1(<vscale x 16 x i1>, <vscale x 16 x i1>, i32, <vscale x 16 x i1>, i32, i32)
+declare <vscale x 32 x i1> @llvm.experimental.vp.splice.nxv32i1(<vscale x 32 x i1>, <vscale x 32 x i1>, i32, <vscale x 32 x i1>, i32, i32)
+declare <vscale x 64 x i1> @llvm.experimental.vp.splice.nxv64i1(<vscale x 64 x i1>, <vscale x 64 x i1>, i32, <vscale x 64 x i1>, i32, i32)
+
+define <vscale x 1 x i1> @test_vp_splice_nxv1i1(<vscale x 1 x i1> %va, <vscale x 1 x i1> %vb, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_nxv1i1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v9, v0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmerge.vim v9, v10, 1, v0
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    vslidedown.vi v9, v9, 5
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vslideup.vx v9, v8, a0
+; CHECK-NEXT:    vmsne.vi v0, v9, 0
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 1 x i1> undef, i1 1, i32 0
+  %allones = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> undef, <vscale x 1 x i32> zeroinitializer
+
+  %v = call <vscale x 1 x i1> @llvm.experimental.vp.splice.nxv1i1(<vscale x 1 x i1> %va, <vscale x 1 x i1> %vb, i32 5, <vscale x 1 x i1> %allones, i32 %evla, i32 %evlb)
+  ret <vscale x 1 x i1> %v
+}
+
+define <vscale x 1 x i1> @test_vp_splice_nxv1i1_negative_offset(<vscale x 1 x i1> %va, <vscale x 1 x i1> %vb, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_nxv1i1_negative_offset:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v9, v0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmerge.vim v9, v10, 1, v0
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetivli zero, 5, e8, mf8, ta, ma
+; CHECK-NEXT:    vslidedown.vx v9, v9, a0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vslideup.vi v9, v8, 5
+; CHECK-NEXT:    vmsne.vi v0, v9, 0
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 1 x i1> undef, i1 1, i32 0
+  %allones = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> undef, <vscale x 1 x i32> zeroinitializer
+
+  %v = call <vscale x 1 x i1> @llvm.experimental.vp.splice.nxv1i1(<vscale x 1 x i1> %va, <vscale x 1 x i1> %vb, i32 -5, <vscale x 1 x i1> %allones, i32 %evla, i32 %evlb)
+  ret <vscale x 1 x i1> %v
+}
+
+define <vscale x 1 x i1> @test_vp_splice_nxv1i1_masked(<vscale x 1 x i1> %va, <vscale x 1 x i1> %vb, <vscale x 1 x i1> %mask, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_nxv1i1_masked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v10, v0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vmv.v.i v11, 0
+; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmerge.vim v8, v11, 1, v0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    vmv.v.i v11, 0
+; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmerge.vim v10, v11, 1, v0
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vslidedown.vi v10, v10, 5, v0.t
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, mu
+; CHECK-NEXT:    vslideup.vx v10, v8, a0, v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
+; CHECK-NEXT:    vmsne.vi v0, v10, 0, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 1 x i1> @llvm.experimental.vp.splice.nxv1i1(<vscale x 1 x i1> %va, <vscale x 1 x i1> %vb, i32 5, <vscale x 1 x i1> %mask, i32 %evla, i32 %evlb)
+  ret <vscale x 1 x i1> %v
+}
+
+define <vscale x 2 x i1> @test_vp_splice_nxv2i1(<vscale x 2 x i1> %va, <vscale x 2 x i1> %vb, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_nxv2i1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v9, v0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmerge.vim v9, v10, 1, v0
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vslidedown.vi v9, v9, 5
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vslideup.vx v9, v8, a0
+; CHECK-NEXT:    vmsne.vi v0, v9, 0
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 2 x i1> undef, i1 1, i32 0
+  %allones = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer
+
+  %v = call <vscale x 2 x i1> @llvm.experimental.vp.splice.nxv2i1(<vscale x 2 x i1> %va, <vscale x 2 x i1> %vb, i32 5, <vscale x 2 x i1> %allones, i32 %evla, i32 %evlb)
+  ret <vscale x 2 x i1> %v
+}
+
+define <vscale x 2 x i1> @test_vp_splice_nxv2i1_negative_offset(<vscale x 2 x i1> %va, <vscale x 2 x i1> %vb, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_nxv2i1_negative_offset:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v9, v0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmerge.vim v9, v10, 1, v0
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetivli zero, 5, e8, mf4, ta, ma
+; CHECK-NEXT:    vslidedown.vx v9, v9, a0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vslideup.vi v9, v8, 5
+; CHECK-NEXT:    vmsne.vi v0, v9, 0
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 2 x i1> undef, i1 1, i32 0
+  %allones = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer
+
+  %v = call <vscale x 2 x i1> @llvm.experimental.vp.splice.nxv2i1(<vscale x 2 x i1> %va, <vscale x 2 x i1> %vb, i32 -5, <vscale x 2 x i1> %allones, i32 %evla, i32 %evlb)
+  ret <vscale x 2 x i1> %v
+}
+
+define <vscale x 2 x i1> @test_vp_splice_nxv2i1_masked(<vscale x 2 x i1> %va, <vscale x 2 x i1> %vb, <vscale x 2 x i1> %mask, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_nxv2i1_masked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v10, v0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vmv.v.i v11, 0
+; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmerge.vim v8, v11, 1, v0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vmv.v.i v11, 0
+; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmerge.vim v10, v11, 1, v0
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vslidedown.vi v10, v10, 5, v0.t
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, mu
+; CHECK-NEXT:    vslideup.vx v10, v8, a0, v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    vmsne.vi v0, v10, 0, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 2 x i1> @llvm.experimental.vp.splice.nxv2i1(<vscale x 2 x i1> %va, <vscale x 2 x i1> %vb, i32 5, <vscale x 2 x i1> %mask, i32 %evla, i32 %evlb)
+  ret <vscale x 2 x i1> %v
+}
+
+define <vscale x 4 x i1> @test_vp_splice_nxv4i1(<vscale x 4 x i1> %va, <vscale x 4 x i1> %vb, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_nxv4i1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v9, v0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmerge.vim v9, v10, 1, v0
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vslidedown.vi v9, v9, 5
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vslideup.vx v9, v8, a0
+; CHECK-NEXT:    vmsne.vi v0, v9, 0
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 4 x i1> undef, i1 1, i32 0
+  %allones = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> undef, <vscale x 4 x i32> zeroinitializer
+
+  %v = call <vscale x 4 x i1> @llvm.experimental.vp.splice.nxv4i1(<vscale x 4 x i1> %va, <vscale x 4 x i1> %vb, i32 5, <vscale x 4 x i1> %allones, i32 %evla, i32 %evlb)
+  ret <vscale x 4 x i1> %v
+}
+
+define <vscale x 4 x i1> @test_vp_splice_nxv4i1_negative_offset(<vscale x 4 x i1> %va, <vscale x 4 x i1> %vb, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_nxv4i1_negative_offset:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v9, v0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmerge.vim v9, v10, 1, v0
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetivli zero, 5, e8, mf2, ta, ma
+; CHECK-NEXT:    vslidedown.vx v9, v9, a0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vslideup.vi v9, v8, 5
+; CHECK-NEXT:    vmsne.vi v0, v9, 0
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 4 x i1> undef, i1 1, i32 0
+  %allones = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> undef, <vscale x 4 x i32> zeroinitializer
+
+  %v = call <vscale x 4 x i1> @llvm.experimental.vp.splice.nxv4i1(<vscale x 4 x i1> %va, <vscale x 4 x i1> %vb, i32 -5, <vscale x 4 x i1> %allones, i32 %evla, i32 %evlb)
+  ret <vscale x 4 x i1> %v
+}
+
+define <vscale x 4 x i1> @test_vp_splice_nxv4i1_masked(<vscale x 4 x i1> %va, <vscale x 4 x i1> %vb, <vscale x 4 x i1> %mask, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_nxv4i1_masked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v10, v0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vmv.v.i v11, 0
+; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmerge.vim v8, v11, 1, v0
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vmv.v.i v11, 0
+; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmerge.vim v10, v11, 1, v0
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vslidedown.vi v10, v10, 5, v0.t
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, mu
+; CHECK-NEXT:    vslideup.vx v10, v8, a0, v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vmsne.vi v0, v10, 0, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 4 x i1> @llvm.experimental.vp.splice.nxv4i1(<vscale x 4 x i1> %va, <vscale x 4 x i1> %vb, i32 5, <vscale x 4 x i1> %mask, i32 %evla, i32 %evlb)
+  ret <vscale x 4 x i1> %v
+}
+
+define <vscale x 8 x i1> @test_vp_splice_nxv8i1(<vscale x 8 x i1> %va, <vscale x 8 x i1> %vb, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_nxv8i1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v9, v0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmerge.vim v9, v10, 1, v0
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vi v9, v9, 5
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v9, v8, a0
+; CHECK-NEXT:    vmsne.vi v0, v9, 0
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 8 x i1> undef, i1 1, i32 0
+  %allones = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> undef, <vscale x 8 x i32> zeroinitializer
+
+  %v = call <vscale x 8 x i1> @llvm.experimental.vp.splice.nxv8i1(<vscale x 8 x i1> %va, <vscale x 8 x i1> %vb, i32 5, <vscale x 8 x i1> %allones, i32 %evla, i32 %evlb)
+  ret <vscale x 8 x i1> %v
+}
+
+define <vscale x 8 x i1> @test_vp_splice_nxv8i1_negative_offset(<vscale x 8 x i1> %va, <vscale x 8 x i1> %vb, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_nxv8i1_negative_offset:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v9, v0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmerge.vim v9, v10, 1, v0
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetivli zero, 5, e8, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vx v9, v9, a0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vslideup.vi v9, v8, 5
+; CHECK-NEXT:    vmsne.vi v0, v9, 0
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 8 x i1> undef, i1 1, i32 0
+  %allones = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> undef, <vscale x 8 x i32> zeroinitializer
+
+  %v = call <vscale x 8 x i1> @llvm.experimental.vp.splice.nxv8i1(<vscale x 8 x i1> %va, <vscale x 8 x i1> %vb, i32 -5, <vscale x 8 x i1> %allones, i32 %evla, i32 %evlb)
+  ret <vscale x 8 x i1> %v
+}
+
+define <vscale x 8 x i1> @test_vp_splice_nxv8i1_masked(<vscale x 8 x i1> %va, <vscale x 8 x i1> %vb, <vscale x 8 x i1> %mask, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_nxv8i1_masked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v10, v0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv.v.i v11, 0
+; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmerge.vim v8, v11, 1, v0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vmv.v.i v11, 0
+; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmerge.vim v10, v11, 1, v0
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vslidedown.vi v10, v10, 5, v0.t
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, mu
+; CHECK-NEXT:    vslideup.vx v10, v8, a0, v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vmsne.vi v0, v10, 0, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 8 x i1> @llvm.experimental.vp.splice.nxv8i1(<vscale x 8 x i1> %va, <vscale x 8 x i1> %vb, i32 5, <vscale x 8 x i1> %mask, i32 %evla, i32 %evlb)
+  ret <vscale x 8 x i1> %v
+}
+
+define <vscale x 16 x i1> @test_vp_splice_nxv16i1(<vscale x 16 x i1> %va, <vscale x 16 x i1> %vb, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_nxv16i1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v9, v0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmerge.vim v10, v10, 1, v0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
+; CHECK-NEXT:    vmv.v.i v12, 0
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmerge.vim v8, v12, 1, v0
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
+; CHECK-NEXT:    vslidedown.vi v8, v8, 5
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-NEXT:    vslideup.vx v8, v10, a0
+; CHECK-NEXT:    vmsne.vi v0, v8, 0
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 16 x i1> undef, i1 1, i32 0
+  %allones = shufflevector <vscale x 16 x i1> %head, <vscale x 16 x i1> undef, <vscale x 16 x i32> zeroinitializer
+
+  %v = call <vscale x 16 x i1> @llvm.experimental.vp.splice.nxv16i1(<vscale x 16 x i1> %va, <vscale x 16 x i1> %vb, i32 5, <vscale x 16 x i1> %allones, i32 %evla, i32 %evlb)
+  ret <vscale x 16 x i1> %v
+}
+
+define <vscale x 16 x i1> @test_vp_splice_nxv16i1_negative_offset(<vscale x 16 x i1> %va, <vscale x 16 x i1> %vb, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_nxv16i1_negative_offset:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v9, v0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmerge.vim v10, v10, 1, v0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
+; CHECK-NEXT:    vmv.v.i v12, 0
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmerge.vim v8, v12, 1, v0
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetivli zero, 5, e8, m2, ta, ma
+; CHECK-NEXT:    vslidedown.vx v8, v8, a0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-NEXT:    vslideup.vi v8, v10, 5
+; CHECK-NEXT:    vmsne.vi v0, v8, 0
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 16 x i1> undef, i1 1, i32 0
+  %allones = shufflevector <vscale x 16 x i1> %head, <vscale x 16 x i1> undef, <vscale x 16 x i32> zeroinitializer
+
+  %v = call <vscale x 16 x i1> @llvm.experimental.vp.splice.nxv16i1(<vscale x 16 x i1> %va, <vscale x 16 x i1> %vb, i32 -5, <vscale x 16 x i1> %allones, i32 %evla, i32 %evlb)
+  ret <vscale x 16 x i1> %v
+}
+
+define <vscale x 16 x i1> @test_vp_splice_nxv16i1_masked(<vscale x 16 x i1> %va, <vscale x 16 x i1> %vb, <vscale x 16 x i1> %mask, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_nxv16i1_masked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v10, v0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-NEXT:    vmv.v.i v12, 0
+; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmerge.vim v12, v12, 1, v0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
+; CHECK-NEXT:    vmv.v.i v14, 0
+; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmerge.vim v10, v14, 1, v0
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vslidedown.vi v10, v10, 5, v0.t
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, mu
+; CHECK-NEXT:    vslideup.vx v10, v12, a0, v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e8, m2, ta, ma
+; CHECK-NEXT:    vmsne.vi v8, v10, 0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    ret
+  %v = call <vscale x 16 x i1> @llvm.experimental.vp.splice.nxv16i1(<vscale x 16 x i1> %va, <vscale x 16 x i1> %vb, i32 5, <vscale x 16 x i1> %mask, i32 %evla, i32 %evlb)
+  ret <vscale x 16 x i1> %v
+}
+
+define <vscale x 32 x i1> @test_vp_splice_nxv32i1(<vscale x 32 x i1> %va, <vscale x 32 x i1> %vb, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_nxv32i1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v9, v0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
+; CHECK-NEXT:    vmv.v.i v12, 0
+; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmerge.vim v12, v12, 1, v0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; CHECK-NEXT:    vmv.v.i v16, 0
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmerge.vim v8, v16, 1, v0
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; CHECK-NEXT:    vslidedown.vi v8, v8, 5
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
+; CHECK-NEXT:    vslideup.vx v8, v12, a0
+; CHECK-NEXT:    vmsne.vi v0, v8, 0
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 32 x i1> undef, i1 1, i32 0
+  %allones = shufflevector <vscale x 32 x i1> %head, <vscale x 32 x i1> undef, <vscale x 32 x i32> zeroinitializer
+
+  %v = call <vscale x 32 x i1> @llvm.experimental.vp.splice.nxv32i1(<vscale x 32 x i1> %va, <vscale x 32 x i1> %vb, i32 5, <vscale x 32 x i1> %allones, i32 %evla, i32 %evlb)
+  ret <vscale x 32 x i1> %v
+}
+
+define <vscale x 32 x i1> @test_vp_splice_nxv32i1_negative_offset(<vscale x 32 x i1> %va, <vscale x 32 x i1> %vb, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_nxv32i1_negative_offset:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v9, v0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
+; CHECK-NEXT:    vmv.v.i v12, 0
+; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmerge.vim v12, v12, 1, v0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; CHECK-NEXT:    vmv.v.i v16, 0
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmerge.vim v8, v16, 1, v0
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetivli zero, 5, e8, m4, ta, ma
+; CHECK-NEXT:    vslidedown.vx v8, v8, a0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
+; CHECK-NEXT:    vslideup.vi v8, v12, 5
+; CHECK-NEXT:    vmsne.vi v0, v8, 0
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 32 x i1> undef, i1 1, i32 0
+  %allones = shufflevector <vscale x 32 x i1> %head, <vscale x 32 x i1> undef, <vscale x 32 x i32> zeroinitializer
+
+  %v = call <vscale x 32 x i1> @llvm.experimental.vp.splice.nxv32i1(<vscale x 32 x i1> %va, <vscale x 32 x i1> %vb, i32 -5, <vscale x 32 x i1> %allones, i32 %evla, i32 %evlb)
+  ret <vscale x 32 x i1> %v
+}
+
+define <vscale x 32 x i1> @test_vp_splice_nxv32i1_masked(<vscale x 32 x i1> %va, <vscale x 32 x i1> %vb, <vscale x 32 x i1> %mask, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_nxv32i1_masked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v10, v0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
+; CHECK-NEXT:    vmv.v.i v12, 0
+; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmerge.vim v12, v12, 1, v0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; CHECK-NEXT:    vmv.v.i v16, 0
+; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmerge.vim v16, v16, 1, v0
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vslidedown.vi v16, v16, 5, v0.t
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, mu
+; CHECK-NEXT:    vslideup.vx v16, v12, a0, v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e8, m4, ta, ma
+; CHECK-NEXT:    vmsne.vi v8, v16, 0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    ret
+  %v = call <vscale x 32 x i1> @llvm.experimental.vp.splice.nxv32i1(<vscale x 32 x i1> %va, <vscale x 32 x i1> %vb, i32 5, <vscale x 32 x i1> %mask, i32 %evla, i32 %evlb)
+  ret <vscale x 32 x i1> %v
+}
+
+define <vscale x 64 x i1> @test_vp_splice_nxv64i1(<vscale x 64 x i1> %va, <vscale x 64 x i1> %vb, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_nxv64i1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v9, v0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vmv.v.i v16, 0
+; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmerge.vim v16, v16, 1, v0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
+; CHECK-NEXT:    vmv.v.i v24, 0
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmerge.vim v8, v24, 1, v0
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
+; CHECK-NEXT:    vslidedown.vi v8, v8, 5
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vslideup.vx v8, v16, a0
+; CHECK-NEXT:    vmsne.vi v0, v8, 0
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 64 x i1> undef, i1 1, i32 0
+  %allones = shufflevector <vscale x 64 x i1> %head, <vscale x 64 x i1> undef, <vscale x 64 x i32> zeroinitializer
+
+  %v = call <vscale x 64 x i1> @llvm.experimental.vp.splice.nxv64i1(<vscale x 64 x i1> %va, <vscale x 64 x i1> %vb, i32 5, <vscale x 64 x i1> %allones, i32 %evla, i32 %evlb)
+  ret <vscale x 64 x i1> %v
+}
+
+define <vscale x 64 x i1> @test_vp_splice_nxv64i1_negative_offset(<vscale x 64 x i1> %va, <vscale x 64 x i1> %vb, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_nxv64i1_negative_offset:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v9, v0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vmv.v.i v16, 0
+; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmerge.vim v16, v16, 1, v0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
+; CHECK-NEXT:    vmv.v.i v24, 0
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmerge.vim v8, v24, 1, v0
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetivli zero, 5, e8, m8, ta, ma
+; CHECK-NEXT:    vslidedown.vx v8, v8, a0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vslideup.vi v8, v16, 5
+; CHECK-NEXT:    vmsne.vi v0, v8, 0
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 64 x i1> undef, i1 1, i32 0
+  %allones = shufflevector <vscale x 64 x i1> %head, <vscale x 64 x i1> undef, <vscale x 64 x i32> zeroinitializer
+
+  %v = call <vscale x 64 x i1> @llvm.experimental.vp.splice.nxv64i1(<vscale x 64 x i1> %va, <vscale x 64 x i1> %vb, i32 -5, <vscale x 64 x i1> %allones, i32 %evla, i32 %evlb)
+  ret <vscale x 64 x i1> %v
+}
+
+define <vscale x 64 x i1> @test_vp_splice_nxv64i1_masked(<vscale x 64 x i1> %va, <vscale x 64 x i1> %vb, <vscale x 64 x i1> %mask, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_nxv64i1_masked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v10, v0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vmv.v.i v16, 0
+; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmerge.vim v16, v16, 1, v0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
+; CHECK-NEXT:    vmv.v.i v24, 0
+; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmerge.vim v24, v24, 1, v0
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vslidedown.vi v24, v24, 5, v0.t
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, mu
+; CHECK-NEXT:    vslideup.vx v24, v16, a0, v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e8, m8, ta, ma
+; CHECK-NEXT:    vmsne.vi v8, v24, 0, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    ret
+  %v = call <vscale x 64 x i1> @llvm.experimental.vp.splice.nxv64i1(<vscale x 64 x i1> %va, <vscale x 64 x i1> %vb, i32 5, <vscale x 64 x i1> %mask, i32 %evla, i32 %evlb)
+  ret <vscale x 64 x i1> %v
+}
diff --git a/llvm/test/CodeGen/RISCV/vp-splice.ll b/llvm/test/CodeGen/RISCV/vp-splice.ll
new file mode 100644
index 0000000000000..7d85370e390b0
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/vp-splice.ll
@@ -0,0 +1,330 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple riscv64 -mattr=+f,+d,+v -verify-machineinstrs \
+; RUN:   < %s | FileCheck %s
+
+declare <vscale x 2 x i64> @llvm.experimental.vp.splice.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, i32, <vscale x 2 x i1>, i32, i32)
+
+declare <vscale x 1 x i64> @llvm.experimental.vp.splice.nxv1i64(<vscale x 1 x i64>, <vscale x 1 x i64>, i32, <vscale x 1 x i1>, i32, i32)
+declare <vscale x 2 x i32> @llvm.experimental.vp.splice.nxv2i32(<vscale x 2 x i32>, <vscale x 2 x i32>, i32, <vscale x 2 x i1>, i32, i32)
+declare <vscale x 4 x i16> @llvm.experimental.vp.splice.nxv4i16(<vscale x 4 x i16>, <vscale x 4 x i16>, i32, <vscale x 4 x i1>, i32, i32)
+declare <vscale x 8 x i8> @llvm.experimental.vp.splice.nxv8i8(<vscale x 8 x i8>, <vscale x 8 x i8>, i32, <vscale x 8 x i1>, i32, i32)
+
+declare <vscale x 1 x double> @llvm.experimental.vp.splice.nxv1f64(<vscale x 1 x double>, <vscale x 1 x double>, i32, <vscale x 1 x i1>, i32, i32)
+declare <vscale x 2 x float> @llvm.experimental.vp.splice.nxv2f32(<vscale x 2 x float>, <vscale x 2 x float>, i32, <vscale x 2 x i1>, i32, i32)
+
+declare <vscale x 16 x i64> @llvm.experimental.vp.splice.nxv16i64(<vscale x 16 x i64>, <vscale x 16 x i64>, i32, <vscale x 16 x i1>, i32, i32)
+
+define <vscale x 2 x i64> @test_vp_splice_nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i64> %vb, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_nxv2i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    vslidedown.vi v8, v8, 5
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vslideup.vx v8, v10, a0
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 2 x i1> undef, i1 1, i32 0
+  %allones = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer
+
+  %v = call <vscale x 2 x i64> @llvm.experimental.vp.splice.nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i64> %vb, i32 5, <vscale x 2 x i1> %allones, i32 %evla, i32 %evlb)
+  ret <vscale x 2 x i64> %v
+}
+
+define <vscale x 2 x i64> @test_vp_splice_nxv2i64_negative_offset(<vscale x 2 x i64> %va, <vscale x 2 x i64> %vb, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_nxv2i64_negative_offset:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetivli zero, 5, e64, m2, ta, ma
+; CHECK-NEXT:    vslidedown.vx v8, v8, a0
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vslideup.vi v8, v10, 5
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 2 x i1> undef, i1 1, i32 0
+  %allones = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer
+
+  %v = call <vscale x 2 x i64> @llvm.experimental.vp.splice.nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i64> %vb, i32 -5, <vscale x 2 x i1> %allones, i32 %evla, i32 %evlb)
+  ret <vscale x 2 x i64> %v
+}
+
+define <vscale x 2 x i64> @test_vp_splice_nxv2i64_masked(<vscale x 2 x i64> %va, <vscale x 2 x i64> %vb, <vscale x 2 x i1> %mask, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_nxv2i64_masked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    vslidedown.vi v8, v8, 5, v0.t
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, mu
+; CHECK-NEXT:    vslideup.vx v8, v10, a0, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 2 x i64> @llvm.experimental.vp.splice.nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i64> %vb, i32 5, <vscale x 2 x i1> %mask, i32 %evla, i32 %evlb)
+  ret <vscale x 2 x i64> %v
+}
+
+define <vscale x 1 x i64> @test_vp_splice_nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %vb, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_nxv1i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vi v8, v8, 5
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v8, v9, a0
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 1 x i1> undef, i1 1, i32 0
+  %allones = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> undef, <vscale x 1 x i32> zeroinitializer
+
+  %v = call <vscale x 1 x i64> @llvm.experimental.vp.splice.nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %vb, i32 5, <vscale x 1 x i1> %allones, i32 %evla, i32 %evlb)
+  ret <vscale x 1 x i64> %v
+}
+
+define <vscale x 1 x i64> @test_vp_splice_nxv1i64_negative_offset(<vscale x 1 x i64> %va, <vscale x 1 x i64> %vb, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_nxv1i64_negative_offset:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetivli zero, 5, e64, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vx v8, v8, a0
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vslideup.vi v8, v9, 5
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 1 x i1> undef, i1 1, i32 0
+  %allones = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> undef, <vscale x 1 x i32> zeroinitializer
+
+  %v = call <vscale x 1 x i64> @llvm.experimental.vp.splice.nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %vb, i32 -5, <vscale x 1 x i1> %allones, i32 %evla, i32 %evlb)
+  ret <vscale x 1 x i64> %v
+}
+
+define <vscale x 1 x i64> @test_vp_splice_nxv1i64_masked(<vscale x 1 x i64> %va, <vscale x 1 x i64> %vb, <vscale x 1 x i1> %mask, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_nxv1i64_masked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vi v8, v8, 5, v0.t
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, mu
+; CHECK-NEXT:    vslideup.vx v8, v9, a0, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 1 x i64> @llvm.experimental.vp.splice.nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %vb, i32 5, <vscale x 1 x i1> %mask, i32 %evla, i32 %evlb)
+  ret <vscale x 1 x i64> %v
+}
+
+define <vscale x 2 x i32> @test_vp_splice_nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %vb, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_nxv2i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vi v8, v8, 5
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v8, v9, a0
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 2 x i1> undef, i1 1, i32 0
+  %allones = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer
+
+  %v = call <vscale x 2 x i32> @llvm.experimental.vp.splice.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %vb, i32 5, <vscale x 2 x i1> %allones, i32 %evla, i32 %evlb)
+  ret <vscale x 2 x i32> %v
+}
+
+define <vscale x 2 x i32> @test_vp_splice_nxv2i32_negative_offset(<vscale x 2 x i32> %va, <vscale x 2 x i32> %vb, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_nxv2i32_negative_offset:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetivli zero, 5, e32, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vx v8, v8, a0
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vslideup.vi v8, v9, 5
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 2 x i1> undef, i1 1, i32 0
+  %allones = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer
+
+  %v = call <vscale x 2 x i32> @llvm.experimental.vp.splice.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %vb, i32 -5, <vscale x 2 x i1> %allones, i32 %evla, i32 %evlb)
+  ret <vscale x 2 x i32> %v
+}
+
+define <vscale x 2 x i32> @test_vp_splice_nxv2i32_masked(<vscale x 2 x i32> %va, <vscale x 2 x i32> %vb, <vscale x 2 x i1> %mask, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_nxv2i32_masked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vi v8, v8, 5, v0.t
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, mu
+; CHECK-NEXT:    vslideup.vx v8, v9, a0, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 2 x i32> @llvm.experimental.vp.splice.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %vb, i32 5, <vscale x 2 x i1> %mask, i32 %evla, i32 %evlb)
+  ret <vscale x 2 x i32> %v
+}
+
+define <vscale x 4 x i16> @test_vp_splice_nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> %vb, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_nxv4i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vi v8, v8, 5
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v8, v9, a0
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 4 x i1> undef, i1 1, i32 0
+  %allones = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> undef, <vscale x 4 x i32> zeroinitializer
+
+  %v = call <vscale x 4 x i16> @llvm.experimental.vp.splice.nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> %vb, i32 5, <vscale x 4 x i1> %allones, i32 %evla, i32 %evlb)
+  ret <vscale x 4 x i16> %v
+}
+
+define <vscale x 4 x i16> @test_vp_splice_nxv4i16_negative_offset(<vscale x 4 x i16> %va, <vscale x 4 x i16> %vb, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_nxv4i16_negative_offset:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetivli zero, 5, e16, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vx v8, v8, a0
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vslideup.vi v8, v9, 5
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 4 x i1> undef, i1 1, i32 0
+  %allones = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> undef, <vscale x 4 x i32> zeroinitializer
+
+  %v = call <vscale x 4 x i16> @llvm.experimental.vp.splice.nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> %vb, i32 -5, <vscale x 4 x i1> %allones, i32 %evla, i32 %evlb)
+  ret <vscale x 4 x i16> %v
+}
+
+define <vscale x 4 x i16> @test_vp_splice_nxv4i16_masked(<vscale x 4 x i16> %va, <vscale x 4 x i16> %vb, <vscale x 4 x i1> %mask, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_nxv4i16_masked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vi v8, v8, 5, v0.t
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
+; CHECK-NEXT:    vslideup.vx v8, v9, a0, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 4 x i16> @llvm.experimental.vp.splice.nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> %vb, i32 5, <vscale x 4 x i1> %mask, i32 %evla, i32 %evlb)
+  ret <vscale x 4 x i16> %v
+}
+
+define <vscale x 8 x i8> @test_vp_splice_nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> %vb, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_nxv8i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vi v8, v8, 5
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v8, v9, a0
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 8 x i1> undef, i1 1, i32 0
+  %allones = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> undef, <vscale x 8 x i32> zeroinitializer
+
+  %v = call <vscale x 8 x i8> @llvm.experimental.vp.splice.nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> %vb, i32 5, <vscale x 8 x i1> %allones, i32 %evla, i32 %evlb)
+  ret <vscale x 8 x i8> %v
+}
+
+define <vscale x 8 x i8> @test_vp_splice_nxv8i8_negative_offset(<vscale x 8 x i8> %va, <vscale x 8 x i8> %vb, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_nxv8i8_negative_offset:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetivli zero, 5, e8, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vx v8, v8, a0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vslideup.vi v8, v9, 5
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 8 x i1> undef, i1 1, i32 0
+  %allones = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> undef, <vscale x 8 x i32> zeroinitializer
+
+  %v = call <vscale x 8 x i8> @llvm.experimental.vp.splice.nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> %vb, i32 -5, <vscale x 8 x i1> %allones, i32 %evla, i32 %evlb)
+  ret <vscale x 8 x i8> %v
+}
+
+define <vscale x 8 x i8> @test_vp_splice_nxv8i8_masked(<vscale x 8 x i8> %va, <vscale x 8 x i8> %vb, <vscale x 8 x i1> %mask, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_nxv8i8_masked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vi v8, v8, 5, v0.t
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, mu
+; CHECK-NEXT:    vslideup.vx v8, v9, a0, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 8 x i8> @llvm.experimental.vp.splice.nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> %vb, i32 5, <vscale x 8 x i1> %mask, i32 %evla, i32 %evlb)
+  ret <vscale x 8 x i8> %v
+}
+
+define <vscale x 1 x double> @test_vp_splice_nxv1f64(<vscale x 1 x double> %va, <vscale x 1 x double> %vb, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_nxv1f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vi v8, v8, 5
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v8, v9, a0
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 1 x i1> undef, i1 1, i32 0
+  %allones = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> undef, <vscale x 1 x i32> zeroinitializer
+
+  %v = call <vscale x 1 x double> @llvm.experimental.vp.splice.nxv1f64(<vscale x 1 x double> %va, <vscale x 1 x double> %vb, i32 5, <vscale x 1 x i1> %allones, i32 %evla, i32 %evlb)
+  ret <vscale x 1 x double> %v
+}
+
+define <vscale x 1 x double> @test_vp_splice_nxv1f64_negative_offset(<vscale x 1 x double> %va, <vscale x 1 x double> %vb, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_nxv1f64_negative_offset:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetivli zero, 5, e64, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vx v8, v8, a0
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vslideup.vi v8, v9, 5
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 1 x i1> undef, i1 1, i32 0
+  %allones = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> undef, <vscale x 1 x i32> zeroinitializer
+
+  %v = call <vscale x 1 x double> @llvm.experimental.vp.splice.nxv1f64(<vscale x 1 x double> %va, <vscale x 1 x double> %vb, i32 -5, <vscale x 1 x i1> %allones, i32 %evla, i32 %evlb)
+  ret <vscale x 1 x double> %v
+}
+
+define <vscale x 1 x double> @test_vp_splice_nxv1f64_masked(<vscale x 1 x double> %va, <vscale x 1 x double> %vb, <vscale x 1 x i1> %mask, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_nxv1f64_masked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vi v8, v8, 5, v0.t
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, mu
+; CHECK-NEXT:    vslideup.vx v8, v9, a0, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 1 x double> @llvm.experimental.vp.splice.nxv1f64(<vscale x 1 x double> %va, <vscale x 1 x double> %vb, i32 5, <vscale x 1 x i1> %mask, i32 %evla, i32 %evlb)
+  ret <vscale x 1 x double> %v
+}
+
+define <vscale x 2 x float> @test_vp_splice_nxv2f32(<vscale x 2 x float> %va, <vscale x 2 x float> %vb, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_nxv2f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vi v8, v8, 5
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v8, v9, a0
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 2 x i1> undef, i1 1, i32 0
+  %allones = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer
+
+  %v = call <vscale x 2 x float> @llvm.experimental.vp.splice.nxv2f32(<vscale x 2 x float> %va, <vscale x 2 x float> %vb, i32 5, <vscale x 2 x i1> %allones, i32 %evla, i32 %evlb)
+  ret <vscale x 2 x float> %v
+}
+
+define <vscale x 2 x float> @test_vp_splice_nxv2f32_negative_offset(<vscale x 2 x float> %va, <vscale x 2 x float> %vb, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_nxv2f32_negative_offset:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetivli zero, 5, e32, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vx v8, v8, a0
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vslideup.vi v8, v9, 5
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 2 x i1> undef, i1 1, i32 0
+  %allones = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer
+
+  %v = call <vscale x 2 x float> @llvm.experimental.vp.splice.nxv2f32(<vscale x 2 x float> %va, <vscale x 2 x float> %vb, i32 -5, <vscale x 2 x i1> %allones, i32 %evla, i32 %evlb)
+  ret <vscale x 2 x float> %v
+}
+
+define <vscale x 2 x float> @test_vp_splice_nxv2f32_masked(<vscale x 2 x float> %va, <vscale x 2 x float> %vb, <vscale x 2 x i1> %mask, i32 zeroext %evla, i32 zeroext %evlb) {
+; CHECK-LABEL: test_vp_splice_nxv2f32_masked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a0, a0, -5
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vi v8, v8, 5, v0.t
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, mu
+; CHECK-NEXT:    vslideup.vx v8, v9, a0, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 2 x float> @llvm.experimental.vp.splice.nxv2f32(<vscale x 2 x float> %va, <vscale x 2 x float> %vb, i32 5, <vscale x 2 x i1> %mask, i32 %evla, i32 %evlb)
+  ret <vscale x 2 x float> %v
+}

From f97e559539afc08700a6b4bf134d535fd237625e Mon Sep 17 00:00:00 2001
From: Nico Weber <thakis@chromium.org>
Date: Thu, 21 Dec 2023 11:39:13 -0500
Subject: [PATCH 090/342] [gn] port e3627e2690a (TextAPI/BinaryReader)

---
 .../secondary/llvm/lib/TextAPI/BinaryReader/BUILD.gn   | 10 ++++++++++
 .../gn/secondary/llvm/tools/llvm-readtapi/BUILD.gn     |  1 +
 2 files changed, 11 insertions(+)
 create mode 100644 llvm/utils/gn/secondary/llvm/lib/TextAPI/BinaryReader/BUILD.gn

diff --git a/llvm/utils/gn/secondary/llvm/lib/TextAPI/BinaryReader/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/TextAPI/BinaryReader/BUILD.gn
new file mode 100644
index 0000000000000..3eeb32aae4e3c
--- /dev/null
+++ b/llvm/utils/gn/secondary/llvm/lib/TextAPI/BinaryReader/BUILD.gn
@@ -0,0 +1,10 @@
+static_library("BinaryReader") {
+  output_name = "LLVMTextAPIBinaryReader"
+  deps = [
+    "//llvm/lib/Object",
+    "//llvm/lib/Support",
+    "//llvm/lib/TargetParser",
+    "//llvm/lib/TextAPI",
+  ]
+  sources = [ "DylibReader.cpp" ]
+}
diff --git a/llvm/utils/gn/secondary/llvm/tools/llvm-readtapi/BUILD.gn b/llvm/utils/gn/secondary/llvm/tools/llvm-readtapi/BUILD.gn
index a562d04c4e90d..df0a6b01755aa 100644
--- a/llvm/utils/gn/secondary/llvm/tools/llvm-readtapi/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/tools/llvm-readtapi/BUILD.gn
@@ -30,6 +30,7 @@ executable("llvm-readtapi") {
     "//llvm/lib/Option",
     "//llvm/lib/Support",
     "//llvm/lib/TextAPI",
+    "//llvm/lib/TextAPI/BinaryReader",
   ]
   sources = [
     "DiffEngine.cpp",

From 4cdeef510e136865c2445dedb5a0f72cd11d4527 Mon Sep 17 00:00:00 2001
From: Nico Weber <thakis@chromium.org>
Date: Thu, 21 Dec 2023 11:37:22 -0500
Subject: [PATCH 091/342] [gn] port c6f29dbb596f

---
 llvm/utils/gn/secondary/llvm/lib/TextAPI/BUILD.gn         | 1 +
 llvm/utils/gn/secondary/llvm/tools/llvm-readtapi/BUILD.gn | 1 +
 2 files changed, 2 insertions(+)

diff --git a/llvm/utils/gn/secondary/llvm/lib/TextAPI/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/TextAPI/BUILD.gn
index f1c604aa420bd..ed312b2486564 100644
--- a/llvm/utils/gn/secondary/llvm/lib/TextAPI/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/TextAPI/BUILD.gn
@@ -20,5 +20,6 @@ static_library("TextAPI") {
     "TextStub.cpp",
     "TextStubCommon.cpp",
     "TextStubV5.cpp",
+    "Utils.cpp",
   ]
 }
diff --git a/llvm/utils/gn/secondary/llvm/tools/llvm-readtapi/BUILD.gn b/llvm/utils/gn/secondary/llvm/tools/llvm-readtapi/BUILD.gn
index df0a6b01755aa..1e67b1dcf036e 100644
--- a/llvm/utils/gn/secondary/llvm/tools/llvm-readtapi/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/tools/llvm-readtapi/BUILD.gn
@@ -26,6 +26,7 @@ group("symlinks") {
 executable("llvm-readtapi") {
   deps = [
     ":TapiOpts",
+    "//llvm/lib/BinaryFormat",
     "//llvm/lib/Object",
     "//llvm/lib/Option",
     "//llvm/lib/Support",

From 411cba215a9c4034fdaf60d4c79bf803d74e6e69 Mon Sep 17 00:00:00 2001
From: Mikhail Gudim <mgudim@gmail.com>
Date: Thu, 21 Dec 2023 11:41:09 -0500
Subject: [PATCH 092/342] =?UTF-8?q?Revert=20"[InstCombine]=20Extend=20`fol?=
 =?UTF-8?q?dICmpBinOp`=20to=20`add`-like=20`or`.=20(#71=E2=80=A6=20(#76167?=
 =?UTF-8?q?)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

…396)"

This reverts commit 8773c9be3d9868288f1f46957945d50ff58e4e91.
---
 .../InstCombine/InstCombineCompares.cpp       |  62 +++++-----
 llvm/test/Transforms/InstCombine/icmp.ll      | 117 ++++--------------
 2 files changed, 53 insertions(+), 126 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
index 0ad87eeb4c91a..289976718e52f 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -4624,35 +4624,27 @@ Instruction *InstCombinerImpl::foldICmpBinOp(ICmpInst &I,
   }
 
   bool NoOp0WrapProblem = false, NoOp1WrapProblem = false;
-  bool Op0HasNUW = false, Op1HasNUW = false;
-  bool Op0HasNSW = false, Op1HasNSW = false;
+  if (BO0 && isa<OverflowingBinaryOperator>(BO0))
+    NoOp0WrapProblem =
+        ICmpInst::isEquality(Pred) ||
+        (CmpInst::isUnsigned(Pred) && BO0->hasNoUnsignedWrap()) ||
+        (CmpInst::isSigned(Pred) && BO0->hasNoSignedWrap());
+  if (BO1 && isa<OverflowingBinaryOperator>(BO1))
+    NoOp1WrapProblem =
+        ICmpInst::isEquality(Pred) ||
+        (CmpInst::isUnsigned(Pred) && BO1->hasNoUnsignedWrap()) ||
+        (CmpInst::isSigned(Pred) && BO1->hasNoSignedWrap());
+
   // Analyze the case when either Op0 or Op1 is an add instruction.
   // Op0 = A + B (or A and B are null); Op1 = C + D (or C and D are null).
-  auto hasNoWrapProblem = [](const BinaryOperator &BO, CmpInst::Predicate Pred,
-                             bool &HasNSW, bool &HasNUW) -> bool {
-    if (isa<OverflowingBinaryOperator>(BO)) {
-      HasNUW = BO.hasNoUnsignedWrap();
-      HasNSW = BO.hasNoSignedWrap();
-      return ICmpInst::isEquality(Pred) ||
-             (CmpInst::isUnsigned(Pred) && HasNUW) ||
-             (CmpInst::isSigned(Pred) && HasNSW);
-    } else if (BO.getOpcode() == Instruction::Or) {
-      HasNUW = true;
-      HasNSW = true;
-      return true;
-    } else {
-      return false;
-    }
-  };
   Value *A = nullptr, *B = nullptr, *C = nullptr, *D = nullptr;
-
-  if (BO0) {
-    match(BO0, m_AddLike(m_Value(A), m_Value(B)));
-    NoOp0WrapProblem = hasNoWrapProblem(*BO0, Pred, Op0HasNSW, Op0HasNUW);
+  if (BO0 && BO0->getOpcode() == Instruction::Add) {
+    A = BO0->getOperand(0);
+    B = BO0->getOperand(1);
   }
-  if (BO1) {
-    match(BO1, m_AddLike(m_Value(C), m_Value(D)));
-    NoOp1WrapProblem = hasNoWrapProblem(*BO1, Pred, Op1HasNSW, Op1HasNUW);
+  if (BO1 && BO1->getOpcode() == Instruction::Add) {
+    C = BO1->getOperand(0);
+    D = BO1->getOperand(1);
   }
 
   // icmp (A+B), A -> icmp B, 0 for equalities or if there is no overflow.
@@ -4772,15 +4764,17 @@ Instruction *InstCombinerImpl::foldICmpBinOp(ICmpInst &I,
       APInt AP2Abs = AP2->abs();
       if (AP1Abs.uge(AP2Abs)) {
         APInt Diff = *AP1 - *AP2;
+        bool HasNUW = BO0->hasNoUnsignedWrap() && Diff.ule(*AP1);
+        bool HasNSW = BO0->hasNoSignedWrap();
         Constant *C3 = Constant::getIntegerValue(BO0->getType(), Diff);
-        Value *NewAdd = Builder.CreateAdd(
-            A, C3, "", Op0HasNUW && Diff.ule(*AP1), Op0HasNSW);
+        Value *NewAdd = Builder.CreateAdd(A, C3, "", HasNUW, HasNSW);
         return new ICmpInst(Pred, NewAdd, C);
       } else {
         APInt Diff = *AP2 - *AP1;
+        bool HasNUW = BO1->hasNoUnsignedWrap() && Diff.ule(*AP2);
+        bool HasNSW = BO1->hasNoSignedWrap();
         Constant *C3 = Constant::getIntegerValue(BO0->getType(), Diff);
-        Value *NewAdd = Builder.CreateAdd(
-            C, C3, "", Op1HasNUW && Diff.ule(*AP1), Op1HasNSW);
+        Value *NewAdd = Builder.CreateAdd(C, C3, "", HasNUW, HasNSW);
         return new ICmpInst(Pred, A, NewAdd);
       }
     }
@@ -4874,14 +4868,16 @@ Instruction *InstCombinerImpl::foldICmpBinOp(ICmpInst &I,
                   isKnownNonZero(Z, Q.DL, /*Depth=*/0, Q.AC, Q.CxtI, Q.DT);
         // if Z != 0 and nsw(X * Z) and nsw(Y * Z)
         //    X * Z eq/ne Y * Z -> X eq/ne Y
-        if (NonZero && BO0 && BO1 && Op0HasNSW && Op1HasNSW)
+        if (NonZero && BO0 && BO1 && BO0->hasNoSignedWrap() &&
+            BO1->hasNoSignedWrap())
           return new ICmpInst(Pred, X, Y);
       } else
         NonZero = isKnownNonZero(Z, Q.DL, /*Depth=*/0, Q.AC, Q.CxtI, Q.DT);
 
       // If Z != 0 and nuw(X * Z) and nuw(Y * Z)
       //    X * Z u{lt/le/gt/ge}/eq/ne Y * Z -> X u{lt/le/gt/ge}/eq/ne Y
-      if (NonZero && BO0 && BO1 && Op0HasNUW && Op1HasNUW)
+      if (NonZero && BO0 && BO1 && BO0->hasNoUnsignedWrap() &&
+          BO1->hasNoUnsignedWrap())
         return new ICmpInst(Pred, X, Y);
     }
   }
@@ -4980,8 +4976,8 @@ Instruction *InstCombinerImpl::foldICmpBinOp(ICmpInst &I,
       return new ICmpInst(Pred, BO0->getOperand(0), BO1->getOperand(0));
 
     case Instruction::Shl: {
-      bool NUW = Op0HasNUW && Op1HasNUW;
-      bool NSW = Op0HasNSW && Op1HasNSW;
+      bool NUW = BO0->hasNoUnsignedWrap() && BO1->hasNoUnsignedWrap();
+      bool NSW = BO0->hasNoSignedWrap() && BO1->hasNoSignedWrap();
       if (!NUW && !NSW)
         break;
       if (!NSW && I.isSigned())
diff --git a/llvm/test/Transforms/InstCombine/icmp.ll b/llvm/test/Transforms/InstCombine/icmp.ll
index fe831934f4f33..1c7bb36f0d34c 100644
--- a/llvm/test/Transforms/InstCombine/icmp.ll
+++ b/llvm/test/Transforms/InstCombine/icmp.ll
@@ -3862,9 +3862,10 @@ define <8 x i1> @bitreverse_vec_ne(<8 x i16> %x, <8 x i16> %y) {
 define i1 @knownbits1(i8 %a, i8 %b) {
 ; CHECK-LABEL: @knownbits1(
 ; CHECK-NEXT:    [[A1:%.*]] = and i8 [[A:%.*]], 1
+; CHECK-NEXT:    [[A2:%.*]] = or disjoint i8 [[A1]], 4
 ; CHECK-NEXT:    [[B1:%.*]] = and i8 [[B:%.*]], 2
-; CHECK-NEXT:    [[TMP1:%.*]] = or disjoint i8 [[B1]], 1
-; CHECK-NEXT:    [[C:%.*]] = icmp eq i8 [[A1]], [[TMP1]]
+; CHECK-NEXT:    [[B2:%.*]] = or disjoint i8 [[B1]], 5
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i8 [[A2]], [[B2]]
 ; CHECK-NEXT:    ret i1 [[C]]
 ;
   %a1 = and i8 %a, 5
@@ -3878,9 +3879,10 @@ define i1 @knownbits1(i8 %a, i8 %b) {
 define i1 @knownbits2(i8 %a, i8 %b) {
 ; CHECK-LABEL: @knownbits2(
 ; CHECK-NEXT:    [[A1:%.*]] = and i8 [[A:%.*]], 1
+; CHECK-NEXT:    [[A2:%.*]] = or disjoint i8 [[A1]], 4
 ; CHECK-NEXT:    [[B1:%.*]] = and i8 [[B:%.*]], 2
-; CHECK-NEXT:    [[TMP1:%.*]] = or disjoint i8 [[B1]], 1
-; CHECK-NEXT:    [[C:%.*]] = icmp ne i8 [[A1]], [[TMP1]]
+; CHECK-NEXT:    [[B2:%.*]] = or disjoint i8 [[B1]], 5
+; CHECK-NEXT:    [[C:%.*]] = icmp ne i8 [[A2]], [[B2]]
 ; CHECK-NEXT:    ret i1 [[C]]
 ;
   %a1 = and i8 %a, 5
@@ -3894,9 +3896,10 @@ define i1 @knownbits2(i8 %a, i8 %b) {
 define i1 @knownbits3(i8 %a, i8 %b) {
 ; CHECK-LABEL: @knownbits3(
 ; CHECK-NEXT:    [[A1:%.*]] = and i8 [[A:%.*]], 1
+; CHECK-NEXT:    [[A2:%.*]] = or disjoint i8 [[A1]], 4
 ; CHECK-NEXT:    [[B1:%.*]] = and i8 [[B:%.*]], 2
-; CHECK-NEXT:    [[TMP1:%.*]] = or disjoint i8 [[B1]], 1
-; CHECK-NEXT:    [[C:%.*]] = icmp eq i8 [[TMP1]], [[A1]]
+; CHECK-NEXT:    [[B2:%.*]] = or disjoint i8 [[B1]], 5
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i8 [[B2]], [[A2]]
 ; CHECK-NEXT:    ret i1 [[C]]
 ;
   %a1 = and i8 %a, 5
@@ -3910,9 +3913,10 @@ define i1 @knownbits3(i8 %a, i8 %b) {
 define <2 x i1> @knownbits4(<2 x i8> %a, <2 x i8> %b) {
 ; CHECK-LABEL: @knownbits4(
 ; CHECK-NEXT:    [[A1:%.*]] = and <2 x i8> [[A:%.*]], <i8 1, i8 1>
+; CHECK-NEXT:    [[A2:%.*]] = or disjoint <2 x i8> [[A1]], <i8 4, i8 4>
 ; CHECK-NEXT:    [[B1:%.*]] = and <2 x i8> [[B:%.*]], <i8 2, i8 2>
-; CHECK-NEXT:    [[TMP1:%.*]] = or disjoint <2 x i8> [[B1]], <i8 1, i8 1>
-; CHECK-NEXT:    [[C:%.*]] = icmp ne <2 x i8> [[TMP1]], [[A1]]
+; CHECK-NEXT:    [[B2:%.*]] = or disjoint <2 x i8> [[B1]], <i8 5, i8 5>
+; CHECK-NEXT:    [[C:%.*]] = icmp ne <2 x i8> [[B2]], [[A2]]
 ; CHECK-NEXT:    ret <2 x i1> [[C]]
 ;
   %a1 = and <2 x i8> %a, <i8 5, i8 5>
@@ -3928,9 +3932,10 @@ define <2 x i1> @knownbits4(<2 x i8> %a, <2 x i8> %b) {
 define i1 @knownbits5(i8 %a, i8 %b) {
 ; CHECK-LABEL: @knownbits5(
 ; CHECK-NEXT:    [[A1:%.*]] = and i8 [[A:%.*]], -127
+; CHECK-NEXT:    [[A2:%.*]] = or disjoint i8 [[A1]], 4
 ; CHECK-NEXT:    [[B1:%.*]] = and i8 [[B:%.*]], 2
-; CHECK-NEXT:    [[TMP1:%.*]] = or disjoint i8 [[B1]], 1
-; CHECK-NEXT:    [[C:%.*]] = icmp eq i8 [[A1]], [[TMP1]]
+; CHECK-NEXT:    [[B2:%.*]] = or disjoint i8 [[B1]], 5
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i8 [[A2]], [[B2]]
 ; CHECK-NEXT:    ret i1 [[C]]
 ;
   %a1 = and i8 %a, 133
@@ -3944,9 +3949,10 @@ define i1 @knownbits5(i8 %a, i8 %b) {
 define i1 @knownbits6(i8 %a, i8 %b) {
 ; CHECK-LABEL: @knownbits6(
 ; CHECK-NEXT:    [[A1:%.*]] = and i8 [[A:%.*]], -127
+; CHECK-NEXT:    [[A2:%.*]] = or disjoint i8 [[A1]], 4
 ; CHECK-NEXT:    [[B1:%.*]] = and i8 [[B:%.*]], 2
-; CHECK-NEXT:    [[TMP1:%.*]] = or disjoint i8 [[B1]], 1
-; CHECK-NEXT:    [[C:%.*]] = icmp ne i8 [[A1]], [[TMP1]]
+; CHECK-NEXT:    [[B2:%.*]] = or disjoint i8 [[B1]], 5
+; CHECK-NEXT:    [[C:%.*]] = icmp ne i8 [[A2]], [[B2]]
 ; CHECK-NEXT:    ret i1 [[C]]
 ;
   %a1 = and i8 %a, 133
@@ -3960,9 +3966,10 @@ define i1 @knownbits6(i8 %a, i8 %b) {
 define <2 x i1> @knownbits7(<2 x i8> %a, <2 x i8> %b) {
 ; CHECK-LABEL: @knownbits7(
 ; CHECK-NEXT:    [[A1:%.*]] = and <2 x i8> [[A:%.*]], <i8 -127, i8 -127>
+; CHECK-NEXT:    [[A2:%.*]] = or disjoint <2 x i8> [[A1]], <i8 4, i8 4>
 ; CHECK-NEXT:    [[B1:%.*]] = and <2 x i8> [[B:%.*]], <i8 2, i8 2>
-; CHECK-NEXT:    [[TMP1:%.*]] = or disjoint <2 x i8> [[B1]], <i8 1, i8 1>
-; CHECK-NEXT:    [[C:%.*]] = icmp eq <2 x i8> [[TMP1]], [[A1]]
+; CHECK-NEXT:    [[B2:%.*]] = or disjoint <2 x i8> [[B1]], <i8 5, i8 5>
+; CHECK-NEXT:    [[C:%.*]] = icmp eq <2 x i8> [[B2]], [[A2]]
 ; CHECK-NEXT:    ret <2 x i1> [[C]]
 ;
   %a1 = and <2 x i8> %a, <i8 133, i8 133>
@@ -3976,9 +3983,10 @@ define <2 x i1> @knownbits7(<2 x i8> %a, <2 x i8> %b) {
 define i1 @knownbits8(i8 %a, i8 %b) {
 ; CHECK-LABEL: @knownbits8(
 ; CHECK-NEXT:    [[A1:%.*]] = and i8 [[A:%.*]], -127
+; CHECK-NEXT:    [[A2:%.*]] = or disjoint i8 [[A1]], 4
 ; CHECK-NEXT:    [[B1:%.*]] = and i8 [[B:%.*]], 2
-; CHECK-NEXT:    [[TMP1:%.*]] = or disjoint i8 [[B1]], 1
-; CHECK-NEXT:    [[C:%.*]] = icmp ne i8 [[TMP1]], [[A1]]
+; CHECK-NEXT:    [[B2:%.*]] = or disjoint i8 [[B1]], 5
+; CHECK-NEXT:    [[C:%.*]] = icmp ne i8 [[B2]], [[A2]]
 ; CHECK-NEXT:    ret i1 [[C]]
 ;
   %a1 = and i8 %a, 133
@@ -4904,80 +4912,3 @@ define i1 @or_positive_sgt_zero_multi_use(i8 %a) {
   %cmp = icmp sgt i8 %b, 0
   ret i1 %cmp
 }
-
-
-define i1 @disjoint_or_sgt_1(i8 %a, i8 %b) {
-; CHECK-LABEL: @disjoint_or_sgt_1(
-; CHECK-NEXT:    [[B1:%.*]] = add nsw i8 [[B:%.*]], 2
-; CHECK-NEXT:    [[ICMP_:%.*]] = icmp sle i8 [[B1]], [[A:%.*]]
-; CHECK-NEXT:    ret i1 [[ICMP_]]
-;
-  %a1 = or disjoint i8 %a, 1
-  %b1 = add nsw i8 %b, 2
-  %icmp_ = icmp sgt i8 %a1, %b1
-  ret i1 %icmp_
-}
-
-define i1 @disjoint_or_sgt_2(i8 %a, i8 %b) {
-; CHECK-LABEL: @disjoint_or_sgt_2(
-; CHECK-NEXT:    [[A1:%.*]] = or disjoint i8 [[A:%.*]], 2
-; CHECK-NEXT:    [[B1:%.*]] = add i8 [[B:%.*]], 1
-; CHECK-NEXT:    [[ICMP_:%.*]] = icmp sgt i8 [[A1]], [[B1]]
-; CHECK-NEXT:    ret i1 [[ICMP_]]
-;
-  %a1 = or disjoint i8 %a, 2
-  %b1 = add i8 %b, 1
-  %icmp_ = icmp sgt i8 %a1, %b1
-  ret i1 %icmp_
-}
-
-define i1 @disjoint_or_sgt_3(i8 %a, i8 %b) {
-; CHECK-LABEL: @disjoint_or_sgt_3(
-; CHECK-NEXT:    [[A1:%.*]] = or disjoint i8 [[A:%.*]], 2
-; CHECK-NEXT:    [[B1:%.*]] = add nuw i8 [[B:%.*]], 1
-; CHECK-NEXT:    [[ICMP_:%.*]] = icmp sgt i8 [[A1]], [[B1]]
-; CHECK-NEXT:    ret i1 [[ICMP_]]
-;
-  %a1 = or disjoint i8 %a, 2
-  %b1 = add nuw i8 %b, 1
-  %icmp_ = icmp sgt i8 %a1, %b1
-  ret i1 %icmp_
-}
-
-define i1 @disjoint_or_ugt_1(i8 %a, i8 %b) {
-; CHECK-LABEL: @disjoint_or_ugt_1(
-; CHECK-NEXT:    [[B1:%.*]] = add nsw i8 [[B:%.*]], 2
-; CHECK-NEXT:    [[ICMP_:%.*]] = icmp ule i8 [[B1]], [[A:%.*]]
-; CHECK-NEXT:    ret i1 [[ICMP_]]
-;
-  %a1 = or disjoint i8 %a, 1
-  %b1 = add nsw i8 %b, 2
-  %icmp_ = icmp ugt i8 %a1, %b1
-  ret i1 %icmp_
-}
-
-define i1 @disjoint_or_ugt_2(i8 %a, i8 %b) {
-; CHECK-LABEL: @disjoint_or_ugt_2(
-; CHECK-NEXT:    [[A1:%.*]] = or disjoint i8 [[A:%.*]], 2
-; CHECK-NEXT:    [[B1:%.*]] = add i8 [[B:%.*]], 1
-; CHECK-NEXT:    [[ICMP_:%.*]] = icmp ugt i8 [[A1]], [[B1]]
-; CHECK-NEXT:    ret i1 [[ICMP_]]
-;
-  %a1 = or disjoint i8 %a, 2
-  %b1 = add i8 %b, 1
-  %icmp_ = icmp ugt i8 %a1, %b1
-  ret i1 %icmp_
-}
-
-define i1 @disjoint_or_ugt_3(i8 %a, i8 %b) {
-; CHECK-LABEL: @disjoint_or_ugt_3(
-; CHECK-NEXT:    [[A1:%.*]] = or disjoint i8 [[A:%.*]], 2
-; CHECK-NEXT:    [[B1:%.*]] = add nuw i8 [[B:%.*]], 1
-; CHECK-NEXT:    [[ICMP_:%.*]] = icmp ugt i8 [[A1]], [[B1]]
-; CHECK-NEXT:    ret i1 [[ICMP_]]
-;
-  %a1 = or disjoint i8 %a, 2
-  %b1 = add nuw i8 %b, 1
-  %icmp_ = icmp ugt i8 %a1, %b1
-  ret i1 %icmp_
-}

From 77c5c44b01a763e8b2f37f6971c830ed036e5da9 Mon Sep 17 00:00:00 2001
From: Dinar Temirbulatov <Dinar.Temirbulatov@arm.com>
Date: Thu, 21 Dec 2023 16:42:24 +0000
Subject: [PATCH 093/342] [AArch64][SME2] Add SME2 MLA/MLS builtins. (#75584)

Add SME2 MLA/MLS builtins.
---
 clang/include/clang/Basic/arm_sme.td          |  213 ++
 clang/lib/CodeGen/CGBuiltin.cpp               |   25 +
 .../aarch64-sme2-intrinsics/acle_sme2_mla.c   |  292 +++
 .../aarch64-sme2-intrinsics/acle_sme2_mlal.c  |  696 +++++++
 .../aarch64-sme2-intrinsics/acle_sme2_mlall.c | 1790 +++++++++++++++++
 .../aarch64-sme2-intrinsics/acle_sme2_mls.c   |  292 +++
 .../aarch64-sme2-intrinsics/acle_sme2_mlsl.c  |  696 +++++++
 .../aarch64-sme2-intrinsics/acle_sme2_imm.cpp |   44 +
 8 files changed, 4048 insertions(+)
 create mode 100644 clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mla.c
 create mode 100644 clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mlal.c
 create mode 100644 clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mlall.c
 create mode 100644 clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mls.c
 create mode 100644 clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mlsl.c

diff --git a/clang/include/clang/Basic/arm_sme.td b/clang/include/clang/Basic/arm_sme.td
index ce99ca82c1d39..2a9bc6870bf71 100644
--- a/clang/include/clang/Basic/arm_sme.td
+++ b/clang/include/clang/Basic/arm_sme.td
@@ -353,6 +353,219 @@ let TargetGuard = "sme2" in {
   def SVBMOPS : Inst<"svbmops_za32[_{d}]_m", "viPPdd", "iUi", MergeNone, "aarch64_sme_bmops_za32", [IsSharedZA, IsStreaming], [ImmCheck<0, ImmCheck0_3>]>;
 }
 
+// FMLA/FMLS
+let TargetGuard = "sme2" in {
+  def SVMLA_MULTI_VG1x2_F32 : Inst<"svmla_za32[_{d}]_vg1x2", "vm22", "f", MergeNone, "aarch64_sme_fmla_vg1x2", [IsStreaming, IsSharedZA], []>;
+  def SVMLA_MULTI_VG1x4_F32 : Inst<"svmla_za32[_{d}]_vg1x4", "vm44", "f", MergeNone, "aarch64_sme_fmla_vg1x4", [IsStreaming, IsSharedZA], []>;
+  def SVMLS_MULTI_VG1x2_F32 : Inst<"svmls_za32[_{d}]_vg1x2", "vm22", "f", MergeNone, "aarch64_sme_fmls_vg1x2", [IsStreaming, IsSharedZA], []>;
+  def SVMLS_MULTI_VG1x4_F32 : Inst<"svmls_za32[_{d}]_vg1x4", "vm44", "f", MergeNone, "aarch64_sme_fmls_vg1x4", [IsStreaming, IsSharedZA], []>;
+
+  def SVMLA_SINGLE_VG1x2_F32 : Inst<"svmla[_single]_za32[_{d}]_vg1x2", "vm2d", "f", MergeNone, "aarch64_sme_fmla_single_vg1x2", [IsStreaming, IsSharedZA], []>;
+  def SVMLA_SINGLE_VG1x4_F32 : Inst<"svmla[_single]_za32[_{d}]_vg1x4", "vm4d", "f", MergeNone, "aarch64_sme_fmla_single_vg1x4", [IsStreaming, IsSharedZA], []>;
+  def SVMLS_SINGLE_VG1x2_F32 : Inst<"svmls[_single]_za32[_{d}]_vg1x2", "vm2d", "f", MergeNone, "aarch64_sme_fmls_single_vg1x2", [IsStreaming, IsSharedZA], []>;
+  def SVMLS_SINGLE_VG1x4_F32 : Inst<"svmls[_single]_za32[_{d}]_vg1x4", "vm4d", "f", MergeNone, "aarch64_sme_fmls_single_vg1x4", [IsStreaming, IsSharedZA], []>;
+
+  def SVMLA_LANE_VG1x2_F32 : Inst<"svmla_lane_za32[_{d}]_vg1x2", "vm2di", "f", MergeNone, "aarch64_sme_fmla_lane_vg1x2", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_3>]>;
+  def SVMLA_LANE_VG1x4_F32 : Inst<"svmla_lane_za32[_{d}]_vg1x4", "vm4di", "f", MergeNone, "aarch64_sme_fmla_lane_vg1x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_3>]>;
+  def SVMLS_LANE_VG1x2_F32 : Inst<"svmls_lane_za32[_{d}]_vg1x2", "vm2di", "f", MergeNone, "aarch64_sme_fmls_lane_vg1x2", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_3>]>;
+  def SVMLS_LANE_VG1x4_F32 : Inst<"svmls_lane_za32[_{d}]_vg1x4", "vm4di", "f", MergeNone, "aarch64_sme_fmls_lane_vg1x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_3>]>;
+}
+
+let TargetGuard = "sme2,sme-f64f64" in {
+  def SVMLA_MULTI_VG1x2_F64 : Inst<"svmla_za64[_{d}]_vg1x2", "vm22", "d", MergeNone, "aarch64_sme_fmla_vg1x2", [IsStreaming, IsSharedZA], []>;
+  def SVMLA_MULTI_VG1x4_F64 : Inst<"svmla_za64[_{d}]_vg1x4", "vm44", "d", MergeNone, "aarch64_sme_fmla_vg1x4", [IsStreaming, IsSharedZA], []>;
+  def SVMLS_MULTI_VG1x2_F64 : Inst<"svmls_za64[_{d}]_vg1x2", "vm22", "d", MergeNone, "aarch64_sme_fmls_vg1x2", [IsStreaming, IsSharedZA], []>;
+  def SVMLS_MULTI_VG1x4_F64 : Inst<"svmls_za64[_{d}]_vg1x4", "vm44", "d", MergeNone, "aarch64_sme_fmls_vg1x4", [IsStreaming, IsSharedZA], []>;
+
+  def SVMLA_SINGLE_VG1x2_F64 : Inst<"svmla[_single]_za64[_{d}]_vg1x2", "vm2d", "d", MergeNone, "aarch64_sme_fmla_single_vg1x2", [IsStreaming, IsSharedZA], []>;
+  def SVMLA_SINGLE_VG1x4_F64 : Inst<"svmla[_single]_za64[_{d}]_vg1x4", "vm4d", "d", MergeNone, "aarch64_sme_fmla_single_vg1x4", [IsStreaming, IsSharedZA], []>;
+  def SVMLS_SINGLE_VG1x2_F64 : Inst<"svmls[_single]_za64[_{d}]_vg1x2", "vm2d", "d", MergeNone, "aarch64_sme_fmls_single_vg1x2", [IsStreaming, IsSharedZA], []>;
+  def SVMLS_SINGLE_VG1x4_F64 : Inst<"svmls[_single]_za64[_{d}]_vg1x4", "vm4d", "d", MergeNone, "aarch64_sme_fmls_single_vg1x4", [IsStreaming, IsSharedZA], []>;
+
+  def SVMLA_LANE_VG1x2_F64 : Inst<"svmla_lane_za64[_{d}]_vg1x2", "vm2di", "d", MergeNone, "aarch64_sme_fmla_lane_vg1x2", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_1>]>;
+  def SVMLA_LANE_VG1x4_F64 : Inst<"svmla_lane_za64[_{d}]_vg1x4", "vm4di", "d", MergeNone, "aarch64_sme_fmla_lane_vg1x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_1>]>;
+  def SVMLS_LANE_VG1x2_F64 : Inst<"svmls_lane_za64[_{d}]_vg1x2", "vm2di", "d", MergeNone, "aarch64_sme_fmls_lane_vg1x2", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_1>]>;
+  def SVMLS_LANE_VG1x4_F64 : Inst<"svmls_lane_za64[_{d}]_vg1x4", "vm4di", "d", MergeNone, "aarch64_sme_fmls_lane_vg1x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_1>]>;
+}
+
+// FMLAL/FMLSL/UMLAL/SMLAL
+// SMLALL/UMLALL/USMLALL/SUMLALL
+let TargetGuard = "sme2" in {
+  // MULTI MLAL
+  def SVMLAL_MULTI_VG2x2_F16 : Inst<"svmla_za32[_{d}]_vg2x2", "vm22", "bh", MergeNone, "aarch64_sme_fmlal_vg2x2", [IsStreaming, IsSharedZA], []>;
+  def SVMLAL_MULTI_VG2x4_F16 : Inst<"svmla_za32[_{d}]_vg2x4", "vm44", "bh", MergeNone, "aarch64_sme_fmlal_vg2x4", [IsStreaming, IsSharedZA], []>;
+  def SVMLAL_MULTI_VG2x2_S16 : Inst<"svmla_za32[_{d}]_vg2x2", "vm22", "s", MergeNone, "aarch64_sme_smlal_vg2x2", [IsStreaming, IsSharedZA], []>;
+  def SVMLAL_MULTI_VG2x4_S16 : Inst<"svmla_za32[_{d}]_vg2x4", "vm44", "s", MergeNone, "aarch64_sme_smlal_vg2x4", [IsStreaming, IsSharedZA], []>;
+  def SVMLAL_MULTI_VG2x2_U16 : Inst<"svmla_za32[_{d}]_vg2x2", "vm22", "Us", MergeNone, "aarch64_sme_umlal_vg2x2", [IsStreaming, IsSharedZA], []>;
+  def SVMLAL_MULTI_VG2x4_U16 : Inst<"svmla_za32[_{d}]_vg2x4", "vm44", "Us", MergeNone, "aarch64_sme_umlal_vg2x4", [IsStreaming, IsSharedZA], []>;
+
+  def SVMLAL_MULTI_VG4x2_S8 : Inst<"svmla_za32[_{d}]_vg4x2", "vm22", "c", MergeNone, "aarch64_sme_smla_za32_vg4x2", [IsStreaming, IsSharedZA], []>;
+  def SVMLAL_MULTI_VG4x2_U8 : Inst<"svmla_za32[_{d}]_vg4x2", "vm22", "Uc", MergeNone, "aarch64_sme_umla_za32_vg4x2", [IsStreaming, IsSharedZA], []>;
+  def SVMLAL_MULTI_VG4x4_S8 : Inst<"svmla_za32[_{d}]_vg4x4", "vm44", "c", MergeNone, "aarch64_sme_smla_za32_vg4x4", [IsStreaming, IsSharedZA], []>;
+  def SVMLAL_MULTI_VG4x4_U8 : Inst<"svmla_za32[_{d}]_vg4x4", "vm44", "Uc", MergeNone, "aarch64_sme_umla_za32_vg4x4", [IsStreaming, IsSharedZA], []>;
+
+  // MULTI MLSL
+  def SVMLSL_MULTI_VG2x2_F16 : Inst<"svmls_za32[_{d}]_vg2x2", "vm22", "bh", MergeNone, "aarch64_sme_fmlsl_vg2x2", [IsStreaming, IsSharedZA], []>;
+  def SVMLSL_MULTI_VG2x4_F16 : Inst<"svmls_za32[_{d}]_vg2x4", "vm44", "bh", MergeNone, "aarch64_sme_fmlsl_vg2x4", [IsStreaming, IsSharedZA], []>;
+  def SVMLSL_MULTI_VG2x2_S16 : Inst<"svmls_za32[_{d}]_vg2x2", "vm22", "s", MergeNone, "aarch64_sme_smlsl_vg2x2", [IsStreaming, IsSharedZA], []>;
+  def SVMLSL_MULTI_VG2x4_S16 : Inst<"svmls_za32[_{d}]_vg2x4", "vm44", "s", MergeNone, "aarch64_sme_smlsl_vg2x4", [IsStreaming, IsSharedZA], []>;
+  def SVMLSL_MULTI_VG2x2_U16 : Inst<"svmls_za32[_{d}]_vg2x2", "vm22", "Us", MergeNone, "aarch64_sme_umlsl_vg2x2", [IsStreaming, IsSharedZA], []>;
+  def SVMLSL_MULTI_VG2x4_U16 : Inst<"svmls_za32[_{d}]_vg2x4", "vm44", "Us", MergeNone, "aarch64_sme_umlsl_vg2x4", [IsStreaming, IsSharedZA], []>;
+
+  def SVMLSL_MULTI_VG4x2_S8 : Inst<"svmls_za32[_{d}]_vg4x2", "vm22", "c", MergeNone, "aarch64_sme_smls_za32_vg4x2", [IsStreaming, IsSharedZA], []>;
+  def SVMLSL_MULTI_VG4x2_U8 : Inst<"svmls_za32[_{d}]_vg4x2", "vm22", "Uc", MergeNone, "aarch64_sme_umls_za32_vg4x2", [IsStreaming, IsSharedZA], []>;
+  def SVMLSL_MULTI_VG4x4_S8 : Inst<"svmls_za32[_{d}]_vg4x4", "vm44", "c", MergeNone, "aarch64_sme_smls_za32_vg4x4", [IsStreaming, IsSharedZA], []>;
+  def SVMLSL_MULTI_VG4x4_U8 : Inst<"svmls_za32[_{d}]_vg4x4", "vm44", "Uc", MergeNone, "aarch64_sme_umls_za32_vg4x4", [IsStreaming, IsSharedZA], []>;
+
+  // SINGLE MLAL
+  def SVMLAL_SINGLE_VG2x1_F16 : Inst<"svmla_za32[_{d}]_vg2x1",          "vmdd", "bh", MergeNone, "aarch64_sme_fmlal_single_vg2x1", [IsStreaming, IsSharedZA], []>;
+  def SVMLAL_SINGLE_VG2x2_F16 : Inst<"svmla[_single]_za32[_{d}]_vg2x2", "vm2d", "bh", MergeNone, "aarch64_sme_fmlal_single_vg2x2", [IsStreaming, IsSharedZA], []>;
+  def SVMLAL_SINGLE_VG2x4_F16 : Inst<"svmla[_single]_za32[_{d}]_vg2x4", "vm4d", "bh", MergeNone, "aarch64_sme_fmlal_single_vg2x4", [IsStreaming, IsSharedZA], []>;
+  def SVMLAL_SINGLE_VG2x1_S16 : Inst<"svmla_za32[_{d}]_vg2x1",          "vmdd", "s", MergeNone, "aarch64_sme_smlal_single_vg2x1", [IsStreaming, IsSharedZA], []>;
+  def SVMLAL_SINGLE_VG2x2_S16 : Inst<"svmla[_single]_za32[_{d}]_vg2x2", "vm2d", "s", MergeNone, "aarch64_sme_smlal_single_vg2x2", [IsStreaming, IsSharedZA], []>;
+  def SVMLAL_SINGLE_VG2x4_S16 : Inst<"svmla[_single]_za32[_{d}]_vg2x4", "vm4d", "s", MergeNone, "aarch64_sme_smlal_single_vg2x4", [IsStreaming, IsSharedZA], []>;
+  def SVMLAL_SINGLE_VG2x1_U16 : Inst<"svmla_za32[_{d}]_vg2x1",          "vmdd", "Us", MergeNone, "aarch64_sme_umlal_single_vg2x1", [IsStreaming, IsSharedZA], []>;
+  def SVMLAL_SINGLE_VG2x2_U16 : Inst<"svmla[_single]_za32[_{d}]_vg2x2", "vm2d", "Us", MergeNone, "aarch64_sme_umlal_single_vg2x2", [IsStreaming, IsSharedZA], []>;
+  def SVMLAL_SINGLE_VG2x4_U16 : Inst<"svmla[_single]_za32[_{d}]_vg2x4", "vm4d", "Us", MergeNone, "aarch64_sme_umlal_single_vg2x4", [IsStreaming, IsSharedZA], []>;
+
+  def SVMLAL_SINGLE_VG4x1_S8  : Inst<"svmla_za32[_{d}]_vg4x1",          "vmdd", "c", MergeNone, "aarch64_sme_smla_za32_single_vg4x1", [IsStreaming, IsSharedZA], []>;
+  def SVMLAL_SINGLE_VG4x1_U8  : Inst<"svmla_za32[_{d}]_vg4x1",          "vmdd", "Uc", MergeNone, "aarch64_sme_umla_za32_single_vg4x1", [IsStreaming, IsSharedZA], []>;
+  def SVMLAL_SINGLE_VG4x2_S8  : Inst<"svmla[_single]_za32[_{d}]_vg4x2", "vm2d", "c", MergeNone, "aarch64_sme_smla_za32_single_vg4x2", [IsStreaming, IsSharedZA], []>;
+  def SVMLAL_SINGLE_VG4x2_U8  : Inst<"svmla[_single]_za32[_{d}]_vg4x2", "vm2d", "Uc", MergeNone, "aarch64_sme_umla_za32_single_vg4x2", [IsStreaming, IsSharedZA], []>;
+  def SVMLAL_SINGLE_VG4x4_S8  : Inst<"svmla[_single]_za32[_{d}]_vg4x4", "vm4d", "c", MergeNone, "aarch64_sme_smla_za32_single_vg4x4", [IsStreaming, IsSharedZA], []>;
+  def SVMLAL_SINGLE_VG4x4_U8  : Inst<"svmla[_single]_za32[_{d}]_vg4x4", "vm4d", "Uc", MergeNone, "aarch64_sme_umla_za32_single_vg4x4", [IsStreaming, IsSharedZA], []>;
+
+  // SINGLE MLSL
+  def SVMLSL_SINGLE_VG2x1_F16 : Inst<"svmls_za32[_{d}]_vg2x1",          "vmdd", "bh", MergeNone, "aarch64_sme_fmlsl_single_vg2x1", [IsStreaming, IsSharedZA], []>;
+  def SVMLSL_SINGLE_VG2x2_F16 : Inst<"svmls[_single]_za32[_{d}]_vg2x2", "vm2d", "bh", MergeNone, "aarch64_sme_fmlsl_single_vg2x2", [IsStreaming, IsSharedZA], []>;
+  def SVMLSL_SINGLE_VG2x4_F16 : Inst<"svmls[_single]_za32[_{d}]_vg2x4", "vm4d", "bh", MergeNone, "aarch64_sme_fmlsl_single_vg2x4", [IsStreaming, IsSharedZA], []>;
+  def SVMLSL_SINGLE_VG2x1_S16 : Inst<"svmls_za32[_{d}]_vg2x1",          "vmdd", "s", MergeNone, "aarch64_sme_smlsl_single_vg2x1", [IsStreaming, IsSharedZA], []>;
+  def SVMLSL_SINGLE_VG2x2_S16 : Inst<"svmls[_single]_za32[_{d}]_vg2x2", "vm2d", "s", MergeNone, "aarch64_sme_smlsl_single_vg2x2", [IsStreaming, IsSharedZA], []>;
+  def SVMLSL_SINGLE_VG2x4_S16 : Inst<"svmls[_single]_za32[_{d}]_vg2x4", "vm4d", "s", MergeNone, "aarch64_sme_smlsl_single_vg2x4", [IsStreaming, IsSharedZA], []>;
+  def SVMLSL_SINGLE_VG2x1_U16 : Inst<"svmls_za32[_{d}]_vg2x1",          "vmdd", "Us", MergeNone, "aarch64_sme_umlsl_single_vg2x1", [IsStreaming, IsSharedZA], []>;
+  def SVMLSL_SINGLE_VG2x2_U16 : Inst<"svmls[_single]_za32[_{d}]_vg2x2", "vm2d", "Us", MergeNone, "aarch64_sme_umlsl_single_vg2x2", [IsStreaming, IsSharedZA], []>;
+  def SVMLSL_SINGLE_VG2x4_U16 : Inst<"svmls[_single]_za32[_{d}]_vg2x4", "vm4d", "Us", MergeNone, "aarch64_sme_umlsl_single_vg2x4", [IsStreaming, IsSharedZA], []>;
+
+  def SVMLSL_SINGLE_VG4x1_S8  : Inst<"svmls_za32[_{d}]_vg4x1",          "vmdd", "c", MergeNone, "aarch64_sme_smls_za32_single_vg4x1", [IsStreaming, IsSharedZA], []>;
+  def SVMLSL_SINGLE_VG4x1_U8  : Inst<"svmls_za32[_{d}]_vg4x1",          "vmdd", "Uc", MergeNone, "aarch64_sme_umls_za32_single_vg4x1", [IsStreaming, IsSharedZA], []>;
+  def SVMLSL_SINGLE_VG4x2_S8  : Inst<"svmls[_single]_za32[_{d}]_vg4x2", "vm2d", "c", MergeNone, "aarch64_sme_smls_za32_single_vg4x2", [IsStreaming, IsSharedZA], []>;
+  def SVMLSL_SINGLE_VG4x2_U8  : Inst<"svmls[_single]_za32[_{d}]_vg4x2", "vm2d", "Uc", MergeNone, "aarch64_sme_umls_za32_single_vg4x2", [IsStreaming, IsSharedZA], []>;
+  def SVMLSL_SINGLE_VG4x4_S8  : Inst<"svmls[_single]_za32[_{d}]_vg4x4", "vm4d", "c", MergeNone, "aarch64_sme_smls_za32_single_vg4x4", [IsStreaming, IsSharedZA], []>;
+  def SVMLSL_SINGLE_VG4x4_U8  : Inst<"svmls[_single]_za32[_{d}]_vg4x4", "vm4d", "Uc", MergeNone, "aarch64_sme_umls_za32_single_vg4x4", [IsStreaming, IsSharedZA], []>;
+
+  // INDEXED MLAL
+  def SVMLAL_LANE_VG2x1_F16 : Inst<"svmla_lane_za32[_{d}]_vg2x1", "vmddi", "bh", MergeNone, "aarch64_sme_fmlal_lane_vg2x1", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLAL_LANE_VG2x2_F16 : Inst<"svmla_lane_za32[_{d}]_vg2x2", "vm2di", "bh", MergeNone, "aarch64_sme_fmlal_lane_vg2x2", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLAL_LANE_VG2x4_F16 : Inst<"svmla_lane_za32[_{d}]_vg2x4", "vm4di", "bh", MergeNone, "aarch64_sme_fmlal_lane_vg2x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLAL_LANE_VG2x1_S16 : Inst<"svmla_lane_za32[_{d}]_vg2x1", "vmddi", "s", MergeNone, "aarch64_sme_smlal_lane_vg2x1", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLAL_LANE_VG2x2_S16 : Inst<"svmla_lane_za32[_{d}]_vg2x2", "vm2di", "s", MergeNone, "aarch64_sme_smlal_lane_vg2x2", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLAL_LANE_VG2x4_S16 : Inst<"svmla_lane_za32[_{d}]_vg2x4", "vm4di", "s", MergeNone, "aarch64_sme_smlal_lane_vg2x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLAL_LANE_VG2x1_U16 : Inst<"svmla_lane_za32[_{d}]_vg2x1", "vmddi", "Us", MergeNone, "aarch64_sme_umlal_lane_vg2x1", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLAL_LANE_VG2x2_U16 : Inst<"svmla_lane_za32[_{d}]_vg2x2", "vm2di", "Us", MergeNone, "aarch64_sme_umlal_lane_vg2x2", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLAL_LANE_VG2x4_U16 : Inst<"svmla_lane_za32[_{d}]_vg2x4", "vm4di", "Us", MergeNone, "aarch64_sme_umlal_lane_vg2x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
+
+  def SVMLAL_LANE_VG4x1_S8 : Inst<"svmla_lane_za32[_{d}]_vg4x1", "vmddi", "c", MergeNone, "aarch64_sme_smla_za32_lane_vg4x1", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_15>]>;
+  def SVMLAL_LANE_VG4x1_U8 : Inst<"svmla_lane_za32[_{d}]_vg4x1", "vmddi", "Uc", MergeNone, "aarch64_sme_umla_za32_lane_vg4x1", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_15>]>;
+  def SVMLAL_LANE_VG4x2_S8 : Inst<"svmla_lane_za32[_{d}]_vg4x2", "vm2di", "c", MergeNone, "aarch64_sme_smla_za32_lane_vg4x2", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_15>]>;
+  def SVMLAL_LANE_VG4x2_U8 : Inst<"svmla_lane_za32[_{d}]_vg4x2", "vm2di", "Uc", MergeNone, "aarch64_sme_umla_za32_lane_vg4x2", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_15>]>;
+  def SVMLAL_LANE_VG4x4_S8 : Inst<"svmla_lane_za32[_{d}]_vg4x4", "vm4di", "c", MergeNone, "aarch64_sme_smla_za32_lane_vg4x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_15>]>;
+  def SVMLAL_LANE_VG4x4_U8 : Inst<"svmla_lane_za32[_{d}]_vg4x4", "vm4di", "Uc", MergeNone, "aarch64_sme_umla_za32_lane_vg4x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_15>]>;
+
+  // INDEXED MLSL
+  def SVMLSL_LANE_VG2x1_F16 : Inst<"svmls_lane_za32[_{d}]_vg2x1", "vmddi", "bh", MergeNone, "aarch64_sme_fmlsl_lane_vg2x1", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLSL_LANE_VG2x2_F16 : Inst<"svmls_lane_za32[_{d}]_vg2x2", "vm2di", "bh", MergeNone, "aarch64_sme_fmlsl_lane_vg2x2", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLSL_LANE_VG2x4_F16 : Inst<"svmls_lane_za32[_{d}]_vg2x4", "vm4di", "bh", MergeNone, "aarch64_sme_fmlsl_lane_vg2x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLSL_LANE_VG2x1_S16 : Inst<"svmls_lane_za32[_{d}]_vg2x1", "vmddi", "s", MergeNone, "aarch64_sme_smlsl_lane_vg2x1", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLSL_LANE_VG2x2_S16 : Inst<"svmls_lane_za32[_{d}]_vg2x2", "vm2di", "s", MergeNone, "aarch64_sme_smlsl_lane_vg2x2", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLSL_LANE_VG2x4_S16 : Inst<"svmls_lane_za32[_{d}]_vg2x4", "vm4di", "s", MergeNone, "aarch64_sme_smlsl_lane_vg2x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLSL_LANE_VG2x1_U16 : Inst<"svmls_lane_za32[_{d}]_vg2x1", "vmddi", "Us", MergeNone, "aarch64_sme_umlsl_lane_vg2x1", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLSL_LANE_VG2x2_U16 : Inst<"svmls_lane_za32[_{d}]_vg2x2", "vm2di", "Us", MergeNone, "aarch64_sme_umlsl_lane_vg2x2", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLSL_LANE_VG2x4_U16 : Inst<"svmls_lane_za32[_{d}]_vg2x4", "vm4di", "Us", MergeNone, "aarch64_sme_umlsl_lane_vg2x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
+
+  def SVMLSL_LANE_VG4x1_S8 : Inst<"svmls_lane_za32[_{d}]_vg4x1", "vmddi", "c", MergeNone, "aarch64_sme_smls_za32_lane_vg4x1", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_15>]>;
+  def SVMLSL_LANE_VG4x1_U8 : Inst<"svmls_lane_za32[_{d}]_vg4x1", "vmddi", "Uc", MergeNone, "aarch64_sme_umls_za32_lane_vg4x1", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_15>]>;
+  def SVMLSL_LANE_VG4x2_S8 : Inst<"svmls_lane_za32[_{d}]_vg4x2", "vm2di", "c", MergeNone, "aarch64_sme_smls_za32_lane_vg4x2", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_15>]>;
+  def SVMLSL_LANE_VG4x2_U8 : Inst<"svmls_lane_za32[_{d}]_vg4x2", "vm2di", "Uc", MergeNone, "aarch64_sme_umls_za32_lane_vg4x2", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_15>]>;
+  def SVMLSL_LANE_VG4x4_S8 : Inst<"svmls_lane_za32[_{d}]_vg4x4", "vm4di", "c", MergeNone, "aarch64_sme_smls_za32_lane_vg4x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_15>]>;
+  def SVMLSL_LANE_VG4x4_U8 : Inst<"svmls_lane_za32[_{d}]_vg4x4", "vm4di", "Uc", MergeNone, "aarch64_sme_umls_za32_lane_vg4x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_15>]>;
+
+  // SINGLE SUMLALL
+  // Single sumla maps to usmla, with zn & zm operands swapped
+  def SVSUMLALL_SINGLE_VG4x1 : Inst<"svsumla_za32[_{d}]_vg4x1",          "vmdu",   "c", MergeNone, "aarch64_sme_usmla_za32_single_vg4x1", [IsStreaming, IsSharedZA], []>;
+
+  def SVSUMLALL_SINGLE_VG4x2 : Inst<"svsumla[_single]_za32[_{d}]_vg4x2", "vm2.du", "c", MergeNone, "aarch64_sme_sumla_za32_single_vg4x2", [IsStreaming, IsSharedZA], []>;
+  def SVSUMLALL_SINGLE_VG4x4 : Inst<"svsumla[_single]_za32[_{d}]_vg4x4", "vm4.du", "c", MergeNone, "aarch64_sme_sumla_za32_single_vg4x4", [IsStreaming, IsSharedZA], []>;
+
+  // Multi-multi sumla builtins are mapped to usmla, with zn & zm operands swapped
+  def SVSUMLALL_MULTI_VG4x2 : Inst<"svsumla_za32[_{d}]_vg4x2", "vm2.d2.u", "c", MergeNone, "aarch64_sme_usmla_za32_vg4x2", [IsStreaming, IsSharedZA], []>;
+  def SVSUMLALL_MULTI_VG4x4 : Inst<"svsumla_za32[_{d}]_vg4x4", "vm4.d4.u", "c", MergeNone, "aarch64_sme_usmla_za32_vg4x4", [IsStreaming, IsSharedZA], []>;
+
+  // INDEXED SUMLALL
+  def SVSUMLALL_LANE_VG4x1 : Inst<"svsumla_lane_za32[_{d}]_vg4x1", "vmdui", "c", MergeNone, "aarch64_sme_sumla_za32_lane_vg4x1", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_15>]>;
+  def SVSUMLALL_LANE_VG4x2 : Inst<"svsumla_lane_za32[_{d}]_vg4x2", "vm2ui", "c", MergeNone, "aarch64_sme_sumla_za32_lane_vg4x2", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_15>]>;
+  def SVSUMLALL_LANE_VG4x4 : Inst<"svsumla_lane_za32[_{d}]_vg4x4", "vm4ui", "c", MergeNone, "aarch64_sme_sumla_za32_lane_vg4x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_15>]>;
+
+  // SINGLE USMLALL
+  def SVUSMLALL_SINGLE_VG4x1 : Inst<"svusmla_za32[_{d}]_vg4x1",          "vmdx",   "Uc", MergeNone, "aarch64_sme_usmla_za32_single_vg4x1", [IsStreaming, IsSharedZA], []>;
+  def SVUSMLALL_SINGLE_VG4x2 : Inst<"svusmla[_single]_za32[_{d}]_vg4x2", "vm2.dx", "Uc", MergeNone, "aarch64_sme_usmla_za32_single_vg4x2", [IsStreaming, IsSharedZA], []>;
+  def SVUSMLALL_SINGLE_VG4x4 : Inst<"svusmla[_single]_za32[_{d}]_vg4x4", "vm4.dx", "Uc", MergeNone, "aarch64_sme_usmla_za32_single_vg4x4", [IsStreaming, IsSharedZA], []>;
+
+  // MULTI USMLALL
+  def SVUSMLALL_MULTI_VG4x2 : Inst<"svusmla_za32[_{d}]_vg4x2", "vm2.d2.x", "Uc", MergeNone, "aarch64_sme_usmla_za32_vg4x2", [IsStreaming, IsSharedZA], []>;
+  def SVUSMLALL_MULTI_VG4x4 : Inst<"svusmla_za32[_{d}]_vg4x4", "vm4.d4.x", "Uc", MergeNone, "aarch64_sme_usmla_za32_vg4x4", [IsStreaming, IsSharedZA], []>;
+
+  // INDEXED USMLALL
+  def SVUSMLALL_LANE_VG4x1 : Inst<"svusmla_lane_za32[_{d}]_vg4x1", "vmdxi", "Uc", MergeNone, "aarch64_sme_usmla_za32_lane_vg4x1", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_15>]>;
+  def SVUSMLALL_LANE_VG4x2 : Inst<"svusmla_lane_za32[_{d}]_vg4x2", "vm2xi", "Uc", MergeNone, "aarch64_sme_usmla_za32_lane_vg4x2", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_15>]>;
+  def SVUSMLALL_LANE_VG4x4 : Inst<"svusmla_lane_za32[_{d}]_vg4x4", "vm4xi", "Uc", MergeNone, "aarch64_sme_usmla_za32_lane_vg4x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_15>]>;
+}
+
+let TargetGuard = "sme2,sme-i16i64" in {
+  // MULTI MLAL
+  def SVMLAL_MULTI_VG4x2_S16 : Inst<"svmla_za64[_{d}]_vg4x2", "vm22", "s", MergeNone, "aarch64_sme_smla_za64_vg4x2", [IsStreaming, IsSharedZA], []>;
+  def SVMLAL_MULTI_VG4x2_U16 : Inst<"svmla_za64[_{d}]_vg4x2", "vm22", "Us", MergeNone, "aarch64_sme_umla_za64_vg4x2", [IsStreaming, IsSharedZA], []>;
+  def SVMLAL_MULTI_VG4x4_S16 : Inst<"svmla_za64[_{d}]_vg4x4", "vm44", "s", MergeNone, "aarch64_sme_smla_za64_vg4x4", [IsStreaming, IsSharedZA], []>;
+  def SVMLAL_MULTI_VG4x4_U16 : Inst<"svmla_za64[_{d}]_vg4x4", "vm44", "Us", MergeNone, "aarch64_sme_umla_za64_vg4x4", [IsStreaming, IsSharedZA], []>;
+
+  // MULTI MLSL
+  def SVMLSL_MULTI_VG4x2_S16 : Inst<"svmls_za64[_{d}]_vg4x2", "vm22", "s", MergeNone, "aarch64_sme_smls_za64_vg4x2", [IsStreaming, IsSharedZA], []>;
+  def SVMLSL_MULTI_VG4x2_U16 : Inst<"svmls_za64[_{d}]_vg4x2", "vm22", "Us", MergeNone, "aarch64_sme_umls_za64_vg4x2", [IsStreaming, IsSharedZA], []>;
+  def SVMLSL_MULTI_VG4x4_S16 : Inst<"svmls_za64[_{d}]_vg4x4", "vm44", "s", MergeNone, "aarch64_sme_smls_za64_vg4x4", [IsStreaming, IsSharedZA], []>;
+  def SVMLSL_MULTI_VG4x4_U16 : Inst<"svmls_za64[_{d}]_vg4x4", "vm44", "Us", MergeNone, "aarch64_sme_umls_za64_vg4x4", [IsStreaming, IsSharedZA], []>;
+
+  // SINGLE MLAL
+  def SVMLAL_SINGLE_VG4x1_S16 : Inst<"svmla_za64[_{d}]_vg4x1",          "vmdd", "s", MergeNone, "aarch64_sme_smla_za64_single_vg4x1", [IsStreaming, IsSharedZA], []>;
+  def SVMLAL_SINGLE_VG4x1_U16 : Inst<"svmla_za64[_{d}]_vg4x1",          "vmdd", "Us", MergeNone, "aarch64_sme_umla_za64_single_vg4x1", [IsStreaming, IsSharedZA], []>;
+  def SVMLAL_SINGLE_VG4x2_S16 : Inst<"svmla[_single]_za64[_{d}]_vg4x2", "vm2d", "s", MergeNone, "aarch64_sme_smla_za64_single_vg4x2", [IsStreaming, IsSharedZA], []>;
+  def SVMLAL_SINGLE_VG4x2_U16 : Inst<"svmla[_single]_za64[_{d}]_vg4x2", "vm2d", "Us", MergeNone, "aarch64_sme_umla_za64_single_vg4x2", [IsStreaming, IsSharedZA], []>;
+  def SVMLAL_SINGLE_VG4x4_S16 : Inst<"svmla[_single]_za64[_{d}]_vg4x4", "vm4d", "s", MergeNone, "aarch64_sme_smla_za64_single_vg4x4", [IsStreaming, IsSharedZA], []>;
+  def SVMLAL_SINGLE_VG4x4_U16 : Inst<"svmla[_single]_za64[_{d}]_vg4x4", "vm4d", "Us", MergeNone, "aarch64_sme_umla_za64_single_vg4x4", [IsStreaming, IsSharedZA], []>;
+
+  // SINGLE MLSL
+  def SVMLSL_SINGLE_VG4x1_S16 : Inst<"svmls_za64[_{d}]_vg4x1",          "vmdd", "s", MergeNone, "aarch64_sme_smls_za64_single_vg4x1", [IsStreaming, IsSharedZA], []>;
+  def SVMLSL_SINGLE_VG4x1_U16 : Inst<"svmls_za64[_{d}]_vg4x1",          "vmdd", "Us", MergeNone, "aarch64_sme_umls_za64_single_vg4x1", [IsStreaming, IsSharedZA], []>;
+  def SVMLSL_SINGLE_VG4x2_S16 : Inst<"svmls[_single]_za64[_{d}]_vg4x2", "vm2d", "s", MergeNone, "aarch64_sme_smls_za64_single_vg4x2", [IsStreaming, IsSharedZA], []>;
+  def SVMLSL_SINGLE_VG4x2_U16 : Inst<"svmls[_single]_za64[_{d}]_vg4x2", "vm2d", "Us", MergeNone, "aarch64_sme_umls_za64_single_vg4x2", [IsStreaming, IsSharedZA], []>;
+  def SVMLSL_SINGLE_VG4x4_S16 : Inst<"svmls[_single]_za64[_{d}]_vg4x4", "vm4d", "s", MergeNone, "aarch64_sme_smls_za64_single_vg4x4", [IsStreaming, IsSharedZA], []>;
+  def SVMLSL_SINGLE_VG4x4_U16 : Inst<"svmls[_single]_za64[_{d}]_vg4x4", "vm4d", "Us", MergeNone, "aarch64_sme_umls_za64_single_vg4x4", [IsStreaming, IsSharedZA], []>;
+
+  // INDEXED MLAL
+  def SVMLAL_LANE_VG4x1_S16 : Inst<"svmla_lane_za64[_{d}]_vg4x1", "vmddi", "s", MergeNone, "aarch64_sme_smla_za64_lane_vg4x1", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLAL_LANE_VG4x1_U16 : Inst<"svmla_lane_za64[_{d}]_vg4x1", "vmddi", "Us", MergeNone, "aarch64_sme_umla_za64_lane_vg4x1", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLAL_LANE_VG4x2_S16 : Inst<"svmla_lane_za64[_{d}]_vg4x2", "vm2di", "s", MergeNone, "aarch64_sme_smla_za64_lane_vg4x2", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLAL_LANE_VG4x2_U16 : Inst<"svmla_lane_za64[_{d}]_vg4x2", "vm2di", "Us", MergeNone, "aarch64_sme_umla_za64_lane_vg4x2", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLAL_LANE_VG4x4_S16 : Inst<"svmla_lane_za64[_{d}]_vg4x4", "vm4di", "s", MergeNone, "aarch64_sme_smla_za64_lane_vg4x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLAL_LANE_VG4x4_U16 : Inst<"svmla_lane_za64[_{d}]_vg4x4", "vm4di", "Us", MergeNone, "aarch64_sme_umla_za64_lane_vg4x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
+
+  // INDEXED MLSL
+  def SVMLSL_LANE_VG4x1_S16 : Inst<"svmls_lane_za64[_{d}]_vg4x1", "vmddi", "s", MergeNone, "aarch64_sme_smls_za64_lane_vg4x1", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLSL_LANE_VG4x1_U16 : Inst<"svmls_lane_za64[_{d}]_vg4x1", "vmddi", "Us", MergeNone, "aarch64_sme_umls_za64_lane_vg4x1", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLSL_LANE_VG4x2_S16 : Inst<"svmls_lane_za64[_{d}]_vg4x2", "vm2di", "s", MergeNone, "aarch64_sme_smls_za64_lane_vg4x2", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLSL_LANE_VG4x2_U16 : Inst<"svmls_lane_za64[_{d}]_vg4x2", "vm2di", "Us", MergeNone, "aarch64_sme_umls_za64_lane_vg4x2", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLSL_LANE_VG4x4_S16 : Inst<"svmls_lane_za64[_{d}]_vg4x4", "vm4di", "s", MergeNone, "aarch64_sme_smls_za64_lane_vg4x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLSL_LANE_VG4x4_U16 : Inst<"svmls_lane_za64[_{d}]_vg4x4", "vm4di", "Us", MergeNone, "aarch64_sme_umls_za64_lane_vg4x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
+}
+
 //
 // Spill and fill of ZT0
 //
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index a29304c81928c..7bc3b7594c8f3 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -10318,6 +10318,28 @@ Value *CodeGenFunction::EmitAArch64SVEBuiltinExpr(unsigned BuiltinID,
   return nullptr;
 }
 
+static void swapCommutativeSMEOperands(unsigned BuiltinID,
+                                       SmallVectorImpl<Value *> &Ops) {
+  unsigned MultiVec;
+  switch (BuiltinID) {
+  default:
+    return;
+  case SME::BI__builtin_sme_svsumla_za32_s8_vg4x1:
+    MultiVec = 1;
+    break;
+  case SME::BI__builtin_sme_svsumla_za32_s8_vg4x2:
+    MultiVec = 2;
+    break;
+  case SME::BI__builtin_sme_svsumla_za32_s8_vg4x4:
+    MultiVec = 4;
+    break;
+  }
+
+  if (MultiVec > 0)
+    for (unsigned I = 0; I < MultiVec; ++I)
+      std::swap(Ops[I + 1], Ops[I + 1 + MultiVec]);
+}
+
 Value *CodeGenFunction::EmitAArch64SMEBuiltinExpr(unsigned BuiltinID,
                                                   const CallExpr *E) {
   auto *Builtin = findARMVectorIntrinsicInMap(AArch64SMEIntrinsicMap, BuiltinID,
@@ -10340,6 +10362,9 @@ Value *CodeGenFunction::EmitAArch64SMEBuiltinExpr(unsigned BuiltinID,
            BuiltinID == SME::BI__builtin_sme_svstr_za)
     return EmitSMELdrStr(TypeFlags, Ops, Builtin->LLVMIntrinsic);
 
+  // Handle builtins which require their multi-vector operands to be swapped
+  swapCommutativeSMEOperands(BuiltinID, Ops);
+
   // Should not happen!
   if (Builtin->LLVMIntrinsic == 0)
     return nullptr;
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mla.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mla.c
new file mode 100644
index 0000000000000..f52edd9888daa
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mla.c
@@ -0,0 +1,292 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// REQUIRES: aarch64-registered-target
+
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+#include <arm_sme_draft_spec_subject_to_change.h>
+
+#ifdef SVE_OVERLOADED_FORMS
+// A simple used,unused... macro, long enough to represent any SVE builtin.
+#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED,A5) A1##A3##A5
+#else
+#define SVE_ACLE_FUNC(A1,A2,A3,A4,A5) A1##A2##A3##A4##A5
+#endif
+
+//
+// Multi, multi
+// CHECK-LABEL: @test_svmla2_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN]], i64 4)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZM]], i64 4)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmla.vg1x2.nxv4f32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z15test_svmla2_f32j13svfloat32x2_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZM]], i64 4)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmla.vg1x2.nxv4f32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla2_f32(uint32_t slice_base, svfloat32x2_t zn, svfloat32x2_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svmla_za32,_f32,_vg1x2,,)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmla4_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 4)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 12)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZM]], i64 4)
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZM]], i64 8)
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZM]], i64 12)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmla.vg1x4.nxv4f32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], <vscale x 4 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], <vscale x 4 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z15test_svmla4_f32j13svfloat32x4_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 12)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZM]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZM]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZM]], i64 12)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmla.vg1x4.nxv4f32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], <vscale x 4 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], <vscale x 4 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla4_f32(uint32_t slice_base, svfloat32x4_t zn, svfloat32x4_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svmla_za32,_f32,_vg1x4,,)(slice_base, zn, zm);
+}
+
+//
+// Multi, single
+// CHECK-LABEL: @test_svmla_single2_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN]], i64 4)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmla.single.vg1x2.nxv4f32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_svmla_single2_f32j13svfloat32x2_tu13__SVFloat32_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN]], i64 4)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmla.single.vg1x2.nxv4f32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla_single2_f32(uint32_t slice_base, svfloat32x2_t zn, svfloat32_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svmla_single_za32,,_f32,,_vg1x2)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmla_single4_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 4)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 12)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmla.single.vg1x4.nxv4f32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], <vscale x 4 x float> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_svmla_single4_f32j13svfloat32x4_tu13__SVFloat32_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 12)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmla.single.vg1x4.nxv4f32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], <vscale x 4 x float> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla_single4_f32(uint32_t slice_base, svfloat32x4_t zn, svfloat32_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svmla_single_za32,,_f32,,_vg1x4)(slice_base, zn, zm);
+}
+
+//
+// Multi, indexed
+// CHECK-LABEL: @test_svmla_lane2_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN]], i64 4)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmla.lane.vg1x2.nxv4f32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[ZM:%.*]], i32 3)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z20test_svmla_lane2_f32j13svfloat32x2_tu13__SVFloat32_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN]], i64 4)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmla.lane.vg1x2.nxv4f32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[ZM:%.*]], i32 3)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla_lane2_f32(uint32_t slice_base, svfloat32x2_t zn, svfloat32_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svmla_lane_za32,_f32,_vg1x2,,)(slice_base, zn, zm, 3);
+}
+
+// CHECK-LABEL: @test_svmla_lane4_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 4)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 12)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmla.lane.vg1x4.nxv4f32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], <vscale x 4 x float> [[ZM:%.*]], i32 3)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z20test_svmla_lane4_f32j13svfloat32x4_tu13__SVFloat32_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 12)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmla.lane.vg1x4.nxv4f32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], <vscale x 4 x float> [[ZM:%.*]], i32 3)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla_lane4_f32(uint32_t slice_base, svfloat32x4_t zn, svfloat32_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svmla_lane_za32,_f32,_vg1x4,,)(slice_base, zn, zm, 3);
+}
+
+//
+// Multi, multi
+// CHECK-LABEL: @test_svmla2_f64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZN]], i64 2)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZM]], i64 2)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmla.vg1x2.nxv2f64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z15test_svmla2_f64j13svfloat64x2_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZN]], i64 2)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZM]], i64 2)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmla.vg1x2.nxv2f64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla2_f64(uint32_t slice_base, svfloat64x2_t zn, svfloat64x2_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svmla_za64,_f64,_vg1x2,,)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmla4_f64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN]], i64 2)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN]], i64 4)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN]], i64 6)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZM]], i64 2)
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZM]], i64 4)
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZM]], i64 6)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmla.vg1x4.nxv2f64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], <vscale x 2 x double> [[TMP4]], <vscale x 2 x double> [[TMP5]], <vscale x 2 x double> [[TMP6]], <vscale x 2 x double> [[TMP7]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z15test_svmla4_f64j13svfloat64x4_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN]], i64 2)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN]], i64 6)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZM]], i64 2)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZM]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZM]], i64 6)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmla.vg1x4.nxv2f64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], <vscale x 2 x double> [[TMP4]], <vscale x 2 x double> [[TMP5]], <vscale x 2 x double> [[TMP6]], <vscale x 2 x double> [[TMP7]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla4_f64(uint32_t slice_base, svfloat64x4_t zn, svfloat64x4_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svmla_za64,_f64,_vg1x4,,)(slice_base, zn, zm);
+}
+
+//
+// Multi, single
+// CHECK-LABEL: @test_svmla_single2_f64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZN]], i64 2)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmla.single.vg1x2.nxv2f64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_svmla_single2_f64j13svfloat64x2_tu13__SVFloat64_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZN]], i64 2)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmla.single.vg1x2.nxv2f64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla_single2_f64(uint32_t slice_base, svfloat64x2_t zn, svfloat64_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svmla_single_za64,,_f64,,_vg1x2)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmla_single4_f64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN]], i64 2)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN]], i64 4)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN]], i64 6)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmla.single.vg1x4.nxv2f64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], <vscale x 2 x double> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_svmla_single4_f64j13svfloat64x4_tu13__SVFloat64_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN]], i64 2)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN]], i64 6)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmla.single.vg1x4.nxv2f64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], <vscale x 2 x double> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla_single4_f64(uint32_t slice_base, svfloat64x4_t zn, svfloat64_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svmla_single_za64,,_f64,,_vg1x4)(slice_base, zn, zm);
+}
+
+//
+// Multi, indexed
+// CHECK-LABEL: @test_svmla_lane2_f64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZN]], i64 2)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmla.lane.vg1x2.nxv2f64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[ZM:%.*]], i32 1)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z20test_svmla_lane2_f64j13svfloat64x2_tu13__SVFloat64_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZN]], i64 2)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmla.lane.vg1x2.nxv2f64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[ZM:%.*]], i32 1)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla_lane2_f64(uint32_t slice_base, svfloat64x2_t zn, svfloat64_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svmla_lane_za64,_f64,_vg1x2,,)(slice_base, zn, zm, 1);
+}
+
+// CHECK-LABEL: @test_svmla_lane4_f64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN]], i64 2)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN]], i64 4)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN]], i64 6)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmla.lane.vg1x4.nxv2f64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], <vscale x 2 x double> [[ZM:%.*]], i32 1)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z20test_svmla_lane4_f64j13svfloat64x4_tu13__SVFloat64_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN]], i64 2)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN]], i64 6)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmla.lane.vg1x4.nxv2f64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], <vscale x 2 x double> [[ZM:%.*]], i32 1)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla_lane4_f64(uint32_t slice_base, svfloat64x4_t zn, svfloat64_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svmla_lane_za64,_f64,_vg1x4,,)(slice_base, zn, zm, 1);
+}
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mlal.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mlal.c
new file mode 100644
index 0000000000000..834ade7535076
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mlal.c
@@ -0,0 +1,696 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// REQUIRES: aarch64-registered-target
+
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+#include <arm_sme_draft_spec_subject_to_change.h>
+
+#ifdef SVE_OVERLOADED_FORMS
+// A simple used,unused... macro, long enough to represent any SVE builtin.
+#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED,A5) A1##A3##A5
+#else
+#define SVE_ACLE_FUNC(A1,A2,A3,A4,A5) A1##A2##A3##A4##A5
+#endif
+
+//
+// Multi, multi
+// CHECK-LABEL: @test_svmla2_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZM]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.vg2x2.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z15test_svmla2_f16j13svfloat16x2_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZM]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.vg2x2.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla2_f16(uint32_t slice_base, svfloat16x2_t zn, svfloat16x2_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmla_za32,_f16,_vg2x2,,)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmla2_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZM]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.vg2x2.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z16test_svmla2_bf16j14svbfloat16x2_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZM]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.vg2x2.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla2_bf16(uint32_t slice_base, svbfloat16x2_t zn, svbfloat16x2_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmla_za32,_bf16,_vg2x2,,)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmla2_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlal.vg2x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z15test_svmla2_u16j12svuint16x2_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlal.vg2x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16x2_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmla_za32,_u16,_vg2x2,,)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmla2_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlal.vg2x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z15test_svmla2_s16j11svint16x2_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlal.vg2x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla2_s16(uint32_t slice_base, svint16x2_t zn, svint16x2_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmla_za32,_s16,_vg2x2,,)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmla4_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 24)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 8)
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 16)
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.vg2x4.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], <vscale x 8 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z15test_svmla4_f16j13svfloat16x4_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.vg2x4.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], <vscale x 8 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla4_f16(uint32_t slice_base, svfloat16x4_t zn, svfloat16x4_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmla_za32,_f16,_vg2x4,,)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmla4_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 24)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZM]], i64 8)
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZM]], i64 16)
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZM]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.vg2x4.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], <vscale x 8 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z16test_svmla4_bf16j14svbfloat16x4_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZM]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZM]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZM]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.vg2x4.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], <vscale x 8 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla4_bf16(uint32_t slice_base, svbfloat16x4_t zn, svbfloat16x4_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmla_za32,_bf16,_vg2x4,,)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmla4_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 8)
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 16)
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlal.vg2x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z15test_svmla4_u16j12svuint16x4_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlal.vg2x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16x4_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmla_za32,_u16,_vg2x4,,)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmla4_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 8)
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 16)
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlal.vg2x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z15test_svmla4_s16j11svint16x4_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlal.vg2x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla4_s16(uint32_t slice_base, svint16x4_t zn, svint16x4_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmla_za32,_s16,_vg2x4,,)(slice_base, zn, zm);
+}
+
+//
+// Multi, single
+// CHECK-LABEL: @test_svmla_single1_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.single.vg2x1.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_svmla_single1_f16ju13__SVFloat16_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.single.vg2x1.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla_single1_f16(uint32_t slice_base, svfloat16_t zn, svfloat16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmla_za32,_f16,_vg2x1,,)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmla_single1_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.single.vg2x1.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z23test_svmla_single1_bf16ju14__SVBfloat16_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.single.vg2x1.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla_single1_bf16(uint32_t slice_base, svbfloat16_t zn, svbfloat16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmla_za32,_bf16,_vg2x1,,)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmla_single1_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlal.single.vg2x1.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_svmla_single1_u16ju12__SVUint16_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlal.single.vg2x1.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla_single1_u16(uint32_t slice_base, svuint16_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmla_za32,_u16,_vg2x1,,)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmla_single1_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlal.single.vg2x1.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_svmla_single1_s16ju11__SVInt16_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlal.single.vg2x1.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla_single1_s16(uint32_t slice_base, svint16_t zn, svint16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmla_za32,_s16,_vg2x1,,)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmla_single2_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.single.vg2x2.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_svmla_single2_f16j13svfloat16x2_tu13__SVFloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.single.vg2x2.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla_single2_f16(uint32_t slice_base, svfloat16x2_t zn, svfloat16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmla_single_za32,,_f16,,_vg2x2)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmla_single2_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.single.vg2x2.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z23test_svmla_single2_bf16j14svbfloat16x2_tu14__SVBfloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.single.vg2x2.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla_single2_bf16(uint32_t slice_base, svbfloat16x2_t zn, svbfloat16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmla_single_za32,,_bf16,,_vg2x2)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmla_single2_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlal.single.vg2x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_svmla_single2_u16j12svuint16x2_tu12__SVUint16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlal.single.vg2x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla_single2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmla_single_za32,,_u16,,_vg2x2)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmla_single2_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlal.single.vg2x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_svmla_single2_s16j11svint16x2_tu11__SVInt16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlal.single.vg2x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla_single2_s16(uint32_t slice_base, svint16x2_t zn, svint16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmla_single_za32,,_s16,,_vg2x2)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmla_single4_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.single.vg2x4.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_svmla_single4_f16j13svfloat16x4_tu13__SVFloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.single.vg2x4.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla_single4_f16(uint32_t slice_base, svfloat16x4_t zn, svfloat16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmla_single_za32,,_f16,,_vg2x4)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmla_single4_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.single.vg2x4.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z23test_svmla_single4_bf16j14svbfloat16x4_tu14__SVBfloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.single.vg2x4.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla_single4_bf16(uint32_t slice_base, svbfloat16x4_t zn, svbfloat16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmla_single_za32,,_bf16,,_vg2x4)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmla_single4_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlal.single.vg2x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_svmla_single4_u16j12svuint16x4_tu12__SVUint16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlal.single.vg2x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla_single4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmla_single_za32,,_u16,,_vg2x4)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmla_single4_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlal.single.vg2x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_svmla_single4_s16j11svint16x4_tu11__SVInt16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlal.single.vg2x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla_single4_s16(uint32_t slice_base, svint16x4_t zn, svint16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmla_single_za32,,_s16,,_vg2x4)(slice_base, zn, zm);
+}
+
+//
+// Multi, indexed
+//
+
+// CHECK-LABEL: @test_svmla_lane1_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.lane.vg2x1.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z20test_svmla_lane1_f16ju13__SVFloat16_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.lane.vg2x1.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla_lane1_f16(uint32_t slice_base, svfloat16_t zn, svfloat16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmla_lane_za32,,,_f16,_vg2x1)(slice_base, zn, zm, 7);
+}
+
+// CHECK-LABEL: @test_svmla_lane1_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.lane.vg2x1.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z21test_svmla_lane1_bf16ju14__SVBfloat16_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.lane.vg2x1.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla_lane1_bf16(uint32_t slice_base, svbfloat16_t zn, svbfloat16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmla_lane_za32,,,_bf16,_vg2x1)(slice_base, zn, zm, 7);
+}
+
+// CHECK-LABEL: @test_svmla_lane1_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlal.lane.vg2x1.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z20test_svmla_lane1_u16ju12__SVUint16_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlal.lane.vg2x1.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla_lane1_u16(uint32_t slice_base, svuint16_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmla_lane_za32,,,_u16,_vg2x1)(slice_base, zn, zm, 7);
+}
+
+// CHECK-LABEL: @test_svmla_lane1_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlal.lane.vg2x1.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z20test_svmla_lane1_s16ju11__SVInt16_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlal.lane.vg2x1.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla_lane1_s16(uint32_t slice_base, svint16_t zn, svint16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmla_lane_za32,,,_s16,_vg2x1)(slice_base, zn, zm, 7);
+}
+
+// CHECK-LABEL: @test_svmla_lane2_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.lane.vg2x2.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z20test_svmla_lane2_f16j13svfloat16x2_tu13__SVFloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.lane.vg2x2.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla_lane2_f16(uint32_t slice_base, svfloat16x2_t zn, svfloat16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmla_lane_za32,,,_f16,_vg2x2)(slice_base, zn, zm, 7);
+}
+
+// CHECK-LABEL: @test_svmla_lane2_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.lane.vg2x2.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z21test_svmla_lane2_bf16j14svbfloat16x2_tu14__SVBfloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.lane.vg2x2.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla_lane2_bf16(uint32_t slice_base, svbfloat16x2_t zn, svbfloat16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmla_lane_za32,,,_bf16,_vg2x2)(slice_base, zn, zm, 7);
+}
+
+// CHECK-LABEL: @test_svmla_lane2_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlal.lane.vg2x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z20test_svmla_lane2_u16j12svuint16x2_tu12__SVUint16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlal.lane.vg2x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla_lane2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmla_lane_za32,,,_u16,_vg2x2)(slice_base, zn, zm, 7);
+}
+
+// CHECK-LABEL: @test_svmla_lane2_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlal.lane.vg2x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z20test_svmla_lane2_s16j11svint16x2_tu11__SVInt16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlal.lane.vg2x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla_lane2_s16(uint32_t slice_base, svint16x2_t zn, svint16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmla_lane_za32,,,_s16,_vg2x2)(slice_base, zn, zm, 7);
+}
+
+// CHECK-LABEL: @test_svmla_lane4_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.lane.vg2x4.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z20test_svmla_lane4_f16j13svfloat16x4_tu13__SVFloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.lane.vg2x4.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla_lane4_f16(uint32_t slice_base, svfloat16x4_t zn, svfloat16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmla_lane_za32,,,_f16,_vg2x4)(slice_base, zn, zm, 7);
+}
+
+// CHECK-LABEL: @test_svmla_lane4_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.lane.vg2x4.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z21test_svmla_lane4_bf16j14svbfloat16x4_tu14__SVBfloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlal.lane.vg2x4.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla_lane4_bf16(uint32_t slice_base, svbfloat16x4_t zn, svbfloat16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmla_lane_za32,,,_bf16,_vg2x4)(slice_base, zn, zm, 7);
+}
+
+// CHECK-LABEL: @test_svmla_lane4_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlal.lane.vg2x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z20test_svmla_lane4_u16j12svuint16x4_tu12__SVUint16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlal.lane.vg2x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla_lane4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmla_lane_za32,,,_u16,_vg2x4)(slice_base, zn, zm, 7);
+}
+
+// CHECK-LABEL: @test_svmla_lane4_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlal.lane.vg2x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z20test_svmla_lane4_s16j11svint16x4_tu11__SVInt16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlal.lane.vg2x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla_lane4_s16(uint32_t slice_base, svint16x4_t zn, svint16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmla_lane_za32,,,_s16,_vg2x4)(slice_base, zn, zm, 7);
+}
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mlall.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mlall.c
new file mode 100644
index 0000000000000..fceb16a482600
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mlall.c
@@ -0,0 +1,1790 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// REQUIRES: aarch64-registered-target
+
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -target-feature +sme-i16i64 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+#include <arm_sme_draft_spec_subject_to_change.h>
+
+#ifdef SVE_OVERLOADED_FORMS
+// A simple used,unused... macro, long enough to represent any SVE builtin.
+#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED,A5) A1##A3##A5
+#else
+#define SVE_ACLE_FUNC(A1,A2,A3,A4,A5) A1##A2##A3##A4##A5
+#endif
+
+//
+// Single x 1
+//
+
+// MLAL
+
+// CHECK-LABEL: @test_svmla_single_x1_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za32.single.vg4x1.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z23test_svmla_single_x1_s8ju10__SVInt8_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za32.single.vg4x1.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla_single_x1_s8(uint32_t slice_base, svint8_t zn, svint8_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmla_za32,_s8,_vg4x1,,)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmla_single_x1_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za64.single.vg4x1.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z24test_svmla_single_x1_s16ju11__SVInt16_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za64.single.vg4x1.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla_single_x1_s16(uint32_t slice_base, svint16_t zn, svint16_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmla_za64,_s16,_vg4x1,,)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_uvmlal_single_x1_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za32.single.vg4x1.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z24test_uvmlal_single_x1_u8ju11__SVUint8_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za32.single.vg4x1.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_uvmlal_single_x1_u8(uint32_t slice_base, svuint8_t zn, svuint8_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmla_za32,_u8,_vg4x1,,)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_uvmlal_single_x1_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za64.single.vg4x1.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z25test_uvmlal_single_x1_u16ju12__SVUint16_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za64.single.vg4x1.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_uvmlal_single_x1_u16(uint32_t slice_base, svuint16_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmla_za64,_u16,_vg4x1,,)(slice_base, zn, zm);
+}
+
+// MLSL
+
+// CHECK-LABEL: @test_svmls_single_x1_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za32.single.vg4x1.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z23test_svmls_single_x1_s8ju10__SVInt8_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za32.single.vg4x1.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls_single_x1_s8(uint32_t slice_base, svint8_t zn, svint8_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmls_za32,_s8,_vg4x1,,)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmls_single_x1_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za64.single.vg4x1.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z24test_svmls_single_x1_s16ju11__SVInt16_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za64.single.vg4x1.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls_single_x1_s16(uint32_t slice_base, svint16_t zn, svint16_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmls_za64,_s16,_vg4x1,,)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_uvmlsl_single_x1_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za32.single.vg4x1.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z24test_uvmlsl_single_x1_u8ju11__SVUint8_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za32.single.vg4x1.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_uvmlsl_single_x1_u8(uint32_t slice_base, svuint8_t zn, svuint8_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmls_za32,_u8,_vg4x1,,)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_uvmlsl_single_x1_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za64.single.vg4x1.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z25test_uvmlsl_single_x1_u16ju12__SVUint16_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za64.single.vg4x1.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_uvmlsl_single_x1_u16(uint32_t slice_base, svuint16_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmls_za64,_u16,_vg4x1,,)(slice_base, zn, zm);
+}
+
+// SUMLALL
+
+// CHECK-LABEL: @test_sumlall_single_x1_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.usmla.za32.single.vg4x1.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[ZM:%.*]], <vscale x 16 x i8> [[ZN:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z25test_sumlall_single_x1_s8ju10__SVInt8_tu11__SVUint8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.usmla.za32.single.vg4x1.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[ZM:%.*]], <vscale x 16 x i8> [[ZN:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_sumlall_single_x1_s8(uint32_t slice_base, svint8_t zn, svuint8_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svsumla_za32,_s8,_vg4x1,,)(slice_base, zn, zm);
+}
+
+// USMLALL
+
+// CHECK-LABEL: @test_usmlall_single_x1_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.usmla.za32.single.vg4x1.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z25test_usmlall_single_x1_u8ju11__SVUint8_tu10__SVInt8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.usmla.za32.single.vg4x1.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_usmlall_single_x1_u8(uint32_t slice_base, svuint8_t zn, svint8_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svusmla_za32,_u8,_vg4x1,,)(slice_base, zn, zm);
+}
+
+//
+// Single x 2
+//
+
+// MLAL
+
+// CHECK-LABEL: @test_svmla_single_x2_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za32.single.vg4x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z23test_svmla_single_x2_s8j10svint8x2_tu10__SVInt8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za32.single.vg4x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla_single_x2_s8(uint32_t slice_base, svint8x2_t zn, svint8_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmla_single_za32,,_s8,,_vg4x2)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmla_single_x2_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za64.single.vg4x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z24test_svmla_single_x2_s16j11svint16x2_tu11__SVInt16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za64.single.vg4x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla_single_x2_s16(uint32_t slice_base, svint16x2_t zn, svint16_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmla_single_za64,,_s16,,_vg4x2)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmla_single_x2_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za32.single.vg4x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z23test_svmla_single_x2_u8j11svuint8x2_tu11__SVUint8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za32.single.vg4x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla_single_x2_u8(uint32_t slice_base, svuint8x2_t zn, svuint8_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmla_single_za32,,_u8,,_vg4x2)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmla_single_x2_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za64.single.vg4x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z24test_svmla_single_x2_u16j12svuint16x2_tu12__SVUint16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za64.single.vg4x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla_single_x2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmla_single_za64,,_u16,,_vg4x2)(slice_base, zn, zm);
+}
+
+// MLSL
+
+// CHECK-LABEL: @test_svmls_single_x2_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za32.single.vg4x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z23test_svmls_single_x2_s8j10svint8x2_tu10__SVInt8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za32.single.vg4x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls_single_x2_s8(uint32_t slice_base, svint8x2_t zn, svint8_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmls_single_za32,,_s8,,_vg4x2)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmls_single_x2_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za64.single.vg4x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z24test_svmls_single_x2_s16j11svint16x2_tu11__SVInt16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za64.single.vg4x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls_single_x2_s16(uint32_t slice_base, svint16x2_t zn, svint16_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmls_single_za64,,_s16,,_vg4x2)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmls_single_x2_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za32.single.vg4x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z23test_svmls_single_x2_u8j11svuint8x2_tu11__SVUint8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za32.single.vg4x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls_single_x2_u8(uint32_t slice_base, svuint8x2_t zn, svuint8_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmls_single_za32,,_u8,,_vg4x2)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmls_single_x2_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za64.single.vg4x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z24test_svmls_single_x2_u16j12svuint16x2_tu12__SVUint16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za64.single.vg4x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls_single_x2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmls_single_za64,,_u16,,_vg4x2)(slice_base, zn, zm);
+}
+
+// SUMLALL
+
+// CHECK-LABEL: @test_svsumla_single_x2_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.sumla.za32.single.vg4x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z25test_svsumla_single_x2_s8j10svint8x2_tu11__SVUint8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.sumla.za32.single.vg4x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svsumla_single_x2_s8(uint32_t slice_base, svint8x2_t zn, svuint8_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svsumla_single_za32,,_s8,,_vg4x2)(slice_base, zn, zm);
+}
+
+// USMLALL
+
+// CHECK-LABEL: @test_usmlall_single_x2_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.usmla.za32.single.vg4x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z25test_usmlall_single_x2_u8j11svuint8x2_tu10__SVInt8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.usmla.za32.single.vg4x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_usmlall_single_x2_u8(uint32_t slice_base, svuint8x2_t zn, svint8_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svusmla_single_za32,,_u8,,_vg4x2)(slice_base, zn, zm);
+}
+
+//
+// Single x 4
+//
+
+// MLAL
+
+// CHECK-LABEL: @test_svmla_single_x4_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za32.single.vg4x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z23test_svmla_single_x4_s8j10svint8x4_tu10__SVInt8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za32.single.vg4x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla_single_x4_s8(uint32_t slice_base, svint8x4_t zn, svint8_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmla_single_za32,,_s8,,_vg4x4)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmla_single_x4_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za64.single.vg4x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z24test_svmla_single_x4_s16j11svint16x4_tu11__SVInt16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za64.single.vg4x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla_single_x4_s16(uint32_t slice_base, svint16x4_t zn, svint16_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmla_single_za64,,_s16,,_vg4x4)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmla_single_x4_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za32.single.vg4x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z23test_svmla_single_x4_u8j11svuint8x4_tu11__SVUint8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za32.single.vg4x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla_single_x4_u8(uint32_t slice_base, svuint8x4_t zn, svuint8_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmla_single_za32,,_u8,,_vg4x4)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmla_single_x4_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za64.single.vg4x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z24test_svmla_single_x4_u16j12svuint16x4_tu12__SVUint16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za64.single.vg4x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmla_single_x4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmla_single_za64,,_u16,,_vg4x4)(slice_base, zn, zm);
+}
+
+// MLSL
+
+// CHECK-LABEL: @test_svmls_single_x4_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za32.single.vg4x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z23test_svmls_single_x4_s8j10svint8x4_tu10__SVInt8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za32.single.vg4x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls_single_x4_s8(uint32_t slice_base, svint8x4_t zn, svint8_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmls_single_za32,,_s8,,_vg4x4)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmls_single_x4_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za64.single.vg4x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z24test_svmls_single_x4_s16j11svint16x4_tu11__SVInt16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za64.single.vg4x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls_single_x4_s16(uint32_t slice_base, svint16x4_t zn, svint16_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmls_single_za64,,_s16,,_vg4x4)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmls_single_x4_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za32.single.vg4x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z23test_svmls_single_x4_u8j11svuint8x4_tu11__SVUint8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za32.single.vg4x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls_single_x4_u8(uint32_t slice_base, svuint8x4_t zn, svuint8_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmls_single_za32,,_u8,,_vg4x4)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmls_single_x4_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za64.single.vg4x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z24test_svmls_single_x4_u16j12svuint16x4_tu12__SVUint16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za64.single.vg4x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls_single_x4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmls_single_za64,,_u16,,_vg4x4)(slice_base, zn, zm);
+}
+
+// SUMLALL
+
+// CHECK-LABEL: @test_svsumla_single_x4_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.sumla.za32.single.vg4x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z25test_svsumla_single_x4_s8j10svint8x4_tu11__SVUint8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.sumla.za32.single.vg4x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svsumla_single_x4_s8(uint32_t slice_base, svint8x4_t zn, svuint8_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svsumla_single_za32,,_s8,,_vg4x4)(slice_base, zn, zm);
+}
+
+// USMLALL
+
+// CHECK-LABEL: @test_usmlall_single_x4_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.usmla.za32.single.vg4x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z25test_usmlall_single_x4_u8j11svuint8x4_tu10__SVInt8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.usmla.za32.single.vg4x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_usmlall_single_x4_u8(uint32_t slice_base, svuint8x4_t zn, svint8_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svusmla_single_za32,,_u8,,_vg4x4)(slice_base, zn, zm);
+}
+
+//
+// Multi x 2
+//
+
+// MLAL
+
+// CHECK-LABEL: @test_mlal_multi_x2_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM]], i64 16)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za32.vg4x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z21test_mlal_multi_x2_s8j10svint8x2_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM]], i64 16)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za32.vg4x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_mlal_multi_x2_s8(uint32_t slice_base, svint8x2_t zn, svint8x2_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmla_za32,_s8,_vg4x2,,)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_mlal_multi_x2_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za64.vg4x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_mlal_multi_x2_s16j11svint16x2_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za64.vg4x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_mlal_multi_x2_s16(uint32_t slice_base, svint16x2_t zn, svint16x2_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmla_za64,_s16,_vg4x2,,)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_mlal_multi_x2_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM]], i64 16)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za32.vg4x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z21test_mlal_multi_x2_u8j11svuint8x2_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM]], i64 16)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za32.vg4x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_mlal_multi_x2_u8(uint32_t slice_base, svuint8x2_t zn, svuint8x2_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmla_za32,_u8,_vg4x2,,)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_mlal_multi_x2_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za64.vg4x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_mlal_multi_x2_u16j12svuint16x2_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za64.vg4x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_mlal_multi_x2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16x2_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmla_za64,_u16,_vg4x2,,)(slice_base, zn, zm);
+}
+
+// MLSL
+
+// CHECK-LABEL: @test_mlsl_multi_x2_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM]], i64 16)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za32.vg4x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z21test_mlsl_multi_x2_s8j10svint8x2_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM]], i64 16)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za32.vg4x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_mlsl_multi_x2_s8(uint32_t slice_base, svint8x2_t zn, svint8x2_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmls_za32,_s8,_vg4x2,,)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_mlsl_multi_x2_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za64.vg4x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_mlsl_multi_x2_s16j11svint16x2_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za64.vg4x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_mlsl_multi_x2_s16(uint32_t slice_base, svint16x2_t zn, svint16x2_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmls_za64,_s16,_vg4x2,,)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_mlsl_multi_x2_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM]], i64 16)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za32.vg4x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z21test_mlsl_multi_x2_u8j11svuint8x2_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM]], i64 16)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za32.vg4x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_mlsl_multi_x2_u8(uint32_t slice_base, svuint8x2_t zn, svuint8x2_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmls_za32,_u8,_vg4x2,,)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_mlsl_multi_x2_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za64.vg4x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_mlsl_multi_x2_u16j12svuint16x2_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za64.vg4x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_mlsl_multi_x2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16x2_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmls_za64,_u16,_vg4x2,,)(slice_base, zn, zm);
+}
+
+// SUMLALL
+
+// CHECK-LABEL: @test_sumlal_multi_x2_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM]], i64 16)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.usmla.za32.vg4x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z23test_sumlal_multi_x2_s8j10svint8x2_t11svuint8x2_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM]], i64 16)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.usmla.za32.vg4x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_sumlal_multi_x2_s8(uint32_t slice_base, svint8x2_t zn, svuint8x2_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svsumla_za32,_s8,_vg4x2,,)(slice_base, zn, zm);
+}
+
+// USMLALL
+
+// CHECK-LABEL: @test_usmlal_multi_x2_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM]], i64 16)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.usmla.za32.vg4x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z23test_usmlal_multi_x2_u8j11svuint8x2_t10svint8x2_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM]], i64 16)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.usmla.za32.vg4x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_usmlal_multi_x2_u8(uint32_t slice_base, svuint8x2_t zn, svint8x2_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svusmla_za32,_u8,_vg4x2,,)(slice_base, zn, zm);
+}
+
+//
+// Multi x 4
+//
+
+// MLAL
+
+// CHECK-LABEL: @test_mlal_multi_x4_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 16)
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 32)
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 48)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za32.vg4x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z21test_mlal_multi_x4_s8j10svint8x4_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 48)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za32.vg4x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_mlal_multi_x4_s8(uint32_t slice_base, svint8x4_t zn, svint8x4_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmla_za32,_s8,_vg4x4,,)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_mlal_multi_x4_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 8)
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 16)
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za64.vg4x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_mlal_multi_x4_s16j11svint16x4_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za64.vg4x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_mlal_multi_x4_s16(uint32_t slice_base, svint16x4_t zn, svint16x4_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmla_za64,_s16,_vg4x4,,)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_mlal_multi_x4_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 16)
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 32)
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 48)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za32.vg4x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z21test_mlal_multi_x4_u8j11svuint8x4_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 48)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za32.vg4x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_mlal_multi_x4_u8(uint32_t slice_base, svuint8x4_t zn, svuint8x4_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmla_za32,_u8,_vg4x4,,)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_mlal_multi_x4_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 8)
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 16)
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za64.vg4x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_mlal_multi_x4_u16j12svuint16x4_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za64.vg4x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_mlal_multi_x4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16x4_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmla_za64,_u16,_vg4x4,,)(slice_base, zn, zm);
+}
+
+// MLSL
+
+// CHECK-LABEL: @test_mlsl_multi_x4_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 16)
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 32)
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 48)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za32.vg4x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z21test_mlsl_multi_x4_s8j10svint8x4_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 48)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za32.vg4x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_mlsl_multi_x4_s8(uint32_t slice_base, svint8x4_t zn, svint8x4_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmls_za32,_s8,_vg4x4,,)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_mlsl_multi_x4_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 8)
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 16)
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za64.vg4x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_mlsl_multi_x4_s16j11svint16x4_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za64.vg4x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_mlsl_multi_x4_s16(uint32_t slice_base, svint16x4_t zn, svint16x4_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmls_za64,_s16,_vg4x4,,)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_mlsl_multi_x4_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 16)
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 32)
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 48)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za32.vg4x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z21test_mlsl_multi_x4_u8j11svuint8x4_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 48)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za32.vg4x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_mlsl_multi_x4_u8(uint32_t slice_base, svuint8x4_t zn, svuint8x4_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmls_za32,_u8,_vg4x4,,)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_mlsl_multi_x4_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 8)
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 16)
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za64.vg4x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_mlsl_multi_x4_u16j12svuint16x4_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za64.vg4x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_mlsl_multi_x4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16x4_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmls_za64,_u16,_vg4x4,,)(slice_base, zn, zm);
+}
+
+// SUMLALL
+
+// CHECK-LABEL: @test_sumlal_multi_x4_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 16)
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 32)
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 48)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.usmla.za32.vg4x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z23test_sumlal_multi_x4_s8j10svint8x4_t11svuint8x4_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 48)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.usmla.za32.vg4x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_sumlal_multi_x4_s8(uint32_t slice_base, svint8x4_t zn, svuint8x4_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svsumla_za32,_s8,_vg4x4,,)(slice_base, zn, zm);
+}
+
+// USMLALL
+
+// CHECK-LABEL: @test_usmlal_multi_x4_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 16)
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 32)
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 48)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.usmla.za32.vg4x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z23test_usmlal_multi_x4_u8j11svuint8x4_t10svint8x4_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 48)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.usmla.za32.vg4x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_usmlal_multi_x4_u8(uint32_t slice_base, svuint8x4_t zn, svint8x4_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svusmla_za32,_u8,_vg4x4,,)(slice_base, zn, zm);
+}
+
+//
+// Indexed x 1
+//
+
+// SMLAL
+
+// CHECK-LABEL: @test_smlal_lane_x1_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za32.lane.vg4x1.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z21test_smlal_lane_x1_s8ju10__SVInt8_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za32.lane.vg4x1.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_smlal_lane_x1_s8(uint32_t slice_base, svint8_t zn, svint8_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmla_lane_za32,_s8,_vg4x1,,)(slice_base, zn, zm, 15);
+}
+
+// CHECK-LABEL: @test_smlal_lane_x1_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za64.lane.vg4x1.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_smlal_lane_x1_s16ju11__SVInt16_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za64.lane.vg4x1.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_smlal_lane_x1_s16(uint32_t slice_base, svint16_t zn, svint16_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmla_lane_za64,_s16,_vg4x1,,)(slice_base, zn, zm, 7);
+}
+
+// CHECK-LABEL: @test_smlal_lane_x1_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za32.lane.vg4x1.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z21test_smlal_lane_x1_u8ju11__SVUint8_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za32.lane.vg4x1.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_smlal_lane_x1_u8(uint32_t slice_base, svuint8_t zn, svuint8_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmla_lane_za32,_u8,_vg4x1,,)(slice_base, zn, zm, 15);
+}
+
+// CHECK-LABEL: @test_smlal_lane_x1_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za64.lane.vg4x1.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_smlal_lane_x1_u16ju12__SVUint16_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za64.lane.vg4x1.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_smlal_lane_x1_u16(uint32_t slice_base, svuint16_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmla_lane_za64,_u16,_vg4x1,,)(slice_base, zn, zm, 7);
+}
+
+// SMLSL
+
+// CHECK-LABEL: @test_smlsl_lane_x1_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za32.lane.vg4x1.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z21test_smlsl_lane_x1_s8ju10__SVInt8_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za32.lane.vg4x1.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_smlsl_lane_x1_s8(uint32_t slice_base, svint8_t zn, svint8_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmls_lane_za32,_s8,_vg4x1,,)(slice_base, zn, zm, 15);
+}
+
+// CHECK-LABEL: @test_smlsl_lane_x1_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za64.lane.vg4x1.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_smlsl_lane_x1_s16ju11__SVInt16_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za64.lane.vg4x1.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_smlsl_lane_x1_s16(uint32_t slice_base, svint16_t zn, svint16_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmls_lane_za64,_s16,_vg4x1,,)(slice_base, zn, zm, 7);
+}
+
+// CHECK-LABEL: @test_smlsl_lane_x1_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za32.lane.vg4x1.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z21test_smlsl_lane_x1_u8ju11__SVUint8_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za32.lane.vg4x1.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_smlsl_lane_x1_u8(uint32_t slice_base, svuint8_t zn, svuint8_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmls_lane_za32,_u8,_vg4x1,,)(slice_base, zn, zm, 15);
+}
+
+// CHECK-LABEL: @test_smlsl_lane_x1_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za64.lane.vg4x1.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_smlsl_lane_x1_u16ju12__SVUint16_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za64.lane.vg4x1.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_smlsl_lane_x1_u16(uint32_t slice_base, svuint16_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmls_lane_za64,_u16,_vg4x1,,)(slice_base, zn, zm, 7);
+}
+
+// SUMLALL
+
+// CHECK-LABEL: @test_sumlall_lane_x1_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.sumla.za32.lane.vg4x1.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z23test_sumlall_lane_x1_s8ju10__SVInt8_tu11__SVUint8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.sumla.za32.lane.vg4x1.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_sumlall_lane_x1_s8(uint32_t slice_base, svint8_t zn, svuint8_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svsumla_lane_za32,_s8,_vg4x1,,)(slice_base, zn, zm, 15);
+}
+
+// USMLALL
+
+// CHECK-LABEL: @test_usmlall_lane_x1_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.usmla.za32.lane.vg4x1.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z23test_usmlall_lane_x1_u8ju11__SVUint8_tu10__SVInt8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.usmla.za32.lane.vg4x1.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_usmlall_lane_x1_u8(uint32_t slice_base, svuint8_t zn, svint8_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svusmla_lane_za32,_u8,_vg4x1,,)(slice_base, zn, zm, 15);
+}
+
+//
+// Indexed x 2
+//
+
+// SMLAL
+
+// CHECK-LABEL: @test_smlal_lane_x2_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za32.lane.vg4x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z21test_smlal_lane_x2_s8j10svint8x2_tu10__SVInt8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za32.lane.vg4x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_smlal_lane_x2_s8(uint32_t slice_base, svint8x2_t zn, svint8_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmla_lane_za32,_s8,_vg4x2,,)(slice_base, zn, zm, 15);
+}
+
+// CHECK-LABEL: @test_smlal_lane_x2_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za64.lane.vg4x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_smlal_lane_x2_s16j11svint16x2_tu11__SVInt16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za64.lane.vg4x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_smlal_lane_x2_s16(uint32_t slice_base, svint16x2_t zn, svint16_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmla_lane_za64,_s16,_vg4x2,,)(slice_base, zn, zm, 7);
+}
+
+// CHECK-LABEL: @test_smlal_lane_x2_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za32.lane.vg4x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z21test_smlal_lane_x2_u8j11svuint8x2_tu11__SVUint8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za32.lane.vg4x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_smlal_lane_x2_u8(uint32_t slice_base, svuint8x2_t zn, svuint8_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmla_lane_za32,_u8,_vg4x2,,)(slice_base, zn, zm, 15);
+}
+
+// CHECK-LABEL: @test_smlal_lane_x2_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za64.lane.vg4x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_smlal_lane_x2_u16j12svuint16x2_tu12__SVUint16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za64.lane.vg4x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_smlal_lane_x2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmla_lane_za64,_u16,_vg4x2,,)(slice_base, zn, zm, 7);
+}
+
+// SMLSL
+
+// CHECK-LABEL: @test_smlsl_lane_x2_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za32.lane.vg4x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z21test_smlsl_lane_x2_s8j10svint8x2_tu10__SVInt8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za32.lane.vg4x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_smlsl_lane_x2_s8(uint32_t slice_base, svint8x2_t zn, svint8_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmls_lane_za32,_s8,_vg4x2,,)(slice_base, zn, zm, 15);
+}
+
+// CHECK-LABEL: @test_smlsl_lane_x2_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za64.lane.vg4x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_smlsl_lane_x2_s16j11svint16x2_tu11__SVInt16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za64.lane.vg4x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_smlsl_lane_x2_s16(uint32_t slice_base, svint16x2_t zn, svint16_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmls_lane_za64,_s16,_vg4x2,,)(slice_base, zn, zm, 7);
+}
+
+// CHECK-LABEL: @test_smlsl_lane_x2_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za32.lane.vg4x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z21test_smlsl_lane_x2_u8j11svuint8x2_tu11__SVUint8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za32.lane.vg4x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_smlsl_lane_x2_u8(uint32_t slice_base, svuint8x2_t zn, svuint8_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmls_lane_za32,_u8,_vg4x2,,)(slice_base, zn, zm, 15);
+}
+
+// CHECK-LABEL: @test_smlsl_lane_x2_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za64.lane.vg4x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_smlsl_lane_x2_u16j12svuint16x2_tu12__SVUint16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za64.lane.vg4x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_smlsl_lane_x2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmls_lane_za64,_u16,_vg4x2,,)(slice_base, zn, zm, 7);
+}
+
+// SUMLALL
+
+// CHECK-LABEL: @test_sumlall_lane_x2_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.sumla.za32.lane.vg4x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z23test_sumlall_lane_x2_s8j10svint8x2_tu11__SVUint8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.sumla.za32.lane.vg4x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_sumlall_lane_x2_s8(uint32_t slice_base, svint8x2_t zn, svuint8_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svsumla_lane_za32,_s8,_vg4x2,,)(slice_base, zn, zm, 15);
+}
+
+// CHECK-LABEL: @test_usmlall_lane_x2_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.usmla.za32.lane.vg4x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z23test_usmlall_lane_x2_u8j11svuint8x2_tu10__SVInt8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.usmla.za32.lane.vg4x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_usmlall_lane_x2_u8(uint32_t slice_base, svuint8x2_t zn, svint8_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svusmla_lane_za32,_u8,_vg4x2,,)(slice_base, zn, zm, 15);
+}
+
+//
+// Indexed x 4
+//
+
+// MLAL
+
+// CHECK-LABEL: @test_smlal_lane_x4_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za32.lane.vg4x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z21test_smlal_lane_x4_s8j10svint8x4_tu10__SVInt8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za32.lane.vg4x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_smlal_lane_x4_s8(uint32_t slice_base, svint8x4_t zn, svint8_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmla_lane_za32,_s8,_vg4x4,,)(slice_base, zn, zm, 15);
+}
+
+// CHECK-LABEL: @test_smlal_lane_x4_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za64.lane.vg4x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_smlal_lane_x4_s16j11svint16x4_tu11__SVInt16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smla.za64.lane.vg4x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_smlal_lane_x4_s16(uint32_t slice_base, svint16x4_t zn, svint16_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmla_lane_za64,_s16,_vg4x4,,)(slice_base, zn, zm, 7);
+}
+
+// CHECK-LABEL: @test_smlal_lane_x4_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za32.lane.vg4x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z21test_smlal_lane_x4_u8j11svuint8x4_tu11__SVUint8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za32.lane.vg4x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_smlal_lane_x4_u8(uint32_t slice_base, svuint8x4_t zn, svuint8_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmla_lane_za32,_u8,_vg4x4,,)(slice_base, zn, zm, 15);
+}
+
+// CHECK-LABEL: @test_smlal_lane_x4_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za64.lane.vg4x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_smlal_lane_x4_u16j12svuint16x4_tu12__SVUint16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umla.za64.lane.vg4x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_smlal_lane_x4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmla_lane_za64,_u16,_vg4x4,,)(slice_base, zn, zm, 7);
+}
+
+// MLSL
+
+// CHECK-LABEL: @test_smlsl_lane_x4_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za32.lane.vg4x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z21test_smlsl_lane_x4_s8j10svint8x4_tu10__SVInt8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za32.lane.vg4x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_smlsl_lane_x4_s8(uint32_t slice_base, svint8x4_t zn, svint8_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmls_lane_za32,_s8,_vg4x4,,)(slice_base, zn, zm, 15);
+}
+
+// CHECK-LABEL: @test_smlsl_lane_x4_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za64.lane.vg4x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_smlsl_lane_x4_s16j11svint16x4_tu11__SVInt16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smls.za64.lane.vg4x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_smlsl_lane_x4_s16(uint32_t slice_base, svint16x4_t zn, svint16_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmls_lane_za64,_s16,_vg4x4,,)(slice_base, zn, zm, 7);
+}
+
+// CHECK-LABEL: @test_smlsl_lane_x4_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za32.lane.vg4x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z21test_smlsl_lane_x4_u8j11svuint8x4_tu11__SVUint8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za32.lane.vg4x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_smlsl_lane_x4_u8(uint32_t slice_base, svuint8x4_t zn, svuint8_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmls_lane_za32,_u8,_vg4x4,,)(slice_base, zn, zm, 15);
+}
+
+// CHECK-LABEL: @test_smlsl_lane_x4_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za64.lane.vg4x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_smlsl_lane_x4_u16j12svuint16x4_tu12__SVUint16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umls.za64.lane.vg4x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_smlsl_lane_x4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svmls_lane_za64,_u16,_vg4x4,,)(slice_base, zn, zm, 7);
+}
+
+// SUMLALL
+
+// CHECK-LABEL: @test_sumlall_lane_x4_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.sumla.za32.lane.vg4x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z23test_sumlall_lane_x4_s8j10svint8x4_tu11__SVUint8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.sumla.za32.lane.vg4x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_sumlall_lane_x4_s8(uint32_t slice_base, svint8x4_t zn, svuint8_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svsumla_lane_za32,_s8,_vg4x4,,)(slice_base, zn, zm, 15);
+}
+
+// USMLALL
+
+// CHECK-LABEL: @test_usmlall_lane_x4_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.usmla.za32.lane.vg4x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z23test_usmlall_lane_x4_s8j11svuint8x4_tu10__SVInt8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.usmla.za32.lane.vg4x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]], i32 15)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_usmlall_lane_x4_s8(uint32_t slice_base, svuint8x4_t zn, svint8_t zm) __arm_streaming __arm_shared_za
+{
+  SVE_ACLE_FUNC(svusmla_lane_za32,_u8,_vg4x4,,)(slice_base, zn, zm, 15);
+}
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mls.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mls.c
new file mode 100644
index 0000000000000..6830a399e91d6
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mls.c
@@ -0,0 +1,292 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// REQUIRES: aarch64-registered-target
+
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+#include <arm_sme_draft_spec_subject_to_change.h>
+
+#ifdef SVE_OVERLOADED_FORMS
+// A simple used,unused... macro, long enough to represent any SVE builtin.
+#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED,A5) A1##A3##A5
+#else
+#define SVE_ACLE_FUNC(A1,A2,A3,A4,A5) A1##A2##A3##A4##A5
+#endif
+
+//
+// Multi, multi
+// CHECK-LABEL: @test_svmls2_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN]], i64 4)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZM]], i64 4)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmls.vg1x2.nxv4f32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z15test_svmls2_f32j13svfloat32x2_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZM]], i64 4)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmls.vg1x2.nxv4f32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls2_f32(uint32_t slice_base, svfloat32x2_t zn, svfloat32x2_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svmls_za32,,_f32,,_vg1x2)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmls4_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 4)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 12)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZM]], i64 4)
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZM]], i64 8)
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZM]], i64 12)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmls.vg1x4.nxv4f32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], <vscale x 4 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], <vscale x 4 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z15test_svmls4_f32j13svfloat32x4_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 12)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZM]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZM]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZM]], i64 12)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmls.vg1x4.nxv4f32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], <vscale x 4 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], <vscale x 4 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls4_f32(uint32_t slice_base, svfloat32x4_t zn, svfloat32x4_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svmls_za32,,_f32,,_vg1x4)(slice_base, zn, zm);
+}
+
+//
+// Multi, single
+// CHECK-LABEL: @test_svmls_single2_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN]], i64 4)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmls.single.vg1x2.nxv4f32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_svmls_single2_f32j13svfloat32x2_tu13__SVFloat32_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN]], i64 4)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmls.single.vg1x2.nxv4f32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls_single2_f32(uint32_t slice_base, svfloat32x2_t zn, svfloat32_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svmls_single_za32,,_f32,,_vg1x2)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmls_single4_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 4)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 12)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmls.single.vg1x4.nxv4f32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], <vscale x 4 x float> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_svmls_single4_f32j13svfloat32x4_tu13__SVFloat32_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 12)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmls.single.vg1x4.nxv4f32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], <vscale x 4 x float> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls_single4_f32(uint32_t slice_base, svfloat32x4_t zn, svfloat32_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svmls_single_za32,,_f32,,_vg1x4)(slice_base, zn, zm);
+}
+
+//
+// Multi, indexed
+// CHECK-LABEL: @test_svmls_lane2_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN]], i64 4)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmls.lane.vg1x2.nxv4f32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[ZM:%.*]], i32 3)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z20test_svmls_lane2_f32j13svfloat32x2_tu13__SVFloat32_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN]], i64 4)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmls.lane.vg1x2.nxv4f32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[ZM:%.*]], i32 3)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls_lane2_f32(uint32_t slice_base, svfloat32x2_t zn, svfloat32_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svmls_lane_za32,,_f32,,_vg1x2)(slice_base, zn, zm, 3);
+}
+
+// CHECK-LABEL: @test_svmls_lane4_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 4)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 12)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmls.lane.vg1x4.nxv4f32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], <vscale x 4 x float> [[ZM:%.*]], i32 3)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z20test_svmls_lane4_f32j13svfloat32x4_tu13__SVFloat32_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 12)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmls.lane.vg1x4.nxv4f32(i32 [[SLICE_BASE:%.*]], <vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], <vscale x 4 x float> [[ZM:%.*]], i32 3)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls_lane4_f32(uint32_t slice_base, svfloat32x4_t zn, svfloat32_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svmls_lane_za32,,_f32,,_vg1x4)(slice_base, zn, zm, 3);
+}
+
+//
+// Multi, multi
+// CHECK-LABEL: @test_svmls2_f64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZN]], i64 2)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZM]], i64 2)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmls.vg1x2.nxv2f64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z15test_svmls2_f64j13svfloat64x2_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZN]], i64 2)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZM]], i64 2)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmls.vg1x2.nxv2f64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls2_f64(uint32_t slice_base, svfloat64x2_t zn, svfloat64x2_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svmls_za64,,_f64,,_vg1x2)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmls4_f64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN]], i64 2)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN]], i64 4)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN]], i64 6)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZM]], i64 2)
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZM]], i64 4)
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZM]], i64 6)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmls.vg1x4.nxv2f64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], <vscale x 2 x double> [[TMP4]], <vscale x 2 x double> [[TMP5]], <vscale x 2 x double> [[TMP6]], <vscale x 2 x double> [[TMP7]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z15test_svmls4_f64j13svfloat64x4_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN]], i64 2)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN]], i64 6)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZM]], i64 2)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZM]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZM]], i64 6)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmls.vg1x4.nxv2f64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], <vscale x 2 x double> [[TMP4]], <vscale x 2 x double> [[TMP5]], <vscale x 2 x double> [[TMP6]], <vscale x 2 x double> [[TMP7]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls4_f64(uint32_t slice_base, svfloat64x4_t zn, svfloat64x4_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svmls_za64,,_f64,,_vg1x4)(slice_base, zn, zm);
+}
+
+//
+// Multi, single
+// CHECK-LABEL: @test_svmls_single2_f64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZN]], i64 2)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmls.single.vg1x2.nxv2f64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_svmls_single2_f64j13svfloat64x2_tu13__SVFloat64_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZN]], i64 2)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmls.single.vg1x2.nxv2f64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls_single2_f64(uint32_t slice_base, svfloat64x2_t zn, svfloat64_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svmls_single_za64,,_f64,,_vg1x2)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmls_single4_f64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN]], i64 2)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN]], i64 4)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN]], i64 6)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmls.single.vg1x4.nxv2f64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], <vscale x 2 x double> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_svmls_single4_f64j13svfloat64x4_tu13__SVFloat64_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN]], i64 2)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN]], i64 6)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmls.single.vg1x4.nxv2f64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], <vscale x 2 x double> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls_single4_f64(uint32_t slice_base, svfloat64x4_t zn, svfloat64_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svmls_single_za64,,_f64,,_vg1x4)(slice_base, zn, zm);
+}
+
+//
+// Multi, indexed
+// CHECK-LABEL: @test_svmls_lane2_f64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZN]], i64 2)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmls.lane.vg1x2.nxv2f64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[ZM:%.*]], i32 1)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z20test_svmls_lane2_f64j13svfloat64x2_tu13__SVFloat64_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[ZN]], i64 2)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmls.lane.vg1x2.nxv2f64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[ZM:%.*]], i32 1)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls_lane2_f64(uint32_t slice_base, svfloat64x2_t zn, svfloat64_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svmls_lane_za64,,_f64,,_vg1x2)(slice_base, zn, zm, 1);
+}
+
+// CHECK-LABEL: @test_svmls_lane4_f64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN]], i64 2)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN]], i64 4)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN]], i64 6)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmls.lane.vg1x4.nxv2f64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], <vscale x 2 x double> [[ZM:%.*]], i32 1)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z20test_svmls_lane4_f64j13svfloat64x4_tu13__SVFloat64_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN]], i64 2)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[ZN]], i64 6)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmls.lane.vg1x4.nxv2f64(i32 [[SLICE_BASE:%.*]], <vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], <vscale x 2 x double> [[ZM:%.*]], i32 1)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls_lane4_f64(uint32_t slice_base, svfloat64x4_t zn, svfloat64_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svmls_lane_za64,,_f64,,_vg1x4)(slice_base, zn, zm, 1);
+}
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mlsl.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mlsl.c
new file mode 100644
index 0000000000000..0a87d97f649c9
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mlsl.c
@@ -0,0 +1,696 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// REQUIRES: aarch64-registered-target
+
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+#include <arm_sme_draft_spec_subject_to_change.h>
+
+#ifdef SVE_OVERLOADED_FORMS
+// A simple used,unused... macro, long enough to represent any SVE builtin.
+#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED,A5) A1##A3##A5
+#else
+#define SVE_ACLE_FUNC(A1,A2,A3,A4,A5) A1##A2##A3##A4##A5
+#endif
+
+//
+// Multi, multi
+// CHECK-LABEL: @test_svmls2_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZM]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.vg2x2.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z15test_svmls2_f16j13svfloat16x2_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZM]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.vg2x2.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls2_f16(uint32_t slice_base, svfloat16x2_t zn, svfloat16x2_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmls_za32,_f16,_vg2x2,,)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmls2_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZM]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.vg2x2.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z16test_svmls2_bf16j14svbfloat16x2_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZM]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.vg2x2.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls2_bf16(uint32_t slice_base, svbfloat16x2_t zn, svbfloat16x2_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmls_za32,_bf16,_vg2x2,,)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmls2_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlsl.vg2x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z15test_svmls2_u16j12svuint16x2_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlsl.vg2x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16x2_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmls_za32,_u16,_vg2x2,,)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmls2_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlsl.vg2x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z15test_svmls2_s16j11svint16x2_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlsl.vg2x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls2_s16(uint32_t slice_base, svint16x2_t zn, svint16x2_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmls_za32,_s16,_vg2x2,,)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmls4_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 24)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 8)
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 16)
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.vg2x4.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], <vscale x 8 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z15test_svmls4_f16j13svfloat16x4_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.vg2x4.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], <vscale x 8 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls4_f16(uint32_t slice_base, svfloat16x4_t zn, svfloat16x4_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmls_za32,_f16,_vg2x4,,)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmls4_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 24)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZM]], i64 8)
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZM]], i64 16)
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZM]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.vg2x4.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], <vscale x 8 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z16test_svmls4_bf16j14svbfloat16x4_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZM]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZM]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZM]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.vg2x4.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], <vscale x 8 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls4_bf16(uint32_t slice_base, svbfloat16x4_t zn, svbfloat16x4_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmls_za32,_bf16,_vg2x4,,)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmls4_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 8)
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 16)
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlsl.vg2x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z15test_svmls4_u16j12svuint16x4_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlsl.vg2x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16x4_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmls_za32,_u16,_vg2x4,,)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmls4_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 8)
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 16)
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlsl.vg2x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z15test_svmls4_s16j11svint16x4_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlsl.vg2x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls4_s16(uint32_t slice_base, svint16x4_t zn, svint16x4_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmls_za32,_s16,_vg2x4,,)(slice_base, zn, zm);
+}
+
+//
+// Multi, single
+// CHECK-LABEL: @test_svmls_single1_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.single.vg2x1.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_svmls_single1_f16ju13__SVFloat16_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.single.vg2x1.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls_single1_f16(uint32_t slice_base, svfloat16_t zn, svfloat16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmls_za32,_f16,_vg2x1,,)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmls_single1_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.single.vg2x1.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z23test_svmls_single1_bf16ju14__SVBfloat16_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.single.vg2x1.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls_single1_bf16(uint32_t slice_base, svbfloat16_t zn, svbfloat16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmls_za32,_bf16,_vg2x1,,)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmls_single1_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlsl.single.vg2x1.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_svmls_single1_u16ju12__SVUint16_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlsl.single.vg2x1.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls_single1_u16(uint32_t slice_base, svuint16_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmls_za32,_u16,_vg2x1,,)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmls_single1_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlsl.single.vg2x1.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_svmls_single1_s16ju11__SVInt16_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlsl.single.vg2x1.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls_single1_s16(uint32_t slice_base, svint16_t zn, svint16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmls_za32,_s16,_vg2x1,,)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmls_single2_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.single.vg2x2.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_svmls_single2_f16j13svfloat16x2_tu13__SVFloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.single.vg2x2.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls_single2_f16(uint32_t slice_base, svfloat16x2_t zn, svfloat16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmls_single_za32,,_f16,,_vg2x2)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmls_single2_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.single.vg2x2.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z23test_svmls_single2_bf16j14svbfloat16x2_tu14__SVBfloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.single.vg2x2.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls_single2_bf16(uint32_t slice_base, svbfloat16x2_t zn, svbfloat16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmls_single_za32,,_bf16,,_vg2x2)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmls_single2_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlsl.single.vg2x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_svmls_single2_u16j12svuint16x2_tu12__SVUint16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlsl.single.vg2x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls_single2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmls_single_za32,,_u16,,_vg2x2)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmls_single2_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlsl.single.vg2x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_svmls_single2_s16j11svint16x2_tu11__SVInt16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlsl.single.vg2x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls_single2_s16(uint32_t slice_base, svint16x2_t zn, svint16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmls_single_za32,,_s16,,_vg2x2)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmls_single4_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.single.vg2x4.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_svmls_single4_f16j13svfloat16x4_tu13__SVFloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.single.vg2x4.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls_single4_f16(uint32_t slice_base, svfloat16x4_t zn, svfloat16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmls_single_za32,,_f16,,_vg2x4)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmls_single4_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.single.vg2x4.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z23test_svmls_single4_bf16j14svbfloat16x4_tu14__SVBfloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.single.vg2x4.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls_single4_bf16(uint32_t slice_base, svbfloat16x4_t zn, svbfloat16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmls_single_za32,,_bf16,,_vg2x4)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmls_single4_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlsl.single.vg2x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_svmls_single4_u16j12svuint16x4_tu12__SVUint16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlsl.single.vg2x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls_single4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmls_single_za32,,_u16,,_vg2x4)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svmls_single4_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlsl.single.vg2x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z22test_svmls_single4_s16j11svint16x4_tu11__SVInt16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlsl.single.vg2x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls_single4_s16(uint32_t slice_base, svint16x4_t zn, svint16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmls_single_za32,,_s16,,_vg2x4)(slice_base, zn, zm);
+}
+
+//
+// Multi, indexed
+//
+
+// CHECK-LABEL: @test_svmls_lane1_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.lane.vg2x1.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z20test_svmls_lane1_f16ju13__SVFloat16_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.lane.vg2x1.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls_lane1_f16(uint32_t slice_base, svfloat16_t zn, svfloat16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmls_lane_za32,,_f16,,_vg2x1)(slice_base, zn, zm, 7);
+}
+
+// CHECK-LABEL: @test_svmls_lane1_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.lane.vg2x1.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z21test_svmls_lane1_bf16ju14__SVBfloat16_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.lane.vg2x1.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls_lane1_bf16(uint32_t slice_base, svbfloat16_t zn, svbfloat16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmls_lane_za32,,_bf16,,_vg2x1)(slice_base, zn, zm, 7);
+}
+
+// CHECK-LABEL: @test_svmls_lane1_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlsl.lane.vg2x1.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z20test_svmls_lane1_u16ju12__SVUint16_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlsl.lane.vg2x1.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls_lane1_u16(uint32_t slice_base, svuint16_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmls_lane_za32,,_u16,,_vg2x1)(slice_base, zn, zm, 7);
+}
+
+// CHECK-LABEL: @test_svmls_lane1_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlsl.lane.vg2x1.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z20test_svmls_lane1_s16ju11__SVInt16_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlsl.lane.vg2x1.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[ZN:%.*]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls_lane1_s16(uint32_t slice_base, svint16_t zn, svint16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmls_lane_za32,,_s16,,_vg2x1)(slice_base, zn, zm, 7);
+}
+
+// CHECK-LABEL: @test_svmls_lane2_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.lane.vg2x2.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z20test_svmls_lane2_f16j13svfloat16x2_tu13__SVFloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.lane.vg2x2.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls_lane2_f16(uint32_t slice_base, svfloat16x2_t zn, svfloat16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmls_lane_za32,,_f16,,_vg2x2)(slice_base, zn, zm, 7);
+}
+
+// CHECK-LABEL: @test_svmls_lane2_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.lane.vg2x2.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z21test_svmls_lane2_bf16j14svbfloat16x2_tu14__SVBfloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.lane.vg2x2.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls_lane2_bf16(uint32_t slice_base, svbfloat16x2_t zn, svbfloat16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmls_lane_za32,,_bf16,,_vg2x2)(slice_base, zn, zm, 7);
+}
+
+// CHECK-LABEL: @test_svmls_lane2_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlsl.lane.vg2x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z20test_svmls_lane2_u16j12svuint16x2_tu12__SVUint16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlsl.lane.vg2x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls_lane2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmls_lane_za32,,_u16,,_vg2x2)(slice_base, zn, zm, 7);
+}
+
+// CHECK-LABEL: @test_svmls_lane2_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlsl.lane.vg2x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z20test_svmls_lane2_s16j11svint16x2_tu11__SVInt16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlsl.lane.vg2x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls_lane2_s16(uint32_t slice_base, svint16x2_t zn, svint16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmls_lane_za32,,_s16,,_vg2x2)(slice_base, zn, zm, 7);
+}
+
+// CHECK-LABEL: @test_svmls_lane4_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.lane.vg2x4.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z20test_svmls_lane4_f16j13svfloat16x4_tu13__SVFloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.lane.vg2x4.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls_lane4_f16(uint32_t slice_base, svfloat16x4_t zn, svfloat16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmls_lane_za32,,_f16,,_vg2x4)(slice_base, zn, zm, 7);
+}
+
+// CHECK-LABEL: @test_svmls_lane4_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.lane.vg2x4.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z21test_svmls_lane4_bf16j14svbfloat16x4_tu14__SVBfloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fmlsl.lane.vg2x4.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls_lane4_bf16(uint32_t slice_base, svbfloat16x4_t zn, svbfloat16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmls_lane_za32,,_bf16,,_vg2x4)(slice_base, zn, zm, 7);
+}
+
+// CHECK-LABEL: @test_svmls_lane4_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlsl.lane.vg2x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z20test_svmls_lane4_u16j12svuint16x4_tu12__SVUint16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.umlsl.lane.vg2x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls_lane4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmls_lane_za32,,_u16,,_vg2x4)(slice_base, zn, zm, 7);
+}
+
+// CHECK-LABEL: @test_svmls_lane4_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlsl.lane.vg2x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z20test_svmls_lane4_s16j11svint16x4_tu11__SVInt16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.smlsl.lane.vg2x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]], i32 7)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svmls_lane4_s16(uint32_t slice_base, svint16x4_t zn, svint16_t zm) __arm_streaming __arm_shared_za
+{
+   SVE_ACLE_FUNC(svmls_lane_za32,,_s16,,_vg2x4)(slice_base, zn, zm, 7);
+}
diff --git a/clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_imm.cpp b/clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_imm.cpp
index 6ab6dabd92b19..d07d83a53e462 100644
--- a/clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_imm.cpp
+++ b/clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_imm.cpp
@@ -241,3 +241,47 @@ void test_bfmlslb_bad_lane(svfloat32_t zda, svbfloat16_t zn, svbfloat16_t zm) __
   svbfmlslb_lane_f32(zda, zn, zm, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
   svbfmlslt_lane_f32(zda, zn, zm, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
 }
+
+void test_multiply_add_sub_long(uint32_t base, svint8_t s8, svuint8_t u8,
+                                svint16_t s16, svuint16_t u16, svint8x2_t s8x2,
+                                svuint8x2_t u8x2, svint16x2_t s16x2, svuint16x2_t u16x2,
+                                svint8x4_t s8x4, svuint8x4_t u8x4, svint16x4_t s16x4, svuint16x4_t u16x4) __arm_streaming __arm_shared_za {
+
+  svmla_lane_za32_s8_vg4x1(base, s8, s8, 16);   // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  svmla_lane_za32_u8_vg4x1(base, u8, u8, 16);   // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  svmla_lane_za64_s16_vg4x1(base, s16, s16, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+  svmla_lane_za64_u16_vg4x1(base, u16, u16, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+
+  svmla_lane_za32_s8_vg4x2(base, s8x2, s8, 16);   // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  svmla_lane_za32_u8_vg4x2(base, u8x2, u8, 16);   // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  svmla_lane_za64_s16_vg4x2(base, s16x2, s16, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+  svmla_lane_za64_u16_vg4x2(base, u16x2, u16, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+
+  svmla_lane_za32_s8_vg4x4(base, s8x4, s8, 16);   // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  svmla_lane_za32_u8_vg4x4(base, u8x4, u8, 16);   // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  svmla_lane_za64_s16_vg4x4(base, s16x4, s16, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+  svmla_lane_za64_u16_vg4x4(base, u16x4, u16, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+
+  svmls_lane_za32_s8_vg4x1(base, s8, s8, 16);   // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  svmls_lane_za32_u8_vg4x1(base, u8, u8, 16);   // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  svmls_lane_za64_s16_vg4x1(base, s16, s16, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+  svmls_lane_za64_u16_vg4x1(base, u16, u16, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+
+  svmls_lane_za32_s8_vg4x2(base, s8x2, s8, 16);   // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  svmls_lane_za32_u8_vg4x2(base, u8x2, u8, 16);   // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  svmls_lane_za64_s16_vg4x2(base, s16x2, s16, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+  svmls_lane_za64_u16_vg4x2(base, u16x2, u16, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+
+  svmls_lane_za32_s8_vg4x4(base, s8x4, s8, 16);   // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  svmls_lane_za32_u8_vg4x4(base, u8x4, u8, 16);   // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  svmls_lane_za64_s16_vg4x4(base, s16x4, s16, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+  svmls_lane_za64_u16_vg4x4(base, u16x4, u16, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+
+  svsumla_lane_za32_s8_vg4x1(base, s8, u8, 16);  // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  svsumla_lane_za32_s8_vg4x2(base, s8x2, u8, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  svsumla_lane_za32_s8_vg4x4(base, s8x4, u8, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+
+  svusmla_lane_za32_u8_vg4x1(base, u8, s8, 16);  // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  svusmla_lane_za32_u8_vg4x2(base, u8x2, s8, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  svusmla_lane_za32_u8_vg4x4(base, u8x4, s8, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+}

From e6751c1a128320420801370ab662f213df5791b5 Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Thu, 21 Dec 2023 16:46:38 +0000
Subject: [PATCH 094/342] [gn build] Port 0ea87560cca4

---
 llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn
index 3debc48a4bb99..6059074dfa27b 100644
--- a/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn
@@ -120,6 +120,7 @@ copy("Headers") {
     "__stddef_wint_t.h",
     "__wmmintrin_aes.h",
     "__wmmintrin_pclmul.h",
+    "adcintrin.h",
     "adxintrin.h",
     "altivec.h",
     "ammintrin.h",

From 72003adf6bd44e91778c22e42e94a28c28be2339 Mon Sep 17 00:00:00 2001
From: Jakub Kuderski <jakub@nod-labs.com>
Date: Thu, 21 Dec 2023 11:55:43 -0500
Subject: [PATCH 095/342] [mlir][gpu] Allow subgroup reductions over 1-d vector
 types (#76015)

Each vector element is reduced independently, which is a form of
multi-reduction.

The plan is to allow for gradual lowering of multi-reduction that
results in fewer `gpu.shuffle` ops at the end:
1d `vector.multi_reduction` --> 1d `gpu.subgroup_reduce` --> smaller 1d
`gpu.subgroup_reduce` --> packed `gpu.shuffle` over i32

For example we can perform 2 independent f16 reductions with a series of
`gpu.shuffles` over i32, reducing the final number of `gpu.shuffles` by 2x.
---
 mlir/include/mlir/Dialect/GPU/IR/GPUOps.td    | 16 ++++++--
 mlir/lib/Conversion/GPUToSPIRV/GPUToSPIRV.cpp | 12 ++++--
 mlir/lib/Dialect/GPU/IR/GPUDialect.cpp        | 11 +++++-
 .../Conversion/GPUToSPIRV/reductions.mlir     | 38 +++++++++++++++++++
 mlir/test/Dialect/GPU/invalid.mlir            | 14 +++++--
 mlir/test/Dialect/GPU/ops.mlir                |  5 +++
 6 files changed, 84 insertions(+), 12 deletions(-)

diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
index c72fde2ab351d..b536b6c97cef0 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
@@ -19,10 +19,11 @@ include "mlir/Dialect/GPU/IR/CompilationAttrInterfaces.td"
 include "mlir/Dialect/GPU/IR/CompilationAttrs.td"
 include "mlir/Dialect/GPU/IR/ParallelLoopMapperAttr.td"
 include "mlir/Dialect/GPU/TransformOps/GPUDeviceMappingAttr.td"
+include "mlir/IR/CommonTypeConstraints.td"
 include "mlir/IR/EnumAttr.td"
-include "mlir/Interfaces/FunctionInterfaces.td"
 include "mlir/IR/SymbolInterfaces.td"
 include "mlir/Interfaces/DataLayoutInterfaces.td"
+include "mlir/Interfaces/FunctionInterfaces.td"
 include "mlir/Interfaces/InferIntRangeInterface.td"
 include "mlir/Interfaces/InferTypeOpInterface.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
@@ -1023,16 +1024,23 @@ def GPU_AllReduceOp : GPU_Op<"all_reduce",
   let hasRegionVerifier = 1;
 }
 
+def AnyIntegerOrFloatOr1DVector :
+  AnyTypeOf<[AnyIntegerOrFloat, VectorOfRankAndType<[1], [AnyIntegerOrFloat]>]>;
+
 def GPU_SubgroupReduceOp : GPU_Op<"subgroup_reduce", [SameOperandsAndResultType]> {
   let summary = "Reduce values among subgroup.";
   let description = [{
     The `subgroup_reduce` op reduces the value of every work item across a
     subgroup. The result is equal for all work items of a subgroup.
 
+    When the reduced value is of a vector type, each vector element is reduced
+    independently. Only 1-d vector types are allowed.
+
     Example:
 
     ```mlir
-    %1 = gpu.subgroup_reduce add %0 : (f32) -> (f32)
+    %1 = gpu.subgroup_reduce add %a : (f32) -> (f32)
+    %2 = gpu.subgroup_reduce add %b : (vector<4xf16>) -> (vector<4xf16>)
     ```
 
     If `uniform` flag is set either none or all work items of a subgroup
@@ -1045,11 +1053,11 @@ def GPU_SubgroupReduceOp : GPU_Op<"subgroup_reduce", [SameOperandsAndResultType]
   }];
 
   let arguments = (ins
-    AnyIntegerOrFloat:$value,
+    AnyIntegerOrFloatOr1DVector:$value,
     GPU_AllReduceOperationAttr:$op,
     UnitAttr:$uniform
   );
-  let results = (outs AnyIntegerOrFloat:$result);
+  let results = (outs AnyIntegerOrFloatOr1DVector:$result);
 
   let assemblyFormat = [{ custom<AllReduceOperation>($op) $value
                           (`uniform` $uniform^)? attr-dict
diff --git a/mlir/lib/Conversion/GPUToSPIRV/GPUToSPIRV.cpp b/mlir/lib/Conversion/GPUToSPIRV/GPUToSPIRV.cpp
index d383c16949f0e..d7885e0359592 100644
--- a/mlir/lib/Conversion/GPUToSPIRV/GPUToSPIRV.cpp
+++ b/mlir/lib/Conversion/GPUToSPIRV/GPUToSPIRV.cpp
@@ -16,10 +16,12 @@
 #include "mlir/Dialect/SPIRV/IR/SPIRVDialect.h"
 #include "mlir/Dialect/SPIRV/IR/SPIRVEnums.h"
 #include "mlir/Dialect/SPIRV/IR/SPIRVOps.h"
+#include "mlir/Dialect/SPIRV/IR/SPIRVTypes.h"
 #include "mlir/Dialect/SPIRV/IR/TargetAndABI.h"
 #include "mlir/Dialect/SPIRV/Transforms/SPIRVConversion.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/Matchers.h"
+#include "mlir/Support/LogicalResult.h"
 #include "mlir/Transforms/DialectConversion.h"
 #include <optional>
 
@@ -591,10 +593,12 @@ class GPUSubgroupReduceConversion final
   LogicalResult
   matchAndRewrite(gpu::SubgroupReduceOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
-    auto opType = op.getOp();
-    auto result =
-        createGroupReduceOp(rewriter, op.getLoc(), adaptor.getValue(), opType,
-                            /*isGroup*/ false, op.getUniform());
+    if (!isa<spirv::ScalarType>(adaptor.getValue().getType()))
+      return rewriter.notifyMatchFailure(op, "reduction type is not a scalar");
+
+    auto result = createGroupReduceOp(rewriter, op.getLoc(), adaptor.getValue(),
+                                      adaptor.getOp(),
+                                      /*isGroup=*/false, adaptor.getUniform());
     if (!result)
       return failure();
 
diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
index 7c3330f4c238f..dd482f305fcbc 100644
--- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
+++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
@@ -19,6 +19,7 @@
 #include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/Diagnostics.h"
 #include "mlir/IR/DialectImplementation.h"
 #include "mlir/IR/Matchers.h"
 #include "mlir/IR/OpImplementation.h"
@@ -588,8 +589,16 @@ static void printAllReduceOperation(AsmPrinter &printer, Operation *op,
 //===----------------------------------------------------------------------===//
 
 LogicalResult gpu::SubgroupReduceOp::verify() {
+  Type elemType = getType();
+  if (auto vecTy = dyn_cast<VectorType>(elemType)) {
+    if (vecTy.isScalable())
+      return emitOpError() << "is not compatible with scalable vector types";
+
+    elemType = vecTy.getElementType();
+  }
+
   gpu::AllReduceOperation opName = getOp();
-  if (failed(verifyReduceOpAndType(opName, getType()))) {
+  if (failed(verifyReduceOpAndType(opName, elemType))) {
     return emitError() << '`' << gpu::stringifyAllReduceOperation(opName)
                        << "` reduction operation is not compatible with type "
                        << getType();
diff --git a/mlir/test/Conversion/GPUToSPIRV/reductions.mlir b/mlir/test/Conversion/GPUToSPIRV/reductions.mlir
index af58f4173136f..44f85f68587f1 100644
--- a/mlir/test/Conversion/GPUToSPIRV/reductions.mlir
+++ b/mlir/test/Conversion/GPUToSPIRV/reductions.mlir
@@ -655,6 +655,26 @@ gpu.module @kernels {
 
 // -----
 
+module attributes {
+  gpu.container_module,
+  spirv.target_env = #spirv.target_env<#spirv.vce<v1.3, [Kernel, Addresses, Groups, GroupNonUniformArithmetic, GroupUniformArithmeticKHR], []>, #spirv.resource_limits<>>
+} {
+
+gpu.module @kernels {
+  // CHECK-LABEL:  spirv.func @test
+  //  CHECK-SAME: (%[[ARG:.*]]: i32)
+  gpu.func @test(%arg : vector<1xi32>) kernel
+    attributes {spirv.entry_point_abi = #spirv.entry_point_abi<workgroup_size = [16, 1, 1]>} {
+    // CHECK: %{{.*}} = spirv.GroupNonUniformSMax "Subgroup" "Reduce" %[[ARG]] : i32
+    %r0 = gpu.subgroup_reduce maxsi %arg : (vector<1xi32>) -> (vector<1xi32>)
+    gpu.return
+  }
+}
+
+}
+
+// -----
+
 // TODO: Handle boolean reductions.
 
 module attributes {
@@ -751,3 +771,21 @@ gpu.module @kernels {
   }
 }
 }
+
+// -----
+
+// Vector reductions need to be lowered to scalar reductions first.
+
+module attributes {
+  gpu.container_module,
+  spirv.target_env = #spirv.target_env<#spirv.vce<v1.3, [Kernel, Addresses, Groups, GroupNonUniformArithmetic, GroupUniformArithmeticKHR], []>, #spirv.resource_limits<>>
+} {
+gpu.module @kernels {
+  gpu.func @maxui(%arg : vector<2xi32>) kernel
+    attributes {spirv.entry_point_abi = #spirv.entry_point_abi<workgroup_size = [16, 1, 1]>} {
+    // expected-error @+1 {{failed to legalize operation 'gpu.subgroup_reduce'}}
+    %r0 = gpu.subgroup_reduce maxui %arg : (vector<2xi32>) -> (vector<2xi32>)
+    gpu.return
+  }
+}
+}
diff --git a/mlir/test/Dialect/GPU/invalid.mlir b/mlir/test/Dialect/GPU/invalid.mlir
index d8a40f89f80ac..8a34d64326072 100644
--- a/mlir/test/Dialect/GPU/invalid.mlir
+++ b/mlir/test/Dialect/GPU/invalid.mlir
@@ -333,9 +333,17 @@ func.func @reduce_invalid_op_type_maximumf(%arg0 : i32) {
 
 // -----
 
-func.func @subgroup_reduce_bad_type(%arg0 : vector<2xf32>) {
-  // expected-error@+1 {{'gpu.subgroup_reduce' op operand #0 must be Integer or Float}}
-  %res = gpu.subgroup_reduce add %arg0 : (vector<2xf32>) -> vector<2xf32>
+func.func @subgroup_reduce_bad_type(%arg0 : vector<2x2xf32>) {
+  // expected-error@+1 {{'gpu.subgroup_reduce' op operand #0 must be Integer or Float or vector of}}
+  %res = gpu.subgroup_reduce add %arg0 : (vector<2x2xf32>) -> vector<2x2xf32>
+  return
+}
+
+// -----
+
+func.func @subgroup_reduce_bad_type_scalable(%arg0 : vector<[2]xf32>) {
+  // expected-error@+1 {{is not compatible with scalable vector types}}
+  %res = gpu.subgroup_reduce add %arg0 : (vector<[2]xf32>) -> vector<[2]xf32>
   return
 }
 
diff --git a/mlir/test/Dialect/GPU/ops.mlir b/mlir/test/Dialect/GPU/ops.mlir
index 4819343641563..6051242438305 100644
--- a/mlir/test/Dialect/GPU/ops.mlir
+++ b/mlir/test/Dialect/GPU/ops.mlir
@@ -84,6 +84,8 @@ module attributes {gpu.container_module} {
 
       %one = arith.constant 1.0 : f32
 
+      %vec = vector.broadcast %arg0 : f32 to vector<4xf32>
+
       // CHECK: %{{.*}} = gpu.all_reduce add %{{.*}} {
       // CHECK-NEXT: } : (f32) -> f32
       %sum = gpu.all_reduce add %one {} : (f32) -> (f32)
@@ -98,6 +100,9 @@ module attributes {gpu.container_module} {
       // CHECK: %{{.*}} = gpu.subgroup_reduce add %{{.*}} uniform : (f32) -> f32
       %sum_subgroup1 = gpu.subgroup_reduce add %one uniform : (f32) -> f32
 
+      // CHECK: %{{.*}} = gpu.subgroup_reduce add %{{.*}} : (vector<4xf32>) -> vector<4xf32>
+      %sum_subgroup2 = gpu.subgroup_reduce add %vec : (vector<4xf32>) -> vector<4xf32>
+
       %width = arith.constant 7 : i32
       %offset = arith.constant 3 : i32
       // CHECK: gpu.shuffle xor %{{.*}}, %{{.*}}, %{{.*}} : f32

From 11140cc238b8c4124e6f9efacb1601f81da096a0 Mon Sep 17 00:00:00 2001
From: "Oleksandr \"Alex\" Zinenko" <zinenko@google.com>
Date: Thu, 21 Dec 2023 17:58:53 +0100
Subject: [PATCH 096/342] [mlir] mark ChangeResult as nodiscard (#76147)

This enum is used by dataflow analyses to indicate whether further
propagation is necessary to reach the fix point. Accidentally discarding
such a value will likely lead to propagation stopping early, leading to
incomplete or incorrect results. The most egregious example is the
duality between `join` on the analysis class, which triggers propagation
internally, and `join` on the lattice class that does not and expects
the caller to trigger it depending on the returned `ChangeResult`.
---
 mlir/include/mlir/Analysis/DataFlowFramework.h   | 4 ++--
 mlir/lib/Analysis/DataFlow/LivenessAnalysis.cpp  | 2 +-
 mlir/test/lib/Analysis/TestDataFlowFramework.cpp | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/mlir/include/mlir/Analysis/DataFlowFramework.h b/mlir/include/mlir/Analysis/DataFlowFramework.h
index 541cdb1e237c1..c76cfac07fc77 100644
--- a/mlir/include/mlir/Analysis/DataFlowFramework.h
+++ b/mlir/include/mlir/Analysis/DataFlowFramework.h
@@ -30,8 +30,8 @@ namespace mlir {
 //===----------------------------------------------------------------------===//
 
 /// A result type used to indicate if a change happened. Boolean operations on
-/// ChangeResult behave as though `Change` is truthy.
-enum class ChangeResult {
+/// ChangeResult behave as though `Change` is truth.
+enum class [[nodiscard]] ChangeResult {
   NoChange,
   Change,
 };
diff --git a/mlir/lib/Analysis/DataFlow/LivenessAnalysis.cpp b/mlir/lib/Analysis/DataFlow/LivenessAnalysis.cpp
index 2820d27b65f7a..7875fa9d43d9e 100644
--- a/mlir/lib/Analysis/DataFlow/LivenessAnalysis.cpp
+++ b/mlir/lib/Analysis/DataFlow/LivenessAnalysis.cpp
@@ -191,7 +191,7 @@ void LivenessAnalysis::visitCallOperand(OpOperand &operand) {
 
 void LivenessAnalysis::setToExitState(Liveness *lattice) {
   // This marks values of type (2) liveness as "live".
-  lattice->markLive();
+  (void)lattice->markLive();
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/lib/Analysis/TestDataFlowFramework.cpp b/mlir/test/lib/Analysis/TestDataFlowFramework.cpp
index ed361b5a0e270..b6b33182440cf 100644
--- a/mlir/test/lib/Analysis/TestDataFlowFramework.cpp
+++ b/mlir/test/lib/Analysis/TestDataFlowFramework.cpp
@@ -100,7 +100,7 @@ LogicalResult FooAnalysis::initialize(Operation *top) {
     return top->emitError("expected at least one block in the region");
 
   // Initialize the top-level state.
-  getOrCreate<FooState>(&top->getRegion(0).front())->join(0);
+  (void)getOrCreate<FooState>(&top->getRegion(0).front())->join(0);
 
   // Visit all nested blocks and operations.
   for (Block &block : top->getRegion(0)) {

From 537b2aa264c5a9879a80289c8d123b39e520eb15 Mon Sep 17 00:00:00 2001
From: Maksim Levental <maksim.levental@gmail.com>
Date: Thu, 21 Dec 2023 11:20:29 -0600
Subject: [PATCH 097/342] [mlir][python] meta region_op (#75673)

---
 mlir/python/CMakeLists.txt                    |   9 +-
 mlir/python/mlir/dialects/arith.py            |   8 +
 mlir/python/mlir/dialects/builtin.py          |  23 +++
 mlir/python/mlir/dialects/func.py             |   3 +
 mlir/python/mlir/dialects/pdl.py              |  10 +-
 mlir/python/mlir/dialects/scf.py              |   2 +-
 mlir/python/mlir/dialects/tensor.py           |   7 +
 .../mlir/dialects/transform/__init__.py       |  13 +-
 .../dialects/transform/extras/__init__.py     |  15 +-
 mlir/python/mlir/extras/meta.py               |  83 ++++++++++
 mlir/test/python/dialects/arith_dialect.py    |   6 +-
 mlir/test/python/dialects/tensor.py           |  35 ++++
 mlir/test/python/dialects/transform_extras.py |  73 ++++++++-
 .../python/integration/dialects/transform.py  | 155 ++++++++++++++++++
 14 files changed, 429 insertions(+), 13 deletions(-)
 create mode 100644 mlir/python/mlir/extras/meta.py
 create mode 100644 mlir/test/python/integration/dialects/transform.py

diff --git a/mlir/python/CMakeLists.txt b/mlir/python/CMakeLists.txt
index 55c5973e40e52..3c9cf304d88a2 100644
--- a/mlir/python/CMakeLists.txt
+++ b/mlir/python/CMakeLists.txt
@@ -21,7 +21,6 @@ declare_mlir_python_sources(MLIRPythonSources.Core.Python
     _mlir_libs/__init__.py
     ir.py
     passmanager.py
-    extras/types.py
     dialects/_ods_common.py
 
     # The main _mlir module has submodules: include stubs from each.
@@ -30,6 +29,14 @@ declare_mlir_python_sources(MLIRPythonSources.Core.Python
     _mlir_libs/_mlir/passmanager.pyi
 )
 
+declare_mlir_python_sources(MLIRPythonSources.Core.Python.Extras
+  ROOT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/mlir"
+  ADD_TO_PARENT MLIRPythonSources.Core.Python
+  SOURCES
+    extras/types.py
+    extras/meta.py
+)
+
 declare_mlir_python_sources(MLIRPythonSources.ExecutionEngine
   ROOT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/mlir"
   ADD_TO_PARENT MLIRPythonSources
diff --git a/mlir/python/mlir/dialects/arith.py b/mlir/python/mlir/dialects/arith.py
index 83aca0d58bf2c..663a53660a647 100644
--- a/mlir/python/mlir/dialects/arith.py
+++ b/mlir/python/mlir/dialects/arith.py
@@ -11,6 +11,8 @@
     from ._ods_common import (
         get_default_loc_context as _get_default_loc_context,
         _cext as _ods_cext,
+        get_op_result_or_op_results as _get_op_result_or_op_results,
+        SubClassValueT as _SubClassValueT,
     )
 
     from typing import Any, List, Union
@@ -75,3 +77,9 @@ def literal_value(self) -> Union[int, float]:
             return FloatAttr(self.value).value
         else:
             raise ValueError("only integer and float constants have literal values")
+
+
+def constant(
+    result: Type, value: Union[int, float, Attribute], *, loc=None, ip=None
+) -> _SubClassValueT:
+    return _get_op_result_or_op_results(ConstantOp(result, value, loc=loc, ip=ip))
diff --git a/mlir/python/mlir/dialects/builtin.py b/mlir/python/mlir/dialects/builtin.py
index b71cc2466d464..1c69d6d7c3a0b 100644
--- a/mlir/python/mlir/dialects/builtin.py
+++ b/mlir/python/mlir/dialects/builtin.py
@@ -2,8 +2,11 @@
 #  See https://llvm.org/LICENSE.txt for license information.
 #  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
+from typing import Dict, Optional
+
 from ._builtin_ops_gen import *
 from ._builtin_ops_gen import _Dialect
+from ..extras.meta import region_op
 
 try:
     from ..ir import *
@@ -23,3 +26,23 @@ def __init__(self, *, loc=None, ip=None):
     @property
     def body(self):
         return self.regions[0].blocks[0]
+
+
+@region_op
+def module(
+    *,
+    sym_name=None,
+    sym_visibility=None,
+    attrs: Optional[Dict[str, Attribute]] = None,
+    loc=None,
+    ip=None,
+):
+    mod = ModuleOp.__base__(
+        sym_name=sym_name, sym_visibility=sym_visibility, loc=loc, ip=ip
+    )
+    if attrs is None:
+        attrs = {}
+    for attr_name, attr in attrs.items():
+        mod.operation.attributes[attr_name] = attr
+
+    return mod
diff --git a/mlir/python/mlir/dialects/func.py b/mlir/python/mlir/dialects/func.py
index 6599f67b70787..24fdcbcd85b29 100644
--- a/mlir/python/mlir/dialects/func.py
+++ b/mlir/python/mlir/dialects/func.py
@@ -243,6 +243,9 @@ def emit_call_op(*call_args):
         return decorator
 
 
+func = FuncOp.from_py_func
+
+
 @_ods_cext.register_operation(_Dialect, replace=True)
 class CallOp(CallOp):
     """Specialization for the call op class."""
diff --git a/mlir/python/mlir/dialects/pdl.py b/mlir/python/mlir/dialects/pdl.py
index 90d7d706238e6..db07dc50aabd7 100644
--- a/mlir/python/mlir/dialects/pdl.py
+++ b/mlir/python/mlir/dialects/pdl.py
@@ -5,6 +5,7 @@
 from ._pdl_ops_gen import *
 from ._pdl_ops_gen import _Dialect
 from .._mlir_libs._mlirDialectsPDL import *
+from .._mlir_libs._mlirDialectsPDL import OperationType
 
 
 try:
@@ -13,7 +14,7 @@
 except ImportError as e:
     raise RuntimeError("Error loading imports from extension module") from e
 
-from typing import Union, Optional, Sequence, Mapping
+from typing import Union, Optional, Sequence, Mapping, NewType
 from ._ods_common import (
     get_op_result_or_value as _get_value,
     get_op_results_or_values as _get_values,
@@ -220,3 +221,10 @@ def __init__(
             constantTypes = []
         result = pdl.RangeType.get(pdl.TypeType.get())
         super().__init__(result, constantTypes=constantTypes, loc=loc, ip=ip)
+
+
+OperationTypeT = NewType("OperationType", OperationType)
+
+
+def op_t() -> OperationTypeT:
+    return OperationTypeT(OperationType.get())
diff --git a/mlir/python/mlir/dialects/scf.py b/mlir/python/mlir/dialects/scf.py
index 20bbed9bc93df..dad7377987e56 100644
--- a/mlir/python/mlir/dialects/scf.py
+++ b/mlir/python/mlir/dialects/scf.py
@@ -120,7 +120,7 @@ def for_(
     params = [start, stop, step]
     for i, p in enumerate(params):
         if isinstance(p, int):
-            p = constant(IntegerAttr.get(IndexType.get(), p))
+            p = constant(IndexType.get(), p)
         elif isinstance(p, float):
             raise ValueError(f"{p=} must be int.")
         params[i] = p
diff --git a/mlir/python/mlir/dialects/tensor.py b/mlir/python/mlir/dialects/tensor.py
index 67248748eaf3a..79dd9476ad0ff 100644
--- a/mlir/python/mlir/dialects/tensor.py
+++ b/mlir/python/mlir/dialects/tensor.py
@@ -4,6 +4,7 @@
 
 from ._tensor_ops_gen import *
 from ._tensor_ops_gen import _Dialect
+from ..extras.meta import region_op
 
 try:
     from ..ir import *
@@ -40,3 +41,9 @@ def __init__(
                 dynamic_sizes.append(s)
         result_type = RankedTensorType.get(static_sizes, element_type)
         super().__init__(result_type, dynamic_sizes, loc=loc, ip=ip)
+
+
+generate = region_op(
+    lambda result, dynamic_extents: GenerateOp(result, dynamic_extents),
+    terminator=lambda args: YieldOp(args[0]),
+)
diff --git a/mlir/python/mlir/dialects/transform/__init__.py b/mlir/python/mlir/dialects/transform/__init__.py
index 175634c7d458f..5b158ec6b65fd 100644
--- a/mlir/python/mlir/dialects/transform/__init__.py
+++ b/mlir/python/mlir/dialects/transform/__init__.py
@@ -18,7 +18,7 @@
 except ImportError as e:
     raise RuntimeError("Error loading imports from extension module") from e
 
-from typing import Optional, Sequence, Union
+from typing import Optional, Sequence, Union, NewType
 
 
 @_ods_cext.register_operation(_Dialect, replace=True)
@@ -175,7 +175,7 @@ def __init__(
         result_types: Sequence[Type],
         sym_visibility=None,
         arg_attrs=None,
-        res_attrs=None
+        res_attrs=None,
     ):
         function_type = FunctionType.get(input_types, result_types)
         super().__init__(
@@ -183,7 +183,7 @@ def __init__(
             function_type=TypeAttr.get(function_type),
             sym_visibility=sym_visibility,
             arg_attrs=arg_attrs,
-            res_attrs=res_attrs
+            res_attrs=res_attrs,
         )
         self.regions[0].blocks.append(*input_types)
 
@@ -212,3 +212,10 @@ def __init__(
         if operands is None:
             operands = []
         super().__init__(_get_op_results_or_values(operands), loc=loc, ip=ip)
+
+
+AnyOpTypeT = NewType("AnyOpType", AnyOpType)
+
+
+def any_op_t() -> AnyOpTypeT:
+    return AnyOpTypeT(AnyOpType.get())
diff --git a/mlir/python/mlir/dialects/transform/extras/__init__.py b/mlir/python/mlir/dialects/transform/extras/__init__.py
index c715dac1ef7eb..e4d47e9064f2c 100644
--- a/mlir/python/mlir/dialects/transform/extras/__init__.py
+++ b/mlir/python/mlir/dialects/transform/extras/__init__.py
@@ -4,8 +4,16 @@
 
 from typing import Callable, Optional, Sequence, Union
 
+from ....extras.meta import region_op
 from .... import ir
-from .. import AnyOpType, OperationType, NamedSequenceOp, YieldOp
+from .. import (
+    AnyOpType,
+    OperationType,
+    NamedSequenceOp,
+    YieldOp,
+    SequenceOp,
+    ApplyPatternsOp,
+)
 from .. import structured
 
 
@@ -147,3 +155,8 @@ def test_match_ops_single(module: OpHandle):
 
     if dump_script:
         print(named_sequence_op)
+
+
+sequence = region_op(SequenceOp.__base__, terminator=YieldOp)
+named_sequence = region_op(NamedSequenceOp, terminator=YieldOp)
+apply_patterns = region_op(ApplyPatternsOp)
diff --git a/mlir/python/mlir/extras/meta.py b/mlir/python/mlir/extras/meta.py
new file mode 100644
index 0000000000000..3f2defadf7941
--- /dev/null
+++ b/mlir/python/mlir/extras/meta.py
@@ -0,0 +1,83 @@
+#  Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+#  See https://llvm.org/LICENSE.txt for license information.
+#  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+import inspect
+from functools import wraps
+
+from ..dialects._ods_common import get_op_result_or_op_results
+from ..ir import Type, InsertionPoint
+
+
+def op_region_builder(op, op_region, terminator=None):
+    def builder_wrapper(body_builder):
+        # Add a block with block args having types determined by type hints on the wrapped function.
+        if len(op_region.blocks) == 0:
+            sig = inspect.signature(body_builder)
+            types = [p.annotation for p in sig.parameters.values()]
+            if not (
+                len(types) == len(sig.parameters)
+                and all(isinstance(t, Type) for t in types)
+            ):
+                raise ValueError(
+                    f"for {body_builder=} either missing a type annotation or type annotation isn't a mlir type: {sig}"
+                )
+
+            op_region.blocks.append(*types)
+
+        with InsertionPoint(op_region.blocks[0]):
+            results = body_builder(*list(op_region.blocks[0].arguments))
+
+        with InsertionPoint(list(op_region.blocks)[-1]):
+            if terminator is not None:
+                res = []
+                if isinstance(results, (tuple, list)):
+                    res.extend(results)
+                elif results is not None:
+                    res.append(results)
+                terminator(res)
+
+        return get_op_result_or_op_results(op)
+
+    return builder_wrapper
+
+
+def region_op(op_constructor, terminator=None):
+    """Decorator to define an MLIR Op specified as a python function.
+
+    Requires that an `mlir.ir.InsertionPoint` and `mlir.ir.Location` are
+    active for the current thread (i.e. established in a `with` block).
+
+    Supports "naked" usage i.e., no parens if no args need to be passed to the Op constructor.
+
+    When applied as a decorator to a Python function, an entry block will
+    be constructed for the Op with types as specified **as type hints on the args of the function**.
+    The block arguments will be passed positionally to the Python function.
+
+    If a terminator is specified then the return from the decorated function will be passed
+    to the terminator as the last statement in the entry block. Note, the API for the terminator
+    is a (possibly empty) list; terminator accepting single values should be wrapped in a
+    `lambda args: term(args[0])`
+
+    The identifier (name) of the function will become:
+    1. A single value result if the Op returns a single value;
+    2. An OpResultList (as a list) if the Op returns multiple values;
+    3. The Operation if the Op returns no results.
+
+    See examples in tensor.py and transform.extras.
+    """
+
+    def op_decorator(*args, **kwargs):
+        op = op_constructor(*args, **kwargs)
+        op_region = op.regions[0]
+
+        return op_region_builder(op, op_region, terminator)
+
+    @wraps(op_decorator)
+    def maybe_no_args(*args, **kwargs):
+        if len(args) == 1 and len(kwargs) == 0 and callable(args[0]):
+            return op_decorator()(args[0])
+        else:
+            return op_decorator(*args, **kwargs)
+
+    return maybe_no_args
diff --git a/mlir/test/python/dialects/arith_dialect.py b/mlir/test/python/dialects/arith_dialect.py
index f80f2c084a0f3..8bb80eed2b810 100644
--- a/mlir/test/python/dialects/arith_dialect.py
+++ b/mlir/test/python/dialects/arith_dialect.py
@@ -75,7 +75,7 @@ def __str__(self):
         f64_t = F64Type.get()
 
         with InsertionPoint(module.body):
-            a = arith.constant(value=FloatAttr.get(f16_t, 42.42))
+            a = arith.constant(f16_t, 42.42)
             # CHECK: ArithValue(%cst = arith.constant 4.240
             print(a)
 
@@ -83,12 +83,12 @@ def __str__(self):
             # CHECK: ArithValue(%0 = arith.addf %cst, %cst : f16)
             print(b)
 
-            a = arith.constant(value=FloatAttr.get(f32_t, 42.42))
+            a = arith.constant(f32_t, 42.42)
             b = a - a
             # CHECK: ArithValue(%1 = arith.subf %cst_0, %cst_0 : f32)
             print(b)
 
-            a = arith.constant(value=FloatAttr.get(f64_t, 42.42))
+            a = arith.constant(f64_t, 42.42)
             b = a * a
             # CHECK: ArithValue(%2 = arith.mulf %cst_1, %cst_1 : f64)
             print(b)
diff --git a/mlir/test/python/dialects/tensor.py b/mlir/test/python/dialects/tensor.py
index b690c934dc46b..ca9066b239111 100644
--- a/mlir/test/python/dialects/tensor.py
+++ b/mlir/test/python/dialects/tensor.py
@@ -4,6 +4,7 @@
 import mlir.dialects.arith as arith
 import mlir.dialects.func as func
 import mlir.dialects.tensor as tensor
+from mlir.extras import types as T
 
 
 def run(f):
@@ -139,3 +140,37 @@ def default_builder():
                 t = tensor.FromElementsOp(RankedTensorType.get((1, 2), f32), [c0, c1])
                 # CHECK: %{{.*}} = "tensor.from_elements"(%[[C0]], %[[C1]]) : (f32, f32) -> tensor<1x2xf32>
                 print(t)
+
+
+# CHECK-LABEL: TEST: testGenerateRegionOp
+@run
+def testGenerateRegionOp():
+    S = ShapedType.get_dynamic_size()
+    with Context(), Location.unknown():
+        module = Module.create()
+        with InsertionPoint(module.body):
+            # CHECK: %[[VAL_0:.*]] = arith.constant 1 : index
+            # CHECK: %[[VAL_1:.*]] = arith.constant 2 : index
+            one = arith.constant(T.index(), 1)
+            two = arith.constant(T.index(), 2)
+
+            @tensor.generate(T.tensor(S, 3, S, T.index()), dynamic_extents=[one, two])
+            def generate_one(i: T.index(), j: T.index(), k: T.index()):
+                ij = arith.addi(i, j)
+                ijk = arith.addi(ij, k)
+                return ijk
+
+            assert (
+                isinstance(generate_one, Value)
+                and generate_one.owner.name == "tensor.generate"
+            )
+
+        # CHECK:         %[[GENERATED:.*]] = tensor.generate
+        # CHECK-SAME:    %[[VAL_0]],
+        # CHECK-SAME:    %[[VAL_1]] {
+        # CHECK:         ^bb0(%[[VAL_1:.*]]: index, %[[VAL_2:.*]]: index, %[[VAL_3:.*]]: index):
+        # CHECK:           %[[VAL_4:.*]] = arith.addi %[[VAL_1]], %[[VAL_2]] : index
+        # CHECK:           %[[VAL_5:.*]] = arith.addi %[[VAL_4]], %[[VAL_3]] : index
+        # CHECK:           tensor.yield %[[VAL_5]] : index
+        # CHECK:         } : tensor<?x3x?xindex>
+        print(module)
diff --git a/mlir/test/python/dialects/transform_extras.py b/mlir/test/python/dialects/transform_extras.py
index e7b43ea63c31c..358f8c32f75c7 100644
--- a/mlir/test/python/dialects/transform_extras.py
+++ b/mlir/test/python/dialects/transform_extras.py
@@ -2,9 +2,34 @@
 
 from typing import Callable
 from mlir import ir
-from mlir.dialects import scf
-from mlir.dialects.transform import structured
-from mlir.dialects.transform.extras import OpHandle, insert_transform_script
+from mlir.dialects import scf, pdl
+from mlir.dialects.transform import (
+    structured,
+    get_parent_op,
+    apply_patterns_canonicalization,
+    apply_cse,
+    any_op_t,
+)
+from mlir.dialects.transform import FailurePropagationMode
+from mlir.dialects.transform.structured import structured_match
+from mlir.dialects.transform.loop import loop_unroll
+from mlir.dialects.transform.extras import (
+    OpHandle,
+    insert_transform_script,
+    sequence,
+    apply_patterns,
+)
+from mlir.extras import types as T
+
+
+def construct_and_print_in_module(f):
+    print("\nTEST:", f.__name__)
+    with ir.Context(), ir.Location.unknown():
+        module = ir.Module.create()
+        with ir.InsertionPoint(module.body):
+            f()
+        print(module)
+    return f
 
 
 def build_transform_script(script: Callable[[OpHandle], None]):
@@ -93,3 +118,45 @@ def test_match_ops_mixed(op: OpHandle):
     # CHECK-NEXT: %[[VAL_1:.*]] = transform.structured.match
     # CHECK-SAME:   ops{["scf.for", "linalg.matmul", "scf.forall"]} in %[[VAL_0]]
     # CHECK-SAME:     -> !transform.any_op
+
+
+# CHECK-LABEL: TEST: test_sequence_region
+@construct_and_print_in_module
+def test_sequence_region():
+    # CHECK:   transform.sequence  failures(propagate) {
+    # CHECK:   ^{{.*}}(%[[VAL_0:.*]]: !transform.any_op):
+    # CHECK:     %[[VAL_1:.*]] = transform.structured.match ops{["arith.addi"]} in %[[VAL_0]] : (!transform.any_op) -> !transform.any_op
+    # CHECK:     %[[VAL_2:.*]] = get_parent_op %[[VAL_1]] {op_name = "scf.for"} : (!transform.any_op) -> !pdl.operation
+    # CHECK:     transform.loop.unroll %[[VAL_2]] {factor = 4 : i64} : !pdl.operation
+    # CHECK:   }
+    @sequence([], FailurePropagationMode.Propagate, [])
+    def basic(target: any_op_t()):
+        m = structured_match(any_op_t(), target, ops=["arith.addi"])
+        loop = get_parent_op(pdl.op_t(), m, op_name="scf.for")
+        loop_unroll(loop, 4)
+
+
+# CHECK-LABEL: TEST: test_apply_patterns
+@construct_and_print_in_module
+def test_apply_patterns():
+    # CHECK:   transform.sequence  failures(propagate) {
+    # CHECK:   ^{{.*}}(%[[VAL_0:.*]]: !transform.any_op):
+    # CHECK:     %[[VAL_1:.*]] = transform.structured.match ops{["linalg.matmul"]} in %[[VAL_0]] : (!transform.any_op) -> !transform.any_op
+    # CHECK:     %[[VAL_2:.*]] = get_parent_op %[[VAL_1]] {op_name = "func.func"} : (!transform.any_op) -> !pdl.operation
+    # CHECK:     apply_patterns to %[[VAL_2]] {
+    # CHECK:       transform.apply_patterns.canonicalization
+    # CHECK:     } : !pdl.operation
+    # CHECK:     %[[VAL_3:.*]] = transform.structured.match ops{["func.func"]} in %[[VAL_0]] : (!transform.any_op) -> !transform.any_op
+    # CHECK:     apply_cse to %[[VAL_3]] : !transform.any_op
+    # CHECK:   }
+    @sequence([], FailurePropagationMode.Propagate, [])
+    def basic(variant_op: any_op_t()):
+        matmul = structured_match(any_op_t(), variant_op, ops=["linalg.matmul"])
+        top_func = get_parent_op(pdl.op_t(), matmul, op_name="func.func")
+
+        @apply_patterns(top_func)
+        def pats():
+            apply_patterns_canonicalization()
+
+        top_func = structured_match(any_op_t(), variant_op, ops=["func.func"])
+        apply_cse(top_func)
diff --git a/mlir/test/python/integration/dialects/transform.py b/mlir/test/python/integration/dialects/transform.py
new file mode 100644
index 0000000000000..bc88a61314d0d
--- /dev/null
+++ b/mlir/test/python/integration/dialects/transform.py
@@ -0,0 +1,155 @@
+# RUN: %PYTHON %s 2>&1 | FileCheck %s
+
+from mlir.passmanager import PassManager
+from mlir.ir import Context, Location, Module, InsertionPoint, UnitAttr
+from mlir.dialects import scf, pdl, func, arith, linalg
+from mlir.dialects.transform import (
+    get_parent_op,
+    apply_patterns_canonicalization,
+    apply_cse,
+    any_op_t,
+)
+from mlir.dialects.transform.structured import structured_match
+from mlir.dialects.transform.loop import loop_unroll
+from mlir.dialects.transform.extras import named_sequence, apply_patterns
+from mlir.extras import types as T
+from mlir.dialects.builtin import module, ModuleOp
+
+
+def construct_and_print_in_module(f):
+    print("\nTEST:", f.__name__)
+    with Context(), Location.unknown():
+        module = Module.create()
+        with InsertionPoint(module.body):
+            module = f(module)
+        if module is not None:
+            print(module)
+    return f
+
+
+# CHECK-LABEL: TEST: test_named_sequence
+@construct_and_print_in_module
+def test_named_sequence(module_):
+    # CHECK-LABEL:   func.func @loop_unroll_op() {
+    # CHECK:           %[[VAL_0:.*]] = arith.constant 0 : index
+    # CHECK:           %[[VAL_1:.*]] = arith.constant 42 : index
+    # CHECK:           %[[VAL_2:.*]] = arith.constant 5 : index
+    # CHECK:           scf.for %[[VAL_3:.*]] = %[[VAL_0]] to %[[VAL_1]] step %[[VAL_2]] {
+    # CHECK:             %[[VAL_4:.*]] = arith.addi %[[VAL_3]], %[[VAL_3]] : index
+    # CHECK:           }
+    # CHECK:           return
+    # CHECK:         }
+    @func.func()
+    def loop_unroll_op():
+        for i in scf.for_(0, 42, 5):
+            v = arith.addi(i, i)
+            scf.yield_([])
+
+    # CHECK-LABEL:   module attributes {transform.with_named_sequence} {
+    # CHECK:           transform.named_sequence @__transform_main(%[[VAL_0:.*]]: !transform.any_op) {
+    # CHECK:             %[[VAL_1:.*]] = transform.structured.match ops{["arith.addi"]} in %[[VAL_0]] : (!transform.any_op) -> !transform.any_op
+    # CHECK:             %[[VAL_2:.*]] = transform.get_parent_op %[[VAL_1]] {op_name = "scf.for"} : (!transform.any_op) -> !pdl.operation
+    # CHECK:             transform.loop.unroll %[[VAL_2]] {factor = 4 : i64} : !pdl.operation
+    # CHECK:             transform.yield
+    # CHECK:           }
+    # CHECK:         }
+    @module(attrs={"transform.with_named_sequence": UnitAttr.get()})
+    def mod():
+        @named_sequence("__transform_main", [any_op_t()], [])
+        def basic(target: any_op_t()):
+            m = structured_match(any_op_t(), target, ops=["arith.addi"])
+            loop = get_parent_op(pdl.op_t(), m, op_name="scf.for")
+            loop_unroll(loop, 4)
+
+    # The identifier (name) of the function becomes the Operation
+    assert isinstance(mod.opview, ModuleOp)
+
+    print(module_)
+
+    pm = PassManager.parse("builtin.module(transform-interpreter)")
+    pm.run(module_.operation)
+
+    # CHECK-LABEL: func.func @loop_unroll_op() {
+    # CHECK:         %[[VAL_0:.*]] = arith.constant 0 : index
+    # CHECK:         %[[VAL_1:.*]] = arith.constant 42 : index
+    # CHECK:         %[[VAL_2:.*]] = arith.constant 5 : index
+    # CHECK:         %[[VAL_6:.*]] = arith.constant 40 : index
+    # CHECK:         %[[VAL_7:.*]] = arith.constant 20 : index
+    # CHECK:         scf.for %[[VAL_3:.*]] = %[[VAL_0]] to %[[VAL_6]] step %[[VAL_7]] {
+    # CHECK:           %[[VAL_5:.*]] = arith.addi %[[VAL_3]], %[[VAL_3]] : index
+    # CHECK:           %[[VAL_8:.*]] = arith.constant 1 : index
+    # CHECK:           %[[VAL_9:.*]] = arith.muli %[[VAL_2]], %[[VAL_8]] : index
+    # CHECK:           %[[VAL_10:.*]] = arith.addi %[[VAL_3]], %[[VAL_9]] : index
+    # CHECK:           %[[VAL_11:.*]] = arith.addi %[[VAL_10]], %[[VAL_10]] : index
+    # CHECK:           %[[VAL_12:.*]] = arith.constant 2 : index
+    # CHECK:           %[[VAL_13:.*]] = arith.muli %[[VAL_2]], %[[VAL_12]] : index
+    # CHECK:           %[[VAL_14:.*]] = arith.addi %[[VAL_3]], %[[VAL_13]] : index
+    # CHECK:           %[[VAL_15:.*]] = arith.addi %[[VAL_14]], %[[VAL_14]] : index
+    # CHECK:           %[[VAL_16:.*]] = arith.constant 3 : index
+    # CHECK:           %[[VAL_17:.*]] = arith.muli %[[VAL_2]], %[[VAL_16]] : index
+    # CHECK:           %[[VAL_18:.*]] = arith.addi %[[VAL_3]], %[[VAL_17]] : index
+    # CHECK:           %[[VAL_19:.*]] = arith.addi %[[VAL_18]], %[[VAL_18]] : index
+    # CHECK:         }
+    # CHECK:         %[[VAL_4:.*]] = arith.addi %[[VAL_6]], %[[VAL_6]] : index
+    # CHECK:         return
+    # CHECK:       }
+    print(module_)
+
+
+# CHECK-LABEL: TEST: test_apply_patterns
+@construct_and_print_in_module
+def test_apply_patterns(module_):
+    M, N, K = 3, 5, 3
+
+    # CHECK-LABEL:   func.func @matmul(
+    # CHECK-SAME:                      %[[VAL_0:.*]]: tensor<3x5xf32>, %[[VAL_1:.*]]: tensor<5x3xf32>, %[[VAL_2:.*]]: tensor<3x3xf32>) -> tensor<3x3xf32> {
+    # CHECK:           %[[VAL_3:.*]] = arith.constant 1 : i32
+    # CHECK:           %[[VAL_4:.*]] = arith.addi %[[VAL_3]], %[[VAL_3]] : i32
+    # CHECK:           %[[VAL_5:.*]] = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%[[VAL_0]], %[[VAL_1]] : tensor<3x5xf32>, tensor<5x3xf32>) outs(%[[VAL_2]] : tensor<3x3xf32>) -> tensor<3x3xf32>
+    # CHECK:           return %[[VAL_5]] : tensor<3x3xf32>
+    # CHECK:         }
+    @func.func(
+        T.tensor(M, N, T.f32()), T.tensor(N, K, T.f32()), T.tensor(M, K, T.f32())
+    )
+    def matmul(A, B, C):
+        i = arith.constant(T.i32(), 1)
+        v = arith.addi(i, i)
+        return linalg.matmul(A, B, outs=[C])
+
+    # CHECK-LABEL:   module attributes {transform.with_named_sequence} {
+    # CHECK:           transform.named_sequence @__transform_main(%[[VAL_0:.*]]: !transform.any_op) {
+    # CHECK:             %[[VAL_1:.*]] = transform.structured.match ops{["linalg.matmul"]} in %[[VAL_0]] : (!transform.any_op) -> !transform.any_op
+    # CHECK:             %[[VAL_2:.*]] = transform.get_parent_op %[[VAL_1]] {op_name = "func.func"} : (!transform.any_op) -> !pdl.operation
+    # CHECK:             transform.apply_patterns to %[[VAL_2]] {
+    # CHECK:               transform.apply_patterns.canonicalization
+    # CHECK:             } : !pdl.operation
+    # CHECK:             %[[VAL_3:.*]] = transform.structured.match ops{["func.func"]} in %[[VAL_0]] : (!transform.any_op) -> !transform.any_op
+    # CHECK:             transform.apply_cse to %[[VAL_3]] : !transform.any_op
+    # CHECK:             transform.yield
+    # CHECK:           }
+    # CHECK:         }
+    @module(attrs={"transform.with_named_sequence": UnitAttr.get()})
+    def mod():
+        @named_sequence("__transform_main", [any_op_t()], [])
+        def basic(variant_op: any_op_t()):
+            matmul = structured_match(any_op_t(), variant_op, ops=["linalg.matmul"])
+            top_func = get_parent_op(pdl.op_t(), matmul, op_name="func.func")
+
+            @apply_patterns(top_func)
+            def pats():
+                apply_patterns_canonicalization()
+
+            top_func = structured_match(any_op_t(), variant_op, ops=["func.func"])
+            apply_cse(top_func)
+
+    print(module_)
+
+    pm = PassManager.parse("builtin.module(transform-interpreter)")
+    pm.run(module_.operation)
+
+    # CHECK-LABEL:   func.func @matmul(
+    # CHECK-SAME:                      %[[VAL_0:.*]]: tensor<3x5xf32>, %[[VAL_1:.*]]: tensor<5x3xf32>, %[[VAL_2:.*]]: tensor<3x3xf32>) -> tensor<3x3xf32> {
+    # CHECK:           %[[VAL_3:.*]] = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%[[VAL_0]], %[[VAL_1]] : tensor<3x5xf32>, tensor<5x3xf32>) outs(%[[VAL_2]] : tensor<3x3xf32>) -> tensor<3x3xf32>
+    # CHECK:           return %[[VAL_3]] : tensor<3x3xf32>
+    # CHECK:         }
+    print(module_)

From 88151dd4285cdd9feeb24ebb1be9cf5252ab0883 Mon Sep 17 00:00:00 2001
From: Finn Plummer <50529406+inbelic@users.noreply.github.com>
Date: Thu, 21 Dec 2023 09:24:01 -0800
Subject: [PATCH 098/342] [mlir][spirv] Add folding for SNegate, [Logical]Not
 (#74992)

Add missing constant propogation folder for SNegate, [Logical]Not.

Implement additional folding when !(!x) for all ops.

This helps for readability of lowered code into SPIR-V.

Part of work for #70704
---
 .../Dialect/SPIRV/IR/SPIRVArithmeticOps.td    |   2 +
 .../mlir/Dialect/SPIRV/IR/SPIRVBitOps.td      |   2 +
 .../mlir/Dialect/SPIRV/IR/SPIRVLogicalOps.td  |   1 +
 .../SPIRV/IR/SPIRVCanonicalization.cpp        |  55 ++++++++
 .../SPIRV/Transforms/canonicalize.mlir        | 128 ++++++++++++++++++
 5 files changed, 188 insertions(+)

diff --git a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVArithmeticOps.td b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVArithmeticOps.td
index 51124e141c6d4..22d5afcd77381 100644
--- a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVArithmeticOps.td
+++ b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVArithmeticOps.td
@@ -582,6 +582,8 @@ def SPIRV_SNegateOp : SPIRV_ArithmeticUnaryOp<"SNegate",
     %3 = spirv.SNegate %2 : vector<4xi32>
     ```
   }];
+
+  let hasFolder = 1;
 }
 
 // -----
diff --git a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVBitOps.td b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVBitOps.td
index b460c8e68aa0c..38639a175ab4d 100644
--- a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVBitOps.td
+++ b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVBitOps.td
@@ -462,6 +462,8 @@ def SPIRV_NotOp : SPIRV_BitUnaryOp<"Not", [UsableInSpecConstantOp]> {
     %3 = spirv.Not %1 : vector<4xi32>
     ```
   }];
+
+  let hasFolder = 1;
 }
 
 #endif // MLIR_DIALECT_SPIRV_IR_BIT_OPS
diff --git a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVLogicalOps.td b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVLogicalOps.td
index 2e26c44de281a..e48a56f0625d3 100644
--- a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVLogicalOps.td
+++ b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVLogicalOps.td
@@ -534,6 +534,7 @@ def SPIRV_LogicalNotOp : SPIRV_LogicalUnaryOp<"LogicalNot",
   }];
 
   let hasCanonicalizer = 1;
+  let hasFolder = 1;
 }
 
 // -----
diff --git a/mlir/lib/Dialect/SPIRV/IR/SPIRVCanonicalization.cpp b/mlir/lib/Dialect/SPIRV/IR/SPIRVCanonicalization.cpp
index 08ddc7c25aa9e..4c62289a1e945 100644
--- a/mlir/lib/Dialect/SPIRV/IR/SPIRVCanonicalization.cpp
+++ b/mlir/lib/Dialect/SPIRV/IR/SPIRVCanonicalization.cpp
@@ -643,6 +643,45 @@ OpFoldResult spirv::UModOp::fold(FoldAdaptor adaptor) {
   return div0 ? Attribute() : res;
 }
 
+//===----------------------------------------------------------------------===//
+// spirv.SNegate
+//===----------------------------------------------------------------------===//
+
+OpFoldResult spirv::SNegateOp::fold(FoldAdaptor adaptor) {
+  // -(-x) = 0 - (0 - x) = x
+  auto op = getOperand();
+  if (auto negateOp = op.getDefiningOp<spirv::SNegateOp>())
+    return negateOp->getOperand(0);
+
+  // According to the SPIR-V spec:
+  //
+  // Signed-integer subtract of Operand from zero.
+  return constFoldUnaryOp<IntegerAttr>(
+      adaptor.getOperands(), [](const APInt &a) {
+        APInt zero = APInt::getZero(a.getBitWidth());
+        return zero - a;
+      });
+}
+
+//===----------------------------------------------------------------------===//
+// spirv.NotOp
+//===----------------------------------------------------------------------===//
+
+OpFoldResult spirv::NotOp::fold(spirv::NotOp::FoldAdaptor adaptor) {
+  // !(!x) = x
+  auto op = getOperand();
+  if (auto notOp = op.getDefiningOp<spirv::NotOp>())
+    return notOp->getOperand(0);
+
+  // According to the SPIR-V spec:
+  //
+  // Complement the bits of Operand.
+  return constFoldUnaryOp<IntegerAttr>(adaptor.getOperands(), [&](APInt a) {
+    a.flipAllBits();
+    return a;
+  });
+}
+
 //===----------------------------------------------------------------------===//
 // spirv.LogicalAnd
 //===----------------------------------------------------------------------===//
@@ -714,6 +753,22 @@ OpFoldResult spirv::LogicalNotEqualOp::fold(FoldAdaptor adaptor) {
 // spirv.LogicalNot
 //===----------------------------------------------------------------------===//
 
+OpFoldResult spirv::LogicalNotOp::fold(FoldAdaptor adaptor) {
+  // !(!x) = x
+  auto op = getOperand();
+  if (auto notOp = op.getDefiningOp<spirv::LogicalNotOp>())
+    return notOp->getOperand(0);
+
+  // According to the SPIR-V spec:
+  //
+  // Complement the bits of Operand.
+  return constFoldUnaryOp<IntegerAttr>(adaptor.getOperands(),
+                                       [](const APInt &a) {
+                                         APInt zero = APInt::getZero(1);
+                                         return a == 1 ? zero : (zero + 1);
+                                       });
+}
+
 void spirv::LogicalNotOp::getCanonicalizationPatterns(
     RewritePatternSet &results, MLIRContext *context) {
   results
diff --git a/mlir/test/Dialect/SPIRV/Transforms/canonicalize.mlir b/mlir/test/Dialect/SPIRV/Transforms/canonicalize.mlir
index 871ecd4f28b12..1cb69891a70ed 100644
--- a/mlir/test/Dialect/SPIRV/Transforms/canonicalize.mlir
+++ b/mlir/test/Dialect/SPIRV/Transforms/canonicalize.mlir
@@ -1006,6 +1006,102 @@ func.func @umod_fail_fold(%arg0: i32) -> (i32, i32) {
 
 // -----
 
+//===----------------------------------------------------------------------===//
+// spirv.SNegate
+//===----------------------------------------------------------------------===//
+
+// CHECK-LABEL: @snegate_twice
+// CHECK-SAME: (%[[ARG:.*]]: i32)
+func.func @snegate_twice(%arg0 : i32) -> i32 {
+  %0 = spirv.SNegate %arg0 : i32
+  %1 = spirv.SNegate %0 : i32
+
+  // CHECK: return %[[ARG]] : i32
+  return %1 : i32
+}
+
+// CHECK-LABEL: @snegate_min
+func.func @snegate_min() -> (i8, i8) {
+  // CHECK: %[[MIN:.*]] = spirv.Constant -128 : i8
+  %cmin = spirv.Constant -128 : i8
+
+  %0 = spirv.SNegate %cmin : i8
+  %1 = spirv.SNegate %0 : i8
+
+  // CHECK: return %[[MIN]], %[[MIN]]
+  return %0, %1 : i8, i8
+}
+
+// CHECK-LABEL: @const_fold_scalar_snegate
+func.func @const_fold_scalar_snegate() -> (i32, i32, i32) {
+  %c0 = spirv.Constant 0 : i32
+  %c3 = spirv.Constant 3 : i32
+  %cn3 = spirv.Constant -3 : i32
+
+  // CHECK-DAG: %[[THREE:.*]] = spirv.Constant 3 : i32
+  // CHECK-DAG: %[[NTHREE:.*]] = spirv.Constant -3 : i32
+  // CHECK-DAG: %[[ZERO:.*]] = spirv.Constant 0 : i32
+  %0 = spirv.SNegate %c0 : i32
+  %1 = spirv.SNegate %c3 : i32
+  %2 = spirv.SNegate %cn3 : i32
+
+  // CHECK: return %[[ZERO]], %[[NTHREE]], %[[THREE]]
+  return %0, %1, %2  : i32, i32, i32
+}
+
+// CHECK-LABEL: @const_fold_vector_snegate
+func.func @const_fold_vector_snegate() -> vector<3xi32> {
+  // CHECK: spirv.Constant dense<[0, 3, -3]>
+  %cv = spirv.Constant dense<[0, -3, 3]> : vector<3xi32>
+  %0 = spirv.SNegate %cv : vector<3xi32>
+  return %0  : vector<3xi32>
+}
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// spirv.Not
+//===----------------------------------------------------------------------===//
+
+// CHECK-LABEL: @not_twice
+// CHECK-SAME: (%[[ARG:.*]]: i32)
+func.func @not_twice(%arg0 : i32) -> i32 {
+  %0 = spirv.Not %arg0 : i32
+  %1 = spirv.Not %0 : i32
+
+  // CHECK: return %[[ARG]] : i32
+  return %1 : i32
+}
+
+// CHECK-LABEL: @const_fold_scalar_not
+func.func @const_fold_scalar_not() -> (i32, i32, i32) {
+  %c0 = spirv.Constant 0 : i32
+  %c3 = spirv.Constant 3 : i32
+  %cn3 = spirv.Constant -3 : i32
+
+  // CHECK-DAG: %[[TWO:.*]] = spirv.Constant 2 : i32
+  // CHECK-DAG: %[[NFOUR:.*]] = spirv.Constant -4 : i32
+  // CHECK-DAG: %[[NONE:.*]] = spirv.Constant -1 : i32
+  %0 = spirv.Not %c0 : i32
+  %1 = spirv.Not %c3 : i32
+  %2 = spirv.Not %cn3 : i32
+
+  // CHECK: return %[[NONE]], %[[NFOUR]], %[[TWO]]
+  return %0, %1, %2  : i32, i32, i32
+}
+
+// CHECK-LABEL: @const_fold_vector_not
+func.func @const_fold_vector_not() -> vector<3xi32> {
+  %cv = spirv.Constant dense<[-1, -4, 2]> : vector<3xi32>
+
+  // CHECK: spirv.Constant dense<[0, 3, -3]>
+  %0 = spirv.Not %cv : vector<3xi32>
+
+  return %0 : vector<3xi32>
+}
+
+// -----
+
 //===----------------------------------------------------------------------===//
 // spirv.LogicalAnd
 //===----------------------------------------------------------------------===//
@@ -1040,6 +1136,38 @@ func.func @convert_logical_and_true_false_vector(%arg: vector<3xi1>) -> (vector<
 // spirv.LogicalNot
 //===----------------------------------------------------------------------===//
 
+// CHECK-LABEL: @logical_not_twice
+// CHECK-SAME: (%[[ARG:.*]]: i1)
+func.func @logical_not_twice(%arg0 : i1) -> i1 {
+  %0 = spirv.LogicalNot %arg0 : i1
+  %1 = spirv.LogicalNot %0 : i1
+
+  // CHECK: return %[[ARG]] : i1
+  return %1 : i1
+}
+
+// CHECK-LABEL: @const_fold_scalar_logical_not
+func.func @const_fold_scalar_logical_not() -> i1 {
+  %true = spirv.Constant true
+
+  // CHECK: spirv.Constant false
+  %0 = spirv.LogicalNot %true : i1
+
+  return %0 : i1
+}
+
+// CHECK-LABEL: @const_fold_vector_logical_not
+func.func @const_fold_vector_logical_not() -> vector<2xi1> {
+  %cv = spirv.Constant dense<[true, false]> : vector<2xi1>
+
+  // CHECK: spirv.Constant dense<[false, true]>
+  %0 = spirv.LogicalNot %cv : vector<2xi1>
+
+  return %0 : vector<2xi1>
+}
+
+// -----
+
 func.func @convert_logical_not_to_not_equal(%arg0: vector<3xi64>, %arg1: vector<3xi64>) -> vector<3xi1> {
   // CHECK: %[[RESULT:.*]] = spirv.INotEqual {{%.*}}, {{%.*}} : vector<3xi64>
   // CHECK-NEXT: spirv.ReturnValue %[[RESULT]] : vector<3xi1>

From a4e15416b41459b6f69086a22088520ee826f244 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell@arm.com>
Date: Thu, 21 Dec 2023 17:46:12 +0000
Subject: [PATCH 099/342] [mlir][ArmSME] Move creation of load/store intrinsics
 to helpers (NFC) (#76168)

Also, for consistency make the ZeroOp lowering switch on the ArmSMETileType,
rather than the element bit width.
---
 .../Conversion/ArmSMEToLLVM/ArmSMEToLLVM.cpp  | 227 +++++++++---------
 1 file changed, 108 insertions(+), 119 deletions(-)

diff --git a/mlir/lib/Conversion/ArmSMEToLLVM/ArmSMEToLLVM.cpp b/mlir/lib/Conversion/ArmSMEToLLVM/ArmSMEToLLVM.cpp
index f9d6f04a811f3..0c6e2e80b88a3 100644
--- a/mlir/lib/Conversion/ArmSMEToLLVM/ArmSMEToLLVM.cpp
+++ b/mlir/lib/Conversion/ArmSMEToLLVM/ArmSMEToLLVM.cpp
@@ -32,6 +32,95 @@ using namespace mlir;
 
 namespace {
 
+/// Helper to create an arm_sme.intr.ld1*.(horiz|vert)' intrinsic.
+static Operation *createLoadTileSliceIntrinsic(
+    RewriterBase &rewriter, Location loc, arm_sme::ArmSMETileType type,
+    arm_sme::TileSliceLayout layout, Value maskOp, Value ptr,
+    IntegerAttr tileId, Value tileSliceI32) {
+  if (layout == arm_sme::TileSliceLayout::Horizontal) {
+    switch (type) {
+    case arm_sme::ArmSMETileType::ZAB:
+      return rewriter.create<arm_sme::aarch64_sme_ld1b_horiz>(
+          loc, maskOp, ptr, tileId, tileSliceI32);
+    case arm_sme::ArmSMETileType::ZAH:
+      return rewriter.create<arm_sme::aarch64_sme_ld1h_horiz>(
+          loc, maskOp, ptr, tileId, tileSliceI32);
+    case arm_sme::ArmSMETileType::ZAS:
+      return rewriter.create<arm_sme::aarch64_sme_ld1w_horiz>(
+          loc, maskOp, ptr, tileId, tileSliceI32);
+    case arm_sme::ArmSMETileType::ZAD:
+      return rewriter.create<arm_sme::aarch64_sme_ld1d_horiz>(
+          loc, maskOp, ptr, tileId, tileSliceI32);
+    case arm_sme::ArmSMETileType::ZAQ:
+      return rewriter.create<arm_sme::aarch64_sme_ld1q_horiz>(
+          loc, maskOp, ptr, tileId, tileSliceI32);
+    }
+  } else {
+    switch (type) {
+    case arm_sme::ArmSMETileType::ZAB:
+      return rewriter.create<arm_sme::aarch64_sme_ld1b_vert>(
+          loc, maskOp, ptr, tileId, tileSliceI32);
+    case arm_sme::ArmSMETileType::ZAH:
+      return rewriter.create<arm_sme::aarch64_sme_ld1h_vert>(
+          loc, maskOp, ptr, tileId, tileSliceI32);
+    case arm_sme::ArmSMETileType::ZAS:
+      return rewriter.create<arm_sme::aarch64_sme_ld1w_vert>(
+          loc, maskOp, ptr, tileId, tileSliceI32);
+    case arm_sme::ArmSMETileType::ZAD:
+      return rewriter.create<arm_sme::aarch64_sme_ld1d_vert>(
+          loc, maskOp, ptr, tileId, tileSliceI32);
+    case arm_sme::ArmSMETileType::ZAQ:
+      return rewriter.create<arm_sme::aarch64_sme_ld1q_vert>(
+          loc, maskOp, ptr, tileId, tileSliceI32);
+      break;
+    }
+  }
+}
+
+/// Helper to create an arm_sme.intr.st1*.(horiz|vert)' intrinsic.
+static Operation *createStoreTileSliceIntrinsic(
+    RewriterBase &rewriter, Location loc, arm_sme::ArmSMETileType type,
+    arm_sme::TileSliceLayout layout, Value maskOp, Value ptr,
+    IntegerAttr tileId, Value tileSliceI32) {
+  if (layout == arm_sme::TileSliceLayout::Horizontal) {
+    switch (type) {
+    case arm_sme::ArmSMETileType::ZAB:
+      return rewriter.create<arm_sme::aarch64_sme_st1b_horiz>(
+          loc, maskOp, ptr, tileId, tileSliceI32);
+    case arm_sme::ArmSMETileType::ZAH:
+      return rewriter.create<arm_sme::aarch64_sme_st1h_horiz>(
+          loc, maskOp, ptr, tileId, tileSliceI32);
+    case arm_sme::ArmSMETileType::ZAS:
+      return rewriter.create<arm_sme::aarch64_sme_st1w_horiz>(
+          loc, maskOp, ptr, tileId, tileSliceI32);
+    case arm_sme::ArmSMETileType::ZAD:
+      return rewriter.create<arm_sme::aarch64_sme_st1d_horiz>(
+          loc, maskOp, ptr, tileId, tileSliceI32);
+    case arm_sme::ArmSMETileType::ZAQ:
+      return rewriter.create<arm_sme::aarch64_sme_st1q_horiz>(
+          loc, maskOp, ptr, tileId, tileSliceI32);
+    }
+  } else {
+    switch (type) {
+    case arm_sme::ArmSMETileType::ZAB:
+      return rewriter.create<arm_sme::aarch64_sme_st1b_vert>(
+          loc, maskOp, ptr, tileId, tileSliceI32);
+    case arm_sme::ArmSMETileType::ZAH:
+      return rewriter.create<arm_sme::aarch64_sme_st1h_vert>(
+          loc, maskOp, ptr, tileId, tileSliceI32);
+    case arm_sme::ArmSMETileType::ZAS:
+      return rewriter.create<arm_sme::aarch64_sme_st1w_vert>(
+          loc, maskOp, ptr, tileId, tileSliceI32);
+    case arm_sme::ArmSMETileType::ZAD:
+      return rewriter.create<arm_sme::aarch64_sme_st1d_vert>(
+          loc, maskOp, ptr, tileId, tileSliceI32);
+    case arm_sme::ArmSMETileType::ZAQ:
+      return rewriter.create<arm_sme::aarch64_sme_st1q_vert>(
+          loc, maskOp, ptr, tileId, tileSliceI32);
+    }
+  }
+}
+
 IntegerAttr getTileIdOrError(arm_sme::ArmSMETileOpInterface op) {
   auto tileId = op.getTileId();
   if (!tileId)
@@ -75,9 +164,6 @@ struct ZeroOpConversion : public ConvertOpToLLVMPattern<arm_sme::ZeroOp> {
                   ConversionPatternRewriter &rewriter) const override {
     auto loc = zero.getLoc();
 
-    unsigned tileElementWidth =
-        zero.getVectorType().getElementType().getIntOrFloatBitWidth();
-
     auto tileId = getTileIdOrError(zero);
     if (!tileId)
       return failure();
@@ -86,23 +172,24 @@ struct ZeroOpConversion : public ConvertOpToLLVMPattern<arm_sme::ZeroOp> {
     // The base mask is just the mask to zero the first tile (of a size).
     // These masks are derived from:
     // https://developer.arm.com/documentation/ddi0602/2022-06/SME-Instructions/ZERO--Zero-a-list-of-64-bit-element-ZA-tiles-
+    arm_sme::ArmSMETileType tileType = *zero.getAllocatedTileType();
     auto baseMaskForSize = [&] {
-      switch (tileElementWidth) {
-      case 8:
+      switch (tileType) {
+      case arm_sme::ArmSMETileType::ZAB:
         // Zeroing the 8-bit ZA0.B tile is equivalent to zeroing all eight
         // 64-bit element tiles named ZA0.D to ZA7.D.
         return 0b1111'1111;
-      case 16:
-        // Zeroing the 16-bit ZA0.H tile is equivalent to zeroing 64-bit element
-        // tiles named ZA0.D, ZA2.D, ZA4.D, and ZA6.D.
-        // Shift this left once for ZA1.H.
+      case arm_sme::ArmSMETileType::ZAH:
+        // Zeroing the 16-bit ZA0.H tile is equivalent to zeroing 64-bit
+        // element tiles named ZA0.D, ZA2.D, ZA4.D, and ZA6.D. Shift this left
+        // once for ZA1.H.
         return 0b0101'0101;
-      case 32:
+      case arm_sme::ArmSMETileType::ZAS:
         // Zeroing the 32-bit ZA0.S tile is equivalent to zeroing 64-bit
         // element tiles named ZA0.D and ZA4.D.
         // Shift left by 1, 2, or 3 respectively for ZA1.S, ZA2.S, ZA3.S.
         return 0b0001'0001;
-      case 64:
+      case arm_sme::ArmSMETileType::ZAD:
         // Zeroing one of the a 64-bit tiles ZA0.D to ZA7.D just requires
         // setting the bit for that tile.
         return 0b0000'0001;
@@ -172,63 +259,13 @@ struct LoadTileSliceConversion
     // Create all active predicate mask.
     auto maskOp = loadTileSliceOp.getMask();
 
-    auto tileType = loadTileSliceOp.getVectorType();
-    auto tileElementType = tileType.getElementType();
-    unsigned tileElementWidth = tileElementType.getIntOrFloatBitWidth();
+    auto tileVectorType = loadTileSliceOp.getVectorType();
+    arm_sme::ArmSMETileType tileType = *arm_sme::getSMETileType(tileVectorType);
     arm_sme::TileSliceLayout layout = loadTileSliceOp.getLayout();
 
     // Create 'arm_sme.intr.ld1*.(horiz|vert)' intrinsic to load ZA tile slice.
-    if (layout == arm_sme::TileSliceLayout::Horizontal) {
-      switch (tileElementWidth) {
-      default:
-        llvm_unreachable("unexpected element type!");
-      case 8:
-        rewriter.create<arm_sme::aarch64_sme_ld1b_horiz>(loc, maskOp, ptr,
-                                                         tileId, tileSliceI32);
-        break;
-      case 16:
-        rewriter.create<arm_sme::aarch64_sme_ld1h_horiz>(loc, maskOp, ptr,
-                                                         tileId, tileSliceI32);
-        break;
-      case 32:
-        rewriter.create<arm_sme::aarch64_sme_ld1w_horiz>(loc, maskOp, ptr,
-                                                         tileId, tileSliceI32);
-        break;
-      case 64:
-        rewriter.create<arm_sme::aarch64_sme_ld1d_horiz>(loc, maskOp, ptr,
-                                                         tileId, tileSliceI32);
-        break;
-      case 128:
-        rewriter.create<arm_sme::aarch64_sme_ld1q_horiz>(loc, maskOp, ptr,
-                                                         tileId, tileSliceI32);
-        break;
-      }
-    } else {
-      switch (tileElementWidth) {
-      default:
-        llvm_unreachable("unexpected element type!");
-      case 8:
-        rewriter.create<arm_sme::aarch64_sme_ld1b_vert>(loc, maskOp, ptr,
-                                                        tileId, tileSliceI32);
-        break;
-      case 16:
-        rewriter.create<arm_sme::aarch64_sme_ld1h_vert>(loc, maskOp, ptr,
-                                                        tileId, tileSliceI32);
-        break;
-      case 32:
-        rewriter.create<arm_sme::aarch64_sme_ld1w_vert>(loc, maskOp, ptr,
-                                                        tileId, tileSliceI32);
-        break;
-      case 64:
-        rewriter.create<arm_sme::aarch64_sme_ld1d_vert>(loc, maskOp, ptr,
-                                                        tileId, tileSliceI32);
-        break;
-      case 128:
-        rewriter.create<arm_sme::aarch64_sme_ld1q_vert>(loc, maskOp, ptr,
-                                                        tileId, tileSliceI32);
-        break;
-      }
-    }
+    createLoadTileSliceIntrinsic(rewriter, loc, tileType, layout, maskOp, ptr,
+                                 tileId, tileSliceI32);
 
     // The load intrinsics have no result, replace 'arm_sme.tile_load' with
     // the input tile to preserve dataflow.
@@ -249,9 +286,7 @@ struct StoreTileSliceConversion
                   arm_sme::StoreTileSliceOp::Adaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
     auto loc = storeTileSliceOp.getLoc();
-    auto tileType = storeTileSliceOp.getVectorType();
-    auto tileElementType = tileType.getElementType();
-    unsigned tileElementWidth = tileElementType.getIntOrFloatBitWidth();
+    auto tileVectorType = storeTileSliceOp.getVectorType();
 
     auto tileId = getTileIdOrError(storeTileSliceOp);
     if (!tileId)
@@ -271,58 +306,12 @@ struct StoreTileSliceConversion
     auto maskOp = storeTileSliceOp.getMask();
 
     arm_sme::TileSliceLayout layout = storeTileSliceOp.getLayout();
+    arm_sme::ArmSMETileType tileType = *arm_sme::getSMETileType(tileVectorType);
 
-    if (layout == arm_sme::TileSliceLayout::Horizontal) {
-      switch (tileElementWidth) {
-      default:
-        llvm_unreachable("unexpected element type!");
-      case 8:
-        rewriter.replaceOpWithNewOp<arm_sme::aarch64_sme_st1b_horiz>(
-            storeTileSliceOp, maskOp, ptr, tileId, tileSliceI32);
-        break;
-      case 16:
-        rewriter.replaceOpWithNewOp<arm_sme::aarch64_sme_st1h_horiz>(
-            storeTileSliceOp, maskOp, ptr, tileId, tileSliceI32);
-        break;
-      case 32:
-        rewriter.replaceOpWithNewOp<arm_sme::aarch64_sme_st1w_horiz>(
-            storeTileSliceOp, maskOp, ptr, tileId, tileSliceI32);
-        break;
-      case 64:
-        rewriter.replaceOpWithNewOp<arm_sme::aarch64_sme_st1d_horiz>(
-            storeTileSliceOp, maskOp, ptr, tileId, tileSliceI32);
-        break;
-      case 128:
-        rewriter.replaceOpWithNewOp<arm_sme::aarch64_sme_st1q_horiz>(
-            storeTileSliceOp, maskOp, ptr, tileId, tileSliceI32);
-        break;
-      }
-    } else {
-      switch (tileElementWidth) {
-      default:
-        llvm_unreachable("unexpected element type!");
-      case 8:
-        rewriter.replaceOpWithNewOp<arm_sme::aarch64_sme_st1b_vert>(
-            storeTileSliceOp, maskOp, ptr, tileId, tileSliceI32);
-        break;
-      case 16:
-        rewriter.replaceOpWithNewOp<arm_sme::aarch64_sme_st1h_vert>(
-            storeTileSliceOp, maskOp, ptr, tileId, tileSliceI32);
-        break;
-      case 32:
-        rewriter.replaceOpWithNewOp<arm_sme::aarch64_sme_st1w_vert>(
-            storeTileSliceOp, maskOp, ptr, tileId, tileSliceI32);
-        break;
-      case 64:
-        rewriter.replaceOpWithNewOp<arm_sme::aarch64_sme_st1d_vert>(
-            storeTileSliceOp, maskOp, ptr, tileId, tileSliceI32);
-        break;
-      case 128:
-        rewriter.replaceOpWithNewOp<arm_sme::aarch64_sme_st1q_vert>(
-            storeTileSliceOp, maskOp, ptr, tileId, tileSliceI32);
-        break;
-      }
-    }
+    rewriter.replaceOp(storeTileSliceOp,
+                       createStoreTileSliceIntrinsic(rewriter, loc, tileType,
+                                                     layout, maskOp, ptr,
+                                                     tileId, tileSliceI32));
 
     return success();
   }

From 34a65980d7d2e1b05e3fc88535cafe606ee55e04 Mon Sep 17 00:00:00 2001
From: Billy Zhu <billyzhu@modular.com>
Date: Thu, 21 Dec 2023 09:54:48 -0800
Subject: [PATCH 100/342] [MLIR] Erase location of folded constants (#75415)

Follow up to the discussion from #75258, and serves as an alternate
solution for #74670.

Set the location to Unknown for deduplicated / moved / materialized
constants by OperationFolder. This makes sure that the folded constants
don't end up with an arbitrary location of one of the original ops that
became it, and that hoisted ops don't confuse the stepping order.
---
 mlir/include/mlir/Transforms/FoldUtils.h      |  8 +++-
 mlir/lib/Transforms/SCCP.cpp                  |  2 +-
 mlir/lib/Transforms/Utils/FoldUtils.cpp       | 33 +++++++++----
 .../Transform/test-pattern-application.mlir   |  2 -
 .../Transforms/canonicalize-debuginfo.mlir    | 46 +++++++++++++++++++
 .../Transforms/constant-fold-debuginfo.mlir   | 42 +++++++++++++++++
 .../lib/Transforms/TestIntRangeInference.cpp  |  5 +-
 7 files changed, 120 insertions(+), 18 deletions(-)
 create mode 100644 mlir/test/Transforms/canonicalize-debuginfo.mlir
 create mode 100644 mlir/test/Transforms/constant-fold-debuginfo.mlir

diff --git a/mlir/include/mlir/Transforms/FoldUtils.h b/mlir/include/mlir/Transforms/FoldUtils.h
index 2600da361496c..2e7a6fe3e362c 100644
--- a/mlir/include/mlir/Transforms/FoldUtils.h
+++ b/mlir/include/mlir/Transforms/FoldUtils.h
@@ -33,7 +33,8 @@ class Value;
 class OperationFolder {
 public:
   OperationFolder(MLIRContext *ctx, OpBuilder::Listener *listener = nullptr)
-      : interfaces(ctx), rewriter(ctx, listener) {}
+      : erasedFoldedLocation(UnknownLoc::get(ctx)), interfaces(ctx),
+        rewriter(ctx, listener) {}
 
   /// Tries to perform folding on the given `op`, including unifying
   /// deduplicated constants. If successful, replaces `op`'s uses with
@@ -65,7 +66,7 @@ class OperationFolder {
   /// be created in a parent block. On success this returns the constant
   /// operation, nullptr otherwise.
   Value getOrCreateConstant(Block *block, Dialect *dialect, Attribute value,
-                            Type type, Location loc);
+                            Type type);
 
 private:
   /// This map keeps track of uniqued constants by dialect, attribute, and type.
@@ -95,6 +96,9 @@ class OperationFolder {
                                     Dialect *dialect, Attribute value,
                                     Type type, Location loc);
 
+  /// The location to overwrite with for folder-owned constants.
+  UnknownLoc erasedFoldedLocation;
+
   /// A mapping between an insertion region and the constants that have been
   /// created within it.
   DenseMap<Region *, ConstantMap> foldScopes;
diff --git a/mlir/lib/Transforms/SCCP.cpp b/mlir/lib/Transforms/SCCP.cpp
index 14435b37acc91..b2d3929b04596 100644
--- a/mlir/lib/Transforms/SCCP.cpp
+++ b/mlir/lib/Transforms/SCCP.cpp
@@ -53,7 +53,7 @@ static LogicalResult replaceWithConstant(DataFlowSolver &solver,
   Dialect *dialect = latticeValue.getConstantDialect();
   Value constant = folder.getOrCreateConstant(
       builder.getInsertionBlock(), dialect, latticeValue.getConstantValue(),
-      value.getType(), value.getLoc());
+      value.getType());
   if (!constant)
     return failure();
 
diff --git a/mlir/lib/Transforms/Utils/FoldUtils.cpp b/mlir/lib/Transforms/Utils/FoldUtils.cpp
index eb4dcb251a228..e5f78abf7fca5 100644
--- a/mlir/lib/Transforms/Utils/FoldUtils.cpp
+++ b/mlir/lib/Transforms/Utils/FoldUtils.cpp
@@ -77,8 +77,10 @@ LogicalResult OperationFolder::tryToFold(Operation *op, bool *inPlaceUpdate) {
     // Check to see if we should rehoist, i.e. if a non-constant operation was
     // inserted before this one.
     Block *opBlock = op->getBlock();
-    if (&opBlock->front() != op && !isFolderOwnedConstant(op->getPrevNode()))
+    if (&opBlock->front() != op && !isFolderOwnedConstant(op->getPrevNode())) {
       op->moveBefore(&opBlock->front());
+      op->setLoc(erasedFoldedLocation);
+    }
     return failure();
   }
 
@@ -112,8 +114,10 @@ bool OperationFolder::insertKnownConstant(Operation *op, Attribute constValue) {
   // If this is a constant we unique'd, we don't need to insert, but we can
   // check to see if we should rehoist it.
   if (isFolderOwnedConstant(op)) {
-    if (&opBlock->front() != op && !isFolderOwnedConstant(op->getPrevNode()))
+    if (&opBlock->front() != op && !isFolderOwnedConstant(op->getPrevNode())) {
       op->moveBefore(&opBlock->front());
+      op->setLoc(erasedFoldedLocation);
+    }
     return true;
   }
 
@@ -142,6 +146,7 @@ bool OperationFolder::insertKnownConstant(Operation *op, Attribute constValue) {
   if (folderConstOp) {
     notifyRemoval(op);
     rewriter.replaceOp(op, folderConstOp->getResults());
+    folderConstOp->setLoc(erasedFoldedLocation);
     return false;
   }
 
@@ -151,8 +156,10 @@ bool OperationFolder::insertKnownConstant(Operation *op, Attribute constValue) {
   // anything. Otherwise, we move the constant to the insertion block.
   Block *insertBlock = &insertRegion->front();
   if (opBlock != insertBlock || (&insertBlock->front() != op &&
-                                 !isFolderOwnedConstant(op->getPrevNode())))
+                                 !isFolderOwnedConstant(op->getPrevNode()))) {
     op->moveBefore(&insertBlock->front());
+    op->setLoc(erasedFoldedLocation);
+  }
 
   folderConstOp = op;
   referencedDialects[op].push_back(op->getDialect());
@@ -193,17 +200,17 @@ void OperationFolder::clear() {
 /// Get or create a constant using the given builder. On success this returns
 /// the constant operation, nullptr otherwise.
 Value OperationFolder::getOrCreateConstant(Block *block, Dialect *dialect,
-                                           Attribute value, Type type,
-                                           Location loc) {
+                                           Attribute value, Type type) {
   // Find an insertion point for the constant.
   auto *insertRegion = getInsertionRegion(interfaces, block);
   auto &entry = insertRegion->front();
   rewriter.setInsertionPoint(&entry, entry.begin());
 
   // Get the constant map for the insertion region of this operation.
+  // Use erased location since the op is being built at the front of block.
   auto &uniquedConstants = foldScopes[insertRegion];
-  Operation *constOp =
-      tryGetOrCreateConstant(uniquedConstants, dialect, value, type, loc);
+  Operation *constOp = tryGetOrCreateConstant(uniquedConstants, dialect, value,
+                                              type, erasedFoldedLocation);
   return constOp ? constOp->getResult(0) : Value();
 }
 
@@ -254,8 +261,9 @@ OperationFolder::processFoldResults(Operation *op,
     // Check to see if there is a canonicalized version of this constant.
     auto res = op->getResult(i);
     Attribute attrRepl = foldResults[i].get<Attribute>();
-    if (auto *constOp = tryGetOrCreateConstant(
-            uniquedConstants, dialect, attrRepl, res.getType(), op->getLoc())) {
+    if (auto *constOp =
+            tryGetOrCreateConstant(uniquedConstants, dialect, attrRepl,
+                                   res.getType(), erasedFoldedLocation)) {
       // Ensure that this constant dominates the operation we are replacing it
       // with. This may not automatically happen if the operation being folded
       // was inserted before the constant within the insertion block.
@@ -290,8 +298,11 @@ OperationFolder::tryGetOrCreateConstant(ConstantMap &uniquedConstants,
   // Check if an existing mapping already exists.
   auto constKey = std::make_tuple(dialect, value, type);
   Operation *&constOp = uniquedConstants[constKey];
-  if (constOp)
+  if (constOp) {
+    if (loc != constOp->getLoc())
+      constOp->setLoc(erasedFoldedLocation);
     return constOp;
+  }
 
   // If one doesn't exist, try to materialize one.
   if (!(constOp = materializeConstant(dialect, rewriter, value, type, loc)))
@@ -314,6 +325,8 @@ OperationFolder::tryGetOrCreateConstant(ConstantMap &uniquedConstants,
     notifyRemoval(constOp);
     rewriter.eraseOp(constOp);
     referencedDialects[existingOp].push_back(dialect);
+    if (loc != existingOp->getLoc())
+      existingOp->setLoc(erasedFoldedLocation);
     return constOp = existingOp;
   }
 
diff --git a/mlir/test/Dialect/Transform/test-pattern-application.mlir b/mlir/test/Dialect/Transform/test-pattern-application.mlir
index 2fd47c6bae396..ff9a535c83843 100644
--- a/mlir/test/Dialect/Transform/test-pattern-application.mlir
+++ b/mlir/test/Dialect/Transform/test-pattern-application.mlir
@@ -179,7 +179,6 @@ module {
 //       CHECK:   return %[[c5]]
 func.func @canonicalization(%t: tensor<5xf32>) -> index {
   %c0 = arith.constant 0 : index
-  // expected-remark @below {{op was replaced}}
   %dim = tensor.dim %t, %c0 : tensor<5xf32>
   return %dim : index
 }
@@ -191,7 +190,6 @@ transform.sequence failures(propagate) {
   transform.apply_patterns to %1 {
     transform.apply_patterns.canonicalization
   } : !transform.any_op
-  transform.test_print_remark_at_operand %0, "op was replaced" : !transform.any_op
 }
 
 // -----
diff --git a/mlir/test/Transforms/canonicalize-debuginfo.mlir b/mlir/test/Transforms/canonicalize-debuginfo.mlir
new file mode 100644
index 0000000000000..30c8022daa76b
--- /dev/null
+++ b/mlir/test/Transforms/canonicalize-debuginfo.mlir
@@ -0,0 +1,46 @@
+// RUN: mlir-opt %s -pass-pipeline='builtin.module(func.func(canonicalize{test-convergence}))' -split-input-file -mlir-print-debuginfo | FileCheck %s
+
+// CHECK-LABEL: func @merge_constants
+func.func @merge_constants() -> (index, index, index, index) {
+  // CHECK-NEXT: arith.constant 42 : index loc(#[[UnknownLoc:.*]])
+  %0 = arith.constant 42 : index loc("merge_constants":0:0)
+  %1 = arith.constant 42 : index loc("merge_constants":1:0)
+  %2 = arith.constant 42 : index loc("merge_constants":2:0)
+  %3 = arith.constant 42 : index loc("merge_constants":2:0)
+  return %0, %1, %2, %3 : index, index, index, index
+}
+// CHECK: #[[UnknownLoc]] = loc(unknown)
+
+// -----
+
+// CHECK-LABEL: func @simple_hoist
+func.func @simple_hoist(%arg0: memref<8xi32>) -> i32 {
+  // CHECK: arith.constant 88 : i32 loc(#[[UnknownLoc:.*]])
+  // CHECK: arith.constant 42 : i32 loc(#[[ConstLoc0:.*]])
+  // CHECK: arith.constant 0 : index loc(#[[ConstLoc1:.*]])
+  %0 = arith.constant 42 : i32 loc("simple_hoist":0:0)
+  %1 = arith.constant 0 : index loc("simple_hoist":1:0)
+  memref.store %0, %arg0[%1] : memref<8xi32>
+
+  %2 = arith.constant 88 : i32 loc("simple_hoist":2:0)
+
+  return %2 : i32
+}
+// CHECK-DAG: #[[UnknownLoc]] = loc(unknown)
+// CHECK-DAG: #[[ConstLoc0]] = loc("simple_hoist":0:0)
+// CHECK-DAG: #[[ConstLoc1]] = loc("simple_hoist":1:0)
+
+// -----
+
+// CHECK-LABEL: func @hoist_and_merge
+func.func @hoist_and_merge(%arg0: memref<8xi32>) {
+  // CHECK-NEXT: arith.constant 42 : i32 loc(#[[UnknownLoc:.*]])
+  affine.for %arg1 = 0 to 8 {
+    %0 = arith.constant 42 : i32 loc("hoist_and_merge":0:0)
+    %1 = arith.constant 42 : i32 loc("hoist_and_merge":1:0)
+    memref.store %0, %arg0[%arg1] : memref<8xi32>
+    memref.store %1, %arg0[%arg1] : memref<8xi32>
+  }
+  return
+} loc("hoist_and_merge":2:0)
+// CHECK: #[[UnknownLoc]] = loc(unknown)
diff --git a/mlir/test/Transforms/constant-fold-debuginfo.mlir b/mlir/test/Transforms/constant-fold-debuginfo.mlir
new file mode 100644
index 0000000000000..c308bc477bee4
--- /dev/null
+++ b/mlir/test/Transforms/constant-fold-debuginfo.mlir
@@ -0,0 +1,42 @@
+// RUN: mlir-opt %s -split-input-file -test-constant-fold -mlir-print-debuginfo | FileCheck %s
+
+// CHECK-LABEL: func @fold_and_merge
+func.func @fold_and_merge() -> (i32, i32) {
+  // CHECK-NEXT: [[C:%.+]] = arith.constant 6 : i32 loc(#[[UnknownLoc:.*]])
+  %0 = arith.constant 1 : i32 loc("fold_and_merge":0:0)
+  %1 = arith.constant 5 : i32 loc("fold_and_merge":1:0)
+  %2 = arith.addi %0, %1 : i32 loc("fold_and_merge":2:0)
+
+  %3 = arith.constant 6 : i32 loc("fold_and_merge":3:0)
+
+  return %2, %3: i32, i32
+}
+// CHECK: #[[UnknownLoc]] = loc(unknown)
+
+// -----
+
+// CHECK-LABEL: func @materialize_different_dialect
+func.func @materialize_different_dialect() -> (f32, f32) {
+  // CHECK: arith.constant 1.{{0*}}e+00 : f32 loc(#[[UnknownLoc:.*]])
+  %0 = arith.constant -1.0 : f32 loc("materialize_different_dialect":0:0)
+  %1 = math.absf %0 : f32 loc("materialize_different_dialect":1:0)
+  %2 = arith.constant 1.0 : f32 loc("materialize_different_dialect":2:0)
+
+  return %1, %2: f32, f32
+}
+// CHECK: #[[UnknownLoc]] = loc(unknown)
+
+// -----
+
+// CHECK-LABEL: func @materialize_in_front
+func.func @materialize_in_front(%arg0: memref<8xi32>) {
+  // CHECK-NEXT: arith.constant 6 : i32 loc(#[[UnknownLoc:.*]])
+  affine.for %arg1 = 0 to 8 {
+    %1 = arith.constant 1 : i32 loc("materialize_in_front":0:0)
+    %2 = arith.constant 5 : i32 loc("materialize_in_front":1:0)
+    %3 = arith.addi %1, %2 : i32 loc("materialize_in_front":2:0)
+    memref.store %3, %arg0[%arg1] : memref<8xi32>
+  }
+  return
+} loc("materialize_in_front":3:0)
+// CHECK: #[[UnknownLoc]] = loc(unknown)
diff --git a/mlir/test/lib/Transforms/TestIntRangeInference.cpp b/mlir/test/lib/Transforms/TestIntRangeInference.cpp
index 2f6dd5b8095df..5758f6acf2f0f 100644
--- a/mlir/test/lib/Transforms/TestIntRangeInference.cpp
+++ b/mlir/test/lib/Transforms/TestIntRangeInference.cpp
@@ -40,9 +40,8 @@ static LogicalResult replaceWithConstant(DataFlowSolver &solver, OpBuilder &b,
       maybeDefiningOp ? maybeDefiningOp->getDialect()
                       : value.getParentRegion()->getParentOp()->getDialect();
   Attribute constAttr = b.getIntegerAttr(value.getType(), *maybeConstValue);
-  Value constant =
-      folder.getOrCreateConstant(b.getInsertionBlock(), valueDialect, constAttr,
-                                 value.getType(), value.getLoc());
+  Value constant = folder.getOrCreateConstant(
+      b.getInsertionBlock(), valueDialect, constAttr, value.getType());
   if (!constant)
     return failure();
 

From 192f720178732885ec86062b92daf3d562aea427 Mon Sep 17 00:00:00 2001
From: Tomas Matheson <tomas.matheson@arm.com>
Date: Thu, 21 Dec 2023 16:30:19 +0000
Subject: [PATCH 101/342] Re-land "[AArch64] Add FEAT_PAuthLR assembler
 support" (#75947)

This reverts commit 199a0f9f5aaf72ff856f68e3bb708e783252af17.
Fixed the left-shift of signed integer which was causing UB.
---
 llvm/lib/Target/AArch64/AArch64.td            |   9 +-
 .../lib/Target/AArch64/AArch64InstrFormats.td |  74 +++++++++
 llvm/lib/Target/AArch64/AArch64InstrInfo.td   |  39 +++++
 llvm/lib/Target/AArch64/AArch64SchedA64FX.td  |   2 +-
 .../Target/AArch64/AArch64SchedNeoverseN2.td  |   2 +-
 .../AArch64/AsmParser/AArch64AsmParser.cpp    |  28 ++++
 .../Disassembler/AArch64Disassembler.cpp      |  18 +++
 .../MCTargetDesc/AArch64AsmBackend.cpp        |  14 ++
 .../MCTargetDesc/AArch64ELFObjectWriter.cpp   |   4 +
 .../AArch64/MCTargetDesc/AArch64FixupKinds.h  |   5 +
 .../MCTargetDesc/AArch64MCCodeEmitter.cpp     |  29 ++++
 .../MC/AArch64/armv9.5a-pauthlr-diagnostics.s |  57 +++++++
 llvm/test/MC/AArch64/armv9.5a-pauthlr-reloc.s |  12 ++
 llvm/test/MC/AArch64/armv9.5a-pauthlr.s       | 151 ++++++++++++++++++
 .../Disassembler/AArch64/armv9.5a-pauthlr.txt |  78 +++++++++
 15 files changed, 518 insertions(+), 4 deletions(-)
 create mode 100644 llvm/test/MC/AArch64/armv9.5a-pauthlr-diagnostics.s
 create mode 100644 llvm/test/MC/AArch64/armv9.5a-pauthlr-reloc.s
 create mode 100644 llvm/test/MC/AArch64/armv9.5a-pauthlr.s
 create mode 100644 llvm/test/MC/Disassembler/AArch64/armv9.5a-pauthlr.txt

diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td
index db92a94e40e4b..97e92a57a7ff4 100644
--- a/llvm/lib/Target/AArch64/AArch64.td
+++ b/llvm/lib/Target/AArch64/AArch64.td
@@ -622,8 +622,13 @@ def FeatureLdpAlignedOnly : SubtargetFeature<"ldp-aligned-only", "HasLdpAlignedO
 def FeatureStpAlignedOnly : SubtargetFeature<"stp-aligned-only", "HasStpAlignedOnly",
     "true", "In order to emit stp, first check if the store will be aligned to 2 * element_size">;
 
+// AArch64 2023 Architecture Extensions (v9.5-A)
+
 def FeatureCPA : SubtargetFeature<"cpa", "HasCPA", "true",
-  "Enable ARMv9.5-A Checked Pointer Arithmetic (FEAT_CPA)">;
+    "Enable Armv9.5-A Checked Pointer Arithmetic (FEAT_CPA)">;
+
+def FeaturePAuthLR : SubtargetFeature<"pauth-lr", "HasPAuthLR",
+    "true", "Enable Armv9.5-A PAC enhancements (FEAT_PAuth_LR)">;
 
 //===----------------------------------------------------------------------===//
 // Architectures.
@@ -810,7 +815,7 @@ def SMEUnsupported : AArch64Unsupported {
                       SME2Unsupported.F);
 }
 
-let F = [HasPAuth] in
+let F = [HasPAuth, HasPAuthLR] in
 def PAUnsupported : AArch64Unsupported;
 
 include "AArch64SchedA53.td"
diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
index 690ac0dcda621..cb63d8726744d 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -2368,6 +2368,80 @@ class ClearAuth<bits<1> data, string asm>
   let Inst{4-0} = Rd;
 }
 
+// v9.5-A FEAT_PAuth_LR
+
+class SignAuthFixedRegs<bits<5> opcode2, bits<6> opcode, string asm>
+  : I<(outs), (ins), asm, "", "", []>,
+    Sched<[WriteI, ReadI]> {
+  let Inst{31} = 0b1; // sf
+  let Inst{30} = 0b1;
+  let Inst{29} = 0b0; // S
+  let Inst{28-21} = 0b11010110;
+  let Inst{20-16} = opcode2;
+  let Inst{15-10} = opcode;
+  let Inst{9-5} = 0b11111; // Rn
+  let Inst{4-0} = 0b11110; // Rd
+}
+
+def PAuthPCRelLabel16Operand : PCRelLabel<16> {
+  let Name = "PAuthPCRelLabel16";
+  let PredicateMethod = "isPAuthPCRelLabel16Operand";
+}
+def am_pauth_pcrel : Operand<OtherVT> {
+  let EncoderMethod = "getPAuthPCRelOpValue";
+  let DecoderMethod = "DecodePCRelLabel16";
+  let PrintMethod = "printAlignedLabel";
+  let ParserMatchClass = PAuthPCRelLabel16Operand;
+  let OperandType = "OPERAND_PCREL";
+}
+
+class SignAuthPCRel<bits<2> opc, string asm>
+  : I<(outs), (ins am_pauth_pcrel:$label), asm, "\t$label", "", []>,
+    Sched<[]> {
+  bits<16> label;
+  let Inst{31} = 0b1; // sf
+  let Inst{30-23} = 0b11100111;
+  let Inst{22-21} = opc;
+  let Inst{20-5} = label; // imm
+  let Inst{4-0} = 0b11111; // Rd
+}
+
+class SignAuthOneReg<bits<5> opcode2, bits<6> opcode, string asm>
+  : I<(outs), (ins GPR64:$Rn), asm, "\t$Rn", "", []>,
+    Sched<[]> {
+  bits<5> Rn;
+  let Inst{31} = 0b1; // sf
+  let Inst{30} = 0b1;
+  let Inst{29} = 0b0; // S
+  let Inst{28-21} = 0b11010110;
+  let Inst{20-16} = opcode2;
+  let Inst{15-10} = opcode;
+  let Inst{9-5} = Rn;
+  let Inst{4-0} = 0b11110; // Rd
+}
+
+class SignAuthReturnPCRel<bits<3> opc, bits<5> op2, string asm>
+  : I<(outs), (ins am_pauth_pcrel:$label), asm, "\t$label", "", []>,
+    Sched<[WriteAtomic]> {
+  bits<16> label;
+  let Inst{31-24} = 0b01010101;
+  let Inst{23-21} = opc;
+  let Inst{20-5} = label; // imm16
+  let Inst{4-0} = op2;
+}
+
+class SignAuthReturnReg<bits<6> op3, string asm>
+  : I<(outs), (ins GPR64common:$Rm), asm, "\t$Rm", "", []>,
+    Sched<[WriteAtomic]> {
+  bits<5> Rm;
+  let Inst{31-25} = 0b1101011;
+  let Inst{24-21} = 0b0010; // opc
+  let Inst{20-16} = 0b11111; // op2
+  let Inst{15-10} = op3;
+  let Inst{9-5} = 0b11111; // Rn
+  let Inst{4-0} = Rm; // op4 (Rm)
+}
+
 // Base class for the Armv8.4-A 8 and 16-bit flag manipulation instructions
 class BaseFlagManipulation<bit sf, bit sz, dag iops, string asm, string ops>
     : I<(outs), iops, asm, ops, "", []>,
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 4ccac40f99a0a..977729bb082b7 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -61,6 +61,9 @@ def HasLOR           : Predicate<"Subtarget->hasLOR()">,
 def HasPAuth         : Predicate<"Subtarget->hasPAuth()">,
                        AssemblerPredicateWithAll<(all_of FeaturePAuth), "pauth">;
 
+def HasPAuthLR       : Predicate<"Subtarget->hasPAuthLR()">,
+                       AssemblerPredicateWithAll<(all_of FeaturePAuthLR), "pauth-lr">;
+
 def HasJS            : Predicate<"Subtarget->hasJS()">,
                        AssemblerPredicateWithAll<(all_of FeatureJS), "jsconv">;
 
@@ -1646,6 +1649,42 @@ let Predicates = [HasPAuth] in {
 
 }
 
+// v9.5-A pointer authentication extensions
+
+// Always accept "pacm" as an alias for "hint #39", but don't emit it when
+// disassembling if we don't have the pauth-lr feature.
+let CRm = 0b0100 in {
+  def PACM : SystemNoOperands<0b111, "hint\t#39">;
+}
+def : InstAlias<"pacm", (PACM), 0>;
+
+let Predicates = [HasPAuthLR] in {
+  let Defs = [LR], Uses = [LR, SP] in {
+    //                                opcode2, opcode,   asm
+    def PACIASPPC : SignAuthFixedRegs<0b00001, 0b101000, "paciasppc">;
+    def PACIBSPPC : SignAuthFixedRegs<0b00001, 0b101001, "pacibsppc">;
+    def PACNBIASPPC : SignAuthFixedRegs<0b00001, 0b100000, "pacnbiasppc">;
+    def PACNBIBSPPC : SignAuthFixedRegs<0b00001, 0b100001, "pacnbibsppc">;
+    //                             opc,  asm
+    def AUTIASPPCi : SignAuthPCRel<0b00, "autiasppc">;
+    def AUTIBSPPCi : SignAuthPCRel<0b01, "autibsppc">;
+    //                              opcode2, opcode,   asm
+    def AUTIASPPCr : SignAuthOneReg<0b00001, 0b100100, "autiasppc">;
+    def AUTIBSPPCr : SignAuthOneReg<0b00001, 0b100101, "autibsppc">;
+  }
+
+  let Uses = [LR, SP], isReturn = 1, isTerminator = 1, isBarrier = 1 in {
+    //                                   opc,   op2,     asm
+    def RETAASPPCi : SignAuthReturnPCRel<0b000, 0b11111, "retaasppc">;
+    def RETABSPPCi : SignAuthReturnPCRel<0b001, 0b11111, "retabsppc">;
+    //                                 op3,      asm
+    def RETAASPPCr : SignAuthReturnReg<0b000010, "retaasppc">;
+    def RETABSPPCr : SignAuthReturnReg<0b000011, "retabsppc">;
+  }
+  def : InstAlias<"pacm", (PACM), 1>;
+}
+
+
 // v8.3a floating point conversion for javascript
 let Predicates = [HasJS, HasFPARMv8], Defs = [NZCV] in
 def FJCVTZS  : BaseFPToIntegerUnscaled<0b01, 0b11, 0b110, FPR64, GPR32,
diff --git a/llvm/lib/Target/AArch64/AArch64SchedA64FX.td b/llvm/lib/Target/AArch64/AArch64SchedA64FX.td
index 813b4a3affcfd..7edce4b61605d 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedA64FX.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedA64FX.td
@@ -22,7 +22,7 @@ def A64FXModel : SchedMachineModel {
 
   list<Predicate> UnsupportedFeatures = !listconcat(SMEUnsupported.F, SVEUnsupported.F,
                                                     [HasMTE, HasMatMulInt8, HasBF16,
-                                                    HasPAuth, HasCPA]);
+                                                    HasPAuth, HasPAuthLR, HasCPA]);
   let FullInstRWOverlapCheck = 0;
 }
 
diff --git a/llvm/lib/Target/AArch64/AArch64SchedNeoverseN2.td b/llvm/lib/Target/AArch64/AArch64SchedNeoverseN2.td
index 53cf725f0e235..a6fab5e6245f8 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedNeoverseN2.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedNeoverseN2.td
@@ -19,7 +19,7 @@ def NeoverseN2Model : SchedMachineModel {
   let CompleteModel         =   1;
 
   list<Predicate> UnsupportedFeatures = !listconcat(SMEUnsupported.F,
-                                                    [HasSVE2p1, HasCPA]);
+    [HasSVE2p1, HasPAuthLR, HasCPA]);
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
index 74afa4183e67e..38a92cb096029 100644
--- a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
+++ b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
@@ -1696,6 +1696,21 @@ class AArch64Operand : public MCParsedAsmOperand {
     return DiagnosticPredicateTy::Match;
   }
 
+  bool isPAuthPCRelLabel16Operand() const {
+    // PAuth PCRel16 operands are similar to regular branch targets, but only
+    // negative values are allowed for concrete immediates as signing instr
+    // should be in a lower address.
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return true;
+    int64_t Val = MCE->getValue();
+    if (Val & 0b11)
+      return false;
+    return (Val <= 0) && (Val > -(1 << 18));
+  }
+
   void addExpr(MCInst &Inst, const MCExpr *Expr) const {
     // Add as immediates when possible.  Null MCExpr = 0.
     if (!Expr)
@@ -1997,6 +2012,19 @@ class AArch64Operand : public MCParsedAsmOperand {
     Inst.addOperand(MCOperand::createImm(MCE->getValue() >> 2));
   }
 
+  void addPAuthPCRelLabel16Operands(MCInst &Inst, unsigned N) const {
+    // PC-relative operands don't encode the low bits, so shift them off
+    // here. If it's a label, however, just put it on directly as there's
+    // not enough information now to do anything.
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE) {
+      addExpr(Inst, getImm());
+      return;
+    }
+    Inst.addOperand(MCOperand::createImm(MCE->getValue() >> 2));
+  }
+
   void addPCRelLabel19Operands(MCInst &Inst, unsigned N) const {
     // Branch operands don't encode the low bits, so shift them off
     // here. If it's a label, however, just put it on directly as there's
diff --git a/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp b/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
index cf2d3879292d1..e3220d103ae0d 100644
--- a/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
+++ b/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
@@ -165,6 +165,9 @@ static DecodeStatus DecodeFixedPointScaleImm32(MCInst &Inst, unsigned Imm,
 static DecodeStatus DecodeFixedPointScaleImm64(MCInst &Inst, unsigned Imm,
                                                uint64_t Address,
                                                const MCDisassembler *Decoder);
+static DecodeStatus DecodePCRelLabel16(MCInst &Inst, unsigned Imm,
+                                       uint64_t Address,
+                                       const MCDisassembler *Decoder);
 static DecodeStatus DecodePCRelLabel19(MCInst &Inst, unsigned Imm,
                                        uint64_t Address,
                                        const MCDisassembler *Decoder);
@@ -887,6 +890,21 @@ static DecodeStatus DecodeFixedPointScaleImm64(MCInst &Inst, unsigned Imm,
   return Success;
 }
 
+static DecodeStatus DecodePCRelLabel16(MCInst &Inst, unsigned Imm,
+                                       uint64_t Addr,
+                                       const MCDisassembler *Decoder) {
+  // Immediate is encoded as the top 16-bits of an unsigned 18-bit negative
+  // PC-relative offset.
+  uint64_t ImmVal = Imm;
+  if (ImmVal < 0 || ImmVal > (1 << 16))
+    return Fail;
+  ImmVal = -ImmVal;
+  if (!Decoder->tryAddingSymbolicOperand(Inst, (ImmVal << 2), Addr,
+                                         /*IsBranch=*/false, 0, 0, 4))
+    Inst.addOperand(MCOperand::createImm(ImmVal));
+  return Success;
+}
+
 static DecodeStatus DecodePCRelLabel19(MCInst &Inst, unsigned Imm,
                                        uint64_t Addr,
                                        const MCDisassembler *Decoder) {
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
index a6900b8963bb3..30ef3680ae79c 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
@@ -67,6 +67,7 @@ class AArch64AsmBackend : public MCAsmBackend {
         {"fixup_aarch64_ldr_pcrel_imm19", 5, 19, PCRelFlagVal},
         {"fixup_aarch64_movw", 5, 16, 0},
         {"fixup_aarch64_pcrel_branch14", 5, 14, PCRelFlagVal},
+        {"fixup_aarch64_pcrel_branch16", 5, 16, PCRelFlagVal},
         {"fixup_aarch64_pcrel_branch19", 5, 19, PCRelFlagVal},
         {"fixup_aarch64_pcrel_branch26", 0, 26, PCRelFlagVal},
         {"fixup_aarch64_pcrel_call26", 0, 26, PCRelFlagVal}};
@@ -121,6 +122,7 @@ static unsigned getFixupKindNumBytes(unsigned Kind) {
 
   case AArch64::fixup_aarch64_movw:
   case AArch64::fixup_aarch64_pcrel_branch14:
+  case AArch64::fixup_aarch64_pcrel_branch16:
   case AArch64::fixup_aarch64_add_imm12:
   case AArch64::fixup_aarch64_ldst_imm12_scale1:
   case AArch64::fixup_aarch64_ldst_imm12_scale2:
@@ -314,6 +316,17 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, const MCValue &Target,
     if (Value & 0x3)
       Ctx.reportError(Fixup.getLoc(), "fixup not sufficiently aligned");
     return (Value >> 2) & 0x3fff;
+  case AArch64::fixup_aarch64_pcrel_branch16:
+    // Unsigned PC-relative offset, so invert the negative immediate.
+    SignedValue = -SignedValue;
+    Value = static_cast<uint64_t>(SignedValue);
+    // Check valid 18-bit unsigned range.
+    if (SignedValue < 0 || SignedValue > ((1 << 18) - 1))
+      Ctx.reportError(Fixup.getLoc(), "fixup value out of range");
+    // Low two bits are not encoded (4-byte alignment assumed).
+    if (Value & 0b11)
+      Ctx.reportError(Fixup.getLoc(), "fixup not sufficiently aligned");
+    return (Value >> 2) & 0xffff;
   case AArch64::fixup_aarch64_pcrel_branch26:
   case AArch64::fixup_aarch64_pcrel_call26:
     if (TheTriple.isOSBinFormatCOFF() && !IsResolved && SignedValue != 0) {
@@ -380,6 +393,7 @@ unsigned AArch64AsmBackend::getFixupKindContainereSizeInBytes(unsigned Kind) con
 
   case AArch64::fixup_aarch64_movw:
   case AArch64::fixup_aarch64_pcrel_branch14:
+  case AArch64::fixup_aarch64_pcrel_branch16:
   case AArch64::fixup_aarch64_add_imm12:
   case AArch64::fixup_aarch64_ldst_imm12_scale1:
   case AArch64::fixup_aarch64_ldst_imm12_scale2:
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
index 9de40661298cc..496ab18e9b195 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
@@ -186,6 +186,10 @@ unsigned AArch64ELFObjectWriter::getRelocType(MCContext &Ctx,
       return R_CLS(LD_PREL_LO19);
     case AArch64::fixup_aarch64_pcrel_branch14:
       return R_CLS(TSTBR14);
+    case AArch64::fixup_aarch64_pcrel_branch16:
+      Ctx.reportError(Fixup.getLoc(),
+                      "relocation of PAC/AUT instructions is not supported");
+      return ELF::R_AARCH64_NONE;
     case AArch64::fixup_aarch64_pcrel_branch19:
       return R_CLS(CONDBR19);
     default:
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64FixupKinds.h b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64FixupKinds.h
index 767dd88055201..fdee2d5ad2bf3 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64FixupKinds.h
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64FixupKinds.h
@@ -43,6 +43,11 @@ enum Fixups {
   // The high 14 bits of a 21-bit pc-relative immediate.
   fixup_aarch64_pcrel_branch14,
 
+  // The high 16 bits of a 18-bit unsigned PC-relative immediate. Used by
+  // pointer authentication, only within a function, so no relocation can be
+  // generated.
+  fixup_aarch64_pcrel_branch16,
+
   // The high 19 bits of a 21-bit pc-relative immediate. Same encoding as
   // fixup_aarch64_pcrel_adrhi, except this is use by b.cc and generates
   // relocations directly when necessary.
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
index dbc4323a860f5..c3e12b6d8024e 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
@@ -88,6 +88,12 @@ class AArch64MCCodeEmitter : public MCCodeEmitter {
                                       SmallVectorImpl<MCFixup> &Fixups,
                                       const MCSubtargetInfo &STI) const;
 
+  /// getPAuthPCRelOpValue - Return the encoded value for a pointer
+  /// authentication pc-relative operand.
+  uint32_t getPAuthPCRelOpValue(const MCInst &MI, unsigned OpIdx,
+                                SmallVectorImpl<MCFixup> &Fixups,
+                                const MCSubtargetInfo &STI) const;
+
   /// getLoadLiteralOpValue - Return the encoded value for a load-literal
   /// pc-relative address.
   uint32_t getLoadLiteralOpValue(const MCInst &MI, unsigned OpIdx,
@@ -327,6 +333,29 @@ uint32_t AArch64MCCodeEmitter::getCondBranchTargetOpValue(
   return 0;
 }
 
+/// getPAuthPCRelOpValue - Return the encoded value for a pointer
+/// authentication pc-relative operand.
+uint32_t
+AArch64MCCodeEmitter::getPAuthPCRelOpValue(const MCInst &MI, unsigned OpIdx,
+                                           SmallVectorImpl<MCFixup> &Fixups,
+                                           const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+
+  // If the destination is an immediate, invert sign as it's a negative value
+  // that should be encoded as unsigned
+  if (MO.isImm())
+    return -(MO.getImm());
+  assert(MO.isExpr() && "Unexpected target type!");
+
+  MCFixupKind Kind = MCFixupKind(AArch64::fixup_aarch64_pcrel_branch16);
+  Fixups.push_back(MCFixup::create(0, MO.getExpr(), Kind, MI.getLoc()));
+
+  ++MCNumFixups;
+
+  // All of the information is in the fixup.
+  return 0;
+}
+
 /// getLoadLiteralOpValue - Return the encoded value for a load-literal
 /// pc-relative address.
 uint32_t
diff --git a/llvm/test/MC/AArch64/armv9.5a-pauthlr-diagnostics.s b/llvm/test/MC/AArch64/armv9.5a-pauthlr-diagnostics.s
new file mode 100644
index 0000000000000..d06183be9da3e
--- /dev/null
+++ b/llvm/test/MC/AArch64/armv9.5a-pauthlr-diagnostics.s
@@ -0,0 +1,57 @@
+// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+pauth-lr 2>&1 < %s | FileCheck %s
+
+  autiasppc #2
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: expected label or encodable integer pc offset
+// CHECK-NEXT: autiasppc #2
+// CHECK-NOT: [[@LINE-3]]:{{[0-9]+}}:
+
+  autiasppc #1<<17
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: expected label or encodable integer pc offset
+// CHECK-NEXT: autiasppc #1<<17
+// CHECK-NOT: [[@LINE-3]]:{{[0-9]+}}:
+
+  autiasppc #-2
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: expected label or encodable integer pc offset
+// CHECK-NEXT: autiasppc #-2
+// CHECK-NOT: [[@LINE-3]]:{{[0-9]+}}:
+
+  autiasppc w0
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: expected label or encodable integer pc offset
+// CHECK-NEXT: autiasppc w0
+// CHECK-NOT: [[@LINE-3]]:{{[0-9]+}}:
+
+  autiasppc sp
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: expected label or encodable integer pc offset
+// CHECK-NEXT: autiasppc sp
+// CHECK-NOT: [[@LINE-3]]:{{[0-9]+}}:
+
+  retabsppc #2
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: expected label or encodable integer pc offset
+// CHECK-NEXT: retabsppc #2
+// CHECK-NOT: [[@LINE-3]]:{{[0-9]+}}:
+
+  retabsppc #(1<<17)
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: expected label or encodable integer pc offset
+// CHECK-NEXT: retabsppc #(1<<17)
+// CHECK-NOT: [[@LINE-3]]:{{[0-9]+}}:
+
+  retabsppc #-2
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: expected label or encodable integer pc offset
+// CHECK-NEXT: retabsppc #-2
+// CHECK-NOT: [[@LINE-3]]:{{[0-9]+}}:
+
+  retaasppc w0
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: expected label or encodable integer pc offset
+// CHECK-NEXT: retaasppc w0
+// CHECK-NOT: [[@LINE-3]]:{{[0-9]+}}:
+
+  retaasppc sp
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: expected label or encodable integer pc offset
+// CHECK-NEXT: retaasppc sp
+// CHECK-NOT: [[@LINE-3]]:{{[0-9]+}}:
+
+  retaasppc xzr
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: expected label or encodable integer pc offset
+// CHECK-NEXT: retaasppc xzr
+// CHECK-NOT: [[@LINE-3]]:{{[0-9]+}}:
+
diff --git a/llvm/test/MC/AArch64/armv9.5a-pauthlr-reloc.s b/llvm/test/MC/AArch64/armv9.5a-pauthlr-reloc.s
new file mode 100644
index 0000000000000..c10142a199766
--- /dev/null
+++ b/llvm/test/MC/AArch64/armv9.5a-pauthlr-reloc.s
@@ -0,0 +1,12 @@
+// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+pauth-lr -filetype=obj -o /dev/null 2>&1 < %s | FileCheck %s
+
+  autiasppc undef_label
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: relocation of PAC/AUT instructions is not supported
+// CHECK-NEXT: autiasppc undef_label
+// CHECK-NOT: [[@LINE-3]]:{{[0-9]+}}:
+
+  autibsppc undef_label
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: relocation of PAC/AUT instructions is not supported
+// CHECK-NEXT: autibsppc undef_label
+// CHECK-NOT: [[@LINE-3]]:{{[0-9]+}}:
+
diff --git a/llvm/test/MC/AArch64/armv9.5a-pauthlr.s b/llvm/test/MC/AArch64/armv9.5a-pauthlr.s
new file mode 100644
index 0000000000000..24e9c44984683
--- /dev/null
+++ b/llvm/test/MC/AArch64/armv9.5a-pauthlr.s
@@ -0,0 +1,151 @@
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+pauth-lr < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
+// RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+pauth-lr < %s \
+// RUN:        | llvm-objdump -d --mattr=+pauth-lr - | FileCheck %s --check-prefix=CHECK-DISASS
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+pauth-lr < %s \
+// RUN:        | llvm-objdump -d --mattr=-pauth-lr - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+
+// Label at address 4, so we can test that the address shows up in the
+// disassembly.
+  nop
+label1:
+
+  paciasppc
+// CHECK-INST: paciasppc
+// CHECK-DISASS: paciasppc
+// CHECK-ENCODING: [0xfe,0xa3,0xc1,0xda]
+// CHECK-ERROR: instruction requires: pauth-lr
+// CHECK-UNKNOWN: dac1a3fe <unknown>
+
+  pacibsppc
+// CHECK-INST: pacibsppc
+// CHECK-DISASS: pacibsppc
+// CHECK-ENCODING: [0xfe,0xa7,0xc1,0xda]
+// CHECK-ERROR: instruction requires: pauth-lr
+// CHECK-UNKNOWN: dac1a7fe <unknown>
+
+  pacnbiasppc
+// CHECK-INST: pacnbiasppc
+// CHECK-DISASS: pacnbiasppc
+// CHECK-ENCODING: [0xfe,0x83,0xc1,0xda]
+// CHECK-ERROR: instruction requires: pauth-lr
+// CHECK-UNKNOWN: dac183fe <unknown>
+
+  pacnbibsppc
+// CHECK-INST: pacnbibsppc
+// CHECK-DISASS: pacnbibsppc
+// CHECK-ENCODING: [0xfe,0x87,0xc1,0xda]
+// CHECK-ERROR: instruction requires: pauth-lr
+// CHECK-UNKNOWN: dac187fe <unknown>
+
+  autiasppc label1
+// CHECK-INST: autiasppc label1
+// CHECK-DISASS: autiasppc 0x4 <label1>
+// CHECK-ENCODING: [0bAAA11111,A,0b100AAAAA,0xf3]
+// CHECK-ENCODING: fixup A - offset: 0, value: label1, kind: fixup_aarch64_pcrel_branch16
+// CHECK-ERROR: instruction requires: pauth-lr
+// CHECK-UNKNOWN: f380009f <unknown>
+
+  autibsppc label1
+// CHECK-INST: autibsppc label1
+// CHECK-DISASS: autibsppc 0x4 <label1>
+// CHECK-ENCODING: [0bAAA11111,A,0b101AAAAA,0xf3]
+// CHECK-ENCODING: fixup A - offset: 0, value: label1, kind: fixup_aarch64_pcrel_branch16
+// CHECK-ERROR: instruction requires: pauth-lr
+// CHECK-UNKNOWN: f3a000bf <unknown>
+
+  autibsppc #0
+// CHECK-INST: autibsppc #0
+// CHECK-DISASS: autibsppc 0x1c <label1+0x18>
+// CHECK-ENCODING: [0x1f,0x00,0xa0,0xf3]
+// CHECK-ERROR: instruction requires: pauth-lr
+// CHECK-UNKNOWN: f3a0001f <unknown>
+
+  autibsppc #-(1<<18)+4
+// CHECK-INST: autibsppc #-262140
+// CHECK-DISASS: autibsppc 0xfffffffffffc0024 <label1+0xfffffffffffc0020>
+// CHECK-ENCODING: [0xff,0xff,0xbf,0xf3]
+// CHECK-ERROR: instruction requires: pauth-lr
+// CHECK-UNKNOWN: f3bfffff <unknown>
+
+  autiasppc x0
+// CHECK-INST: autiasppc x0
+// CHECK-DISASS: autiasppc x0
+// CHECK-ENCODING: [0x1e,0x90,0xc1,0xda]
+// CHECK-ERROR: instruction requires: pauth-lr
+// CHECK-UNKNOWN: dac1901e <unknown>
+
+  autibsppc x1
+// CHECK-INST: autibsppc x1
+// CHECK-DISASS: autibsppc x1
+// CHECK-ENCODING: [0x3e,0x94,0xc1,0xda]
+// CHECK-ERROR: instruction requires: pauth-lr
+// CHECK-UNKNOWN: dac1943e <unknown>
+
+  autiasppc xzr
+// CHECK-INST: autiasppc xzr
+// CHECK-DISASS: autiasppc xzr
+// CHECK-ENCODING: [0xfe,0x93,0xc1,0xda]
+// CHECK-ERROR: instruction requires: pauth-lr
+// CHECK-UNKNOWN: dac193fe <unknown>
+
+  autibsppc xzr
+// CHECK-INST: autibsppc xzr
+// CHECK-DISASS: autibsppc xzr
+// CHECK-ENCODING: [0xfe,0x97,0xc1,0xda]
+// CHECK-ERROR: instruction requires: pauth-lr
+// CHECK-UNKNOWN: dac197fe <unknown>
+
+
+  retaasppc label1
+// CHECK-INST: retaasppc label1
+// CHECK-DISASS: retaasppc 0x4 <label1>
+// CHECK-ENCODING: [0bAAA11111,A,0b000AAAAA,0x55]
+// CHECK-ENCODING: //   fixup A - offset: 0, value: label1, kind: fixup_aarch64_pcrel_branch16
+// CHECK-ERROR: instruction requires: pauth-lr
+// CHECK-UNKNOWN: 5500019f <unknown>
+
+  retabsppc label1
+// CHECK-INST: retabsppc label1
+// CHECK-DISASS: retabsppc 0x4 <label1>
+// CHECK-ENCODING: [0bAAA11111,A,0b001AAAAA,0x55]
+// CHECK-ENCODING: //   fixup A - offset: 0, value: label1, kind: fixup_aarch64_pcrel_branch16
+// CHECK-ERROR: instruction requires: pauth-lr
+// CHECK-UNKNOWN: 552001bf <unknown>
+
+  retaasppc #0
+// CHECK-INST: retaasppc #0
+// CHECK-DISASS: retaasppc 0x3c <label1+0x38>
+// CHECK-ENCODING: [0x1f,0x00,0x00,0x55]
+// CHECK-ERROR: instruction requires: pauth-lr
+// CHECK-UNKNOWN: 5500001f <unknown>
+
+  retaasppc #-(1<<18)+4
+// CHECK-INST: retaasppc #-262140
+// CHECK-DISASS: retaasppc 0xfffffffffffc0044 <label1+0xfffffffffffc0040>
+// CHECK-ENCODING: [0xff,0xff,0x1f,0x55]
+// CHECK-ERROR: instruction requires: pauth-lr
+// CHECK-UNKNOWN: 551fffff <unknown>
+
+  retaasppc x2
+// CHECK-INST: retaasppc x2
+// CHECK-DISASS: retaasppc x2
+// CHECK-ENCODING: [0xe2,0x0b,0x5f,0xd6]
+// CHECK-ERROR: instruction requires: pauth-lr
+// CHECK-UNKNOWN: d65f0be2 <unknown>
+
+  retabsppc x3
+// CHECK-INST: retabsppc x3
+// CHECK-DISASS: retabsppc x3
+// CHECK-ENCODING: [0xe3,0x0f,0x5f,0xd6]
+// CHECK-ERROR: instruction requires: pauth-lr
+// CHECK-UNKNOWN: d65f0fe3 <unknown>
+
+  pacm
+// CHECK-INST: pacm
+// CHECK-DISASS: pacm
+// CHECK-ENCODING: [0xff,0x24,0x03,0xd5]
+// CHECK-ERROR-NOT: instruction requires:
+// CHECK-UNKNOWN: d50324ff hint #39
diff --git a/llvm/test/MC/Disassembler/AArch64/armv9.5a-pauthlr.txt b/llvm/test/MC/Disassembler/AArch64/armv9.5a-pauthlr.txt
new file mode 100644
index 0000000000000..caf1fde2c2b7c
--- /dev/null
+++ b/llvm/test/MC/Disassembler/AArch64/armv9.5a-pauthlr.txt
@@ -0,0 +1,78 @@
+# RUN: llvm-mc -triple aarch64 -disassemble -mattr=+pauth-lr < %s | FileCheck %s
+# RUN: not llvm-mc -triple aarch64 -disassemble < %s 2>&1 | FileCheck %s --check-prefix=NO-PAUTHLR
+
+[0xfe,0xa3,0xc1,0xda]
+# CHECK: paciasppc
+# NO-PAUTHLR: invalid instruction encoding
+
+[0xfe,0xa7,0xc1,0xda]
+# CHECK: pacibsppc
+# NO-PAUTHLR: invalid instruction encoding
+
+[0xfe,0x83,0xc1,0xda]
+# CHECK: pacnbiasppc
+# NO-PAUTHLR: invalid instruction encoding
+
+[0xfe,0x87,0xc1,0xda]
+# CHECK: pacnbibsppc
+# NO-PAUTHLR: invalid instruction encoding
+
+[0x9f,0x00,0x80,0xf3]
+# CHECK: autiasppc #-16
+# NO-PAUTHLR: invalid instruction encoding
+
+[0xbf,0x00,0xa0,0xf3]
+# CHECK: autibsppc #-20
+# NO-PAUTHLR: invalid instruction encoding
+
+[0x1f,0x00,0xa0,0xf3]
+# CHECK: autibsppc #0
+# NO-PAUTHLR: invalid instruction encoding
+
+[0xff,0xff,0xbf,0xf3]
+# CHECK: autibsppc #-262140
+# NO-PAUTHLR: invalid instruction encoding
+
+[0x1e,0x90,0xc1,0xda]
+# CHECK: autiasppc x0
+# NO-PAUTHLR: invalid instruction encoding
+
+[0x3e,0x94,0xc1,0xda]
+# CHECK: autibsppc x1
+# NO-PAUTHLR: invalid instruction encoding
+
+[0xfe,0x93,0xc1,0xda]
+# CHECK: autiasppc xzr
+# NO-PAUTHLR: invalid instruction encoding
+
+[0xfe,0x97,0xc1,0xda]
+# CHECK: autibsppc xzr
+# NO-PAUTHLR: invalid instruction encoding
+
+[0xbf,0x01,0x00,0x55]
+# CHECK: retaasppc #-52
+# NO-PAUTHLR: invalid instruction encoding
+
+[0xdf,0x01,0x20,0x55]
+# CHECK: retabsppc #-56
+# NO-PAUTHLR: invalid instruction encoding
+
+[0x1f,0x00,0x00,0x55]
+# CHECK: retaasppc #0
+# NO-PAUTHLR: invalid instruction encoding
+
+[0xff,0xff,0x1f,0x55]
+# CHECK: retaasppc #-262140
+# NO-PAUTHLR: invalid instruction encoding
+
+[0xe2,0x0b,0x5f,0xd6]
+# CHECK: retaasppc x2
+# NO-PAUTHLR: invalid instruction encoding
+
+[0xe3,0x0f,0x5f,0xd6]
+# CHECK: retabsppc x3
+# NO-PAUTHLR: invalid instruction encoding
+
+[0xff,0x24,0x03,0xd5]
+# CHECK: pacm
+# NO-PAUTHLR: hint #39

From 6a870cca70e3df6070bdcd2768d6569daae8e1ba Mon Sep 17 00:00:00 2001
From: Walter Lee <49250218+googlewalt@users.noreply.github.com>
Date: Thu, 21 Dec 2023 13:15:02 -0500
Subject: [PATCH 102/342] Add tests for driver to propagate module map flags
 for layering check (#75827)

Xcode 14.3.1 seems to have dropped these flags so we are creating unit tests to reproduce the issue.
---
 clang/test/Driver/modules.m | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/clang/test/Driver/modules.m b/clang/test/Driver/modules.m
index 9eb3569805569..d1a65f5cb0713 100644
--- a/clang/test/Driver/modules.m
+++ b/clang/test/Driver/modules.m
@@ -57,7 +57,21 @@
 // CHECK-MODULE-MAP-FILES: "-fmodule-map-file=foo.map"
 // CHECK-MODULE-MAP-FILES: "-fmodule-map-file=bar.map"
 
-// RUN: %clang -fmodules -fbuiltin-module-map -### %s 2>&1 | FileCheck -check-prefix=CHECK-BUILTIN-MODULE-MAP %s
+// Verify that the driver propagates -fmodule-name and -fmodule-map-file flags when
+// -fmodules-decluse or -fmodules-strict-decluse, as used for layering check.
+// RUN: %clang -fmodules-decluse -fmodule-name=foo -c -### %s 2>&1 | FileCheck -check-prefix=CHECK-DECLUSE-PROPAGATE-MODULE-NAME %s
+// CHECK-DECLUSE-PROPAGATE-MODULE-NAME: -fmodule-name=foo
+
+// RUN: %clang -fmodules-decluse -fmodule-map-file=foo.map -c -### %s 2>&1 | FileCheck -check-prefix=CHECK-DECLUSE-PROPAGATE-MODULE-MAPS %s
+// CHECK-DECLUSE-PROPAGATE-MODULE-MAPS: -fmodule-map-file=foo.map
+
+// RUN: %clang -fmodules-strict-decluse -fmodule-name=foo -c -### %s 2>&1 | FileCheck -check-prefix=CHECK-STRICT-DECLUSE-PROPAGATE-MODULE-NAME %s
+// CHECK-STRICT-DECLUSE-PROPAGATE-MODULE-NAME: -fmodule-name=foo
+
+// RUN: %clang -fmodules-strict-decluse -fmodule-map-file=foo.map -c -### %s 2>&1 | FileCheck -check-prefix=CHECK-STRICT-DECLUSE-PROPAGATE-MODULE-MAPS %s
+// CHECK-STRICT-DECLUSE-PROPAGATE-MODULE-MAPS: -fmodule-map-file=foo.map
+
+    // RUN: %clang -fmodules -fbuiltin-module-map -### %s 2>&1 | FileCheck -check-prefix=CHECK-BUILTIN-MODULE-MAP %s
 // CHECK-BUILTIN-MODULE-MAP: "-fmodules"
 // CHECK-BUILTIN-MODULE-MAP: "-fmodule-map-file={{.*}}include{{/|\\\\}}module.modulemap"
 

From f44079db22036d0ade2cf3d2e5a24bde5d378efd Mon Sep 17 00:00:00 2001
From: David Li <57157229+david-xl@users.noreply.github.com>
Date: Thu, 21 Dec 2023 10:18:57 -0800
Subject: [PATCH 103/342] [ISel] Add pattern matching for depositing subreg
 value (#75978)

Depositing value into the lowest byte/word is a common code pattern.
This patch improves the code generation for it to avoid redundant AND
and OR operations.
---
 llvm/lib/Target/X86/X86InstrCompiler.td | 17 +++++
 llvm/test/CodeGen/X86/insert.ll         | 93 +++++++++++++++++++++++++
 2 files changed, 110 insertions(+)
 create mode 100644 llvm/test/CodeGen/X86/insert.ll

diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td b/llvm/lib/Target/X86/X86InstrCompiler.td
index 457833f8cc331..c77c77ee4a3ee 100644
--- a/llvm/lib/Target/X86/X86InstrCompiler.td
+++ b/llvm/lib/Target/X86/X86InstrCompiler.td
@@ -1515,6 +1515,23 @@ def : Pat<(X86add_flag_nocf GR32:$src1, 128),
 def : Pat<(X86add_flag_nocf GR64:$src1, 128),
           (SUB64ri32 GR64:$src1, -128)>;
 
+// Depositing value to 8/16 bit subreg:
+def : Pat<(or (and GR64:$dst, -256), 
+              (i64 (zextloadi8 addr:$src))),
+          (INSERT_SUBREG (i64 (COPY $dst)), (MOV8rm  i8mem:$src), sub_8bit)>; 
+
+def : Pat<(or (and GR32:$dst, -256), 
+              (i32 (zextloadi8 addr:$src))),
+          (INSERT_SUBREG (i32 (COPY $dst)), (MOV8rm  i8mem:$src), sub_8bit)>; 
+
+def : Pat<(or (and GR64:$dst, -65536), 
+              (i64 (zextloadi16 addr:$src))),
+          (INSERT_SUBREG (i64 (COPY $dst)), (MOV16rm  i16mem:$src), sub_16bit)>;
+
+def : Pat<(or (and GR32:$dst, -65536), 
+              (i32 (zextloadi16 addr:$src))),
+          (INSERT_SUBREG (i32 (COPY $dst)), (MOV16rm  i16mem:$src), sub_16bit)>; 
+
 // The same trick applies for 32-bit immediate fields in 64-bit
 // instructions.
 def : Pat<(add GR64:$src1, 0x0000000080000000),
diff --git a/llvm/test/CodeGen/X86/insert.ll b/llvm/test/CodeGen/X86/insert.ll
new file mode 100644
index 0000000000000..381de2ecaa164
--- /dev/null
+++ b/llvm/test/CodeGen/X86/insert.ll
@@ -0,0 +1,93 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc < %s -mtriple=i386-unknown-unknown | FileCheck %s --check-prefixes=X86
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefixes=X64
+
+define i64 @sub8(i64 noundef %res, ptr %byte) {
+; X86-LABEL: sub8:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movb (%ecx), %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: sub8:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    movb (%rsi), %al
+; X64-NEXT:    retq
+entry:
+  %and = and i64 %res, -256
+  %d = load i8, ptr %byte, align 1
+  %conv2 = zext i8 %d to i64
+  %or = or i64 %and, %conv2
+  ret i64 %or
+}
+
+define i64 @sub16(i64 noundef %res, ptr %byte) {
+; X86-LABEL: sub16:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    shll $16, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movzwl (%eax), %eax
+; X86-NEXT:    orl %ecx, %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: sub16:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    movw (%rsi), %ax
+; X64-NEXT:    retq
+entry:
+  %and = and i64 %res, -65536
+  %d = load i16, ptr %byte, align 1
+  %conv2 = zext i16 %d to i64
+  %or = or i64 %and, %conv2
+  ret i64 %or
+}
+
+define i32 @sub8_32(i32 noundef %res, ptr %byte) {
+; X86-LABEL: sub8_32:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movb (%ecx), %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: sub8_32:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    movb (%rsi), %al
+; X64-NEXT:    retq
+entry:
+  %and = and i32 %res, -256
+  %d = load i8, ptr %byte, align 1
+  %conv2 = zext i8 %d to i32
+  %or = or i32 %and, %conv2
+  ret i32 %or
+}
+
+define i32 @sub16_32(i32 noundef %res, ptr %byte) {
+; X86-LABEL: sub16_32:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    shll $16, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movzwl (%eax), %eax
+; X86-NEXT:    orl %ecx, %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: sub16_32:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    movw (%rsi), %ax
+; X64-NEXT:    retq
+entry:
+  %and = and i32 %res, -65536
+  %d = load i16, ptr %byte, align 1
+  %conv2 = zext i16 %d to i32
+  %or = or i32 %and, %conv2
+  ret i32 %or
+}

From c50de57feb2b824d789fe3bc4e0d24c5bfc266ea Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Thu, 21 Dec 2023 10:30:36 -0800
Subject: [PATCH 104/342] [flang] Fix a warning

This patch fixes:

  flang/lib/Optimizer/Transforms/StackArrays.cpp:452:7: error:
  ignoring return value of function declared with 'nodiscard'
  attribute [-Werror,-Wunused-result]
---
 flang/lib/Optimizer/Transforms/StackArrays.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/flang/lib/Optimizer/Transforms/StackArrays.cpp b/flang/lib/Optimizer/Transforms/StackArrays.cpp
index b51e2aae1a9d5..1c213abefe6f5 100644
--- a/flang/lib/Optimizer/Transforms/StackArrays.cpp
+++ b/flang/lib/Optimizer/Transforms/StackArrays.cpp
@@ -449,7 +449,7 @@ StackArraysAnalysisWrapper::analyseFunction(mlir::Operation *func) {
     const LatticePoint *lattice = solver.lookupState<LatticePoint>(op);
     // there will be no lattice for an unreachable block
     if (lattice)
-      point.join(*lattice);
+      (void)point.join(*lattice);
   };
   func->walk([&](mlir::func::ReturnOp child) { joinOperationLattice(child); });
   func->walk([&](fir::UnreachableOp child) { joinOperationLattice(child); });

From 7bd17212ef23a72ea224a037126d33d3e02553fe Mon Sep 17 00:00:00 2001
From: Tomas Matheson <tomas.matheson@arm.com>
Date: Thu, 21 Dec 2023 18:16:30 +0000
Subject: [PATCH 105/342] Re-land "[AArch64] Codegen support for FEAT_PAuthLR"
 (#75947)

This reverts commit 9f0f5587426a4ff24b240018cf8bf3acc3c566ae.

Fix expensive checks failure by properly marking register def for ADR.
---
 clang/include/clang/Basic/LangOptions.def     |   1 +
 clang/include/clang/Basic/TargetInfo.h        |   1 +
 clang/include/clang/Driver/Options.td         |   2 +
 clang/lib/Basic/Targets/AArch64.cpp           |   1 +
 clang/lib/Basic/Targets/ARM.cpp               |   1 +
 clang/lib/CodeGen/CodeGenModule.cpp           |   3 +
 clang/lib/CodeGen/Targets/AArch64.cpp         |   2 +
 clang/lib/Driver/ToolChains/Clang.cpp         |   7 +-
 .../CodeGen/aarch64-branch-protection-attr.c  |  28 +
 clang/test/Driver/aarch64-pauth-lr.c          |  23 +
 clang/test/Driver/aarch64-v95a.c              |   7 +
 .../llvm/TargetParser/AArch64TargetParser.h   |   2 +
 .../llvm/TargetParser/ARMTargetParserCommon.h |   1 +
 llvm/lib/Target/AArch64/AArch64InstrInfo.cpp  |  11 +
 .../AArch64/AArch64MachineFunctionInfo.cpp    |  28 +-
 .../AArch64/AArch64MachineFunctionInfo.h      |  18 +
 .../lib/Target/AArch64/AArch64PointerAuth.cpp |  86 ++-
 .../TargetParser/ARMTargetParserCommon.cpp    |   6 +-
 .../AArch64/sign-return-address-pauth-lr.ll   | 542 ++++++++++++++++++
 .../CodeGen/AArch64/sign-return-address.ll    |   3 +
 .../TargetParser/TargetParserTest.cpp         |   4 +-
 21 files changed, 752 insertions(+), 25 deletions(-)
 create mode 100644 clang/test/Driver/aarch64-pauth-lr.c
 create mode 100644 llvm/test/CodeGen/AArch64/sign-return-address-pauth-lr.ll

diff --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def
index 152d9f65f86db..21abc346cf17a 100644
--- a/clang/include/clang/Basic/LangOptions.def
+++ b/clang/include/clang/Basic/LangOptions.def
@@ -456,6 +456,7 @@ ENUM_LANGOPT(SignReturnAddressScope, SignReturnAddressScopeKind, 2, SignReturnAd
 ENUM_LANGOPT(SignReturnAddressKey, SignReturnAddressKeyKind, 1, SignReturnAddressKeyKind::AKey,
              "Key used for return address signing")
 LANGOPT(BranchTargetEnforcement, 1, 0, "Branch-target enforcement enabled")
+LANGOPT(BranchProtectionPAuthLR, 1, 0, "Use PC as a diversifier using PAuthLR NOP instructions.")
 
 LANGOPT(SpeculativeLoadHardening, 1, 0, "Speculative load hardening enabled")
 
diff --git a/clang/include/clang/Basic/TargetInfo.h b/clang/include/clang/Basic/TargetInfo.h
index aa0f5023104a1..ac3c324c6c29c 100644
--- a/clang/include/clang/Basic/TargetInfo.h
+++ b/clang/include/clang/Basic/TargetInfo.h
@@ -1372,6 +1372,7 @@ class TargetInfo : public TransferrableTargetInfo,
     LangOptions::SignReturnAddressKeyKind SignKey =
         LangOptions::SignReturnAddressKeyKind::AKey;
     bool BranchTargetEnforcement = false;
+    bool BranchProtectionPAuthLR = false;
   };
 
   /// Determine if the Architecture in this TargetInfo supports branch
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 9678165bfd98e..2b93ddf033499 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -7000,6 +7000,8 @@ def msign_return_address_key_EQ : Joined<["-"], "msign-return-address-key=">,
     Values<"a_key,b_key">;
 def mbranch_target_enforce : Flag<["-"], "mbranch-target-enforce">,
   MarshallingInfoFlag<LangOpts<"BranchTargetEnforcement">>;
+def mbranch_protection_pauth_lr : Flag<["-"], "mbranch-protection-pauth-lr">,
+  MarshallingInfoFlag<LangOpts<"BranchProtectionPAuthLR">>;
 def fno_dllexport_inlines : Flag<["-"], "fno-dllexport-inlines">,
   MarshallingInfoNegativeFlag<LangOpts<"DllExportInlines">>;
 def cfguard_no_checks : Flag<["-"], "cfguard-no-checks">,
diff --git a/clang/lib/Basic/Targets/AArch64.cpp b/clang/lib/Basic/Targets/AArch64.cpp
index def16c032c869..3ee39133fcee7 100644
--- a/clang/lib/Basic/Targets/AArch64.cpp
+++ b/clang/lib/Basic/Targets/AArch64.cpp
@@ -225,6 +225,7 @@ bool AArch64TargetInfo::validateBranchProtection(StringRef Spec, StringRef,
     BPI.SignKey = LangOptions::SignReturnAddressKeyKind::BKey;
 
   BPI.BranchTargetEnforcement = PBP.BranchTargetEnforcement;
+  BPI.BranchProtectionPAuthLR = PBP.BranchProtectionPAuthLR;
   return true;
 }
 
diff --git a/clang/lib/Basic/Targets/ARM.cpp b/clang/lib/Basic/Targets/ARM.cpp
index ce7e4d4639cea..6e1842fc64e50 100644
--- a/clang/lib/Basic/Targets/ARM.cpp
+++ b/clang/lib/Basic/Targets/ARM.cpp
@@ -419,6 +419,7 @@ bool ARMTargetInfo::validateBranchProtection(StringRef Spec, StringRef Arch,
   BPI.SignKey = LangOptions::SignReturnAddressKeyKind::AKey;
 
   BPI.BranchTargetEnforcement = PBP.BranchTargetEnforcement;
+  BPI.BranchProtectionPAuthLR = PBP.BranchProtectionPAuthLR;
   return true;
 }
 
diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index b2e173d0d6949..d78f2594a2376 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -1106,6 +1106,9 @@ void CodeGenModule::Release() {
     if (LangOpts.BranchTargetEnforcement)
       getModule().addModuleFlag(llvm::Module::Min, "branch-target-enforcement",
                                 1);
+    if (LangOpts.BranchProtectionPAuthLR)
+      getModule().addModuleFlag(llvm::Module::Min, "branch-protection-pauth-lr",
+                                1);
     if (LangOpts.hasSignReturnAddress())
       getModule().addModuleFlag(llvm::Module::Min, "sign-return-address", 1);
     if (LangOpts.isSignReturnAddressScopeAll())
diff --git a/clang/lib/CodeGen/Targets/AArch64.cpp b/clang/lib/CodeGen/Targets/AArch64.cpp
index be5145daa00b7..7102d190fe008 100644
--- a/clang/lib/CodeGen/Targets/AArch64.cpp
+++ b/clang/lib/CodeGen/Targets/AArch64.cpp
@@ -136,6 +136,8 @@ class AArch64TargetCodeGenInfo : public TargetCodeGenInfo {
 
     Fn->addFnAttr("branch-target-enforcement",
                   BPI.BranchTargetEnforcement ? "true" : "false");
+    Fn->addFnAttr("branch-protection-pauth-lr",
+                  BPI.BranchProtectionPAuthLR ? "true" : "false");
   }
 
   bool isScalarizableAsmOperand(CodeGen::CodeGenFunction &CGF,
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index de9fd5eaa1e02..4783affd3220b 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -1497,7 +1497,7 @@ static void CollectARMPACBTIOptions(const ToolChain &TC, const ArgList &Args,
         << Triple.getArchName();
 
   StringRef Scope, Key;
-  bool IndirectBranches;
+  bool IndirectBranches, BranchProtectionPAuthLR;
 
   if (A->getOption().matches(options::OPT_msign_return_address_EQ)) {
     Scope = A->getValue();
@@ -1506,6 +1506,7 @@ static void CollectARMPACBTIOptions(const ToolChain &TC, const ArgList &Args,
           << A->getSpelling() << Scope;
     Key = "a_key";
     IndirectBranches = false;
+    BranchProtectionPAuthLR = false;
   } else {
     StringRef DiagMsg;
     llvm::ARM::ParsedBranchProtection PBP;
@@ -1517,6 +1518,7 @@ static void CollectARMPACBTIOptions(const ToolChain &TC, const ArgList &Args,
           << "b-key" << A->getAsString(Args);
     Scope = PBP.Scope;
     Key = PBP.Key;
+    BranchProtectionPAuthLR = PBP.BranchProtectionPAuthLR;
     IndirectBranches = PBP.BranchTargetEnforcement;
   }
 
@@ -1525,6 +1527,9 @@ static void CollectARMPACBTIOptions(const ToolChain &TC, const ArgList &Args,
   if (!Scope.equals("none"))
     CmdArgs.push_back(
         Args.MakeArgString(Twine("-msign-return-address-key=") + Key));
+  if (BranchProtectionPAuthLR)
+    CmdArgs.push_back(
+        Args.MakeArgString(Twine("-mbranch-protection-pauth-lr")));
   if (IndirectBranches)
     CmdArgs.push_back("-mbranch-target-enforce");
 }
diff --git a/clang/test/CodeGen/aarch64-branch-protection-attr.c b/clang/test/CodeGen/aarch64-branch-protection-attr.c
index 3c2714e2feda2..8ab3e17ade426 100644
--- a/clang/test/CodeGen/aarch64-branch-protection-attr.c
+++ b/clang/test/CodeGen/aarch64-branch-protection-attr.c
@@ -46,6 +46,24 @@ __attribute__ ((target("branch-protection=pac-ret+leaf+bti")))
 void btileaf() {}
 // CHECK: define{{.*}} void @btileaf() #[[#BTIPACLEAF:]]
 
+
+__attribute__ ((target("branch-protection=pac-ret+pc")))
+void pauthlr() {}
+// CHECK: define{{.*}} void @pauthlr()  #[[#PAUTHLR:]]
+
+__attribute__ ((target("branch-protection=pac-ret+pc+b-key")))
+void pauthlr_bkey() {}
+// CHECK: define{{.*}} void @pauthlr_bkey()  #[[#PAUTHLR_BKEY:]]
+
+__attribute__ ((target("branch-protection=pac-ret+pc+leaf")))
+void pauthlr_leaf() {}
+// CHECK: define{{.*}} void @pauthlr_leaf()  #[[#PAUTHLR_LEAF:]]
+
+__attribute__ ((target("branch-protection=pac-ret+pc+bti")))
+void pauthlr_bti() {}
+// CHECK: define{{.*}} void @pauthlr_bti()  #[[#PAUTHLR_BTI:]]
+
+
 // CHECK-DAG: attributes #[[#NONE]] = { {{.*}} "branch-target-enforcement"="false" {{.*}} "sign-return-address"="none"
 
 // CHECK-DAG: attributes #[[#STD]] = { {{.*}} "branch-target-enforcement"="true" {{.*}} "sign-return-address"="non-leaf" "sign-return-address-key"="a_key"
@@ -61,3 +79,13 @@ void btileaf() {}
 // CHECK-DAG: attributes #[[#PACBKEYLEAF]] = { {{.*}} "branch-target-enforcement"="false" {{.*}}"sign-return-address"="all" "sign-return-address-key"="b_key"
 
 // CHECK-DAG: attributes #[[#BTIPACLEAF]] = { {{.*}}"branch-target-enforcement"="true" {{.*}} "sign-return-address"="all" "sign-return-address-key"="a_key"
+
+
+// CHECK-DAG: attributes #[[#PAUTHLR]] = { {{.*}}"branch-protection-pauth-lr"="true" {{.*}}"branch-target-enforcement"="false" {{.*}}"sign-return-address"="non-leaf" "sign-return-address-key"="a_key"
+
+// CHECK-DAG: attributes #[[#PAUTHLR_BKEY]] = { {{.*}}"branch-protection-pauth-lr"="true" {{.*}}"branch-target-enforcement"="false" {{.*}}"sign-return-address"="non-leaf" "sign-return-address-key"="b_key"
+
+// CHECK-DAG: attributes #[[#PAUTHLR_LEAF]] = { {{.*}}"branch-protection-pauth-lr"="true" {{.*}}"branch-target-enforcement"="false" {{.*}}"sign-return-address"="all" "sign-return-address-key"="a_key"
+
+// CHECK-DAG: attributes #[[#PAUTHLR_BTI]] = { {{.*}}"branch-protection-pauth-lr"="true" {{.*}}"branch-target-enforcement"="true" {{.*}}"sign-return-address"="non-leaf" "sign-return-address-key"="a_key"
+
diff --git a/clang/test/Driver/aarch64-pauth-lr.c b/clang/test/Driver/aarch64-pauth-lr.c
new file mode 100644
index 0000000000000..2e1b530fc9895
--- /dev/null
+++ b/clang/test/Driver/aarch64-pauth-lr.c
@@ -0,0 +1,23 @@
+// Check the -cc1 flags for the various forms of -mbranch-protection=pac-ret+pc.
+
+// RUN: %clang -target aarch64-arm-none-eabi -c %s -### -mbranch-protection=pac-ret+pc                  2>&1 |  FileCheck %s --check-prefixes=PAUTH-LR
+// RUN: %clang -target aarch64-arm-none-eabi -c %s -### -mbranch-protection=pac-ret+pc+b-key            2>&1 |  FileCheck %s --check-prefixes=PAUTH-LR-B-KEY
+// RUN: %clang -target aarch64-arm-none-eabi -c %s -### -mbranch-protection=pac-ret+pc+leaf             2>&1 |  FileCheck %s --check-prefixes=PAUTH-LR-LEAF
+// RUN: %clang -target aarch64-arm-none-eabi -c %s -### -mbranch-protection=pac-ret+pc+bti              2>&1 |  FileCheck %s --check-prefixes=PAUTH-LR-BTI
+// RUN: %clang -target aarch64-arm-none-eabi -c %s -### -mbranch-protection=pac-ret+pc+leaf+b-key+bti   2>&1 |  FileCheck %s --check-prefixes=PAUTH-LR-LEAF-B-KEY-BTI
+// RUN: %clang -target aarch64-arm-none-eabi -c %s -### -mbranch-protection=pac-ret+pc                  -march=armv9.5-a 2>&1 |  FileCheck %s --check-prefixes=PAUTH-LR
+// RUN: %clang -target aarch64-arm-none-eabi -c %s -### -mbranch-protection=pac-ret+pc+b-key            -march=armv9.5-a 2>&1 |  FileCheck %s --check-prefixes=PAUTH-LR-B-KEY
+// RUN: %clang -target aarch64-arm-none-eabi -c %s -### -mbranch-protection=pac-ret+pc+leaf             -march=armv9.5-a 2>&1 |  FileCheck %s --check-prefixes=PAUTH-LR-LEAF
+// RUN: %clang -target aarch64-arm-none-eabi -c %s -### -mbranch-protection=pac-ret+pc+bti              -march=armv9.5-a 2>&1 |  FileCheck %s --check-prefixes=PAUTH-LR-BTI
+// RUN: %clang -target aarch64-arm-none-eabi -c %s -### -mbranch-protection=pac-ret+pc+leaf+b-key+bti   -march=armv9.5-a 2>&1 |  FileCheck %s --check-prefixes=PAUTH-LR-LEAF-B-KEY-BTI
+
+// PAUTH-LR: "-msign-return-address=non-leaf" "-msign-return-address-key=a_key" "-mbranch-protection-pauth-lr"
+// PAUTH-LR-B-KEY: "-msign-return-address=non-leaf" "-msign-return-address-key=b_key" "-mbranch-protection-pauth-lr"
+// PAUTH-LR-LEAF: "-msign-return-address=all" "-msign-return-address-key=a_key" "-mbranch-protection-pauth-lr"
+// PAUTH-LR-BTI: "-msign-return-address=non-leaf" "-msign-return-address-key=a_key" "-mbranch-protection-pauth-lr"
+// PAUTH-LR-LEAF-B-KEY-BTI: "-msign-return-address=all" "-msign-return-address-key=b_key" "-mbranch-protection-pauth-lr" "-mbranch-target-enforce"
+
+// NOT-PAUTH-LR: "-mbranch-target-enforce"
+// NOT-PAUTH-LR-B-KEY: "-mbranch-target-enforce"
+// NOT-PAUTH-LR-LEAF: "-mbranch-target-enforce"
+// NOT-PAUTH-LR-BTI: "-mbranch-target-enforce"
diff --git a/clang/test/Driver/aarch64-v95a.c b/clang/test/Driver/aarch64-v95a.c
index 366cade86a9fb..6fac62e8b389a 100644
--- a/clang/test/Driver/aarch64-v95a.c
+++ b/clang/test/Driver/aarch64-v95a.c
@@ -1,3 +1,5 @@
+// ===== Base v9.5a architecture =====
+
 // RUN: %clang -target aarch64 -march=armv9.5a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV95A %s
 // RUN: %clang -target aarch64 -march=armv9.5-a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV95A %s
 // RUN: %clang -target aarch64 -mlittle-endian -march=armv9.5a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV95A %s
@@ -5,6 +7,7 @@
 // RUN: %clang -target aarch64_be -mlittle-endian -march=armv9.5a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV95A %s
 // RUN: %clang -target aarch64_be -mlittle-endian -march=armv9.5-a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV95A %s
 // GENERICV95A: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "generic" "-target-feature" "+neon" "-target-feature" "+v9.5a"
+
 // RUN: %clang -target aarch64_be -march=armv9.5a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV95A-BE %s
 // RUN: %clang -target aarch64_be -march=armv9.5-a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV95A-BE %s
 // RUN: %clang -target aarch64 -mbig-endian -march=armv9.5a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV95A-BE %s
@@ -18,3 +21,7 @@
 // RUN: %clang -target aarch64 -march=armv9.5a+cpa -### -c %s 2>&1 | FileCheck -check-prefix=V95A-CPA %s
 // RUN: %clang -target aarch64 -march=armv9.5-a+cpa -### -c %s 2>&1 | FileCheck -check-prefix=V95A-CPA %s
 // V95A-CPA: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "generic" "-target-feature" "+neon" "-target-feature" "+v9.5a" "-target-feature" "+cpa"
+
+// RUN: %clang -target aarch64 -march=armv9.5a+pauth-lr -### -c %s 2>&1 | FileCheck -check-prefix=V95A-PAUTHLR %s
+// RUN: %clang -target aarch64 -march=armv9.5-a+pauth-lr -### -c %s 2>&1 | FileCheck -check-prefix=V95A-PAUTHLR %s
+// V95A-PAUTHLR: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "generic" "-target-feature" "+neon" "-target-feature" "+v9.5a" "-target-feature" "+pauth-lr"
diff --git a/llvm/include/llvm/TargetParser/AArch64TargetParser.h b/llvm/include/llvm/TargetParser/AArch64TargetParser.h
index f0b35790133fb..6c7410a8b8f79 100644
--- a/llvm/include/llvm/TargetParser/AArch64TargetParser.h
+++ b/llvm/include/llvm/TargetParser/AArch64TargetParser.h
@@ -174,6 +174,7 @@ enum ArchExtKind : unsigned {
   AEK_SMEF8F32 =      70, // FEAT_SME_F8F32
   AEK_SMEFA64 =       71, // FEAT_SME_FA64
   AEK_CPA =           72, // FEAT_CPA
+  AEK_PAUTHLR =       73, // FEAT_PAuth_LR
   AEK_NUM_EXTENSIONS
 };
 using ExtensionBitset = Bitset<AEK_NUM_EXTENSIONS>;
@@ -297,6 +298,7 @@ inline constexpr ExtensionInfo Extensions[] = {
     {"sme-f8f32", AArch64::AEK_SMEF8F32, "+sme-f8f32", "-sme-f8f32", FEAT_INIT, "+sme2,+fp8", 0},
     {"sme-fa64",  AArch64::AEK_SMEFA64,  "+sme-fa64", "-sme-fa64",  FEAT_INIT, "", 0},
     {"cpa", AArch64::AEK_CPA, "+cpa", "-cpa", FEAT_INIT, "", 0},
+    {"pauth-lr", AArch64::AEK_PAUTHLR, "+pauth-lr", "-pauth-lr", FEAT_INIT, "", 0},
     // Special cases
     {"none", AArch64::AEK_NONE, {}, {}, FEAT_INIT, "", ExtensionInfo::MaxFMVPriority},
 };
diff --git a/llvm/include/llvm/TargetParser/ARMTargetParserCommon.h b/llvm/include/llvm/TargetParser/ARMTargetParserCommon.h
index e3d9ffc1d4db5..1e4187c6fb111 100644
--- a/llvm/include/llvm/TargetParser/ARMTargetParserCommon.h
+++ b/llvm/include/llvm/TargetParser/ARMTargetParserCommon.h
@@ -41,6 +41,7 @@ struct ParsedBranchProtection {
   StringRef Scope;
   StringRef Key;
   bool BranchTargetEnforcement;
+  bool BranchProtectionPAuthLR;
 };
 
 bool parseBranchProtection(StringRef Spec, ParsedBranchProtection &PBP,
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 175f6ef49c3ba..6d85e1fb5fbf1 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -8802,12 +8802,23 @@ AArch64InstrInfo::getOutliningTypeImpl(MachineBasicBlock::iterator &MIT,
   // Don't outline anything used for return address signing. The outlined
   // function will get signed later if needed
   switch (MI.getOpcode()) {
+  case AArch64::PACM:
   case AArch64::PACIASP:
   case AArch64::PACIBSP:
+  case AArch64::PACIASPPC:
+  case AArch64::PACIBSPPC:
   case AArch64::AUTIASP:
   case AArch64::AUTIBSP:
+  case AArch64::AUTIASPPCi:
+  case AArch64::AUTIASPPCr:
+  case AArch64::AUTIBSPPCi:
+  case AArch64::AUTIBSPPCr:
   case AArch64::RETAA:
   case AArch64::RETAB:
+  case AArch64::RETAASPPCi:
+  case AArch64::RETAASPPCr:
+  case AArch64::RETABSPPCi:
+  case AArch64::RETABSPPCr:
   case AArch64::EMITBKEY:
   case AArch64::PAUTH_PROLOGUE:
   case AArch64::PAUTH_EPILOGUE:
diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp
index 9da59ef2a8062..1a8c71888a852 100644
--- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp
@@ -93,16 +93,24 @@ AArch64FunctionInfo::AArch64FunctionInfo(const Function &F,
   // TODO: skip functions that have no instrumented allocas for optimization
   IsMTETagged = F.hasFnAttribute(Attribute::SanitizeMemTag);
 
-  if (!F.hasFnAttribute("branch-target-enforcement")) {
-    if (const auto *BTE = mdconst::extract_or_null<ConstantInt>(
-            F.getParent()->getModuleFlag("branch-target-enforcement")))
-      BranchTargetEnforcement = BTE->getZExtValue();
-  } else {
-    const StringRef BTIEnable =
-        F.getFnAttribute("branch-target-enforcement").getValueAsString();
-    assert(BTIEnable == "true" || BTIEnable == "false");
-    BranchTargetEnforcement = BTIEnable == "true";
-  }
+  // BTI/PAuthLR may be set either on the function or the module. Set Bool from
+  // either the function attribute or module attribute, depending on what is
+  // set.
+  // Note: the module attributed is numeric (0 or 1) but the function attribute
+  // is stringy ("true" or "false").
+  auto TryFnThenModule = [&](StringRef AttrName, bool &Bool) {
+    if (F.hasFnAttribute(AttrName)) {
+      const StringRef V = F.getFnAttribute(AttrName).getValueAsString();
+      assert(V.equals_insensitive("true") || V.equals_insensitive("false"));
+      Bool = V.equals_insensitive("true");
+    } else if (const auto *ModVal = mdconst::extract_or_null<ConstantInt>(
+                   F.getParent()->getModuleFlag(AttrName))) {
+      Bool = ModVal->getZExtValue();
+    }
+  };
+
+  TryFnThenModule("branch-target-enforcement", BranchTargetEnforcement);
+  TryFnThenModule("branch-protection-pauth-lr", BranchProtectionPAuthLR);
 
   // The default stack probe size is 4096 if the function has no
   // stack-probe-size attribute. This is a safe default because it is the
diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
index 219f83cfd32e0..cd4a18bfbc23a 100644
--- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
@@ -22,6 +22,7 @@
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/IR/Function.h"
 #include "llvm/MC/MCLinkerOptimizationHint.h"
+#include "llvm/MC/MCSymbol.h"
 #include <cassert>
 #include <optional>
 
@@ -164,10 +165,21 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
   /// SignWithBKey modifies the default PAC-RET mode to signing with the B key.
   bool SignWithBKey = false;
 
+  /// SigningInstrOffset captures the offset of the PAC-RET signing instruction
+  /// within the prologue, so it can be re-used for authentication in the
+  /// epilogue when using PC as a second salt (FEAT_PAuth_LR)
+  MCSymbol *SignInstrLabel = nullptr;
+
   /// BranchTargetEnforcement enables placing BTI instructions at potential
   /// indirect branch destinations.
   bool BranchTargetEnforcement = false;
 
+  /// Indicates that SP signing should be diversified with PC as-per PAuthLR.
+  /// This is set by -mbranch-protection and will emit NOP instructions unless
+  /// the subtarget feature +pauthlr is also used (in which case non-NOP
+  /// instructions are emitted).
+  bool BranchProtectionPAuthLR = false;
+
   /// Whether this function has an extended frame record [Ctx, FP, LR]. If so,
   /// bit 60 of the in-memory FP will be 1 to enable other tools to detect the
   /// extended record.
@@ -436,10 +448,16 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
   bool needsShadowCallStackPrologueEpilogue(MachineFunction &MF) const;
 
   bool shouldSignWithBKey() const { return SignWithBKey; }
+
+  MCSymbol *getSigningInstrLabel() const { return SignInstrLabel; }
+  void setSigningInstrLabel(MCSymbol *Label) { SignInstrLabel = Label; }
+
   bool isMTETagged() const { return IsMTETagged; }
 
   bool branchTargetEnforcement() const { return BranchTargetEnforcement; }
 
+  bool branchProtectionPAuthLR() const { return BranchProtectionPAuthLR; }
+
   void setHasSwiftAsyncContext(bool HasContext) {
     HasSwiftAsyncContext = HasContext;
   }
diff --git a/llvm/lib/Target/AArch64/AArch64PointerAuth.cpp b/llvm/lib/Target/AArch64/AArch64PointerAuth.cpp
index 7576d2a899d1a..7509afaeb5fef 100644
--- a/llvm/lib/Target/AArch64/AArch64PointerAuth.cpp
+++ b/llvm/lib/Target/AArch64/AArch64PointerAuth.cpp
@@ -60,11 +60,35 @@ FunctionPass *llvm::createAArch64PointerAuthPass() {
 
 char AArch64PointerAuth::ID = 0;
 
+// Where PAuthLR support is not known at compile time, it is supported using
+// PACM. PACM is in the hint space so has no effect when PAuthLR is not
+// supported by the hardware, but will alter the behaviour of PACI*SP, AUTI*SP
+// and RETAA/RETAB if the hardware supports PAuthLR.
+static void BuildPACM(const AArch64Subtarget &Subtarget, MachineBasicBlock &MBB,
+                      MachineBasicBlock::iterator MBBI, DebugLoc DL,
+                      MachineInstr::MIFlag Flags, MCSymbol *PACSym = nullptr) {
+  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+  auto &MFnI = *MBB.getParent()->getInfo<AArch64FunctionInfo>();
+
+  // ADR X16,<address_of_PACIASP>
+  if (PACSym) {
+    assert(Flags == MachineInstr::FrameDestroy);
+    BuildMI(MBB, MBBI, DL, TII->get(AArch64::ADR))
+        .addReg(AArch64::X16, RegState::Define)
+        .addSym(PACSym);
+  }
+
+  // Only emit PACM if -mbranch-protection has +pc and the target does not
+  // have feature +pauth-lr.
+  if (MFnI.branchProtectionPAuthLR() && !Subtarget.hasPAuthLR())
+    BuildMI(MBB, MBBI, DL, TII->get(AArch64::PACM)).setMIFlag(Flags);
+}
+
 void AArch64PointerAuth::signLR(MachineFunction &MF,
                                 MachineBasicBlock::iterator MBBI) const {
-  const AArch64FunctionInfo *MFnI = MF.getInfo<AArch64FunctionInfo>();
-  bool UseBKey = MFnI->shouldSignWithBKey();
-  bool EmitCFI = MFnI->needsDwarfUnwindInfo(MF);
+  auto &MFnI = *MF.getInfo<AArch64FunctionInfo>();
+  bool UseBKey = MFnI.shouldSignWithBKey();
+  bool EmitCFI = MFnI.needsDwarfUnwindInfo(MF);
   bool NeedsWinCFI = MF.hasWinCFI();
 
   MachineBasicBlock &MBB = *MBBI->getParent();
@@ -77,11 +101,29 @@ void AArch64PointerAuth::signLR(MachineFunction &MF,
         .setMIFlag(MachineInstr::FrameSetup);
   }
 
+  // PAuthLR authentication instructions need to know the value of PC at the
+  // point of signing (PACI*).
+  if (MFnI.branchProtectionPAuthLR()) {
+    MCSymbol *PACSym = MF.getMMI().getContext().createTempSymbol();
+    MFnI.setSigningInstrLabel(PACSym);
+  }
+
   // No SEH opcode for this one; it doesn't materialize into an
   // instruction on Windows.
-  BuildMI(MBB, MBBI, DL,
-          TII->get(UseBKey ? AArch64::PACIBSP : AArch64::PACIASP))
-      .setMIFlag(MachineInstr::FrameSetup);
+  if (MFnI.branchProtectionPAuthLR() && Subtarget->hasPAuthLR()) {
+    BuildMI(MBB, MBBI, DL,
+            TII->get(MFnI.shouldSignWithBKey() ? AArch64::PACIBSPPC
+                                               : AArch64::PACIASPPC))
+        .setMIFlag(MachineInstr::FrameSetup)
+        ->setPreInstrSymbol(MF, MFnI.getSigningInstrLabel());
+  } else {
+    BuildPACM(*Subtarget, MBB, MBBI, DL, MachineInstr::FrameSetup);
+    BuildMI(MBB, MBBI, DL,
+            TII->get(MFnI.shouldSignWithBKey() ? AArch64::PACIBSP
+                                               : AArch64::PACIASP))
+        .setMIFlag(MachineInstr::FrameSetup)
+        ->setPreInstrSymbol(MF, MFnI.getSigningInstrLabel());
+  }
 
   if (EmitCFI) {
     unsigned CFIIndex =
@@ -118,15 +160,37 @@ void AArch64PointerAuth::authenticateLR(
   // DW_CFA_AARCH64_negate_ra_state can't be emitted.
   bool TerminatorIsCombinable =
       TI != MBB.end() && TI->getOpcode() == AArch64::RET;
+  MCSymbol *PACSym = MFnI->getSigningInstrLabel();
+
   if (Subtarget->hasPAuth() && TerminatorIsCombinable && !NeedsWinCFI &&
       !MF.getFunction().hasFnAttribute(Attribute::ShadowCallStack)) {
-    unsigned CombinedRetOpcode = UseBKey ? AArch64::RETAB : AArch64::RETAA;
-    BuildMI(MBB, TI, DL, TII->get(CombinedRetOpcode)).copyImplicitOps(*TI);
+    if (MFnI->branchProtectionPAuthLR() && Subtarget->hasPAuthLR()) {
+      assert(PACSym && "No PAC instruction to refer to");
+      BuildMI(MBB, TI, DL,
+              TII->get(UseBKey ? AArch64::RETABSPPCi : AArch64::RETAASPPCi))
+          .addSym(PACSym)
+          .copyImplicitOps(*MBBI)
+          .setMIFlag(MachineInstr::FrameDestroy);
+    } else {
+      BuildPACM(*Subtarget, MBB, TI, DL, MachineInstr::FrameDestroy, PACSym);
+      BuildMI(MBB, TI, DL, TII->get(UseBKey ? AArch64::RETAB : AArch64::RETAA))
+          .copyImplicitOps(*MBBI)
+          .setMIFlag(MachineInstr::FrameDestroy);
+    }
     MBB.erase(TI);
   } else {
-    unsigned AutOpcode = UseBKey ? AArch64::AUTIBSP : AArch64::AUTIASP;
-    BuildMI(MBB, MBBI, DL, TII->get(AutOpcode))
-        .setMIFlag(MachineInstr::FrameDestroy);
+    if (MFnI->branchProtectionPAuthLR() && Subtarget->hasPAuthLR()) {
+      assert(PACSym && "No PAC instruction to refer to");
+      BuildMI(MBB, MBBI, DL,
+              TII->get(UseBKey ? AArch64::AUTIBSPPCi : AArch64::AUTIASPPCi))
+          .addSym(PACSym)
+          .setMIFlag(MachineInstr::FrameDestroy);
+    } else {
+      BuildPACM(*Subtarget, MBB, MBBI, DL, MachineInstr::FrameDestroy, PACSym);
+      BuildMI(MBB, MBBI, DL,
+              TII->get(UseBKey ? AArch64::AUTIBSP : AArch64::AUTIASP))
+          .setMIFlag(MachineInstr::FrameDestroy);
+    }
 
     if (EmitAsyncCFI) {
       unsigned CFIIndex =
diff --git a/llvm/lib/TargetParser/ARMTargetParserCommon.cpp b/llvm/lib/TargetParser/ARMTargetParserCommon.cpp
index 10b80cad43472..6d3a59d532fd3 100644
--- a/llvm/lib/TargetParser/ARMTargetParserCommon.cpp
+++ b/llvm/lib/TargetParser/ARMTargetParserCommon.cpp
@@ -134,13 +134,13 @@ ARM::EndianKind ARM::parseArchEndian(StringRef Arch) {
 }
 
 // Parse a branch protection specification, which has the form
-//   standard | none | [bti,pac-ret[+b-key,+leaf]*]
+//   standard | none | [bti,pac-ret[+b-key,+leaf,+pc]*]
 // Returns true on success, with individual elements of the specification
 // returned in `PBP`. Returns false in error, with `Err` containing
 // an erroneous part of the spec.
 bool ARM::parseBranchProtection(StringRef Spec, ParsedBranchProtection &PBP,
                                 StringRef &Err) {
-  PBP = {"none", "a_key", false};
+  PBP = {"none", "a_key", false, false};
   if (Spec == "none")
     return true; // defaults are ok
 
@@ -166,6 +166,8 @@ bool ARM::parseBranchProtection(StringRef Spec, ParsedBranchProtection &PBP,
           PBP.Scope = "all";
         else if (PACOpt == "b-key")
           PBP.Key = "b_key";
+        else if (PACOpt == "pc")
+          PBP.BranchProtectionPAuthLR = true;
         else
           break;
       }
diff --git a/llvm/test/CodeGen/AArch64/sign-return-address-pauth-lr.ll b/llvm/test/CodeGen/AArch64/sign-return-address-pauth-lr.ll
new file mode 100644
index 0000000000000..a78fa853d99dc
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sign-return-address-pauth-lr.ll
@@ -0,0 +1,542 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+
+; PauthLR is controlled via a combination of -mbranch-protection and +pauth-lr.
+; -mbranch-protection=+pc enables branch protection. If the feature +pauth-lr
+; is available (v9.5a onwards) then non-NOP instructions are used; otherwise
+; NOP instructions are used.
+
+; There are 6 cases to cover:
+
+; feature \ -mbranch-protection= |    none    | pac-ret |   pac-ret+pc
+; ------------------------------------------------------------------------
+; without +pauth-lr              | no codegen | old pac |     NOP pauth-lr
+;    with +pauth-lr              | no codegen | old pac | non-NOP pauth-lr
+
+; sign-return-address.ll tests combinations of -mbranch-protection=none/pac-ret
+; and whether +pauth-lr is present or not.
+
+; sign-return-address-pauth-lr.ll is identical, with the addition of this module
+; attribute, which enables -mbranch-protection=pac-ret+pc, and therefore tests
+; the remaining parameter combinations in the table:
+!llvm.module.flags = !{!1}
+!1 = !{i32 1, !"branch-protection-pauth-lr", i32 1}
+
+; RUN: llc -mtriple=aarch64              < %s | FileCheck --check-prefixes=CHECK,COMPAT %s
+; RUN: llc -mtriple=aarch64 -mattr=v8.3a < %s | FileCheck --check-prefixes=CHECK,V83A %s
+; RUN: llc -mtriple=aarch64 -mattr=v9a -mattr=pauth-lr < %s | FileCheck --check-prefixes=PAUTHLR %s
+
+define i32 @leaf(i32 %x) {
+; CHECK-LABEL: leaf:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ret
+;
+; PAUTHLR-LABEL: leaf:
+; PAUTHLR:       // %bb.0:
+; PAUTHLR-NEXT:    ret
+  ret i32 %x
+}
+
+define i32 @leaf_sign_none(i32 %x) "sign-return-address"="none"  {
+; CHECK-LABEL: leaf_sign_none:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ret
+;
+; PAUTHLR-LABEL: leaf_sign_none:
+; PAUTHLR:       // %bb.0:
+; PAUTHLR-NEXT:    ret
+  ret i32 %x
+}
+
+define i32 @leaf_sign_non_leaf(i32 %x) "sign-return-address"="non-leaf"  {
+; CHECK-LABEL: leaf_sign_non_leaf:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ret
+;
+; PAUTHLR-LABEL: leaf_sign_non_leaf:
+; PAUTHLR:       // %bb.0:
+; PAUTHLR-NEXT:    ret
+  ret i32 %x
+}
+
+define i32 @leaf_sign_all(i32 %x) "sign-return-address"="all" {
+; COMPAT-LABEL: leaf_sign_all:
+; COMPAT:       // %bb.0:
+; COMPAT-NEXT:    hint #39
+; COMPAT-NEXT:  .Ltmp0:
+; COMPAT-NEXT:    hint #25
+; COMPAT-NEXT:    .cfi_negate_ra_state
+; COMPAT-NEXT:    adr x16, .Ltmp0
+; COMPAT-NEXT:    hint #39
+; COMPAT-NEXT:    hint #29
+; COMPAT-NEXT:    ret
+;
+; V83A-LABEL: leaf_sign_all:
+; V83A:       // %bb.0:
+; V83A-NEXT:    hint #39
+; V83A-NEXT:  .Ltmp0:
+; V83A-NEXT:    paciasp
+; V83A-NEXT:    .cfi_negate_ra_state
+; V83A-NEXT:    adr x16, .Ltmp0
+; V83A-NEXT:    hint #39
+; V83A-NEXT:    retaa
+;
+; PAUTHLR-LABEL: leaf_sign_all:
+; PAUTHLR:       // %bb.0:
+; PAUTHLR-NEXT:  .Ltmp0:
+; PAUTHLR-NEXT:    paciasppc
+; PAUTHLR-NEXT:    .cfi_negate_ra_state
+; PAUTHLR-NEXT:    retaasppc .Ltmp0
+  ret i32 %x
+}
+
+define i64 @leaf_clobbers_lr(i64 %x) "sign-return-address"="non-leaf"  {
+; COMPAT-LABEL: leaf_clobbers_lr:
+; COMPAT:       // %bb.0:
+; COMPAT-NEXT:    hint #39
+; COMPAT-NEXT:  .Ltmp1:
+; COMPAT-NEXT:    hint #25
+; COMPAT-NEXT:    .cfi_negate_ra_state
+; COMPAT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; COMPAT-NEXT:    .cfi_def_cfa_offset 16
+; COMPAT-NEXT:    .cfi_offset w30, -16
+; COMPAT-NEXT:    //APP
+; COMPAT-NEXT:    mov x30, x0
+; COMPAT-NEXT:    //NO_APP
+; COMPAT-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; COMPAT-NEXT:    adr x16, .Ltmp1
+; COMPAT-NEXT:    hint #39
+; COMPAT-NEXT:    hint #29
+; COMPAT-NEXT:    ret
+;
+; V83A-LABEL: leaf_clobbers_lr:
+; V83A:       // %bb.0:
+; V83A-NEXT:    hint #39
+; V83A-NEXT:  .Ltmp1:
+; V83A-NEXT:    paciasp
+; V83A-NEXT:    .cfi_negate_ra_state
+; V83A-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; V83A-NEXT:    .cfi_def_cfa_offset 16
+; V83A-NEXT:    .cfi_offset w30, -16
+; V83A-NEXT:    //APP
+; V83A-NEXT:    mov x30, x0
+; V83A-NEXT:    //NO_APP
+; V83A-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; V83A-NEXT:    adr x16, .Ltmp1
+; V83A-NEXT:    hint #39
+; V83A-NEXT:    retaa
+;
+; PAUTHLR-LABEL: leaf_clobbers_lr:
+; PAUTHLR:       // %bb.0:
+; PAUTHLR-NEXT:  .Ltmp1:
+; PAUTHLR-NEXT:    paciasppc
+; PAUTHLR-NEXT:    .cfi_negate_ra_state
+; PAUTHLR-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; PAUTHLR-NEXT:    .cfi_def_cfa_offset 16
+; PAUTHLR-NEXT:    .cfi_offset w30, -16
+; PAUTHLR-NEXT:    //APP
+; PAUTHLR-NEXT:    mov x30, x0
+; PAUTHLR-NEXT:    //NO_APP
+; PAUTHLR-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; PAUTHLR-NEXT:    retaasppc .Ltmp1
+  call void asm sideeffect "mov x30, $0", "r,~{lr}"(i64 %x) #1
+  ret i64 %x
+}
+
+declare i32 @foo(i32)
+
+define i32 @non_leaf_sign_all(i32 %x) "sign-return-address"="all" {
+; COMPAT-LABEL: non_leaf_sign_all:
+; COMPAT:       // %bb.0:
+; COMPAT-NEXT:    hint #39
+; COMPAT-NEXT:  .Ltmp2:
+; COMPAT-NEXT:    hint #25
+; COMPAT-NEXT:    .cfi_negate_ra_state
+; COMPAT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; COMPAT-NEXT:    .cfi_def_cfa_offset 16
+; COMPAT-NEXT:    .cfi_offset w30, -16
+; COMPAT-NEXT:    bl foo
+; COMPAT-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; COMPAT-NEXT:    adr x16, .Ltmp2
+; COMPAT-NEXT:    hint #39
+; COMPAT-NEXT:    hint #29
+; COMPAT-NEXT:    ret
+;
+; V83A-LABEL: non_leaf_sign_all:
+; V83A:       // %bb.0:
+; V83A-NEXT:    hint #39
+; V83A-NEXT:  .Ltmp2:
+; V83A-NEXT:    paciasp
+; V83A-NEXT:    .cfi_negate_ra_state
+; V83A-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; V83A-NEXT:    .cfi_def_cfa_offset 16
+; V83A-NEXT:    .cfi_offset w30, -16
+; V83A-NEXT:    bl foo
+; V83A-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; V83A-NEXT:    adr x16, .Ltmp2
+; V83A-NEXT:    hint #39
+; V83A-NEXT:    retaa
+;
+; PAUTHLR-LABEL: non_leaf_sign_all:
+; PAUTHLR:       // %bb.0:
+; PAUTHLR-NEXT:  .Ltmp2:
+; PAUTHLR-NEXT:    paciasppc
+; PAUTHLR-NEXT:    .cfi_negate_ra_state
+; PAUTHLR-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; PAUTHLR-NEXT:    .cfi_def_cfa_offset 16
+; PAUTHLR-NEXT:    .cfi_offset w30, -16
+; PAUTHLR-NEXT:    bl foo
+; PAUTHLR-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; PAUTHLR-NEXT:    retaasppc .Ltmp2
+  %call = call i32 @foo(i32 %x)
+  ret i32 %call
+}
+
+define i32 @non_leaf_sign_non_leaf(i32 %x) "sign-return-address"="non-leaf"  {
+; COMPAT-LABEL: non_leaf_sign_non_leaf:
+; COMPAT:       // %bb.0:
+; COMPAT-NEXT:    hint #39
+; COMPAT-NEXT:  .Ltmp3:
+; COMPAT-NEXT:    hint #25
+; COMPAT-NEXT:    .cfi_negate_ra_state
+; COMPAT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; COMPAT-NEXT:    .cfi_def_cfa_offset 16
+; COMPAT-NEXT:    .cfi_offset w30, -16
+; COMPAT-NEXT:    bl foo
+; COMPAT-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; COMPAT-NEXT:    adr x16, .Ltmp3
+; COMPAT-NEXT:    hint #39
+; COMPAT-NEXT:    hint #29
+; COMPAT-NEXT:    ret
+;
+; V83A-LABEL: non_leaf_sign_non_leaf:
+; V83A:       // %bb.0:
+; V83A-NEXT:    hint #39
+; V83A-NEXT:  .Ltmp3:
+; V83A-NEXT:    paciasp
+; V83A-NEXT:    .cfi_negate_ra_state
+; V83A-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; V83A-NEXT:    .cfi_def_cfa_offset 16
+; V83A-NEXT:    .cfi_offset w30, -16
+; V83A-NEXT:    bl foo
+; V83A-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; V83A-NEXT:    adr x16, .Ltmp3
+; V83A-NEXT:    hint #39
+; V83A-NEXT:    retaa
+;
+; PAUTHLR-LABEL: non_leaf_sign_non_leaf:
+; PAUTHLR:       // %bb.0:
+; PAUTHLR-NEXT:  .Ltmp3:
+; PAUTHLR-NEXT:    paciasppc
+; PAUTHLR-NEXT:    .cfi_negate_ra_state
+; PAUTHLR-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; PAUTHLR-NEXT:    .cfi_def_cfa_offset 16
+; PAUTHLR-NEXT:    .cfi_offset w30, -16
+; PAUTHLR-NEXT:    bl foo
+; PAUTHLR-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; PAUTHLR-NEXT:    retaasppc .Ltmp3
+  %call = call i32 @foo(i32 %x)
+  ret i32 %call
+}
+
+; Should not use the RETAA instruction.
+define i32 @non_leaf_scs(i32 %x) "sign-return-address"="non-leaf" shadowcallstack "target-features"="+v8.3a,+reserve-x18"  {
+; CHECK-LABEL: non_leaf_scs:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [x18], #8
+; CHECK-NEXT:    .cfi_escape 0x16, 0x12, 0x02, 0x82, 0x78 //
+; CHECK-NEXT:    hint #39
+; CHECK-NEXT:  .Ltmp4:
+; CHECK-NEXT:    paciasp
+; CHECK-NEXT:    .cfi_negate_ra_state
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    bl foo
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    adr x16, .Ltmp4
+; CHECK-NEXT:    hint #39
+; CHECK-NEXT:    autiasp
+; CHECK-NEXT:    ldr x30, [x18, #-8]!
+; CHECK-NEXT:    ret
+;
+; PAUTHLR-LABEL: non_leaf_scs:
+; PAUTHLR:       // %bb.0:
+; PAUTHLR-NEXT:    str x30, [x18], #8
+; PAUTHLR-NEXT:    .cfi_escape 0x16, 0x12, 0x02, 0x82, 0x78 //
+; PAUTHLR-NEXT:  .Ltmp4:
+; PAUTHLR-NEXT:    paciasppc
+; PAUTHLR-NEXT:    .cfi_negate_ra_state
+; PAUTHLR-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; PAUTHLR-NEXT:    .cfi_def_cfa_offset 16
+; PAUTHLR-NEXT:    .cfi_offset w30, -16
+; PAUTHLR-NEXT:    bl foo
+; PAUTHLR-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; PAUTHLR-NEXT:    autiasppc .Ltmp4
+; PAUTHLR-NEXT:    ldr x30, [x18, #-8]!
+; PAUTHLR-NEXT:    ret
+  %call = call i32 @foo(i32 %x)
+  ret i32 %call
+}
+
+define i32 @leaf_sign_all_v83(i32 %x) "sign-return-address"="all" "target-features"="+v8.3a" {
+; CHECK-LABEL: leaf_sign_all_v83:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    hint #39
+; CHECK-NEXT:  .Ltmp5:
+; CHECK-NEXT:    paciasp
+; CHECK-NEXT:    .cfi_negate_ra_state
+; CHECK-NEXT:    adr x16, .Ltmp5
+; CHECK-NEXT:    hint #39
+; CHECK-NEXT:    retaa
+;
+; PAUTHLR-LABEL: leaf_sign_all_v83:
+; PAUTHLR:       // %bb.0:
+; PAUTHLR-NEXT:  .Ltmp5:
+; PAUTHLR-NEXT:    paciasppc
+; PAUTHLR-NEXT:    .cfi_negate_ra_state
+; PAUTHLR-NEXT:    retaasppc .Ltmp5
+  ret i32 %x
+}
+
+declare fastcc i64 @bar(i64)
+
+define fastcc void @spill_lr_and_tail_call(i64 %x) "sign-return-address"="all" {
+; COMPAT-LABEL: spill_lr_and_tail_call:
+; COMPAT:       // %bb.0:
+; COMPAT-NEXT:    hint #39
+; COMPAT-NEXT:  .Ltmp6:
+; COMPAT-NEXT:    hint #25
+; COMPAT-NEXT:    .cfi_negate_ra_state
+; COMPAT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; COMPAT-NEXT:    .cfi_def_cfa_offset 16
+; COMPAT-NEXT:    .cfi_offset w30, -16
+; COMPAT-NEXT:    //APP
+; COMPAT-NEXT:    mov x30, x0
+; COMPAT-NEXT:    //NO_APP
+; COMPAT-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; COMPAT-NEXT:    adr x16, .Ltmp6
+; COMPAT-NEXT:    hint #39
+; COMPAT-NEXT:    hint #29
+; COMPAT-NEXT:    b bar
+;
+; V83A-LABEL: spill_lr_and_tail_call:
+; V83A:       // %bb.0:
+; V83A-NEXT:    hint #39
+; V83A-NEXT:  .Ltmp6:
+; V83A-NEXT:    paciasp
+; V83A-NEXT:    .cfi_negate_ra_state
+; V83A-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; V83A-NEXT:    .cfi_def_cfa_offset 16
+; V83A-NEXT:    .cfi_offset w30, -16
+; V83A-NEXT:    //APP
+; V83A-NEXT:    mov x30, x0
+; V83A-NEXT:    //NO_APP
+; V83A-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; V83A-NEXT:    adr x16, .Ltmp6
+; V83A-NEXT:    hint #39
+; V83A-NEXT:    autiasp
+; V83A-NEXT:    b bar
+;
+; PAUTHLR-LABEL: spill_lr_and_tail_call:
+; PAUTHLR:       // %bb.0:
+; PAUTHLR-NEXT:  .Ltmp6:
+; PAUTHLR-NEXT:    paciasppc
+; PAUTHLR-NEXT:    .cfi_negate_ra_state
+; PAUTHLR-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; PAUTHLR-NEXT:    .cfi_def_cfa_offset 16
+; PAUTHLR-NEXT:    .cfi_offset w30, -16
+; PAUTHLR-NEXT:    //APP
+; PAUTHLR-NEXT:    mov x30, x0
+; PAUTHLR-NEXT:    //NO_APP
+; PAUTHLR-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; PAUTHLR-NEXT:    autiasppc .Ltmp6
+; PAUTHLR-NEXT:    b bar
+  call void asm sideeffect "mov x30, $0", "r,~{lr}"(i64 %x) #1
+  tail call fastcc i64 @bar(i64 %x)
+  ret void
+}
+
+define i32 @leaf_sign_all_a_key(i32 %x) "sign-return-address"="all" "sign-return-address-key"="a_key" {
+; COMPAT-LABEL: leaf_sign_all_a_key:
+; COMPAT:       // %bb.0:
+; COMPAT-NEXT:    hint #39
+; COMPAT-NEXT:  .Ltmp7:
+; COMPAT-NEXT:    hint #25
+; COMPAT-NEXT:    .cfi_negate_ra_state
+; COMPAT-NEXT:    adr x16, .Ltmp7
+; COMPAT-NEXT:    hint #39
+; COMPAT-NEXT:    hint #29
+; COMPAT-NEXT:    ret
+;
+; V83A-LABEL: leaf_sign_all_a_key:
+; V83A:       // %bb.0:
+; V83A-NEXT:    hint #39
+; V83A-NEXT:  .Ltmp7:
+; V83A-NEXT:    paciasp
+; V83A-NEXT:    .cfi_negate_ra_state
+; V83A-NEXT:    adr x16, .Ltmp7
+; V83A-NEXT:    hint #39
+; V83A-NEXT:    retaa
+;
+; PAUTHLR-LABEL: leaf_sign_all_a_key:
+; PAUTHLR:       // %bb.0:
+; PAUTHLR-NEXT:  .Ltmp7:
+; PAUTHLR-NEXT:    paciasppc
+; PAUTHLR-NEXT:    .cfi_negate_ra_state
+; PAUTHLR-NEXT:    retaasppc .Ltmp7
+  ret i32 %x
+}
+
+define i32 @leaf_sign_all_b_key(i32 %x) "sign-return-address"="all" "sign-return-address-key"="b_key" {
+; COMPAT-LABEL: leaf_sign_all_b_key:
+; COMPAT:       // %bb.0:
+; COMPAT-NEXT:    .cfi_b_key_frame
+; COMPAT-NEXT:    hint #39
+; COMPAT-NEXT:  .Ltmp8:
+; COMPAT-NEXT:    hint #27
+; COMPAT-NEXT:    .cfi_negate_ra_state
+; COMPAT-NEXT:    adr x16, .Ltmp8
+; COMPAT-NEXT:    hint #39
+; COMPAT-NEXT:    hint #31
+; COMPAT-NEXT:    ret
+;
+; V83A-LABEL: leaf_sign_all_b_key:
+; V83A:       // %bb.0:
+; V83A-NEXT:    .cfi_b_key_frame
+; V83A-NEXT:    hint #39
+; V83A-NEXT:  .Ltmp8:
+; V83A-NEXT:    pacibsp
+; V83A-NEXT:    .cfi_negate_ra_state
+; V83A-NEXT:    adr x16, .Ltmp8
+; V83A-NEXT:    hint #39
+; V83A-NEXT:    retab
+;
+; PAUTHLR-LABEL: leaf_sign_all_b_key:
+; PAUTHLR:       // %bb.0:
+; PAUTHLR-NEXT:    .cfi_b_key_frame
+; PAUTHLR-NEXT:  .Ltmp8:
+; PAUTHLR-NEXT:    pacibsppc
+; PAUTHLR-NEXT:    .cfi_negate_ra_state
+; PAUTHLR-NEXT:    retabsppc .Ltmp8
+  ret i32 %x
+}
+
+define i32 @leaf_sign_all_v83_b_key(i32 %x) "sign-return-address"="all" "target-features"="+v8.3a" "sign-return-address-key"="b_key" {
+; CHECK-LABEL: leaf_sign_all_v83_b_key:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    .cfi_b_key_frame
+; CHECK-NEXT:    hint #39
+; CHECK-NEXT:  .Ltmp9:
+; CHECK-NEXT:    pacibsp
+; CHECK-NEXT:    .cfi_negate_ra_state
+; CHECK-NEXT:    adr x16, .Ltmp9
+; CHECK-NEXT:    hint #39
+; CHECK-NEXT:    retab
+;
+; PAUTHLR-LABEL: leaf_sign_all_v83_b_key:
+; PAUTHLR:       // %bb.0:
+; PAUTHLR-NEXT:    .cfi_b_key_frame
+; PAUTHLR-NEXT:  .Ltmp9:
+; PAUTHLR-NEXT:    pacibsppc
+; PAUTHLR-NEXT:    .cfi_negate_ra_state
+; PAUTHLR-NEXT:    retabsppc .Ltmp9
+  ret i32 %x
+}
+
+; Note that BTI instruction is not needed before PACIASP.
+define i32 @leaf_sign_all_a_key_bti(i32 %x) "sign-return-address"="all" "sign-return-address-key"="a_key" "branch-target-enforcement"="true"{
+; COMPAT-LABEL: leaf_sign_all_a_key_bti:
+; COMPAT:       // %bb.0:
+; COMPAT-NEXT:    hint #34
+; COMPAT-NEXT:    hint #39
+; COMPAT-NEXT:  .Ltmp10:
+; COMPAT-NEXT:    hint #25
+; COMPAT-NEXT:    .cfi_negate_ra_state
+; COMPAT-NEXT:    adr x16, .Ltmp10
+; COMPAT-NEXT:    hint #39
+; COMPAT-NEXT:    hint #29
+; COMPAT-NEXT:    ret
+;
+; V83A-LABEL: leaf_sign_all_a_key_bti:
+; V83A:       // %bb.0:
+; V83A-NEXT:    hint #34
+; V83A-NEXT:    hint #39
+; V83A-NEXT:  .Ltmp10:
+; V83A-NEXT:    paciasp
+; V83A-NEXT:    .cfi_negate_ra_state
+; V83A-NEXT:    adr x16, .Ltmp10
+; V83A-NEXT:    hint #39
+; V83A-NEXT:    retaa
+;
+; PAUTHLR-LABEL: leaf_sign_all_a_key_bti:
+; PAUTHLR:       // %bb.0:
+; PAUTHLR-NEXT:    bti c
+; PAUTHLR-NEXT:  .Ltmp10:
+; PAUTHLR-NEXT:    paciasppc
+; PAUTHLR-NEXT:    .cfi_negate_ra_state
+; PAUTHLR-NEXT:    retaasppc .Ltmp10
+  ret i32 %x
+}
+
+; Note that BTI instruction is not needed before PACIBSP.
+define i32 @leaf_sign_all_b_key_bti(i32 %x) "sign-return-address"="all" "sign-return-address-key"="b_key" "branch-target-enforcement"="true"{
+; COMPAT-LABEL: leaf_sign_all_b_key_bti:
+; COMPAT:       // %bb.0:
+; COMPAT-NEXT:    hint #34
+; COMPAT-NEXT:    .cfi_b_key_frame
+; COMPAT-NEXT:    hint #39
+; COMPAT-NEXT:  .Ltmp11:
+; COMPAT-NEXT:    hint #27
+; COMPAT-NEXT:    .cfi_negate_ra_state
+; COMPAT-NEXT:    adr x16, .Ltmp11
+; COMPAT-NEXT:    hint #39
+; COMPAT-NEXT:    hint #31
+; COMPAT-NEXT:    ret
+;
+; V83A-LABEL: leaf_sign_all_b_key_bti:
+; V83A:       // %bb.0:
+; V83A-NEXT:    hint #34
+; V83A-NEXT:    .cfi_b_key_frame
+; V83A-NEXT:    hint #39
+; V83A-NEXT:  .Ltmp11:
+; V83A-NEXT:    pacibsp
+; V83A-NEXT:    .cfi_negate_ra_state
+; V83A-NEXT:    adr x16, .Ltmp11
+; V83A-NEXT:    hint #39
+; V83A-NEXT:    retab
+;
+; PAUTHLR-LABEL: leaf_sign_all_b_key_bti:
+; PAUTHLR:       // %bb.0:
+; PAUTHLR-NEXT:    bti c
+; PAUTHLR-NEXT:    .cfi_b_key_frame
+; PAUTHLR-NEXT:  .Ltmp11:
+; PAUTHLR-NEXT:    pacibsppc
+; PAUTHLR-NEXT:    .cfi_negate_ra_state
+; PAUTHLR-NEXT:    retabsppc .Ltmp11
+  ret i32 %x
+}
+
+; Note that BTI instruction is not needed before PACIBSP.
+define i32 @leaf_sign_all_v83_b_key_bti(i32 %x) "sign-return-address"="all" "target-features"="+v8.3a" "sign-return-address-key"="b_key" "branch-target-enforcement"="true" {
+; CHECK-LABEL: leaf_sign_all_v83_b_key_bti:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    hint #34
+; CHECK-NEXT:    .cfi_b_key_frame
+; CHECK-NEXT:    hint #39
+; CHECK-NEXT:  .Ltmp12:
+; CHECK-NEXT:    pacibsp
+; CHECK-NEXT:    .cfi_negate_ra_state
+; CHECK-NEXT:    adr x16, .Ltmp12
+; CHECK-NEXT:    hint #39
+; CHECK-NEXT:    retab
+;
+; PAUTHLR-LABEL: leaf_sign_all_v83_b_key_bti:
+; PAUTHLR:       // %bb.0:
+; PAUTHLR-NEXT:    bti c
+; PAUTHLR-NEXT:    .cfi_b_key_frame
+; PAUTHLR-NEXT:  .Ltmp12:
+; PAUTHLR-NEXT:    pacibsppc
+; PAUTHLR-NEXT:    .cfi_negate_ra_state
+; PAUTHLR-NEXT:    retabsppc .Ltmp12
+  ret i32 %x
+}
diff --git a/llvm/test/CodeGen/AArch64/sign-return-address.ll b/llvm/test/CodeGen/AArch64/sign-return-address.ll
index 5680915c7f414..1481d4beb50d6 100644
--- a/llvm/test/CodeGen/AArch64/sign-return-address.ll
+++ b/llvm/test/CodeGen/AArch64/sign-return-address.ll
@@ -2,6 +2,9 @@
 ; RUN: llc -mtriple=aarch64              < %s | FileCheck --check-prefixes=CHECK,COMPAT %s
 ; RUN: llc -mtriple=aarch64 -mattr=v8.3a < %s | FileCheck --check-prefixes=CHECK,V83A %s
 
+; v9.5-A is not expected to change codegen without -mbranch-protection=+pc, so reuse V83A.
+; RUN: llc -mtriple=aarch64 -mattr=v9.5a < %s | FileCheck --check-prefixes=CHECK,V83A %s
+
 define i32 @leaf(i32 %x) {
 ; CHECK-LABEL: leaf:
 ; CHECK:       // %bb.0:
diff --git a/llvm/unittests/TargetParser/TargetParserTest.cpp b/llvm/unittests/TargetParser/TargetParserTest.cpp
index 30e60ad92b68e..866176ab09836 100644
--- a/llvm/unittests/TargetParser/TargetParserTest.cpp
+++ b/llvm/unittests/TargetParser/TargetParserTest.cpp
@@ -1812,7 +1812,8 @@ TEST(TargetParserTest, AArch64ExtensionFeatures) {
       AArch64::AEK_SSVE_FP8DOT4, AArch64::AEK_LUT,
       AArch64::AEK_SME_LUTv2,    AArch64::AEK_SMEF8F16,
       AArch64::AEK_SMEF8F32,     AArch64::AEK_SMEFA64,
-      AArch64::AEK_CPA};
+      AArch64::AEK_CPA,          AArch64::AEK_PAUTHLR,
+  };
 
   std::vector<StringRef> Features;
 
@@ -1899,6 +1900,7 @@ TEST(TargetParserTest, AArch64ExtensionFeatures) {
   EXPECT_TRUE(llvm::is_contained(Features, "+sme-f8f32"));
   EXPECT_TRUE(llvm::is_contained(Features, "+sme-fa64"));
   EXPECT_TRUE(llvm::is_contained(Features, "+cpa"));
+  EXPECT_TRUE(llvm::is_contained(Features, "+pauth-lr"));
 
   // Assuming we listed every extension above, this should produce the same
   // result. (note that AEK_NONE doesn't have a name so it won't be in the

From 2366d53d8d8726b73408597b534d2f910c3d3e6d Mon Sep 17 00:00:00 2001
From: Arthur Eubanks <aeubanks@google.com>
Date: Thu, 21 Dec 2023 10:40:56 -0800
Subject: [PATCH 106/342] [X86] Fix more medium code model addressing modes
 (#75641)

By looking at whether a global is large instead of looking at the code
model.

This also fixes references to large data in the small code model.

We now always fold any 32-bit offset into the addressing mode with the
large code model since it uses 64-bit relocations.
---
 llvm/lib/Target/X86/X86ISelDAGToDAG.cpp       | 31 ++++-----
 llvm/lib/Target/X86/X86ISelLowering.cpp       | 29 ++++----
 llvm/test/CodeGen/X86/code-model-elf.ll       | 66 ++++++++++---------
 .../CodeGen/X86/fast-isel-large-object.ll     |  5 +-
 llvm/test/CodeGen/X86/fold-add.ll             |  3 +-
 5 files changed, 68 insertions(+), 66 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
index 7ec59c74f5f58..77a997588c4fe 100644
--- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -1828,9 +1828,7 @@ bool X86DAGToDAGISel::matchWrapper(SDValue N, X86ISelAddressMode &AM) {
   // That signifies access to globals that are known to be "near",
   // such as the GOT itself.
   CodeModel::Model M = TM.getCodeModel();
-  if (Subtarget->is64Bit() &&
-      ((M == CodeModel::Large && !IsRIPRelTLS) ||
-       (M == CodeModel::Medium && !IsRIPRel)))
+  if (Subtarget->is64Bit() && M == CodeModel::Large && !IsRIPRelTLS)
     return true;
 
   // Base and index reg must be 0 in order to use %rip as base.
@@ -1866,6 +1864,13 @@ bool X86DAGToDAGISel::matchWrapper(SDValue N, X86ISelAddressMode &AM) {
   } else
     llvm_unreachable("Unhandled symbol reference node.");
 
+  // Can't use an addressing mode with large globals.
+  if (Subtarget->is64Bit() && !IsRIPRel && AM.GV &&
+      TM.isLargeGlobalValue(AM.GV)) {
+    AM = Backup;
+    return true;
+  }
+
   if (foldOffsetIntoAddress(Offset, AM)) {
     AM = Backup;
     return true;
@@ -1910,20 +1915,12 @@ bool X86DAGToDAGISel::matchAddress(SDValue N, X86ISelAddressMode &AM) {
 
   // Post-processing: Convert foo to foo(%rip), even in non-PIC mode,
   // because it has a smaller encoding.
-  // TODO: Which other code models can use this?
-  switch (TM.getCodeModel()) {
-    default: break;
-    case CodeModel::Small:
-    case CodeModel::Kernel:
-      if (Subtarget->is64Bit() &&
-          AM.Scale == 1 &&
-          AM.BaseType == X86ISelAddressMode::RegBase &&
-          AM.Base_Reg.getNode() == nullptr &&
-          AM.IndexReg.getNode() == nullptr &&
-          AM.SymbolFlags == X86II::MO_NO_FLAG &&
-          AM.hasSymbolicDisplacement())
-        AM.Base_Reg = CurDAG->getRegister(X86::RIP, MVT::i64);
-      break;
+  if (TM.getCodeModel() != CodeModel::Large &&
+      (!AM.GV || !TM.isLargeGlobalValue(AM.GV)) && Subtarget->is64Bit() &&
+      AM.Scale == 1 && AM.BaseType == X86ISelAddressMode::RegBase &&
+      AM.Base_Reg.getNode() == nullptr && AM.IndexReg.getNode() == nullptr &&
+      AM.SymbolFlags == X86II::MO_NO_FLAG && AM.hasSymbolicDisplacement()) {
+    AM.Base_Reg = CurDAG->getRegister(X86::RIP, MVT::i64);
   }
 
   return false;
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index db5e4fe84f410..49112862a3142 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -2674,34 +2674,33 @@ SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
   return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));
 }
 
-bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
-                                       bool hasSymbolicDisplacement) {
+bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model CM,
+                                       bool HasSymbolicDisplacement) {
   // Offset should fit into 32 bit immediate field.
   if (!isInt<32>(Offset))
     return false;
 
   // If we don't have a symbolic displacement - we don't have any extra
   // restrictions.
-  if (!hasSymbolicDisplacement)
+  if (!HasSymbolicDisplacement)
     return true;
 
-  // FIXME: Some tweaks might be needed for medium code model.
-  if (M != CodeModel::Small && M != CodeModel::Kernel)
-    return false;
-
-  // For small code model we assume that latest object is 16MB before end of 31
-  // bits boundary. We may also accept pretty large negative constants knowing
-  // that all objects are in the positive half of address space.
-  if (M == CodeModel::Small && Offset < 16*1024*1024)
+  // We can fold large offsets in the large code model because we always use
+  // 64-bit offsets.
+  if (CM == CodeModel::Large)
     return true;
 
   // For kernel code model we know that all object resist in the negative half
   // of 32bits address space. We may not accept negative offsets, since they may
   // be just off and we may accept pretty large positive ones.
-  if (M == CodeModel::Kernel && Offset >= 0)
-    return true;
-
-  return false;
+  if (CM == CodeModel::Kernel)
+    return Offset >= 0;
+
+  // For other non-large code models we assume that latest small object is 16MB
+  // before end of 31 bits boundary. We may also accept pretty large negative
+  // constants knowing that all objects are in the positive half of address
+  // space.
+  return Offset < 16 * 1024 * 1024;
 }
 
 /// Return true if the condition is an signed comparison operation.
diff --git a/llvm/test/CodeGen/X86/code-model-elf.ll b/llvm/test/CodeGen/X86/code-model-elf.ll
index 6112f2a57b82c..afcffb3a7aded 100644
--- a/llvm/test/CodeGen/X86/code-model-elf.ll
+++ b/llvm/test/CodeGen/X86/code-model-elf.ll
@@ -11,6 +11,16 @@
 ; RUN: llc -verify-machineinstrs < %s -relocation-model=pic    -code-model=large  | FileCheck %s --check-prefix=CHECK --check-prefix=LARGE-PIC
 ; RUN: llc -verify-machineinstrs < %s -relocation-model=pic    -code-model=large  -large-data-threshold=1000 | FileCheck %s --check-prefix=CHECK --check-prefix=LARGE-SMALL-DATA-PIC
 
+; Check that the relocations we emit are valid.
+; RUN: llc -verify-machineinstrs < %s -relocation-model=static -code-model=small  -filetype=obj -o /dev/null
+; RUN: llc -verify-machineinstrs < %s -relocation-model=static -code-model=medium -filetype=obj -o /dev/null
+; RUN: llc -verify-machineinstrs < %s -relocation-model=static -code-model=large  -filetype=obj -o /dev/null
+; RUN: llc -verify-machineinstrs < %s -relocation-model=pic    -code-model=small  -filetype=obj -o /dev/null
+; RUN: llc -verify-machineinstrs < %s -relocation-model=pic    -code-model=medium -large-data-threshold=1000 -filetype=obj -o /dev/null
+; RUN: llc -verify-machineinstrs < %s -relocation-model=pic    -code-model=medium -filetype=obj -o /dev/null
+; RUN: llc -verify-machineinstrs < %s -relocation-model=pic    -code-model=large  -filetype=obj -o /dev/null
+; RUN: llc -verify-machineinstrs < %s -relocation-model=pic    -code-model=large  -large-data-threshold=1000 -filetype=obj -o /dev/null
+
 ; Generated from this C source:
 ;
 ; static int static_data[10];
@@ -376,7 +386,6 @@ define dso_local ptr @lea_forced_small_data() #0 {
   ret ptr @forced_small_data
 }
 
-; TODO: make small and medium instruction sequence the same
 define dso_local i32 @load_forced_small_data() #0 {
 ; SMALL-STATIC-LABEL: load_forced_small_data:
 ; SMALL-STATIC:       # %bb.0:
@@ -385,14 +394,13 @@ define dso_local i32 @load_forced_small_data() #0 {
 ;
 ; MEDIUM-STATIC-LABEL: load_forced_small_data:
 ; MEDIUM-STATIC:       # %bb.0:
-; MEDIUM-STATIC-NEXT:    movl $forced_small_data, %eax
-; MEDIUM-STATIC-NEXT:    movl 8(%rax), %eax
+; MEDIUM-STATIC-NEXT:    movl forced_small_data+8(%rip), %eax
 ; MEDIUM-STATIC-NEXT:    retq
 ;
 ; LARGE-STATIC-LABEL: load_forced_small_data:
 ; LARGE-STATIC:       # %bb.0:
-; LARGE-STATIC-NEXT:    movl $forced_small_data, %eax
-; LARGE-STATIC-NEXT:    movl 8(%rax), %eax
+; LARGE-STATIC-NEXT:    movl $forced_small_data+8, %eax
+; LARGE-STATIC-NEXT:    movl (%rax), %eax
 ; LARGE-STATIC-NEXT:    retq
 ;
 ; SMALL-PIC-LABEL: load_forced_small_data:
@@ -402,14 +410,12 @@ define dso_local i32 @load_forced_small_data() #0 {
 ;
 ; MEDIUM-SMALL-DATA-PIC-LABEL: load_forced_small_data:
 ; MEDIUM-SMALL-DATA-PIC:       # %bb.0:
-; MEDIUM-SMALL-DATA-PIC-NEXT:    leaq forced_small_data(%rip), %rax
-; MEDIUM-SMALL-DATA-PIC-NEXT:    movl 8(%rax), %eax
+; MEDIUM-SMALL-DATA-PIC-NEXT:    movl forced_small_data+8(%rip), %eax
 ; MEDIUM-SMALL-DATA-PIC-NEXT:    retq
 ;
 ; MEDIUM-PIC-LABEL: load_forced_small_data:
 ; MEDIUM-PIC:       # %bb.0:
-; MEDIUM-PIC-NEXT:    leaq forced_small_data(%rip), %rax
-; MEDIUM-PIC-NEXT:    movl 8(%rax), %eax
+; MEDIUM-PIC-NEXT:    movl forced_small_data+8(%rip), %eax
 ; MEDIUM-PIC-NEXT:    retq
 ;
 ; LARGE-PIC-LABEL: load_forced_small_data:
@@ -435,7 +441,6 @@ define dso_local i32 @load_forced_small_data() #0 {
   ret i32 %rv
 }
 
-; TODO: fix small code model instruction sequences to use 64-bit constants
 define dso_local ptr @lea_forced_large_data() #0 {
 ; SMALL-STATIC-LABEL: lea_forced_large_data:
 ; SMALL-STATIC:       # %bb.0:
@@ -454,8 +459,9 @@ define dso_local ptr @lea_forced_large_data() #0 {
 ;
 ; SMALL-PIC-LABEL: lea_forced_large_data:
 ; SMALL-PIC:       # %bb.0:
-; SMALL-PIC-NEXT:    leaq _GLOBAL_OFFSET_TABLE_(%rip), %rax
-; SMALL-PIC-NEXT:    leaq forced_large_data@GOTOFF(%rax), %rax
+; SMALL-PIC-NEXT:    leaq _GLOBAL_OFFSET_TABLE_(%rip), %rcx
+; SMALL-PIC-NEXT:    movabsq $forced_large_data@GOTOFF, %rax
+; SMALL-PIC-NEXT:    addq %rcx, %rax
 ; SMALL-PIC-NEXT:    retq
 ;
 ; MEDIUM-SMALL-DATA-PIC-LABEL: lea_forced_large_data:
@@ -497,25 +503,27 @@ define dso_local ptr @lea_forced_large_data() #0 {
 define dso_local i32 @load_forced_large_data() #0 {
 ; SMALL-STATIC-LABEL: load_forced_large_data:
 ; SMALL-STATIC:       # %bb.0:
-; SMALL-STATIC-NEXT:    movl forced_large_data+8(%rip), %eax
+; SMALL-STATIC-NEXT:    movabsq $forced_large_data+8, %rax
+; SMALL-STATIC-NEXT:    movl (%rax), %eax
 ; SMALL-STATIC-NEXT:    retq
 ;
 ; MEDIUM-STATIC-LABEL: load_forced_large_data:
 ; MEDIUM-STATIC:       # %bb.0:
-; MEDIUM-STATIC-NEXT:    movabsq $forced_large_data, %rax
-; MEDIUM-STATIC-NEXT:    movl 8(%rax), %eax
+; MEDIUM-STATIC-NEXT:    movabsq $forced_large_data+8, %rax
+; MEDIUM-STATIC-NEXT:    movl (%rax), %eax
 ; MEDIUM-STATIC-NEXT:    retq
 ;
 ; LARGE-STATIC-LABEL: load_forced_large_data:
 ; LARGE-STATIC:       # %bb.0:
-; LARGE-STATIC-NEXT:    movabsq $forced_large_data, %rax
-; LARGE-STATIC-NEXT:    movl 8(%rax), %eax
+; LARGE-STATIC-NEXT:    movabsq $forced_large_data+8, %rax
+; LARGE-STATIC-NEXT:    movl (%rax), %eax
 ; LARGE-STATIC-NEXT:    retq
 ;
 ; SMALL-PIC-LABEL: load_forced_large_data:
 ; SMALL-PIC:       # %bb.0:
 ; SMALL-PIC-NEXT:    leaq _GLOBAL_OFFSET_TABLE_(%rip), %rax
-; SMALL-PIC-NEXT:    movl forced_large_data@GOTOFF+8(%rax), %eax
+; SMALL-PIC-NEXT:    movabsq $forced_large_data@GOTOFF, %rcx
+; SMALL-PIC-NEXT:    movl 8(%rax,%rcx), %eax
 ; SMALL-PIC-NEXT:    retq
 ;
 ; MEDIUM-SMALL-DATA-PIC-LABEL: load_forced_large_data:
@@ -563,14 +571,14 @@ define dso_local i32 @load_global_data() #0 {
 ;
 ; MEDIUM-STATIC-LABEL: load_global_data:
 ; MEDIUM-STATIC:       # %bb.0:
-; MEDIUM-STATIC-NEXT:    movabsq $global_data, %rax
-; MEDIUM-STATIC-NEXT:    movl 8(%rax), %eax
+; MEDIUM-STATIC-NEXT:    movabsq $global_data+8, %rax
+; MEDIUM-STATIC-NEXT:    movl (%rax), %eax
 ; MEDIUM-STATIC-NEXT:    retq
 ;
 ; LARGE-STATIC-LABEL: load_global_data:
 ; LARGE-STATIC:       # %bb.0:
-; LARGE-STATIC-NEXT:    movabsq $global_data, %rax
-; LARGE-STATIC-NEXT:    movl 8(%rax), %eax
+; LARGE-STATIC-NEXT:    movabsq $global_data+8, %rax
+; LARGE-STATIC-NEXT:    movl (%rax), %eax
 ; LARGE-STATIC-NEXT:    retq
 ;
 ; SMALL-PIC-LABEL: load_global_data:
@@ -580,8 +588,7 @@ define dso_local i32 @load_global_data() #0 {
 ;
 ; MEDIUM-SMALL-DATA-PIC-LABEL: load_global_data:
 ; MEDIUM-SMALL-DATA-PIC:       # %bb.0:
-; MEDIUM-SMALL-DATA-PIC-NEXT:    leaq global_data(%rip), %rax
-; MEDIUM-SMALL-DATA-PIC-NEXT:    movl 8(%rax), %eax
+; MEDIUM-SMALL-DATA-PIC-NEXT:    movl global_data+8(%rip), %eax
 ; MEDIUM-SMALL-DATA-PIC-NEXT:    retq
 ;
 ; MEDIUM-PIC-LABEL: load_global_data:
@@ -684,14 +691,14 @@ define dso_local i32 @load_unknown_size_data() #0 {
 ;
 ; MEDIUM-STATIC-LABEL: load_unknown_size_data:
 ; MEDIUM-STATIC:       # %bb.0:
-; MEDIUM-STATIC-NEXT:    movabsq $unknown_size_data, %rax
-; MEDIUM-STATIC-NEXT:    movl 8(%rax), %eax
+; MEDIUM-STATIC-NEXT:    movabsq $unknown_size_data+8, %rax
+; MEDIUM-STATIC-NEXT:    movl (%rax), %eax
 ; MEDIUM-STATIC-NEXT:    retq
 ;
 ; LARGE-STATIC-LABEL: load_unknown_size_data:
 ; LARGE-STATIC:       # %bb.0:
-; LARGE-STATIC-NEXT:    movabsq $unknown_size_data, %rax
-; LARGE-STATIC-NEXT:    movl 8(%rax), %eax
+; LARGE-STATIC-NEXT:    movabsq $unknown_size_data+8, %rax
+; LARGE-STATIC-NEXT:    movl (%rax), %eax
 ; LARGE-STATIC-NEXT:    retq
 ;
 ; SMALL-PIC-LABEL: load_unknown_size_data:
@@ -1127,8 +1134,7 @@ define dso_local float @load_constant_pool(float %x) #0 {
 ;
 ; MEDIUM-STATIC-LABEL: load_constant_pool:
 ; MEDIUM-STATIC:       # %bb.0:
-; MEDIUM-STATIC-NEXT:    movl ${{\.?LCPI[0-9]+_[0-9]+}}, %eax
-; MEDIUM-STATIC-NEXT:    addss (%rax), %xmm0
+; MEDIUM-STATIC-NEXT:    addss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; MEDIUM-STATIC-NEXT:    retq
 ;
 ; LARGE-STATIC-LABEL: load_constant_pool:
diff --git a/llvm/test/CodeGen/X86/fast-isel-large-object.ll b/llvm/test/CodeGen/X86/fast-isel-large-object.ll
index 6ca2c42407237..9acdfdeaf7cc9 100644
--- a/llvm/test/CodeGen/X86/fast-isel-large-object.ll
+++ b/llvm/test/CodeGen/X86/fast-isel-large-object.ll
@@ -6,8 +6,9 @@
 define ptr @f() {
 ; CHECK-LABEL: f:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    leaq _GLOBAL_OFFSET_TABLE_(%rip), %rax
-; CHECK-NEXT:    leaq g@GOTOFF(%rax), %rax
+; CHECK-NEXT:    leaq _GLOBAL_OFFSET_TABLE_(%rip), %rcx
+; CHECK-NEXT:    movabsq $g@GOTOFF, %rax
+; CHECK-NEXT:    addq %rcx, %rax
 ; CHECK-NEXT:    retq
   ret ptr @g
 }
diff --git a/llvm/test/CodeGen/X86/fold-add.ll b/llvm/test/CodeGen/X86/fold-add.ll
index 597e51d877eb4..8c28d66597fb3 100644
--- a/llvm/test/CodeGen/X86/fold-add.ll
+++ b/llvm/test/CodeGen/X86/fold-add.ll
@@ -45,8 +45,7 @@ define dso_local i64 @one() #0 {
 ;
 ; MSTATIC-LABEL: one:
 ; MSTATIC:       # %bb.0: # %entry
-; MSTATIC-NEXT:    movabsq $foo, %rax
-; MSTATIC-NEXT:    incq %rax
+; MSTATIC-NEXT:    movabsq $foo+1, %rax
 ; MSTATIC-NEXT:    retq
 ;
 ; MPIC-LABEL: one:

From 9664ab570ae44068766cc722e8d5e62003d84361 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Thu, 21 Dec 2023 10:43:04 -0800
Subject: [PATCH 107/342] [llvm-profdata] Modernize FuncSampleStats,
 ValueSitesStats, and HotFuncInfo (NFC)

---
 llvm/tools/llvm-profdata/llvm-profdata.cpp | 29 ++++++++++------------
 1 file changed, 13 insertions(+), 16 deletions(-)

diff --git a/llvm/tools/llvm-profdata/llvm-profdata.cpp b/llvm/tools/llvm-profdata/llvm-profdata.cpp
index 322b7da2678f4..12b81d411cfa9 100644
--- a/llvm/tools/llvm-profdata/llvm-profdata.cpp
+++ b/llvm/tools/llvm-profdata/llvm-profdata.cpp
@@ -1651,10 +1651,10 @@ struct SampleOverlapStats {
 
 namespace {
 struct FuncSampleStats {
-  uint64_t SampleSum;
-  uint64_t MaxSample;
-  uint64_t HotBlockCount;
-  FuncSampleStats() : SampleSum(0), MaxSample(0), HotBlockCount(0) {}
+  uint64_t SampleSum = 0;
+  uint64_t MaxSample = 0;
+  uint64_t HotBlockCount = 0;
+  FuncSampleStats() = default;
   FuncSampleStats(uint64_t SampleSum, uint64_t MaxSample,
                   uint64_t HotBlockCount)
       : SampleSum(SampleSum), MaxSample(MaxSample),
@@ -2563,12 +2563,10 @@ static int overlap_main(int argc, const char *argv[]) {
 
 namespace {
 struct ValueSitesStats {
-  ValueSitesStats()
-      : TotalNumValueSites(0), TotalNumValueSitesWithValueProfile(0),
-        TotalNumValues(0) {}
-  uint64_t TotalNumValueSites;
-  uint64_t TotalNumValueSitesWithValueProfile;
-  uint64_t TotalNumValues;
+  ValueSitesStats() = default;
+  uint64_t TotalNumValueSites = 0;
+  uint64_t TotalNumValueSitesWithValueProfile = 0;
+  uint64_t TotalNumValues = 0;
   std::vector<unsigned> ValueSitesHistogram;
 };
 } // namespace
@@ -2867,13 +2865,12 @@ static void showSectionInfo(sampleprof::SampleProfileReader *Reader,
 namespace {
 struct HotFuncInfo {
   std::string FuncName;
-  uint64_t TotalCount;
-  double TotalCountPercent;
-  uint64_t MaxCount;
-  uint64_t EntryCount;
+  uint64_t TotalCount = 0;
+  double TotalCountPercent = 0.0f;
+  uint64_t MaxCount = 0;
+  uint64_t EntryCount = 0;
 
-  HotFuncInfo()
-      : TotalCount(0), TotalCountPercent(0.0f), MaxCount(0), EntryCount(0) {}
+  HotFuncInfo() = default;
 
   HotFuncInfo(StringRef FN, uint64_t TS, double TSP, uint64_t MS, uint64_t ES)
       : FuncName(FN.begin(), FN.end()), TotalCount(TS), TotalCountPercent(TSP),

From e6f57628790421b16a02b0cb4a67fab4f2f48004 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andrzej=20Warzy=C5=84ski?= <andrzej.warzynski@arm.com>
Date: Thu, 21 Dec 2023 18:45:00 +0000
Subject: [PATCH 108/342] [mlir][vector][nfc] Add a test case for scalable
 vectors (#76138)

Extends fold-arith-extf-into-vector-contract.mlir by adding a test case
for scalable vectors.
---
 .../fold-arith-extf-into-vector-contract.mlir | 38 +++++++++++++++++--
 1 file changed, 35 insertions(+), 3 deletions(-)

diff --git a/mlir/test/Dialect/Vector/fold-arith-extf-into-vector-contract.mlir b/mlir/test/Dialect/Vector/fold-arith-extf-into-vector-contract.mlir
index 79429afd8ff29..31ae126906f21 100644
--- a/mlir/test/Dialect/Vector/fold-arith-extf-into-vector-contract.mlir
+++ b/mlir/test/Dialect/Vector/fold-arith-extf-into-vector-contract.mlir
@@ -10,9 +10,41 @@
 //  CHECK-SAME:   iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>}
 //  CHECK-SAME:   %[[ARG0]], %[[ARG1]], %[[ARG2]] : vector<64x64xf16>, vector<64x64xf16> into vector<64x64xf32>
 //  CHECK-NEXT:   return %[[R]] : vector<64x64xf32>
-func.func @fold_arith_extf_into_contract(%arg0: vector<64x64xf16>, %arg1: vector<64x64xf16>, %arg2: vector<64x64xf32>) -> vector<64x64xf32> {
+func.func @fold_arith_extf_into_contract(
+  %arg0: vector<64x64xf16>,
+  %arg1: vector<64x64xf16>,
+  %arg2: vector<64x64xf32>) -> vector<64x64xf32> {
     %lhs_f32 = arith.extf %arg0 : vector<64x64xf16> to vector<64x64xf32>
     %rhs_f32 = arith.extf %arg1 : vector<64x64xf16> to vector<64x64xf32>
-    %result = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %lhs_f32, %rhs_f32, %arg2 : vector<64x64xf32>, vector<64x64xf32> into vector<64x64xf32>
+    %result = vector.contract {
+      indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>],
+      iterator_types = ["parallel", "parallel", "reduction"],
+      kind = #vector.kind<add>}
+      %lhs_f32, %rhs_f32, %arg2 : vector<64x64xf32>, vector<64x64xf32> into vector<64x64xf32>
     return %result : vector<64x64xf32>
-}
\ No newline at end of file
+}
+
+// -----
+
+// CHECK-DAG: #[[$map0:.*]] = affine_map<(d0, d1, d2) -> (d0, d2)>
+// CHECK-DAG: #[[$map1:.*]] = affine_map<(d0, d1, d2) -> (d2, d1)>
+// CHECK-DAG: #[[$map2:.*]] = affine_map<(d0, d1, d2) -> (d0, d1)>
+// CHECK-LABEL: func.func @fold_arith_extf_into_contract_scalable
+//  CHECK-SAME: (%[[ARG0:.*]]: vector<[64]x64xf16>, %[[ARG1:.*]]: vector<64x64xf16>, %[[ARG2:.*]]: vector<[64]x64xf32>)
+//  CHECK-NEXT:   %[[R:.+]] = vector.contract {indexing_maps = [#[[$map0]], #[[$map1]], #[[$map2]]],
+//  CHECK-SAME:   iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>}
+//  CHECK-SAME:   %[[ARG0]], %[[ARG1]], %[[ARG2]] : vector<[64]x64xf16>, vector<64x64xf16> into vector<[64]x64xf32>
+//  CHECK-NEXT:   return %[[R]] : vector<[64]x64xf32>
+func.func @fold_arith_extf_into_contract_scalable(
+  %arg0: vector<[64]x64xf16>,
+  %arg1: vector<64x64xf16>,
+  %arg2: vector<[64]x64xf32>) -> vector<[64]x64xf32> {
+    %lhs_f32 = arith.extf %arg0 : vector<[64]x64xf16> to vector<[64]x64xf32>
+    %rhs_f32 = arith.extf %arg1 : vector<64x64xf16> to vector<64x64xf32>
+    %result = vector.contract {
+      indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>],
+      iterator_types = ["parallel", "parallel", "reduction"],
+      kind = #vector.kind<add>}
+      %lhs_f32, %rhs_f32, %arg2 : vector<[64]x64xf32>, vector<64x64xf32> into vector<[64]x64xf32>
+    return %result : vector<[64]x64xf32>
+}

From 7433b1ca3ebe9f3b20758336535e82531cbae96f Mon Sep 17 00:00:00 2001
From: Arthur Eubanks <aeubanks@google.com>
Date: Thu, 21 Dec 2023 10:41:39 -0800
Subject: [PATCH 109/342] Reapply "[X86] Set SHF_X86_64_LARGE for globals with
 explicit well-known large section name (#74381)"

This reverts commit 19fff858931bf575b63a0078cc553f8f93cced20.

Now that explicit large globals are handled properly in the small code model.
---
 llvm/lib/Target/TargetMachine.cpp                |  4 ++--
 llvm/test/CodeGen/X86/code-model-elf-sections.ll | 16 ++++++++--------
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/Target/TargetMachine.cpp b/llvm/lib/Target/TargetMachine.cpp
index 3f96bd37755e5..2a4383314e465 100644
--- a/llvm/lib/Target/TargetMachine.cpp
+++ b/llvm/lib/Target/TargetMachine.cpp
@@ -61,8 +61,6 @@ bool TargetMachine::isLargeGlobalValue(const GlobalValue *GVal) const {
   // We should properly mark well-known section name prefixes as small/large,
   // because otherwise the output section may have the wrong section flags and
   // the linker will lay it out in an unexpected way.
-  // TODO: bring back lbss/ldata/lrodata checks after fixing accesses to large
-  // globals in the small code model.
   StringRef Name = GV->getSection();
   if (!Name.empty()) {
     auto IsPrefix = [&](StringRef Prefix) {
@@ -71,6 +69,8 @@ bool TargetMachine::isLargeGlobalValue(const GlobalValue *GVal) const {
     };
     if (IsPrefix(".bss") || IsPrefix(".data") || IsPrefix(".rodata"))
       return false;
+    if (IsPrefix(".lbss") || IsPrefix(".ldata") || IsPrefix(".lrodata"))
+      return true;
   }
 
   // For x86-64, we treat an explicit GlobalVariable small code model to mean
diff --git a/llvm/test/CodeGen/X86/code-model-elf-sections.ll b/llvm/test/CodeGen/X86/code-model-elf-sections.ll
index cb19f0d34f59f..749d5b6bf904e 100644
--- a/llvm/test/CodeGen/X86/code-model-elf-sections.ll
+++ b/llvm/test/CodeGen/X86/code-model-elf-sections.ll
@@ -21,16 +21,16 @@
 ; SMALL: .data {{.*}} WA {{.*}}
 ; SMALL: .data.x {{.*}} WA {{.*}}
 ; SMALL: .data0 {{.*}} WA {{.*}}
-; SMALL: .ldata {{.*}} WA {{.*}}
-; SMALL: .ldata.x {{.*}} WA {{.*}}
+; SMALL: .ldata {{.*}} WAl {{.*}}
+; SMALL: .ldata.x {{.*}} WAl {{.*}}
 ; SMALL: .ldata0 {{.*}} WA {{.*}}
 ; SMALL: force_small {{.*}} WA {{.*}}
 ; SMALL: force_large {{.*}} WAl {{.*}}
 ; SMALL: foo {{.*}} WA {{.*}}
 ; SMALL: .bss {{.*}} WA {{.*}}
-; SMALL: .lbss {{.*}} WA {{.*}}
+; SMALL: .lbss {{.*}} WAl {{.*}}
 ; SMALL: .rodata {{.*}} A {{.*}}
-; SMALL: .lrodata {{.*}} A {{.*}}
+; SMALL: .lrodata {{.*}} Al {{.*}}
 ; SMALL: .data.rel.ro {{.*}} WA {{.*}}
 ; SMALL: .tbss {{.*}} WAT {{.*}}
 ; SMALL: .tdata {{.*}} WAT {{.*}}
@@ -38,16 +38,16 @@
 ; SMALL-DS: .data {{.*}} WA {{.*}}
 ; SMALL-DS: .data.x {{.*}} WA {{.*}}
 ; SMALL-DS: .data0 {{.*}} WA {{.*}}
-; SMALL-DS: .ldata {{.*}} WA {{.*}}
-; SMALL-DS: .ldata.x {{.*}} WA {{.*}}
+; SMALL-DS: .ldata {{.*}} WAl {{.*}}
+; SMALL-DS: .ldata.x {{.*}} WAl {{.*}}
 ; SMALL-DS: .ldata0 {{.*}} WA {{.*}}
 ; SMALL-DS: .data.data {{.*}} WA {{.*}}
 ; SMALL-DS: force_small {{.*}} WA {{.*}}
 ; SMALL-DS: force_large {{.*}} WAl {{.*}}
 ; SMALL-DS: foo {{.*}} WA {{.*}}
-; SMALL-DS: .lbss {{.*}} WA {{.*}}
+; SMALL-DS: .lbss {{.*}} WAl {{.*}}
 ; SMALL-DS: .bss.bss {{.*}} WA {{.*}}
-; SMALL-DS: .lrodata {{.*}} A {{.*}}
+; SMALL-DS: .lrodata {{.*}} Al {{.*}}
 ; SMALL-DS: .rodata.rodata {{.*}} A {{.*}}
 ; SMALL-DS: .data.rel.ro.relro {{.*}} WA {{.*}}
 ; SMALL-DS: .tbss.tbss {{.*}} WAT {{.*}}

From 058e527434aeb61ee6f72d2d460123440726a7df Mon Sep 17 00:00:00 2001
From: Felipe de Azevedo Piovezan <fpiovezan@apple.com>
Date: Thu, 21 Dec 2023 16:10:30 -0300
Subject: [PATCH 110/342] [AccelTable][NFC] Fix typos and duplicated code
 (#76155)

Renaming a member variable from "Endoding" to "Encoding".

Also replace inlined code for "isNormalized" with a call to the
function, so that if the definition of normalization ever changes, we
only need to change the one place.
---
 llvm/include/llvm/CodeGen/AccelTable.h     | 8 +++-----
 llvm/lib/CodeGen/AsmPrinter/AccelTable.cpp | 4 ++--
 2 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/AccelTable.h b/llvm/include/llvm/CodeGen/AccelTable.h
index af874aa5e91a3..6eb09f32f9f95 100644
--- a/llvm/include/llvm/CodeGen/AccelTable.h
+++ b/llvm/include/llvm/CodeGen/AccelTable.h
@@ -270,16 +270,14 @@ class DWARF5AccelTableData : public AccelTableData {
 #endif
 
   uint64_t getDieOffset() const {
-    assert(std::holds_alternative<uint64_t>(OffsetVal) &&
-           "Accessing DIE Offset before normalizing.");
+    assert(isNormalized() && "Accessing DIE Offset before normalizing.");
     return std::get<uint64_t>(OffsetVal);
   }
   unsigned getDieTag() const { return DieTag; }
   unsigned getUnitID() const { return UnitID; }
   bool isTU() const { return IsTU; }
   void normalizeDIEToOffset() {
-    assert(std::holds_alternative<const DIE *>(OffsetVal) &&
-           "Accessing offset after normalizing.");
+    assert(!isNormalized() && "Accessing offset after normalizing.");
     OffsetVal = std::get<const DIE *>(OffsetVal)->getOffset();
   }
   bool isNormalized() const {
@@ -309,7 +307,7 @@ class DWARF5AccelTable : public AccelTable<DWARF5AccelTableData> {
 public:
   struct UnitIndexAndEncoding {
     unsigned Index;
-    DWARF5AccelTableData::AttributeEncoding Endoding;
+    DWARF5AccelTableData::AttributeEncoding Encoding;
   };
   /// Returns type units that were constructed.
   const TUVectorTy &getTypeUnitsSymbols() { return TUSymbolsOrHashes; }
diff --git a/llvm/lib/CodeGen/AsmPrinter/AccelTable.cpp b/llvm/lib/CodeGen/AsmPrinter/AccelTable.cpp
index d6f487c18b030..30ea7eef3a12b 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AccelTable.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AccelTable.cpp
@@ -412,7 +412,7 @@ static uint32_t constructAbbreviationTag(
     const std::optional<DWARF5AccelTable::UnitIndexAndEncoding> &EntryRet) {
   uint32_t AbbrvTag = 0;
   if (EntryRet)
-    AbbrvTag |= 1 << EntryRet->Endoding.Index;
+    AbbrvTag |= 1 << EntryRet->Encoding.Index;
   AbbrvTag |= 1 << dwarf::DW_IDX_die_offset;
   AbbrvTag |= Tag << LowerBitSize;
   return AbbrvTag;
@@ -429,7 +429,7 @@ void Dwarf5AccelTableWriter<DataT>::populateAbbrevsMap() {
         if (Abbreviations.count(AbbrvTag) == 0) {
           SmallVector<DWARF5AccelTableData::AttributeEncoding, 2> UA;
           if (EntryRet)
-            UA.push_back(EntryRet->Endoding);
+            UA.push_back(EntryRet->Encoding);
           UA.push_back({dwarf::DW_IDX_die_offset, dwarf::DW_FORM_ref4});
           Abbreviations.try_emplace(AbbrvTag, UA);
         }

From 809f2f3d7dfaff7c239dd99742175287f76560c7 Mon Sep 17 00:00:00 2001
From: Dinar Temirbulatov <Dinar.Temirbulatov@arm.com>
Date: Thu, 21 Dec 2023 19:41:24 +0000
Subject: [PATCH 111/342] [AArch64][SME2] Add builtins for FDOT, BFDOT, SUDOT,
 USDOT, SDOT, UDOT. (#75737)

Add SME2 DOT builtins.
---
 clang/include/clang/Basic/arm_sme.td          |   66 +
 clang/lib/CodeGen/CGBuiltin.cpp               |    2 +
 .../acle_sme2_fp_dots.c                       |  297 +++++
 .../acle_sme2_int_dots.c                      | 1103 +++++++++++++++++
 .../aarch64-sme2-intrinsics/acle_sme2_vdot.c  |  222 ++++
 .../aarch64-sme2-intrinsics/acle_sme2_imm.cpp |   65 +
 6 files changed, 1755 insertions(+)
 create mode 100644 clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_fp_dots.c
 create mode 100644 clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_int_dots.c
 create mode 100644 clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vdot.c

diff --git a/clang/include/clang/Basic/arm_sme.td b/clang/include/clang/Basic/arm_sme.td
index 2a9bc6870bf71..aac3bd486de92 100644
--- a/clang/include/clang/Basic/arm_sme.td
+++ b/clang/include/clang/Basic/arm_sme.td
@@ -351,6 +351,72 @@ let TargetGuard = "sme2" in {
   def SVBMOPA : Inst<"svbmopa_za32[_{d}]_m", "viPPdd", "iUi", MergeNone, "aarch64_sme_bmopa_za32", [IsSharedZA, IsStreaming], [ImmCheck<0, ImmCheck0_3>]>;
 
   def SVBMOPS : Inst<"svbmops_za32[_{d}]_m", "viPPdd", "iUi", MergeNone, "aarch64_sme_bmops_za32", [IsSharedZA, IsStreaming], [ImmCheck<0, ImmCheck0_3>]>;
+
+  // VERTICAL DOT-PRODUCT
+  def SVVDOT_LANE_ZA32_VG1x2_S : Inst<"svvdot_lane_za32[_{d}]_vg1x2", "vm2di", "s", MergeNone, "aarch64_sme_svdot_lane_za32_vg1x2", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_3>]>;
+  def SVVDOT_LANE_ZA32_VG1x4_S : Inst<"svvdot_lane_za32[_{d}]_vg1x4", "vm4di", "c", MergeNone, "aarch64_sme_svdot_lane_za32_vg1x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_3>]>;
+  def SVVDOT_LANE_ZA32_VG1x2_U : Inst<"svvdot_lane_za32[_{d}]_vg1x2", "vm2di", "Us", MergeNone, "aarch64_sme_uvdot_lane_za32_vg1x2", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_3>]>;
+  def SVVDOT_LANE_ZA32_VG1x4_U : Inst<"svvdot_lane_za32[_{d}]_vg1x4", "vm4di", "Uc", MergeNone, "aarch64_sme_uvdot_lane_za32_vg1x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_3>]>;
+  def SVVDOT_LANE_ZA32_VG1x2_F : Inst<"svvdot_lane_za32[_{d}]_vg1x2", "vm2di", "hb", MergeNone, "aarch64_sme_fvdot_lane_za32_vg1x2", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_3>]>;
+  def SVSUVDOT_LANE_ZA32_VG1x4 : Inst<"svsuvdot_lane_za32[_{d}]_vg1x4", "vm4di", "c", MergeNone, "aarch64_sme_suvdot_lane_za32_vg1x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_3>]>;
+  def SVUSVDOT_LANE_ZA32_VG1x4 : Inst<"svusvdot_lane_za32[_{d}]_vg1x4", "vm4di", "Uc", MergeNone, "aarch64_sme_usvdot_lane_za32_vg1x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_3>]>;
+
+  // Multi-vector signed & unsigned integer dot-product
+  def SVDOT_MULTI_ZA32_VG1x2_S  : Inst<"svdot_za32[_{d}]_vg1x2", "vm22", "cs", MergeNone, "aarch64_sme_sdot_za32_vg1x2", [IsStreaming, IsSharedZA], []>;
+  def SVDOT_MULTI_ZA32_VG1x4_S  : Inst<"svdot_za32[_{d}]_vg1x4", "vm44", "cs", MergeNone, "aarch64_sme_sdot_za32_vg1x4", [IsStreaming, IsSharedZA], []>;
+  def SVDOT_MULTI_ZA32_VG1x2_U  : Inst<"svdot_za32[_{d}]_vg1x2", "vm22", "UcUs", MergeNone, "aarch64_sme_udot_za32_vg1x2", [IsStreaming, IsSharedZA], []>;
+  def SVDOT_MULTI_ZA32_VG1x4_U  : Inst<"svdot_za32[_{d}]_vg1x4", "vm44", "UcUs", MergeNone, "aarch64_sme_udot_za32_vg1x4", [IsStreaming, IsSharedZA], []>;
+  def SVDOT_SINGLE_ZA32_VG1x2_S : Inst<"svdot[_single]_za32[_{d}]_vg1x2", "vm2d", "cs", MergeNone, "aarch64_sme_sdot_single_za32_vg1x2", [IsStreaming, IsSharedZA], []>;
+  def SVDOT_SINGLE_ZA32_VG1x4_S : Inst<"svdot[_single]_za32[_{d}]_vg1x4", "vm4d", "cs", MergeNone, "aarch64_sme_sdot_single_za32_vg1x4", [IsStreaming, IsSharedZA], []>;
+  def SVDOT_SINGLE_ZA32_VG1x2_U : Inst<"svdot[_single]_za32[_{d}]_vg1x2", "vm2d", "UcUs", MergeNone, "aarch64_sme_udot_single_za32_vg1x2", [IsStreaming, IsSharedZA], []>;
+  def SVDOT_SINGLE_ZA32_VG1x4_U : Inst<"svdot[_single]_za32[_{d}]_vg1x4", "vm4d", "UcUs", MergeNone, "aarch64_sme_udot_single_za32_vg1x4", [IsStreaming, IsSharedZA], []>;
+  def SVDOT_LANE_ZA32_VG1x2_S   : Inst<"svdot_lane_za32[_{d}]_vg1x2", "vm2di", "cs", MergeNone, "aarch64_sme_sdot_lane_za32_vg1x2", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_3>]>;
+  def SVDOT_LANE_ZA32_VG1x4_S   : Inst<"svdot_lane_za32[_{d}]_vg1x4", "vm4di", "cs", MergeNone, "aarch64_sme_sdot_lane_za32_vg1x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_3>]>;
+  def SVDOT_LANE_ZA32_VG1x2_U   : Inst<"svdot_lane_za32[_{d}]_vg1x2", "vm2di", "UcUs", MergeNone, "aarch64_sme_udot_lane_za32_vg1x2", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_3>]>;
+  def SVDOT_LANE_ZA32_VG1x4_U   : Inst<"svdot_lane_za32[_{d}]_vg1x4", "vm4di", "UcUs", MergeNone, "aarch64_sme_udot_lane_za32_vg1x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_3>]>;
+
+  def SVUSDOT_SINGLE_ZA32_VG1x2 : Inst<"svusdot[_single]_za32[_{d}]_vg1x2", "vm2.dx", "Uc", MergeNone, "aarch64_sme_usdot_single_za32_vg1x2", [IsStreaming, IsSharedZA], []>;
+  def SVUSDOT_SINGLE_ZA32_VG1x4 : Inst<"svusdot[_single]_za32[_{d}]_vg1x4", "vm4.dx", "Uc", MergeNone, "aarch64_sme_usdot_single_za32_vg1x4", [IsStreaming, IsSharedZA], []>;
+  def SVUSDOT_MULTI_ZA32_VG1x2  : Inst<"svusdot_za32[_{d}]_vg1x2", "vm2.d2.x", "Uc", MergeNone, "aarch64_sme_usdot_za32_vg1x2", [IsStreaming, IsSharedZA], []>;
+  def SVUSDOT_MULTI_ZA32_VG1x4  : Inst<"svusdot_za32[_{d}]_vg1x4", "vm4.d4.x", "Uc", MergeNone, "aarch64_sme_usdot_za32_vg1x4", [IsStreaming, IsSharedZA], []>;
+  def SVUSDOT_LANE_ZA32_VG1x2   : Inst<"svusdot_lane_za32[_{d}]_vg1x2", "vm2.dxi", "Uc", MergeNone, "aarch64_sme_usdot_lane_za32_vg1x2", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_3>]>;
+  def SVUSDOT_LANE_ZA32_VG1x4   : Inst<"svusdot_lane_za32[_{d}]_vg1x4", "vm4.dxi", "Uc", MergeNone, "aarch64_sme_usdot_lane_za32_vg1x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_3>]>;
+
+  def SVSUDOT_SINGLE_ZA32_VG1x2 : Inst<"svsudot[_single]_za32[_{d}]_vg1x2", "vm2.du", "c", MergeNone, "aarch64_sme_sudot_single_za32_vg1x2", [IsStreaming, IsSharedZA], []>;
+  def SVSUDOT_SINGLE_ZA32_VG1x4 : Inst<"svsudot[_single]_za32[_{d}]_vg1x4", "vm4.du", "c", MergeNone, "aarch64_sme_sudot_single_za32_vg1x4", [IsStreaming, IsSharedZA], []>;
+
+  // Multi-multi sudot builtins are mapped to usdot, with zn & zm operands swapped
+  def SVSUDOT_MULTI_ZA32_VG1x2  : Inst<"svsudot_za32[_{d}]_vg1x2", "vm2.d2.u", "c", MergeNone, "aarch64_sme_usdot_za32_vg1x2", [IsStreaming, IsSharedZA], []>;
+  def SVSUDOT_MULTI_ZA32_VG1x4  : Inst<"svsudot_za32[_{d}]_vg1x4", "vm4.d4.u", "c", MergeNone, "aarch64_sme_usdot_za32_vg1x4", [IsStreaming, IsSharedZA], []>;
+
+  def SVSUDOT_LANE_ZA32_VG1x2   : Inst<"svsudot_lane_za32[_{d}]_vg1x2", "vm2.dui", "c", MergeNone, "aarch64_sme_sudot_lane_za32_vg1x2", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_3>]>;
+  def SVSUDOT_LANE_ZA32_VG1x4   : Inst<"svsudot_lane_za32[_{d}]_vg1x4", "vm4.dui", "c", MergeNone, "aarch64_sme_sudot_lane_za32_vg1x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_3>]>;
+
+  // Multi-vector half-precision/BFloat16 floating-point dot-product
+  def SVDOT_MULTI_ZA32_VG1x2_F16  : Inst<"svdot_za32[_{d}]_vg1x2", "vm22", "bh", MergeNone, "aarch64_sme_fdot_za32_vg1x2", [IsStreaming, IsSharedZA], []>;
+  def SVDOT_MULTI_ZA32_VG1x4_F16  : Inst<"svdot_za32[_{d}]_vg1x4", "vm44", "bh", MergeNone, "aarch64_sme_fdot_za32_vg1x4", [IsStreaming, IsSharedZA], []>;
+  def SVDOT_SINGLE_ZA32_VG1x2_F16 : Inst<"svdot[_single]_za32[_{d}]_vg1x2", "vm2d", "bh", MergeNone, "aarch64_sme_fdot_single_za32_vg1x2", [IsStreaming, IsSharedZA], []>;
+  def SVDOT_SINGLE_ZA32_VG1x4_F16 : Inst<"svdot[_single]_za32[_{d}]_vg1x4", "vm4d", "bh", MergeNone, "aarch64_sme_fdot_single_za32_vg1x4", [IsStreaming, IsSharedZA], []>;
+  def SVDOT_LANE_ZA32_VG1x2_F16   : Inst<"svdot_lane_za32[_{d}]_vg1x2", "vm2di", "bh", MergeNone, "aarch64_sme_fdot_lane_za32_vg1x2", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_3>]>;
+  def SVDOT_LANE_ZA32_VG1x4_F16   : Inst<"svdot_lane_za32[_{d}]_vg1x4", "vm4di", "bh", MergeNone, "aarch64_sme_fdot_lane_za32_vg1x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_3>]>;
+}
+
+let TargetGuard = "sme2,sme-i16i64" in {
+  def SVVDOT_LANE_ZA64_VG1x4_S : Inst<"svvdot_lane_za64[_{d}]_vg1x4", "vm4di", "s", MergeNone, "aarch64_sme_svdot_lane_za64_vg1x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_1>]>;
+  def SVVDOT_LANE_ZA64_VG1x4_U : Inst<"svvdot_lane_za64[_{d}]_vg1x4", "vm4di", "Us", MergeNone, "aarch64_sme_uvdot_lane_za64_vg1x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_1>]>;
+
+  def SVDOT_MULTI_ZA64_VG1x2_S16  : Inst<"svdot_za64[_{d}]_vg1x2", "vm22", "s", MergeNone, "aarch64_sme_sdot_za64_vg1x2", [IsStreaming, IsSharedZA], []>;
+  def SVDOT_MULTI_ZA64_VG1x4_S16  : Inst<"svdot_za64[_{d}]_vg1x4", "vm44", "s", MergeNone, "aarch64_sme_sdot_za64_vg1x4", [IsStreaming, IsSharedZA], []>;
+  def SVDOT_MULTI_ZA64_VG1x2_U16  : Inst<"svdot_za64[_{d}]_vg1x2", "vm22", "Us", MergeNone, "aarch64_sme_udot_za64_vg1x2", [IsStreaming, IsSharedZA], []>;
+  def SVDOT_MULTI_ZA64_VG1x4_U16  : Inst<"svdot_za64[_{d}]_vg1x4", "vm44", "Us", MergeNone, "aarch64_sme_udot_za64_vg1x4", [IsStreaming, IsSharedZA], []>;
+  def SVDOT_SINGLE_ZA64_VG1x2_S16 : Inst<"svdot[_single]_za64[_{d}]_vg1x2", "vm2d", "s", MergeNone, "aarch64_sme_sdot_single_za64_vg1x2", [IsStreaming, IsSharedZA], []>;
+  def SVDOT_SINGLE_ZA64_VG1x4_S16 : Inst<"svdot[_single]_za64[_{d}]_vg1x4", "vm4d", "s", MergeNone, "aarch64_sme_sdot_single_za64_vg1x4", [IsStreaming, IsSharedZA], []>;
+  def SVDOT_SINGLE_ZA64_VG1x2_U16 : Inst<"svdot[_single]_za64[_{d}]_vg1x2", "vm2d", "Us", MergeNone, "aarch64_sme_udot_single_za64_vg1x2", [IsStreaming, IsSharedZA], []>;
+  def SVDOT_SINGLE_ZA64_VG1x4_U16 : Inst<"svdot[_single]_za64[_{d}]_vg1x4", "vm4d", "Us", MergeNone, "aarch64_sme_udot_single_za64_vg1x4", [IsStreaming, IsSharedZA], []>;
+  def SVDOT_LANE_ZA64_VG1x2_S16   : Inst<"svdot_lane_za64[_{d}]_vg1x2", "vm2di", "s", MergeNone, "aarch64_sme_sdot_lane_za64_vg1x2", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_1>]>;
+  def SVDOT_LANE_ZA64_VG1x4_S16   : Inst<"svdot_lane_za64[_{d}]_vg1x4", "vm4di", "s", MergeNone, "aarch64_sme_sdot_lane_za64_vg1x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_1>]>;
+  def SVDOT_LANE_ZA64_VG1x2_U16   : Inst<"svdot_lane_za64[_{d}]_vg1x2", "vm2di", "Us", MergeNone, "aarch64_sme_udot_lane_za64_vg1x2", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_1>]>;
+  def SVDOT_LANE_ZA64_VG1x4_U16   : Inst<"svdot_lane_za64[_{d}]_vg1x4", "vm4di", "Us", MergeNone, "aarch64_sme_udot_lane_za64_vg1x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_1>]>;
 }
 
 // FMLA/FMLS
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 7bc3b7594c8f3..5081062da2862 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -10328,8 +10328,10 @@ static void swapCommutativeSMEOperands(unsigned BuiltinID,
     MultiVec = 1;
     break;
   case SME::BI__builtin_sme_svsumla_za32_s8_vg4x2:
+  case SME::BI__builtin_sme_svsudot_za32_s8_vg1x2:
     MultiVec = 2;
     break;
+  case SME::BI__builtin_sme_svsudot_za32_s8_vg1x4:
   case SME::BI__builtin_sme_svsumla_za32_s8_vg4x4:
     MultiVec = 4;
     break;
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_fp_dots.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_fp_dots.c
new file mode 100644
index 0000000000000..ff4176530710a
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_fp_dots.c
@@ -0,0 +1,297 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// REQUIRES: aarch64-registered-target
+
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+#include <arm_sme_draft_spec_subject_to_change.h>
+
+#ifdef SVE_OVERLOADED_FORMS
+// A simple used,unused... macro, long enough to represent any SVE builtin.
+#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED,A5) A1##A3##A5
+#else
+#define SVE_ACLE_FUNC(A1,A2,A3,A4,A5) A1##A2##A3##A4##A5
+#endif
+
+//
+// Multi, multi (half)
+// CHECK-LABEL: @test_svdot_multi_za32_vg1x2_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZM]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fdot.za32.vg1x2.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z31test_svdot_multi_za32_vg1x2_f16j13svfloat16x2_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZM]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fdot.za32.vg1x2.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svdot_multi_za32_vg1x2_f16(uint32_t slice_base, svfloat16x2_t zn, svfloat16x2_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svdot_za32,,,_f16,_vg1x2)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svdot_multi_za32_vg1x4_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 24)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 8)
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 16)
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fdot.za32.vg1x4.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], <vscale x 8 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z31test_svdot_multi_za32_vg1x4_f16j13svfloat16x4_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZM]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fdot.za32.vg1x4.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], <vscale x 8 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svdot_multi_za32_vg1x4_f16(uint32_t slice_base, svfloat16x4_t zn, svfloat16x4_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svdot_za32,,,_f16,_vg1x4)(slice_base, zn, zm);
+}
+
+
+//
+// Multi, single (half)
+// CHECK-LABEL: @test_svdot_single_za32_vg1x2_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fdot.single.za32.vg1x2.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z32test_svdot_single_za32_vg1x2_f16j13svfloat16x2_tu13__SVFloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fdot.single.za32.vg1x2.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svdot_single_za32_vg1x2_f16(uint32_t slice_base, svfloat16x2_t zn, svfloat16_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svdot_single_za32,,_f16,,_vg1x2)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svdot_single_za32_vg1x4_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fdot.single.za32.vg1x4.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z32test_svdot_single_za32_vg1x4_f16j13svfloat16x4_tu13__SVFloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fdot.single.za32.vg1x4.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svdot_single_za32_vg1x4_f16(uint32_t slice_base, svfloat16x4_t zn, svfloat16_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svdot_single_za32,,_f16,,_vg1x4)(slice_base, zn, zm);
+}
+
+
+//
+// Multi, indexed (half)
+// CHECK-LABEL: @test_svdot_lane_za32_vg1x2_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fdot.lane.za32.vg1x2.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[ZM:%.*]], i32 3)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z30test_svdot_lane_za32_vg1x2_f16j13svfloat16x2_tu13__SVFloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fdot.lane.za32.vg1x2.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[ZM:%.*]], i32 3)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svdot_lane_za32_vg1x2_f16(uint32_t slice_base, svfloat16x2_t zn, svfloat16_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svdot_lane_za32,,,_f16,_vg1x2)(slice_base, zn, zm, 3);
+}
+
+// CHECK-LABEL: @test_svdot_lane_za32_vg1x4_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fdot.lane.za32.vg1x4.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[ZM:%.*]], i32 3)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z30test_svdot_lane_za32_vg1x4_f16j13svfloat16x4_tu13__SVFloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fdot.lane.za32.vg1x4.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[ZM:%.*]], i32 3)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svdot_lane_za32_vg1x4_f16(uint32_t slice_base, svfloat16x4_t zn, svfloat16_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svdot_lane_za32,,,_f16,_vg1x4)(slice_base, zn, zm, 3);
+}
+
+
+//
+// Multi, multi (bfloat)
+// CHECK-LABEL: @test_svdot_multi_za32_vg1x2_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZM]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fdot.za32.vg1x2.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z32test_svdot_multi_za32_vg1x2_bf16j14svbfloat16x2_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZM]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fdot.za32.vg1x2.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svdot_multi_za32_vg1x2_bf16(uint32_t slice_base, svbfloat16x2_t zn, svbfloat16x2_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svdot_za32,,,_bf16,_vg1x2)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svdot_multi_za32_vg1x4_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 24)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZM]], i64 8)
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZM]], i64 16)
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZM]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fdot.za32.vg1x4.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], <vscale x 8 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z32test_svdot_multi_za32_vg1x4_bf16j14svbfloat16x4_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZM]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZM]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZM]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fdot.za32.vg1x4.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], <vscale x 8 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svdot_multi_za32_vg1x4_bf16(uint32_t slice_base, svbfloat16x4_t zn, svbfloat16x4_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svdot_za32,,,_bf16,_vg1x4)(slice_base, zn, zm);
+}
+
+
+//
+// Multi, single (bfloat)
+// CHECK-LABEL: @test_svdot_single_za32_vg1x2_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fdot.single.za32.vg1x2.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z33test_svdot_single_za32_vg1x2_bf16j14svbfloat16x2_tu14__SVBfloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fdot.single.za32.vg1x2.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svdot_single_za32_vg1x2_bf16(uint32_t slice_base, svbfloat16x2_t zn, svbfloat16_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svdot_single_za32,,_bf16,,_vg1x2)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svdot_single_za32_vg1x4_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fdot.single.za32.vg1x4.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z33test_svdot_single_za32_vg1x4_bf16j14svbfloat16x4_tu14__SVBfloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fdot.single.za32.vg1x4.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svdot_single_za32_vg1x4_bf16(uint32_t slice_base, svbfloat16x4_t zn, svbfloat16_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svdot_single_za32,,_bf16,,_vg1x4)(slice_base, zn, zm);
+}
+
+
+//
+// Multi, indexed (bfloat)
+// CHECK-LABEL: @test_svdot_lane_za32_vg1x2_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fdot.lane.za32.vg1x2.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[ZM:%.*]], i32 3)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z31test_svdot_lane_za32_vg1x2_bf16j14svbfloat16x2_tu14__SVBfloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fdot.lane.za32.vg1x2.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[ZM:%.*]], i32 3)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svdot_lane_za32_vg1x2_bf16(uint32_t slice_base, svbfloat16x2_t zn, svbfloat16_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svdot_lane_za32,,_bf16,,_vg1x2)(slice_base, zn, zm, 3);
+}
+
+// CHECK-LABEL: @test_svdot_lane_za32_vg1x4_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fdot.lane.za32.vg1x4.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[ZM:%.*]], i32 3)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z31test_svdot_lane_za32_vg1x4_bf16j14svbfloat16x4_tu14__SVBfloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fdot.lane.za32.vg1x4.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[ZM:%.*]], i32 3)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svdot_lane_za32_vg1x4_bf16(uint32_t slice_base, svbfloat16x4_t zn, svbfloat16_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svdot_lane_za32,,_bf16,,_vg1x4)(slice_base, zn, zm, 3);
+}
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_int_dots.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_int_dots.c
new file mode 100644
index 0000000000000..0d85071b7fc3e
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_int_dots.c
@@ -0,0 +1,1103 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// REQUIRES: aarch64-registered-target
+
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+#include <arm_sme_draft_spec_subject_to_change.h>
+
+#ifdef SVE_OVERLOADED_FORMS
+// A simple used,unused... macro, long enough to represent any SVE builtin.
+#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED,A5) A1##A3##A5
+#else
+#define SVE_ACLE_FUNC(A1,A2,A3,A4,A5) A1##A2##A3##A4##A5
+#endif
+
+//
+// Multi, multi (unsigned)
+// CHECK-LABEL: @test_svdot_multi_za32_vg1x2_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.udot.za32.vg1x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z31test_svdot_multi_za32_vg1x2_u16j12svuint16x2_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.udot.za32.vg1x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svdot_multi_za32_vg1x2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16x2_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svdot_za32,,,_u16,_vg1x2)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svdot_multi_za32_vg1x4_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 8)
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 16)
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.udot.za32.vg1x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z31test_svdot_multi_za32_vg1x4_u16j12svuint16x4_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.udot.za32.vg1x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svdot_multi_za32_vg1x4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16x4_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svdot_za32,,,_u16,_vg1x4)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svdot_multi_za32_vg1x2_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM]], i64 16)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.udot.za32.vg1x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z30test_svdot_multi_za32_vg1x2_u8j11svuint8x2_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM]], i64 16)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.udot.za32.vg1x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svdot_multi_za32_vg1x2_u8(uint32_t slice_base, svuint8x2_t zn, svuint8x2_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svdot_za32,,,_u8,_vg1x2)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svdot_multi_za32_vg1x4_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 16)
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 32)
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 48)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.udot.za32.vg1x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z30test_svdot_multi_za32_vg1x4_u8j11svuint8x4_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 48)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.udot.za32.vg1x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svdot_multi_za32_vg1x4_u8(uint32_t slice_base, svuint8x4_t zn, svuint8x4_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svdot_za32,,,_u8,_vg1x4)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svdot_multi_za64_vg1x2_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.udot.za64.vg1x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z31test_svdot_multi_za64_vg1x2_u16j12svuint16x2_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.udot.za64.vg1x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svdot_multi_za64_vg1x2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16x2_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svdot_za64,,,_u16,_vg1x2)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svdot_multi_za64_vg1x4_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 8)
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 16)
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.udot.za64.vg1x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z31test_svdot_multi_za64_vg1x4_u16j12svuint16x4_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.udot.za64.vg1x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svdot_multi_za64_vg1x4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16x4_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svdot_za64,,,_u16,_vg1x4)(slice_base, zn, zm);
+}
+
+
+//
+// Multi, multi (signed)
+// CHECK-LABEL: @test_svdot_multi_za32_vg1x2_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.sdot.za32.vg1x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z31test_svdot_multi_za32_vg1x2_s16j11svint16x2_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.sdot.za32.vg1x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svdot_multi_za32_vg1x2_s16(uint32_t slice_base, svint16x2_t zn, svint16x2_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svdot_za32,,,_s16,_vg1x2)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svdot_multi_za32_vg1x4_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 8)
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 16)
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.sdot.za32.vg1x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z31test_svdot_multi_za32_vg1x4_s16j11svint16x4_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.sdot.za32.vg1x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svdot_multi_za32_vg1x4_s16(uint32_t slice_base, svint16x4_t zn, svint16x4_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svdot_za32,,,_s16,_vg1x4)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svdot_multi_za32_vg1x2_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM]], i64 16)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.sdot.za32.vg1x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z30test_svdot_multi_za32_vg1x2_s8j10svint8x2_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM]], i64 16)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.sdot.za32.vg1x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svdot_multi_za32_vg1x2_s8(uint32_t slice_base, svint8x2_t zn, svint8x2_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svdot_za32,,,_s8,_vg1x2)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svdot_multi_za32_vg1x4_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 16)
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 32)
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 48)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.sdot.za32.vg1x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z30test_svdot_multi_za32_vg1x4_s8j10svint8x4_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 48)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.sdot.za32.vg1x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svdot_multi_za32_vg1x4_s8(uint32_t slice_base, svint8x4_t zn, svint8x4_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svdot_za32,,,_s8,_vg1x4)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svdot_multi_za64_vg1x2_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.sdot.za64.vg1x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z31test_svdot_multi_za64_vg1x2_s16j11svint16x2_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZM]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.sdot.za64.vg1x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svdot_multi_za64_vg1x2_s16(uint32_t slice_base, svint16x2_t zn, svint16x2_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svdot_za64,,,_s16,_vg1x2)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svdot_multi_za64_vg1x4_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 8)
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 16)
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.sdot.za64.vg1x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z31test_svdot_multi_za64_vg1x4_s16j11svint16x4_tS_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZM]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.sdot.za64.vg1x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], <vscale x 8 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svdot_multi_za64_vg1x4_s16(uint32_t slice_base, svint16x4_t zn, svint16x4_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svdot_za64,,,_s16,_vg1x4)(slice_base, zn, zm);
+}
+
+
+//
+// Multi, single (unsigned)
+// CHECK-LABEL: @test_svdot_single_za32_vg1x2_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.udot.single.za32.vg1x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z32test_svdot_single_za32_vg1x2_u16j12svuint16x2_tu12__SVUint16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.udot.single.za32.vg1x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svdot_single_za32_vg1x2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svdot_single_za32,,_u16,,_vg1x2)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svdot_single_za32_vg1x4_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.udot.single.za32.vg1x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z32test_svdot_single_za32_vg1x4_u16j12svuint16x4_tu12__SVUint16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.udot.single.za32.vg1x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svdot_single_za32_vg1x4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svdot_single_za32,,_u16,,_vg1x4)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svdot_single_za32_vg1x2_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.udot.single.za32.vg1x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z31test_svdot_single_za32_vg1x2_u8j11svuint8x2_tu11__SVUint8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.udot.single.za32.vg1x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svdot_single_za32_vg1x2_u8(uint32_t slice_base, svuint8x2_t zn, svuint8_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svdot_single_za32,,_u8,,_vg1x2)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svdot_single_za32_vg1x4_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.udot.single.za32.vg1x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z31test_svdot_single_za32_vg1x4_u8j11svuint8x4_tu11__SVUint8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.udot.single.za32.vg1x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svdot_single_za32_vg1x4_u8(uint32_t slice_base, svuint8x4_t zn, svuint8_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svdot_single_za32,,_u8,,_vg1x4)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svdot_single_za64_vg1x2_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.udot.single.za64.vg1x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z32test_svdot_single_za64_vg1x2_u16j12svuint16x2_tu12__SVUint16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.udot.single.za64.vg1x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svdot_single_za64_vg1x2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svdot_single_za64,,_u16,,_vg1x2)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svdot_single_za64_vg1x4_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.udot.single.za64.vg1x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z32test_svdot_single_za64_vg1x4_u16j12svuint16x4_tu12__SVUint16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.udot.single.za64.vg1x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svdot_single_za64_vg1x4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svdot_single_za64,,_u16,,_vg1x4)(slice_base, zn, zm);
+}
+
+
+//
+// Multi, single (signed)
+// CHECK-LABEL: @test_svdot_single_za32_vg1x2_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.sdot.single.za32.vg1x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z32test_svdot_single_za32_vg1x2_s16j11svint16x2_tu11__SVInt16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.sdot.single.za32.vg1x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svdot_single_za32_vg1x2_s16(uint32_t slice_base, svint16x2_t zn, svint16_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svdot_single_za32,,_s16,,_vg1x2)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svdot_single_za32_vg1x4_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.sdot.single.za32.vg1x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z32test_svdot_single_za32_vg1x4_s16j11svint16x4_tu11__SVInt16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.sdot.single.za32.vg1x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svdot_single_za32_vg1x4_s16(uint32_t slice_base, svint16x4_t zn, svint16_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svdot_single_za32,,_s16,,_vg1x4)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svdot_single_za32_vg1x2_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.sdot.single.za32.vg1x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z31test_svdot_single_za32_vg1x2_s8j10svint8x2_tu10__SVInt8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.sdot.single.za32.vg1x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svdot_single_za32_vg1x2_s8(uint32_t slice_base, svint8x2_t zn, svint8_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svdot_single_za32,,_s8,,_vg1x2)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svdot_single_za32_vg1x4_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.sdot.single.za32.vg1x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z31test_svdot_single_za32_vg1x4_s8j10svint8x4_tu10__SVInt8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.sdot.single.za32.vg1x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svdot_single_za32_vg1x4_s8(uint32_t slice_base, svint8x4_t zn, svint8_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svdot_single_za32,,_s8,,_vg1x4)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svdot_single_za64_vg1x2_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.sdot.single.za64.vg1x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z32test_svdot_single_za64_vg1x2_s16j11svint16x2_tu11__SVInt16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.sdot.single.za64.vg1x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svdot_single_za64_vg1x2_s16(uint32_t slice_base, svint16x2_t zn, svint16_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svdot_single_za64,,_s16,,_vg1x2)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svdot_single_za64_vg1x4_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.sdot.single.za64.vg1x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z32test_svdot_single_za64_vg1x4_s16j11svint16x4_tu11__SVInt16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.sdot.single.za64.vg1x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svdot_single_za64_vg1x4_s16(uint32_t slice_base, svint16x4_t zn, svint16_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svdot_single_za64,,_s16,,_vg1x4)(slice_base, zn, zm);
+}
+
+//
+// Multi, indexed (unsigned)
+// CHECK-LABEL: @test_svdot_lane_za32_vg1x2_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]], i32 3)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z30test_svdot_lane_za32_vg1x2_u16j12svuint16x2_tu12__SVUint16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]], i32 3)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svdot_lane_za32_vg1x2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svdot_lane_za32,,_u16,,_vg1x2)(slice_base, zn, zm, 3);
+}
+
+// CHECK-LABEL: @test_svdot_lane_za32_vg1x4_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]], i32 3)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z30test_svdot_lane_za32_vg1x4_u16j12svuint16x4_tu12__SVUint16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]], i32 3)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svdot_lane_za32_vg1x4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svdot_lane_za32,,_u16,,_vg1x4)(slice_base, zn, zm, 3);
+}
+
+// CHECK-LABEL: @test_svdot_lane_za32_vg1x2_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]], i32 3)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z29test_svdot_lane_za32_vg1x2_u8j11svuint8x2_tu11__SVUint8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]], i32 3)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svdot_lane_za32_vg1x2_u8(uint32_t slice_base, svuint8x2_t zn, svuint8_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svdot_lane_za32,,_u8,,_vg1x2)(slice_base, zn, zm, 3);
+}
+
+// CHECK-LABEL: @test_svdot_lane_za32_vg1x4_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]], i32 3)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z29test_svdot_lane_za32_vg1x4_u8j11svuint8x4_tu11__SVUint8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]], i32 3)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svdot_lane_za32_vg1x4_u8(uint32_t slice_base, svuint8x4_t zn, svuint8_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svdot_lane_za32,,_u8,,_vg1x4)(slice_base, zn, zm, 3);
+}
+
+// CHECK-LABEL: @test_svdot_lane_za64_vg1x2_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.udot.lane.za64.vg1x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]], i32 1)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z30test_svdot_lane_za64_vg1x2_u16j12svuint16x2_tu12__SVUint16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.udot.lane.za64.vg1x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]], i32 1)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svdot_lane_za64_vg1x2_u16(uint32_t slice_base, svuint16x2_t zn, svuint16_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svdot_lane_za64,,,_u16,_vg1x2)(slice_base, zn, zm, 1);
+}
+
+// CHECK-LABEL: @test_svdot_lane_za64_vg1x4_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.udot.lane.za64.vg1x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]], i32 1)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z30test_svdot_lane_za64_vg1x4_u16j12svuint16x4_tu12__SVUint16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.udot.lane.za64.vg1x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]], i32 1)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svdot_lane_za64_vg1x4_u16(uint32_t slice_base, svuint16x4_t zn, svuint16_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svdot_lane_za64,,,_u16,_vg1x4)(slice_base, zn, zm, 1);
+}
+
+
+//
+// Multi, indexed (signed)
+// CHECK-LABEL: @test_svdot_lane_za32_vg1x2_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]], i32 3)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z30test_svdot_lane_za32_vg1x2_s16j11svint16x2_tu11__SVInt16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]], i32 3)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svdot_lane_za32_vg1x2_s16(uint32_t slice_base, svint16x2_t zn, svint16_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svdot_lane_za32,,,_s16,_vg1x2)(slice_base, zn, zm, 3);
+}
+
+// CHECK-LABEL: @test_svdot_lane_za32_vg1x4_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]], i32 3)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z30test_svdot_lane_za32_vg1x4_s16j11svint16x4_tu11__SVInt16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]], i32 3)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svdot_lane_za32_vg1x4_s16(uint32_t slice_base, svint16x4_t zn, svint16_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svdot_lane_za32,,,_s16,_vg1x4)(slice_base, zn, zm, 3);
+}
+
+// CHECK-LABEL: @test_svdot_lane_za32_vg1x2_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]], i32 3)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z29test_svdot_lane_za32_vg1x2_s8j10svint8x2_tu10__SVInt8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]], i32 3)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svdot_lane_za32_vg1x2_s8(uint32_t slice_base, svint8x2_t zn, svint8_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svdot_lane_za32,,,_s8,_vg1x2)(slice_base, zn, zm, 3);
+}
+
+// CHECK-LABEL: @test_svdot_lane_za32_vg1x4_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]], i32 3)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z29test_svdot_lane_za32_vg1x4_s8j10svint8x4_tu10__SVInt8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]], i32 3)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svdot_lane_za32_vg1x4_s8(uint32_t slice_base, svint8x4_t zn, svint8_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svdot_lane_za32,,,_s8,_vg1x4)(slice_base, zn, zm, 3);
+}
+
+// CHECK-LABEL: @test_svdot_lane_za64_vg1x2_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.sdot.lane.za64.vg1x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]], i32 1)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z30test_svdot_lane_za64_vg1x2_s16j11svint16x2_tu11__SVInt16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.sdot.lane.za64.vg1x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]], i32 1)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svdot_lane_za64_vg1x2_s16(uint32_t slice_base, svint16x2_t zn, svint16_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svdot_lane_za64,,,_s16,_vg1x2)(slice_base, zn, zm, 1);
+}
+
+// CHECK-LABEL: @test_svdot_lane_za64_vg1x4_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.sdot.lane.za64.vg1x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]], i32 1)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z30test_svdot_lane_za64_vg1x4_s16j11svint16x4_tu11__SVInt16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.sdot.lane.za64.vg1x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]], i32 1)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svdot_lane_za64_vg1x4_s16(uint32_t slice_base, svint16x4_t zn, svint16_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svdot_lane_za64,,,_s16,_vg1x4)(slice_base, zn, zm, 1);
+}
+
+
+//
+// Multi, multi (unsigned by signed)
+// CHECK-LABEL: @test_svusdot_multi_za32_vg1x2_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM]], i64 16)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.usdot.za32.vg1x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z32test_svusdot_multi_za32_vg1x2_u8j11svuint8x2_t10svint8x2_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM]], i64 16)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.usdot.za32.vg1x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svusdot_multi_za32_vg1x2_u8(uint32_t slice_base, svuint8x2_t zn, svint8x2_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svusdot_za32,,,_u8,_vg1x2)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svusdot_multi_za32_vg1x4_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 16)
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 32)
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 48)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.usdot.za32.vg1x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z32test_svusdot_multi_za32_vg1x4_u8j11svuint8x4_t10svint8x4_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 48)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.usdot.za32.vg1x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svusdot_multi_za32_vg1x4_u8(uint32_t slice_base, svuint8x4_t zn, svint8x4_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svusdot_za32,,,_u8,_vg1x4)(slice_base, zn, zm);
+}
+
+
+//
+// Multi, single (unsigned by signed)
+// CHECK-LABEL: @test_svusdot_single_za32_vg1x2_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.usdot.single.za32.vg1x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z33test_svusdot_single_za32_vg1x2_u8j11svuint8x2_tu10__SVInt8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.usdot.single.za32.vg1x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svusdot_single_za32_vg1x2_u8(uint32_t slice_base, svuint8x2_t zn, svint8_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svusdot_single_za32,,_u8,,_vg1x2)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svusdot_single_za32_vg1x4_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.usdot.single.za32.vg1x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z33test_svusdot_single_za32_vg1x4_u8j11svuint8x4_tu10__SVInt8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.usdot.single.za32.vg1x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svusdot_single_za32_vg1x4_u8(uint32_t slice_base, svuint8x4_t zn, svint8_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svusdot_single_za32,,_u8,,_vg1x4)(slice_base, zn, zm);
+}
+
+//
+// Multi, indexed (unsigned by signed)
+// CHECK-LABEL: @test_svusdot_lane_za32_vg1x2_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]], i32 3)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z31test_svusdot_lane_za32_vg1x2_u8j11svuint8x2_tu10__SVInt8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]], i32 3)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svusdot_lane_za32_vg1x2_u8(uint32_t slice_base, svuint8x2_t zn, svint8_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svusdot_lane_za32,,_u8,,_vg1x2)(slice_base, zn, zm, 3);
+}
+
+// CHECK-LABEL: @test_svusdot_lane_za32_vg1x4_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]], i32 3)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z31test_svusdot_lane_za32_vg1x4_u8j11svuint8x4_tu10__SVInt8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]], i32 3)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svusdot_lane_za32_vg1x4_u8(uint32_t slice_base, svuint8x4_t zn, svint8_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svusdot_lane_za32,,_u8,,_vg1x4)(slice_base, zn, zm, 3);
+}
+
+
+//
+// Multi, single (signed by unsigned)
+// CHECK-LABEL: @test_svsudot_single_za32_vg1x2_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.sudot.single.za32.vg1x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z33test_svsudot_single_za32_vg1x2_s8j10svint8x2_tu11__SVUint8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.sudot.single.za32.vg1x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svsudot_single_za32_vg1x2_s8(uint32_t slice_base, svint8x2_t zn, svuint8_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svsudot_single_za32,,_s8,,_vg1x2)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svsudot_single_za32_vg1x4_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.sudot.single.za32.vg1x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z33test_svsudot_single_za32_vg1x4_s8j10svint8x4_tu11__SVUint8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.sudot.single.za32.vg1x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svsudot_single_za32_vg1x4_s8(uint32_t slice_base, svint8x4_t zn, svuint8_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svsudot_single_za32,,_s8,,_vg1x4)(slice_base, zn, zm);
+}
+
+//
+// Multi, multi (signed by unsigned)
+// CHECK-LABEL: @test_svsudot_multi_za32_vg1x2_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM]], i64 16)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.usdot.za32.vg1x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z32test_svsudot_multi_za32_vg1x2_s8j10svint8x2_t11svuint8x2_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZM]], i64 16)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.usdot.za32.vg1x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svsudot_multi_za32_vg1x2_s8(uint32_t slice_base, svint8x2_t zn, svuint8x2_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svsudot_za32,,_s8,,_vg1x2)(slice_base, zn, zm);
+}
+
+// CHECK-LABEL: @test_svsudot_multi_za32_vg1x4_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 16)
+// CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 32)
+// CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 48)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.usdot.za32.vg1x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z32test_svsudot_multi_za32_vg1x4_s8j10svint8x4_t11svuint8x4_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP7:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZM]], i64 48)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.usdot.za32.vg1x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP4]], <vscale x 16 x i8> [[TMP5]], <vscale x 16 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svsudot_multi_za32_vg1x4_s8(uint32_t slice_base, svint8x4_t zn, svuint8x4_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svsudot_za32,,_s8,,_vg1x4)(slice_base, zn, zm);
+}
+
+//
+// Multi, indexed (signed by unsigned)
+// CHECK-LABEL: @test_svsudot_lane_za32_vg1x2_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]], i32 3)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z31test_svsudot_lane_za32_vg1x2_s8j10svint8x2_tu11__SVUint8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x2.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[ZM:%.*]], i32 3)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svsudot_lane_za32_vg1x2_s8(uint32_t slice_base, svint8x2_t zn, svuint8_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svsudot_lane_za32,,_s8,,_vg1x2)(slice_base, zn, zm, 3);
+}
+
+// CHECK-LABEL: @test_svsudot_lane_za32_vg1x4_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]], i32 3)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z31test_svsudot_lane_za32_vg1x4_s8j10svint8x4_tu11__SVUint8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]], i32 3)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svsudot_lane_za32_vg1x4_s8(uint32_t slice_base, svint8x4_t zn, svuint8_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svsudot_lane_za32,,_s8,,_vg1x4)(slice_base, zn, zm, 3);
+}
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vdot.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vdot.c
new file mode 100644
index 0000000000000..fb313d4cebd72
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vdot.c
@@ -0,0 +1,222 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// REQUIRES: aarch64-registered-target
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+#include <arm_sme_draft_spec_subject_to_change.h>
+
+#ifdef SVE_OVERLOADED_FORMS
+// A simple used,unused... macro, long enough to represent any SVE builtin.
+#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3
+#else
+#define SVE_ACLE_FUNC(A1,A2,A3) A1##A2##A3
+#endif
+
+// CHECK-LABEL: @test_svvdot_lane_za32_bf16_vg1x2(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fvdot.lane.za32.vg1x2.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[ZM:%.*]], i32 3)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z32test_svvdot_lane_za32_bf16_vg1x2j14svbfloat16x2_tu14__SVBfloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fvdot.lane.za32.vg1x2.nxv8bf16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[ZM:%.*]], i32 3)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svvdot_lane_za32_bf16_vg1x2(uint32_t slice_base, svbfloat16x2_t zn, svbfloat16_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svvdot_lane_za32,_bf16,_vg1x2)(slice_base, zn, zm, 3);
+}
+
+// CHECK-LABEL: @test_svvdot_lane_za32_f16_vg1x2(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fvdot.lane.za32.vg1x2.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[ZM:%.*]], i32 3)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z31test_svvdot_lane_za32_f16_vg1x2j13svfloat16x2_tu13__SVFloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fvdot.lane.za32.vg1x2.nxv8f16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[ZM:%.*]], i32 3)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svvdot_lane_za32_f16_vg1x2(uint32_t slice_base, svfloat16x2_t zn, svfloat16_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svvdot_lane_za32,_f16,_vg1x2)(slice_base, zn, zm, 3);
+}
+
+// CHECK-LABEL: @test_svvdot_lane_za32_s16_vg1x2(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]], i32 3)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z31test_svvdot_lane_za32_s16_vg1x2j11svint16x2_tu11__SVInt16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]], i32 3)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svvdot_lane_za32_s16_vg1x2(uint32_t slice_base, svint16x2_t zn, svint16_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svvdot_lane_za32,_s16,_vg1x2)(slice_base, zn, zm, 3);
+}
+
+// CHECK-LABEL: @test_svvdot_lane_za32_u16_vg1x2(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]], i32 3)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z31test_svvdot_lane_za32_u16_vg1x2j12svuint16x2_tu12__SVUint16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x2.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[ZM:%.*]], i32 3)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svvdot_lane_za32_u16_vg1x2(uint32_t slice_base, svuint16x2_t zn, svuint16_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svvdot_lane_za32,_u16,_vg1x2)(slice_base, zn, zm, 3);
+}
+
+// CHECK-LABEL: @test_svvdot_lane_za32_s8_vg1x4(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]], i32 3)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z30test_svvdot_lane_za32_s8_vg1x4j10svint8x4_tu10__SVInt8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]], i32 3)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svvdot_lane_za32_s8_vg1x4(uint32_t slice_base, svint8x4_t zn, svint8_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svvdot_lane_za32,_s8,_vg1x4)(slice_base, zn, zm, 3);
+}
+
+// CHECK-LABEL: @test_svvdot_lane_za32_u8_vg1x4(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]], i32 3)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z30test_svvdot_lane_za32_u8_vg1x4j11svuint8x4_tu11__SVUint8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]], i32 3)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svvdot_lane_za32_u8_vg1x4(uint32_t slice_base, svuint8x4_t zn, svuint8_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svvdot_lane_za32,_u8,_vg1x4)(slice_base, zn, zm, 3);
+}
+
+// CHECK-LABEL: @test_svvdot_lane_za64_s16_vg1x4(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.svdot.lane.za64.vg1x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]], i32 1)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z31test_svvdot_lane_za64_s16_vg1x4j11svint16x4_tu11__SVInt16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.svdot.lane.za64.vg1x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]], i32 1)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svvdot_lane_za64_s16_vg1x4(uint32_t slice_base, svint16x4_t zn, svint16_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svvdot_lane_za64,_s16,_vg1x4)(slice_base, zn, zm, 1);
+}
+
+// CHECK-LABEL: @test_svvdot_lane_za64_u16_vg1x4(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.uvdot.lane.za64.vg1x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]], i32 1)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z31test_svvdot_lane_za64_u16_vg1x4j12svuint16x4_tu12__SVUint16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[ZN]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.uvdot.lane.za64.vg1x4.nxv8i16(i32 [[SLICE_BASE:%.*]], <vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[ZM:%.*]], i32 1)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svvdot_lane_za64_u16_vg1x4(uint32_t slice_base, svuint16x4_t zn, svuint16_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svvdot_lane_za64,_u16,_vg1x4)(slice_base, zn, zm, 1);
+}
+
+
+// CHECK-LABEL: @test_svsuvdot_lane_za32_s8_vg1x4(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.suvdot.lane.za32.vg1x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]], i32 3)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z32test_svsuvdot_lane_za32_s8_vg1x4j10svint8x4_tu10__SVInt8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.suvdot.lane.za32.vg1x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]], i32 3)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svsuvdot_lane_za32_s8_vg1x4(uint32_t slice_base, svint8x4_t zn, svint8_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svsuvdot_lane_za32,_s8,_vg1x4)(slice_base, zn, zm, 3);
+}
+
+
+// CHECK-LABEL: @test_svusvdot_lane_za32_u8_vg1x4(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.usvdot.lane.za32.vg1x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]], i32 3)
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z32test_svusvdot_lane_za32_u8_vg1x4j11svuint8x4_tu11__SVUint8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[ZN]], i64 48)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.usvdot.lane.za32.vg1x4.nxv16i8(i32 [[SLICE_BASE:%.*]], <vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[ZM:%.*]], i32 3)
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svusvdot_lane_za32_u8_vg1x4(uint32_t slice_base, svuint8x4_t zn, svuint8_t zm) __arm_streaming __arm_shared_za {
+  SVE_ACLE_FUNC(svusvdot_lane_za32,_u8,_vg1x4)(slice_base, zn, zm, 3);
+}
+
diff --git a/clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_imm.cpp b/clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_imm.cpp
index d07d83a53e462..6a6370bf99b10 100644
--- a/clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_imm.cpp
+++ b/clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_imm.cpp
@@ -285,3 +285,68 @@ void test_multiply_add_sub_long(uint32_t base, svint8_t s8, svuint8_t u8,
   svusmla_lane_za32_u8_vg4x2(base, u8x2, s8, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
   svusmla_lane_za32_u8_vg4x4(base, u8x4, s8, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
 }
+
+void test_vertical_dot_product(uint32_t base, svint16x2_t s16x2, svuint16x2_t u16x2,
+                               svint8x4_t s8x4, svuint8x4_t u8x4,
+                               svint16x4_t s16x4, svuint16x4_t u16x4,
+                               svfloat16x2_t f16x2, svbfloat16x2_t bf16x2,
+                               svint16_t s16, svuint16_t u16,
+                               svint8_t s8, svuint8_t u8,
+                               svfloat16_t f16, svbfloat16_t b16) __arm_streaming __arm_shared_za {
+  // Test lane indices.
+  svvdot_lane_za32_s16_vg1x2(base, s16x2, s16, 4);   // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+  svvdot_lane_za32_u16_vg1x2(base, u16x2, u16, 4);   // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+  svvdot_lane_za32_s8_vg1x4(base, s8x4, s8, 4);      // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+  svvdot_lane_za32_u8_vg1x4(base, u8x4, u8, 4);      // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+  svvdot_lane_za64_s16_vg1x4(base, s16x4, s16, 2);   // expected-error {{argument value 2 is outside the valid range [0, 1]}}
+  svvdot_lane_za64_u16_vg1x4(base, u16x4, u16, 2);   // expected-error {{argument value 2 is outside the valid range [0, 1]}}
+  svvdot_lane_za32_f16_vg1x2(base, f16x2, f16, 4);   // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+  svvdot_lane_za32_bf16_vg1x2(base, bf16x2, b16, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+  svsuvdot_lane_za32_s8_vg1x4(base, s8x4, s8, 4);    // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+  svusvdot_lane_za32_u8_vg1x4(base, u8x4, u8, 4);    // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+}
+
+void test_fdot_za32_bad_lane(uint32_t slice_base, svfloat16_t z_f16,
+                             svfloat16x2_t z_f16x2, svfloat16x4_t z_f16x4,
+                             svbfloat16_t z_bf16, svbfloat16x2_t z_bf16x2,
+                             svbfloat16x4_t z_bf16x4) __arm_streaming __arm_shared_za {
+  // 16-bit float
+  svdot_lane_za32_f16_vg1x2(slice_base, z_f16x2, z_f16, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+  svdot_lane_za32_f16_vg1x4(slice_base, z_f16x4, z_f16, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+
+  // 16-bit binary float
+  svdot_lane_za32_bf16_vg1x2(slice_base, z_bf16x2, z_bf16, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+  svdot_lane_za32_bf16_vg1x4(slice_base, z_bf16x4, z_bf16, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+}
+
+void test_svdot_multi_za32_bad_lane(uint32_t slice_base, svuint16_t z_u16,
+                                    svuint16x2_t z_u16x2, svuint16x4_t z_u16x4,
+                                    svint16_t z_s16, svint16x2_t z_s16x2,
+                                    svint16x4_t z_s16x4, svuint8_t z_u8,
+                                    svuint8x2_t z_u8x2, svuint8x4_t z_u8x4,
+                                    svint8_t z_s8, svint8x2_t z_s8x2,
+                                    svint8x4_t z_s8x4) __arm_streaming __arm_shared_za {
+  // Multi, indexed (unsigned)
+  svdot_lane_za32_u16_vg1x2(slice_base, z_u16x2, z_u16, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+  svdot_lane_za32_u16_vg1x4(slice_base, z_u16x4, z_u16, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+  svdot_lane_za32_u8_vg1x2(slice_base, z_u8x2, z_u8, 4);    // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+  svdot_lane_za32_u8_vg1x4(slice_base, z_u8x4, z_u8, 4);    // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+  svdot_lane_za64_u16_vg1x2(slice_base, z_u16x2, z_u16, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}}
+  svdot_lane_za64_u16_vg1x4(slice_base, z_u16x4, z_u16, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}}
+
+  // Multi, indexed (signed)
+  svdot_lane_za32_s16_vg1x2(slice_base, z_s16x2, z_s16, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+  svdot_lane_za32_s16_vg1x4(slice_base, z_s16x4, z_s16, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+  svdot_lane_za32_s8_vg1x2(slice_base, z_s8x2, z_s8, 4);    // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+  svdot_lane_za32_s8_vg1x4(slice_base, z_s8x4, z_s8, 4);    // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+  svdot_lane_za64_s16_vg1x2(slice_base, z_s16x2, z_s16, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}}
+  svdot_lane_za64_s16_vg1x4(slice_base, z_s16x4, z_s16, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}}
+
+  // Multi, indexed (unsigned by signed)
+  svusdot_lane_za32_u8_vg1x2(slice_base, z_u8x2, z_s8, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+  svusdot_lane_za32_u8_vg1x4(slice_base, z_u8x4, z_s8, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+
+  // Multi, indexed (unsigned by signed)
+  svsudot_lane_za32_s8_vg1x2(slice_base, z_s8x2, z_u8, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+  svsudot_lane_za32_s8_vg1x4(slice_base, z_s8x4, z_u8, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+}

From 74a09bd1ec6066d56880df1ae1a2c0258442cee9 Mon Sep 17 00:00:00 2001
From: Jonas Paulsson <paulson1@linux.ibm.com>
Date: Thu, 21 Dec 2023 20:48:00 +0100
Subject: [PATCH 112/342] [SystemZ] Test improvements for atomic load/store
 instructions (NFC). (#75630)

Improve tests for atomic loads and stores, mainly by testing 128-bit atomic load and store instructions both with and w/out natural alignment.
---
 llvm/test/CodeGen/SystemZ/atomic-load-05.ll  | 18 ++++++++++----
 llvm/test/CodeGen/SystemZ/atomic-load-08.ll  | 26 ++++++++++++--------
 llvm/test/CodeGen/SystemZ/atomic-store-05.ll |  9 ++++++-
 llvm/test/CodeGen/SystemZ/atomic-store-06.ll | 14 +++++++----
 llvm/test/CodeGen/SystemZ/atomic-store-07.ll |  7 ++++--
 llvm/test/CodeGen/SystemZ/atomic-store-08.ll | 26 ++++++++++++--------
 6 files changed, 67 insertions(+), 33 deletions(-)

diff --git a/llvm/test/CodeGen/SystemZ/atomic-load-05.ll b/llvm/test/CodeGen/SystemZ/atomic-load-05.ll
index 979f1e684e89a..f406bc6d2ce6c 100644
--- a/llvm/test/CodeGen/SystemZ/atomic-load-05.ll
+++ b/llvm/test/CodeGen/SystemZ/atomic-load-05.ll
@@ -1,14 +1,22 @@
-; Test 128-bit atomic loads.
+; Test 128-bit integer atomic loads.
 ;
 ; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
 ; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
 
 define i128 @f1(ptr %src) {
 ; CHECK-LABEL: f1:
-; CHECK: lpq %r0, 0(%r3)
-; CHECK-DAG: stg %r1, 8(%r2)
-; CHECK-DAG: stg %r0, 0(%r2)
-; CHECK: br %r14
+; CHECK: # %bb.0:
+; CHECK-NEXT: lpq %r0, 0(%r3)
+; CHECK-NEXT: stg %r1, 8(%r2)
+; CHECK-NEXT: stg %r0, 0(%r2)
+; CHECK-NEXT: br %r14
   %val = load atomic i128, ptr %src seq_cst, align 16
   ret i128 %val
 }
+
+define i128 @f2(ptr %src) {
+; CHECK-LABEL: f2:
+; CHECK: brasl %r14, __atomic_load@PLT
+  %val = load atomic i128, ptr %src seq_cst, align 8
+  ret i128 %val
+}
diff --git a/llvm/test/CodeGen/SystemZ/atomic-load-08.ll b/llvm/test/CodeGen/SystemZ/atomic-load-08.ll
index 069d2168e19af..4d914e3ea0e18 100644
--- a/llvm/test/CodeGen/SystemZ/atomic-load-08.ll
+++ b/llvm/test/CodeGen/SystemZ/atomic-load-08.ll
@@ -1,19 +1,25 @@
-; Test long double atomic loads. Expect a libcall.
+; Test long double atomic loads. These are emitted by the Clang FE as i128
+; loads with a bitcast, and this test case gets converted into that form as
+; well by the AtomicExpand pass.
 ;
 ; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
 
 define void @f1(ptr %ret, ptr %src) {
 ; CHECK-LABEL: f1:
-; CHECK: lgr [[RET:%r[0-9]+]], %r2
-; CHECK: la %r4, 160(%r15)
-; CHECK: lghi %r2, 16
-; CHECK: lhi %r5, 5
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lpq %r0, 0(%r3)
+; CHECK-NEXT:    stg %r1, 8(%r2)
+; CHECK-NEXT:    stg %r0, 0(%r2)
+; CHECK-NEXT:    br %r14
+  %val = load atomic fp128, ptr %src seq_cst, align 16
+  store fp128 %val, ptr %ret, align 8
+  ret void
+}
+
+define void @f2(ptr %ret, ptr %src) {
+; CHECK-LABEL: f2:
 ; CHECK: brasl %r14, __atomic_load@PLT
-; CHECK: ld [[FL:%f[0-9]+]], 160(%r15)
-; CHECK: ld [[FH:%f[0-9]+]], 168(%r15)
-; CHECK: std [[FL]], 0([[RET]])
-; CHECK: std [[FH]], 8([[RET]])
-; CHECK: br %r14
   %val = load atomic fp128, ptr %src seq_cst, align 8
   store fp128 %val, ptr %ret, align 8
   ret void
diff --git a/llvm/test/CodeGen/SystemZ/atomic-store-05.ll b/llvm/test/CodeGen/SystemZ/atomic-store-05.ll
index dad7d9527b848..e4af7ad57e38a 100644
--- a/llvm/test/CodeGen/SystemZ/atomic-store-05.ll
+++ b/llvm/test/CodeGen/SystemZ/atomic-store-05.ll
@@ -1,4 +1,4 @@
-; Test 128-bit atomic stores.
+; Test 128-bit integer atomic stores.
 ;
 ; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
 ; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
@@ -24,3 +24,10 @@ define void @f2(i128 %val, ptr %src) {
   store atomic i128 %val, ptr %src monotonic, align 16
   ret void
 }
+
+define void @f3(i128 %val, ptr %src) {
+; CHECK-LABEL: f3:
+; CHECK: brasl %r14, __atomic_store@PLT
+  store atomic i128 %val, ptr %src seq_cst, align 8
+  ret void
+}
diff --git a/llvm/test/CodeGen/SystemZ/atomic-store-06.ll b/llvm/test/CodeGen/SystemZ/atomic-store-06.ll
index fd39793faefc8..b748bfc767a4d 100644
--- a/llvm/test/CodeGen/SystemZ/atomic-store-06.ll
+++ b/llvm/test/CodeGen/SystemZ/atomic-store-06.ll
@@ -1,13 +1,17 @@
-; Test float atomic loads.
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; Test float atomic stores.
 ;
 ; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
 
 define void @f1(ptr %src, float %val) {
 ; CHECK-LABEL: f1:
-; CHECK: lgdr [[R:%r[0-9]+]], %f0
-; CHECK: srlg [[R]], [[R]], 32
-; CHECK: st [[R]], 0(%r2)
-; CHECK: br %r14
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    # kill: def $f0s killed $f0s def $f0d
+; CHECK-NEXT:    lgdr %r0, %f0
+; CHECK-NEXT:    srlg %r0, %r0, 32
+; CHECK-NEXT:    st %r0, 0(%r2)
+; CHECK-NEXT:    bcr 15, %r0
+; CHECK-NEXT:    br %r14
   store atomic float %val, ptr %src seq_cst, align 4
   ret void
 }
diff --git a/llvm/test/CodeGen/SystemZ/atomic-store-07.ll b/llvm/test/CodeGen/SystemZ/atomic-store-07.ll
index c904b738f2c57..11f81ae1e07de 100644
--- a/llvm/test/CodeGen/SystemZ/atomic-store-07.ll
+++ b/llvm/test/CodeGen/SystemZ/atomic-store-07.ll
@@ -1,11 +1,14 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
 ; Test double atomic stores.
 ;
 ; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
 
 define void @f1(ptr %dst, double %val) {
 ; CHECK-LABEL: f1:
-; CHECK: std %f0, 0(%r2)
-; CHECK: br %r14
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    std %f0, 0(%r2)
+; CHECK-NEXT:    bcr 15, %r0
+; CHECK-NEXT:    br %r14
   store atomic double %val, ptr %dst seq_cst, align 8
   ret void
 }
diff --git a/llvm/test/CodeGen/SystemZ/atomic-store-08.ll b/llvm/test/CodeGen/SystemZ/atomic-store-08.ll
index b33b283e8dbd7..f7f4f4d967dbd 100644
--- a/llvm/test/CodeGen/SystemZ/atomic-store-08.ll
+++ b/llvm/test/CodeGen/SystemZ/atomic-store-08.ll
@@ -1,19 +1,25 @@
-; Test long double atomic stores. Expect a libcall.
+; Test long double atomic stores. The atomic store is converted to i128 by
+; the AtomicExpand pass.
 ;
 ; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
 
 define void @f1(ptr %dst, ptr %src) {
 ; CHECK-LABEL: f1:
-; CHECK: ld [[FL:%f[0-9]+]], 0(%r3)
-; CHECK: ld [[FH:%f[0-9]+]], 8(%r3)
-; CHECK: lgr %r3, %r2
-; CHECK: std [[FL]], 160(%r15)
-; CHECK: std [[FH]], 168(%r15)
-; CHECK: la %r4, 160(%r15)
-; CHECK: lghi %r2, 16
-; CHECK: lhi %r5, 5
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lg %r1, 8(%r3)
+; CHECK-NEXT:    lg %r0, 0(%r3)
+; CHECK-NEXT:    stpq %r0, 0(%r2)
+; CHECK-NEXT:    bcr 1{{[45]}}, %r0
+; CHECK-NEXT:    br %r14
+  %val = load fp128, ptr %src, align 8
+  store atomic fp128 %val, ptr %dst seq_cst, align 16
+  ret void
+}
+
+define void @f2(ptr %dst, ptr %src) {
+; CHECK-LABEL: f2:
 ; CHECK: brasl %r14, __atomic_store@PLT
-; CHECK: br %r14
   %val = load fp128, ptr %src, align 8
   store atomic fp128 %val, ptr %dst seq_cst, align 8
   ret void

From 1830fadb78be9993cfeeaa7fb6867c3df1a53a8b Mon Sep 17 00:00:00 2001
From: cmtice <cmtice@google.com>
Date: Thu, 21 Dec 2023 12:05:36 -0800
Subject: [PATCH 113/342] [LLDB] Fix write permission error in
 TestGlobalModuleCache.py (#76171)

TestGlobalModuleCache.py, a recently added test, tries to update a
source file in the build directory, but it assumes the file is writable.
In our distributed build and test system, this is not always true, so
the test often fails with a write permissions error.

This change fixes that by setting the permissions on the file to be
writable before attempting to write to it.
---
 .../global_module_cache/TestGlobalModuleCache.py           | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/lldb/test/API/python_api/global_module_cache/TestGlobalModuleCache.py b/lldb/test/API/python_api/global_module_cache/TestGlobalModuleCache.py
index 6bb22c46efb44..5b6e9e8a588a3 100644
--- a/lldb/test/API/python_api/global_module_cache/TestGlobalModuleCache.py
+++ b/lldb/test/API/python_api/global_module_cache/TestGlobalModuleCache.py
@@ -26,6 +26,13 @@ def copy_to_main(self, src, dst):
         # a previous build, so sleep a bit here to ensure that the touch is later.
         time.sleep(2)
         try:
+            # Make sure dst is writeable before trying to write to it.
+            subprocess.run(
+                ["chmod", "777", dst],
+                stdin=None,
+                capture_output=False,
+                encoding="utf-8",
+            )
             shutil.copy(src, dst)
         except:
             self.fail(f"Could not copy {src} to {dst}")

From 528150b90bb54618b5cceddb85601718c2346f3f Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Thu, 21 Dec 2023 12:37:17 -0800
Subject: [PATCH 114/342] [hwasan] Separate sections in report (#76130)

It makes them easier to read.
---
 compiler-rt/lib/hwasan/hwasan_report.cpp | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/compiler-rt/lib/hwasan/hwasan_report.cpp b/compiler-rt/lib/hwasan/hwasan_report.cpp
index 5e8aa315801bc..bbe89112e4dbe 100644
--- a/compiler-rt/lib/hwasan/hwasan_report.cpp
+++ b/compiler-rt/lib/hwasan/hwasan_report.cpp
@@ -233,7 +233,7 @@ static void PrintStackAllocations(const StackAllocationsRingBuffer *sa,
         if (obj_offset >= local.size)
           continue;
         if (!found_local) {
-          Printf("Potentially referenced stack objects:\n");
+          Printf("\nPotentially referenced stack objects:\n");
           found_local = true;
         }
         Printf("  %s in %s %s:%d\n", local.name, local.function_name,
@@ -363,7 +363,7 @@ static void PrintTagsAroundAddr(uptr addr, GetTag get_tag,
   InternalScopedString s;
   addr = MemToShadow(addr);
   s.AppendF(
-      "Memory tags around the buggy address (one tag corresponds to %zd "
+      "\nMemory tags around the buggy address (one tag corresponds to %zd "
       "bytes):\n",
       kShadowAlignment);
   PrintTagInfoAroundAddr(addr, kShadowLines, s,
@@ -803,8 +803,10 @@ void BaseReport::PrintAddressDescription() const {
   }
 
   // Print the remaining threads, as an extra information, 1 line per thread.
-  if (flags()->print_live_threads_info)
+  if (flags()->print_live_threads_info) {
+    Printf("\n");
     hwasanThreadList().VisitAllLiveThreads([&](Thread *t) { t->Announce(); });
+  }
 
   if (!num_descriptions_printed)
     // We exhausted our possibilities. Bail out.
@@ -1020,7 +1022,7 @@ void ReportTagMismatch(StackTrace *stack, uptr tagged_addr, uptr access_size,
 // See the frame breakdown defined in __hwasan_tag_mismatch (from
 // hwasan_tag_mismatch_{aarch64,riscv64}.S).
 void ReportRegisters(const uptr *frame, uptr pc) {
-  Printf("Registers where the failure occurred (pc %p):\n", pc);
+  Printf("\nRegisters where the failure occurred (pc %p):\n", pc);
 
   // We explicitly print a single line (4 registers/line) each iteration to
   // reduce the amount of logcat error messages printed. Each Printf() will

From 0cf3af0c5176cc067bf90b315466a8997498b988 Mon Sep 17 00:00:00 2001
From: Pete Steinfeld <47540744+psteinfeld@users.noreply.github.com>
Date: Thu, 21 Dec 2023 13:14:05 -0800
Subject: [PATCH 115/342] =?UTF-8?q?Revert=20"[Flang]=20Allow=20Intrinsic?=
 =?UTF-8?q?=20simpification=20with=20min/maxloc=20dim=20and=E2=80=A6=20(#7?=
 =?UTF-8?q?6184)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

… scalar result. (#75820)"

This reverts commit 701f64790520790f75b1f948a752472d421ddaa3.

The commit breaks some uses of the 'maxloc' intrinsic.

See PR #75820
---
 .../Transforms/SimplifyIntrinsics.cpp         | 13 ++--
 flang/test/Transforms/simplifyintrinsics.fir  | 68 ++-----------------
 2 files changed, 13 insertions(+), 68 deletions(-)

diff --git a/flang/lib/Optimizer/Transforms/SimplifyIntrinsics.cpp b/flang/lib/Optimizer/Transforms/SimplifyIntrinsics.cpp
index 12f354a47c2bc..c89ee6d5e2039 100644
--- a/flang/lib/Optimizer/Transforms/SimplifyIntrinsics.cpp
+++ b/flang/lib/Optimizer/Transforms/SimplifyIntrinsics.cpp
@@ -1162,14 +1162,11 @@ void SimplifyIntrinsicsPass::simplifyMinMaxlocReduction(
 
   mlir::Operation::operand_range args = call.getArgs();
 
-  mlir::SymbolRefAttr callee = call.getCalleeAttr();
-  mlir::StringRef funcNameBase = callee.getLeafReference().getValue();
-  bool isDim = funcNameBase.ends_with("Dim");
-  mlir::Value back = args[isDim ? 7 : 6];
+  mlir::Value back = args[6];
   if (isTrueOrNotConstant(back))
     return;
 
-  mlir::Value mask = args[isDim ? 6 : 5];
+  mlir::Value mask = args[5];
   mlir::Value maskDef = findMaskDef(mask);
 
   // maskDef is set to NULL when the defining op is not one we accept.
@@ -1178,8 +1175,10 @@ void SimplifyIntrinsicsPass::simplifyMinMaxlocReduction(
   if (maskDef == NULL)
     return;
 
+  mlir::SymbolRefAttr callee = call.getCalleeAttr();
+  mlir::StringRef funcNameBase = callee.getLeafReference().getValue();
   unsigned rank = getDimCount(args[1]);
-  if ((isDim && rank != 1) || !(rank > 0))
+  if (funcNameBase.ends_with("Dim") || !(rank > 0))
     return;
 
   fir::FirOpBuilder builder{getSimplificationBuilder(call, kindMap)};
@@ -1235,7 +1234,7 @@ void SimplifyIntrinsicsPass::simplifyMinMaxlocReduction(
   mlir::func::FuncOp newFunc =
       getOrCreateFunction(builder, funcName, typeGenerator, bodyGenerator);
   builder.create<fir::CallOp>(loc, newFunc,
-                              mlir::ValueRange{args[0], args[1], mask});
+                              mlir::ValueRange{args[0], args[1], args[5]});
   call->dropAllReferences();
   call->erase();
 }
diff --git a/flang/test/Transforms/simplifyintrinsics.fir b/flang/test/Transforms/simplifyintrinsics.fir
index d42924a17a804..0bd6ac7c436ff 100644
--- a/flang/test/Transforms/simplifyintrinsics.fir
+++ b/flang/test/Transforms/simplifyintrinsics.fir
@@ -2115,13 +2115,13 @@ func.func @_QPtestminloc_doesntwork1d_back(%arg0: !fir.ref<!fir.array<10xi32>> {
 // CHECK-NOT:         fir.call @_FortranAMinlocInteger4x1_i32_contract_simplified({{.*}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>) -> ()
 
 // -----
-// Check Minloc is simplified when DIM arg is set so long as the result is scalar
+// Check Minloc is not simplified when DIM arg is set
 
-func.func @_QPtestminloc_1d_dim(%arg0: !fir.ref<!fir.array<10xi32>> {fir.bindc_name = "a"}) -> !fir.array<1xi32> {
+func.func @_QPtestminloc_doesntwork1d_dim(%arg0: !fir.ref<!fir.array<10xi32>> {fir.bindc_name = "a"}) -> !fir.array<1xi32> {
   %0 = fir.alloca !fir.box<!fir.heap<i32>>
   %c10 = arith.constant 10 : index
   %c1 = arith.constant 1 : index
-  %1 = fir.alloca !fir.array<1xi32> {bindc_name = "testminloc_1d_dim", uniq_name = "_QFtestminloc_1d_dimEtestminloc_1d_dim"}
+  %1 = fir.alloca !fir.array<1xi32> {bindc_name = "testminloc_doesntwork1d_dim", uniq_name = "_QFtestminloc_doesntwork1d_dimEtestminloc_doesntwork1d_dim"}
   %2 = fir.shape %c1 : (index) -> !fir.shape<1>
   %3 = fir.array_load %1(%2) : (!fir.ref<!fir.array<1xi32>>, !fir.shape<1>) -> !fir.array<1xi32>
   %4 = fir.shape %c10 : (index) -> !fir.shape<1>
@@ -2156,65 +2156,11 @@ func.func @_QPtestminloc_1d_dim(%arg0: !fir.ref<!fir.array<10xi32>> {fir.bindc_n
   %21 = fir.load %1 : !fir.ref<!fir.array<1xi32>>
   return %21 : !fir.array<1xi32>
 }
-// CHECK-LABEL:   func.func @_QPtestminloc_1d_dim(
+// CHECK-LABEL:   func.func @_QPtestminloc_doesntwork1d_dim(
 // CHECK-SAME:                                             %[[ARR:.*]]: !fir.ref<!fir.array<10xi32>> {fir.bindc_name = "a"}) -> !fir.array<1xi32> {
-// CHECK:             fir.call @_FortranAMinlocDimx1_i32_contract_simplified({{.*}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>) -> ()
-
-// CHECK-LABEL:  func.func private @_FortranAMinlocDimx1_i32_contract_simplified(%arg0: !fir.ref<!fir.box<none>>, %arg1: !fir.box<none>, %arg2: !fir.box<none>) attributes {llvm.linkage = #llvm.linkage<linkonce_odr>} {
-// CHECK-NEXT:    %[[V0:.*]] = fir.alloca i32
-// CHECK-NEXT:    %c0_i32 = arith.constant 0 : i32
-// CHECK-NEXT:    %c1 = arith.constant 1 : index
-// CHECK-NEXT:    %[[V1:.*]] = fir.allocmem !fir.array<1xi32>
-// CHECK-NEXT:    %[[V2:.*]] = fir.shape %c1 : (index) -> !fir.shape<1>
-// CHECK-NEXT:    %[[V3:.*]] = fir.embox %[[V1]](%[[V2]]) : (!fir.heap<!fir.array<1xi32>>, !fir.shape<1>) -> !fir.box<!fir.heap<!fir.array<1xi32>>>
-// CHECK-NEXT:    %c0 = arith.constant 0 : index
-// CHECK-NEXT:    %[[V4:.*]] = fir.coordinate_of %[[V3]], %c0 : (!fir.box<!fir.heap<!fir.array<1xi32>>>, index) -> !fir.ref<i32>
-// CHECK-NEXT:    fir.store %c0_i32 to %[[V4]] : !fir.ref<i32>
-// CHECK-NEXT:    %c0_0 = arith.constant 0 : index
-// CHECK-NEXT:    %[[V5:.*]] = fir.convert %arg1 : (!fir.box<none>) -> !fir.box<!fir.array<?xi32>>
-// CHECK-NEXT:    %c1_i32 = arith.constant 1 : i32
-// CHECK-NEXT:    %c0_i32_1 = arith.constant 0 : i32
-// CHECK-NEXT:    fir.store %c0_i32_1 to %[[V0]] : !fir.ref<i32>
-// CHECK-NEXT:    %c2147483647_i32 = arith.constant 2147483647 : i32
-// CHECK-NEXT:    %c1_2 = arith.constant 1 : index
-// CHECK-NEXT:    %c0_3 = arith.constant 0 : index
-// CHECK-NEXT:    %[[V6:.*]]:3 = fir.box_dims %[[V5]], %c0_3 : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
-// CHECK-NEXT:    %[[V7:.*]] = arith.subi %[[V6]]#1, %c1_2 : index
-// CHECK-NEXT:    %[[V8:.*]] = fir.do_loop %arg3 = %c0_0 to %[[V7]] step %c1_2 iter_args(%arg4 = %c2147483647_i32) -> (i32) {
-// CHECK-NEXT:      fir.store %c1_i32 to %[[V0]] : !fir.ref<i32>
-// CHECK-NEXT:      %[[V12:.*]] = fir.coordinate_of %[[V5]], %arg3 : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
-// CHECK-NEXT:      %[[V13:.*]] = fir.load %[[V12]] : !fir.ref<i32>
-// CHECK-NEXT:      %[[V14:.*]] = arith.cmpi slt, %[[V13]], %arg4 : i32
-// CHECK-NEXT:      %[[V15:.*]] = fir.if %[[V14]] -> (i32) {
-// CHECK-NEXT:        %c1_i32_4 = arith.constant 1 : i32
-// CHECK-NEXT:        %c0_5 = arith.constant 0 : index
-// CHECK-NEXT:        %[[V16:.*]] = fir.coordinate_of %[[V3]], %c0_5 : (!fir.box<!fir.heap<!fir.array<1xi32>>>, index) -> !fir.ref<i32>
-// CHECK-NEXT:        %[[V17:.*]] = fir.convert %arg3 : (index) -> i32
-// CHECK-NEXT:        %[[V18:.*]] = arith.addi %[[V17]], %c1_i32_4 : i32
-// CHECK-NEXT:        fir.store %[[V18]] to %[[V16]] : !fir.ref<i32>
-// CHECK-NEXT:        fir.result %[[V13]] : i32
-// CHECK-NEXT:      } else {
-// CHECK-NEXT:        fir.result %arg4 : i32
-// CHECK-NEXT:      }
-// CHECK-NEXT:      fir.result %[[V15]] : i32
-// CHECK-NEXT:    }
-// CHECK-NEXT:    %[[V9:.*]] = fir.load %[[V0]] : !fir.ref<i32>
-// CHECK-NEXT:    %[[V10:.*]] = arith.cmpi eq, %[[V9]], %c1_i32 : i32
-// CHECK-NEXT:    fir.if %[[V10]] {
-// CHECK-NEXT:      %c2147483647_i32_4 = arith.constant 2147483647 : i32
-// CHECK-NEXT:      %[[V12]] = arith.cmpi eq, %c2147483647_i32_4, %[[V8]] : i32
-// CHECK-NEXT:      fir.if %[[V12]] {
-// CHECK-NEXT:        %c0_5 = arith.constant 0 : index
-// CHECK-NEXT:        %[[V13]] = fir.coordinate_of %[[V3]], %c0_5 : (!fir.box<!fir.heap<!fir.array<1xi32>>>, index) -> !fir.ref<i32>
-// CHECK-NEXT:        fir.store %c1_i32 to %[[V13]] : !fir.ref<i32>
-// CHECK-NEXT:      }
-// CHECK-NEXT:    }
-// CHECK-NEXT:    %[[V11:.*]] = fir.convert %arg0 : (!fir.ref<!fir.box<none>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<1xi32>>>>
-// CHECK-NEXT:    fir.store %[[V3]] to %[[V11]] : !fir.ref<!fir.box<!fir.heap<!fir.array<1xi32>>>>
-// CHECK-NEXT:    return
-// CHECK-NEXT:  }
-
-
+// CHECK-NOT:         fir.call @_FortranAMinlocDimx1_i32_contract_simplified({{.*}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>) -> ()
+// CHECK:             fir.call @_FortranAMinlocDim({{.*}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, i32, !fir.ref<i8>, i32, !fir.box<none>, i1) -> none
+// CHECK-NOT:         fir.call @_FortranAMinlocDimx1_i32_contract_simplified({{.*}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>) -> ()
 
 // -----
 // Check Minloc is not simplified when dimension of inputArr is unknown

From 157748341358f38ab55ea3a7a64276a5d4431d77 Mon Sep 17 00:00:00 2001
From: ShatianWang <38512325+ShatianWang@users.noreply.github.com>
Date: Thu, 21 Dec 2023 16:17:10 -0500
Subject: [PATCH 116/342] [BOLT] Don't split likely fallthrough in CDSplit
 (#76164)

This diff speeds up CDSplit by not considering any hot-warm splitting
point that could break a fall-through branch from a basic block to its
most likely successor.

Co-authored-by: spupyrev <spupyrev@fb.com>
---
 bolt/lib/Passes/SplitFunctions.cpp | 100 ++++++++++++++++++-----------
 bolt/test/X86/cdsplit-call-scale.s |   9 ++-
 2 files changed, 68 insertions(+), 41 deletions(-)

diff --git a/bolt/lib/Passes/SplitFunctions.cpp b/bolt/lib/Passes/SplitFunctions.cpp
index d9f5da3e3bcab..5de0759730048 100644
--- a/bolt/lib/Passes/SplitFunctions.cpp
+++ b/bolt/lib/Passes/SplitFunctions.cpp
@@ -175,8 +175,12 @@ struct SplitCacheDirected final : public SplitStrategy {
   void fragment(const BlockIt Start, const BlockIt End) override {
     BasicBlockOrder BlockOrder(Start, End);
     BinaryFunction &BF = *BlockOrder.front()->getFunction();
+    // No need to re-split small functions.
+    if (BlockOrder.size() <= 2)
+      return;
 
     size_t BestSplitIndex = findSplitIndex(BF, BlockOrder);
+    assert(BestSplitIndex < BlockOrder.size());
 
     // Assign fragments based on the computed best split index.
     // All basic blocks with index up to the best split index become hot.
@@ -200,10 +204,12 @@ struct SplitCacheDirected final : public SplitStrategy {
   };
 
   struct SplitScore {
-    size_t SplitIndex;
+    size_t SplitIndex = size_t(-1);
     size_t HotSizeReduction = 0;
     double LocalScore = 0;
     double CoverCallScore = 0;
+
+    double sum() const { return LocalScore + CoverCallScore; }
   };
 
   // Auxiliary variables used by the algorithm.
@@ -303,7 +309,7 @@ struct SplitCacheDirected final : public SplitStrategy {
                              const size_t SplitIndex) {
     assert(SplitIndex < BlockOrder.size() && "Invalid split index");
 
-    // Update function layout assuming hot-warm splitting at SplitIndex
+    // Update function layout assuming hot-warm splitting at SplitIndex.
     for (size_t Index = 0; Index < BlockOrder.size(); Index++) {
       BinaryBasicBlock *BB = BlockOrder[Index];
       if (BB->getFragmentNum() == FragmentNum::cold())
@@ -319,8 +325,8 @@ struct SplitCacheDirected final : public SplitStrategy {
     // Populate BB.OutputAddressRange with estimated new start and end addresses
     // and compute the old end address of the hot section and the new end
     // address of the hot section.
-    size_t OldHotEndAddr;
-    size_t NewHotEndAddr;
+    size_t OldHotEndAddr{0};
+    size_t NewHotEndAddr{0};
     size_t CurrentAddr = BBOffsets[BlockOrder[0]];
     for (BinaryBasicBlock *BB : BlockOrder) {
       // We only care about new addresses of blocks in hot/warm.
@@ -492,20 +498,15 @@ struct SplitCacheDirected final : public SplitStrategy {
   }
 
   /// Compute the split score of splitting a function at a given index.
-  /// The split score consists of local score and cover score. Cover call score
-  /// is expensive to compute. As a result, we pass in a \p ReferenceScore and
-  /// compute cover score only when the local score exceeds that in the
-  /// ReferenceScore or that the size reduction of the hot fragment is larger
-  /// than that achieved by the split index of the ReferenceScore. This function
-  /// returns \p Score of SplitScore type. It contains the local score and cover
-  /// score (if computed) of the current splitting index. For easier book
-  /// keeping and comparison, it also stores the split index and the resulting
-  /// reduction in hot fragment size.
+  /// The split score consists of local score and cover score. This function
+  /// returns \p Score of SplitScore type. It contains the local score and
+  /// cover score of the current splitting index. For easier book keeping and
+  /// comparison, it also stores the split index and the resulting reduction
+  /// in hot fragment size.
   SplitScore computeSplitScore(const BinaryFunction &BF,
                                const BasicBlockOrder &BlockOrder,
                                const size_t SplitIndex,
-                               const std::vector<CallInfo> &CoverCalls,
-                               const SplitScore &ReferenceScore) {
+                               const std::vector<CallInfo> &CoverCalls) {
     // Populate BinaryBasicBlock::OutputAddressRange with estimated
     // new start and end addresses after hot-warm splitting at SplitIndex.
     size_t OldHotEnd;
@@ -533,47 +534,74 @@ struct SplitCacheDirected final : public SplitStrategy {
     // increamented in place.
     computeJumpScore(BlockOrder, SplitIndex, Score);
 
-    // There is no need to compute CoverCallScore if we have already found
-    // another split index with a bigger LocalScore and bigger HotSizeReduction.
-    if (Score.LocalScore <= ReferenceScore.LocalScore &&
-        Score.HotSizeReduction <= ReferenceScore.HotSizeReduction)
-      return Score;
-
     // Compute CoverCallScore and store in Score in place.
     computeCoverCallScore(BlockOrder, SplitIndex, CoverCalls, Score);
     return Score;
   }
 
+  /// Find the most likely successor of a basic block when it has one or two
+  /// successors. Return nullptr otherwise.
+  const BinaryBasicBlock *getMostLikelySuccessor(const BinaryBasicBlock *BB) {
+    if (BB->succ_size() == 1)
+      return BB->getSuccessor();
+    if (BB->succ_size() == 2) {
+      uint64_t TakenCount = BB->getTakenBranchInfo().Count;
+      assert(TakenCount != BinaryBasicBlock::COUNT_NO_PROFILE);
+      uint64_t NonTakenCount = BB->getFallthroughBranchInfo().Count;
+      assert(NonTakenCount != BinaryBasicBlock::COUNT_NO_PROFILE);
+      if (TakenCount > NonTakenCount)
+        return BB->getConditionalSuccessor(true);
+      else if (TakenCount < NonTakenCount)
+        return BB->getConditionalSuccessor(false);
+    }
+    return nullptr;
+  }
+
   /// Find the best index for splitting. The returned value is the index of the
   /// last hot basic block. Hence, "no splitting" is equivalent to returning the
   /// value which is one less than the size of the function.
   size_t findSplitIndex(const BinaryFunction &BF,
                         const BasicBlockOrder &BlockOrder) {
+    assert(BlockOrder.size() > 2);
     // Find all function calls that can be shortened if we move blocks of the
     // current function to warm/cold
     const std::vector<CallInfo> CoverCalls = extractCoverCalls(BF);
 
-    // Try all possible split indices (blocks with Index <= SplitIndex are in
-    // hot) and find the one maximizing the splitting score.
+    // Find the existing hot-cold splitting index.
+    size_t HotColdIndex = 0;
+    while (HotColdIndex + 1 < BlockOrder.size()) {
+      if (BlockOrder[HotColdIndex + 1]->getFragmentNum() == FragmentNum::cold())
+        break;
+      HotColdIndex++;
+    }
+    assert(HotColdIndex + 1 == BlockOrder.size() ||
+           (BlockOrder[HotColdIndex]->getFragmentNum() == FragmentNum::main() &&
+            BlockOrder[HotColdIndex + 1]->getFragmentNum() ==
+                FragmentNum::cold()));
+
+    // Try all possible split indices up to HotColdIndex (blocks that have
+    // Index <= SplitIndex are in hot) and find the one maximizing the
+    // splitting score.
     SplitScore BestScore;
-    double BestScoreSum = -1.0;
-    SplitScore ReferenceScore;
-    for (size_t Index = 0; Index < BlockOrder.size(); Index++) {
+    for (size_t Index = 0; Index <= HotColdIndex; Index++) {
       const BinaryBasicBlock *LastHotBB = BlockOrder[Index];
-      // No need to keep cold blocks in the hot section.
-      if (LastHotBB->getFragmentNum() == FragmentNum::cold())
-        break;
+      assert(LastHotBB->getFragmentNum() != FragmentNum::cold());
+
+      // Do not break jump to the most likely successor.
+      if (Index + 1 < BlockOrder.size() &&
+          BlockOrder[Index + 1] == getMostLikelySuccessor(LastHotBB))
+        continue;
+
       const SplitScore Score =
-          computeSplitScore(BF, BlockOrder, Index, CoverCalls, ReferenceScore);
-      double ScoreSum = Score.LocalScore + Score.CoverCallScore;
-      if (ScoreSum > BestScoreSum) {
-        BestScoreSum = ScoreSum;
+          computeSplitScore(BF, BlockOrder, Index, CoverCalls);
+      if (Score.sum() > BestScore.sum())
         BestScore = Score;
-      }
-      if (Score.LocalScore > ReferenceScore.LocalScore)
-        ReferenceScore = Score;
     }
 
+    // If we don't find a good splitting point, fallback to the original one.
+    if (BestScore.SplitIndex == size_t(-1))
+      return HotColdIndex;
+
     return BestScore.SplitIndex;
   }
 };
diff --git a/bolt/test/X86/cdsplit-call-scale.s b/bolt/test/X86/cdsplit-call-scale.s
index 5b4f92832624c..5701d9e6dfd69 100644
--- a/bolt/test/X86/cdsplit-call-scale.s
+++ b/bolt/test/X86/cdsplit-call-scale.s
@@ -2,8 +2,9 @@
 # When -call-scale=0.0, the tested function is 2-way splitted.
 # When -call-scale=1.0, the tested function is 3-way splitted with 5 blocks
 # in warm because of the increased benefit of shortening the call edges.
-# When -call-scale=1000.0, the tested function is 3-way splitted with 7 blocks
-# in warm because of the strong benefit of shortening the call edges.
+# When -call-scale=1000.0, the tested function is still 3-way splitted with
+# 5 blocks in warm because cdsplit does not allow hot-warm splitting to break
+# a fall through branch from a basic block to its most likely successor.
 
 # RUN: llvm-mc --filetype=obj --triple x86_64-unknown-unknown %s -o %t.o
 # RUN: link_fdata %s %t.o %t.fdata
@@ -39,12 +40,10 @@
 # MEDINCENTIVE: {{^\.Ltmp5}}
 
 # HIGHINCENTIVE: Binary Function "chain" after split-functions
-# HIGHINCENTIVE: {{^\.LBB00}}
+# HIGHINCENTIVE: {{^\.Ltmp1}}
 # HIGHINCENTIVE: -------   HOT-COLD SPLIT POINT   -------
 # HIGHINCENTIVE: {{^\.LFT1}}
 # HIGHINCENTIVE: -------   HOT-COLD SPLIT POINT   -------
-# HIGHINCENTIVE: {{^\.LFT0}}
-# HIGHINCENTIVE: {{^\.Ltmp1}}
 # HIGHINCENTIVE: {{^\.Ltmp0}}
 # HIGHINCENTIVE: {{^\.Ltmp2}}
 # HIGHINCENTIVE: {{^\.Ltmp3}}

From 011024536963c7822c81f33434969e8eff08e180 Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Thu, 21 Dec 2023 13:49:31 -0800
Subject: [PATCH 117/342] [test][hwasan] Update tests missed by #76130

---
 compiler-rt/test/hwasan/TestCases/stack-uar.c | 4 ++--
 compiler-rt/test/hwasan/TestCases/stack-uas.c | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/compiler-rt/test/hwasan/TestCases/stack-uar.c b/compiler-rt/test/hwasan/TestCases/stack-uar.c
index 48440a47d5f5f..29941e617ad67 100644
--- a/compiler-rt/test/hwasan/TestCases/stack-uar.c
+++ b/compiler-rt/test/hwasan/TestCases/stack-uar.c
@@ -51,14 +51,14 @@ int main() {
   // CHECK: is located in stack of thread
   // CHECK: Potentially referenced stack objects:
   // CHECK-NEXT: {{zzz|yyy}} in buggy {{.*}}stack-uar.c:
-  // CHECK-NEXT: Memory tags around the buggy address
+  // CHECK: Memory tags around the buggy address
 
   // NOSYM: Previously allocated frames:
   // NOSYM-NEXT: record_addr:0x{{.*}} record:0x{{.*}} ({{.*}}/stack-uar.c.tmp+0x{{.*}}){{$}}
   // NOSYM-NEXT: record_addr:0x{{.*}} record:0x{{.*}} ({{.*}}/stack-uar.c.tmp+0x{{.*}}){{$}}
   // NOSYM-NEXT: record_addr:0x{{.*}} record:0x{{.*}} ({{.*}}/stack-uar.c.tmp+0x{{.*}}){{$}}
   // NOSYM-NEXT: record_addr:0x{{.*}} record:0x{{.*}} ({{.*}}/stack-uar.c.tmp+0x{{.*}}){{$}}
-  // NOSYM-NEXT: Memory tags around the buggy address
+  // NOSYM: Memory tags around the buggy address
 
   // CHECK: SUMMARY: HWAddressSanitizer: tag-mismatch {{.*}} in main
 }
diff --git a/compiler-rt/test/hwasan/TestCases/stack-uas.c b/compiler-rt/test/hwasan/TestCases/stack-uas.c
index 4455e59100747..d38eedb87fc26 100644
--- a/compiler-rt/test/hwasan/TestCases/stack-uas.c
+++ b/compiler-rt/test/hwasan/TestCases/stack-uas.c
@@ -70,14 +70,14 @@ int main() {
   // CHECK: is located in stack of thread
   // CHECK: Potentially referenced stack objects:
   // CHECK-NEXT: {{zzz|yyy}} in buggy {{.*}}stack-uas.c:
-  // CHECK-NEXT: Memory tags around the buggy address
+  // CHECK: Memory tags around the buggy address
 
   // NOSYM: Previously allocated frames:
   // NOSYM-NEXT: record_addr:0x{{.*}} record:0x{{.*}} ({{.*}}/stack-uas.c.tmp+0x{{.*}}){{$}}
   // NOSYM-NEXT: record_addr:0x{{.*}} record:0x{{.*}} ({{.*}}/stack-uas.c.tmp+0x{{.*}}){{$}}
   // NOSYM-NEXT: record_addr:0x{{.*}} record:0x{{.*}} ({{.*}}/stack-uas.c.tmp+0x{{.*}}){{$}}
   // NOSYM-NEXT: record_addr:0x{{.*}} record:0x{{.*}} ({{.*}}/stack-uas.c.tmp+0x{{.*}}){{$}}
-  // NOSYM-NEXT: Memory tags around the buggy address
+  // NOSYM: Memory tags around the buggy address
 
   // CHECK: SUMMARY: HWAddressSanitizer: tag-mismatch {{.*}} in buggy
 }

From 35a5df2de6bd56c95edcd10d6acab040b251238e Mon Sep 17 00:00:00 2001
From: Derek Schuff <dschuff@chromium.org>
Date: Thu, 21 Dec 2023 14:16:37 -0800
Subject: [PATCH 118/342] [WebAssembly][Object] Record section start offsets at
 start of payload (#76188)

LLVM ObjectFile currently records the start offsets of sections as the
start of the section header, whereas most other tools (WABT, emscripten,
wasm-tools) record it as the start of the section content, after the
header. This affects binutils tools such as objdump and nm, but not
compilation/assembly (since that is driven by symbols and assembler
labels which already have their values inside the section payload rather
in the header. This patch updates LLVM to match the other tools.
---
 llvm/lib/Object/WasmObjectFile.cpp            |  2 +-
 llvm/test/MC/WebAssembly/custom-sections.ll   |  6 +--
 llvm/test/MC/WebAssembly/debug-info.ll        | 44 +++++++++---------
 llvm/test/MC/WebAssembly/debug-info64.ll      | 46 +++++++++----------
 llvm/test/MC/WebAssembly/tag-section.ll       |  2 +-
 .../test/tools/llvm-readobj/wasm/globals.test |  2 +-
 .../tools/llvm-readobj/wasm/sections.test     | 14 +++---
 7 files changed, 58 insertions(+), 58 deletions(-)

diff --git a/llvm/lib/Object/WasmObjectFile.cpp b/llvm/lib/Object/WasmObjectFile.cpp
index 05bd7302c4971..dfe86a45df322 100644
--- a/llvm/lib/Object/WasmObjectFile.cpp
+++ b/llvm/lib/Object/WasmObjectFile.cpp
@@ -265,7 +265,6 @@ static wasm::WasmTableType readTableType(WasmObjectFile::ReadContext &Ctx) {
 
 static Error readSection(WasmSection &Section, WasmObjectFile::ReadContext &Ctx,
                          WasmSectionOrderChecker &Checker) {
-  Section.Offset = Ctx.Ptr - Ctx.Start;
   Section.Type = readUint8(Ctx);
   LLVM_DEBUG(dbgs() << "readSection type=" << Section.Type << "\n");
   // When reading the section's size, store the size of the LEB used to encode
@@ -273,6 +272,7 @@ static Error readSection(WasmSection &Section, WasmObjectFile::ReadContext &Ctx,
   const uint8_t *PreSizePtr = Ctx.Ptr;
   uint32_t Size = readVaruint32(Ctx);
   Section.HeaderSecSizeEncodingLen = Ctx.Ptr - PreSizePtr;
+  Section.Offset = Ctx.Ptr - Ctx.Start;
   if (Size == 0)
     return make_error<StringError>("zero length section",
                                    object_error::parse_failed);
diff --git a/llvm/test/MC/WebAssembly/custom-sections.ll b/llvm/test/MC/WebAssembly/custom-sections.ll
index cf2d7098ae4b1..ba669716c934d 100644
--- a/llvm/test/MC/WebAssembly/custom-sections.ll
+++ b/llvm/test/MC/WebAssembly/custom-sections.ll
@@ -15,18 +15,18 @@ target triple = "wasm32-unknown-unknown"
 ; CHECK:  Section {
 ; CHECK:    Type: CUSTOM (0x0)
 ; CHECK:    Size: 3
-; CHECK:    Offset: 38
+; CHECK:    Offset: 44
 ; CHECK:    Name: red
 ; CHECK:  }
 ; CHECK:  Section {
 ; CHECK:    Type: CUSTOM (0x0)
 ; CHECK:    Size: 6
-; CHECK:    Offset: 51
+; CHECK:    Offset: 57
 ; CHECK:    Name: green
 ; CHECK:  }
 ; CHECK:  Section {
 ; CHECK:    Type: CUSTOM (0x0)
 ; CHECK:    Size: 25
-; CHECK:    Offset: 84
+; CHECK:    Offset: 90
 ; CHECK:    Name: producers
 ; CHECK:  }
diff --git a/llvm/test/MC/WebAssembly/debug-info.ll b/llvm/test/MC/WebAssembly/debug-info.ll
index c8ab7a93165fb..a65ce0ee83920 100644
--- a/llvm/test/MC/WebAssembly/debug-info.ll
+++ b/llvm/test/MC/WebAssembly/debug-info.ll
@@ -7,37 +7,37 @@
 ; CHECK-NEXT:  Section {
 ; CHECK-NEXT:    Type: TYPE (0x1)
 ; CHECK-NEXT:    Size: 4
-; CHECK-NEXT:    Offset: 8
+; CHECK-NEXT:    Offset: 14
 ; CHECK-NEXT:  }
 ; CHECK-NEXT:  Section {
 ; CHECK-NEXT:    Type: IMPORT (0x2)
 ; CHECK-NEXT:    Size: 81
-; CHECK-NEXT:    Offset: 18
+; CHECK-NEXT:    Offset: 24
 ; CHECK-NEXT:  }
 ; CHECK-NEXT:  Section {
 ; CHECK-NEXT:    Type: FUNCTION (0x3)
 ; CHECK-NEXT:    Size: 2
-; CHECK-NEXT:    Offset: 105
+; CHECK-NEXT:    Offset: 111
 ; CHECK-NEXT:  }
 ; CHECK-NEXT:  Section {
 ; CHECK-NEXT:    Type: ELEM (0x9)
 ; CHECK-NEXT:    Size: 7
-; CHECK-NEXT:    Offset: 113
+; CHECK-NEXT:    Offset: 119
 ; CHECK-NEXT:  }
 ; CHECK-NEXT:  Section {
 ; CHECK-NEXT:    Type: DATACOUNT (0xC)
 ; CHECK-NEXT:    Size: 1
-; CHECK-NEXT:    Offset: 126
+; CHECK-NEXT:    Offset: 132
 ; CHECK-NEXT:  }
 ; CHECK-NEXT:  Section {
 ; CHECK-NEXT:    Type: CODE (0xA)
 ; CHECK-NEXT:    Size: 4
-; CHECK-NEXT:    Offset: 133
+; CHECK-NEXT:    Offset: 139
 ; CHECK-NEXT:  }
 ; CHECK-NEXT:  Section {
 ; CHECK-NEXT:    Type: DATA (0xB)
 ; CHECK-NEXT:    Size: 19
-; CHECK-NEXT:    Offset: 143
+; CHECK-NEXT:    Offset: 149
 ; CHECK-NEXT:    Segments [
 ; CHECK-NEXT:      Segment {
 ; CHECK-NEXT:        Name: .data.foo
@@ -54,91 +54,91 @@
 ; CHECK-NEXT:  Section {
 ; CHECK-NEXT:    Type: CUSTOM (0x0)
 ; CHECK-NEXT:    Size: 86
-; CHECK-NEXT:    Offset: 168
+; CHECK-NEXT:    Offset: 174
 ; CHECK-NEXT:    Name: .debug_abbrev
 ; CHECK-NEXT:  }
 ; CHECK-NEXT:  Section {
 ; CHECK-NEXT:    Type: CUSTOM (0x0)
 ; CHECK-NEXT:    Size: 114
-; CHECK-NEXT:    Offset: 274
+; CHECK-NEXT:    Offset: 280
 ; CHECK-NEXT:    Name: .debug_info
 ; CHECK-NEXT:  }
 ; CHECK-NEXT:  Section {
 ; CHECK-NEXT:    Type: CUSTOM (0x0)
 ; CHECK-NEXT:    Size: 48
-; CHECK-NEXT:    Offset: 406
+; CHECK-NEXT:    Offset: 412
 ; CHECK-NEXT:    Name: .debug_aranges
 ; CHECK-NEXT:  }
 ; CHECK-NEXT:  Section {
 ; CHECK-NEXT:    Type: CUSTOM (0x0)
 ; CHECK-NEXT:    Size: 121
-; CHECK-NEXT:    Offset: 475
+; CHECK-NEXT:    Offset: 481
 ; CHECK-NEXT:    Name: .debug_str
 ; CHECK-NEXT:  }
 ; CHECK-NEXT:  Section {
 ; CHECK-NEXT:    Type: CUSTOM (0x0)
 ; CHECK-NEXT:    Size: 42
-; CHECK-NEXT:    Offset: 613
+; CHECK-NEXT:    Offset: 619
 ; CHECK-NEXT:    Name: .debug_pubnames
 ; CHECK-NEXT:  }
 ; CHECK-NEXT:  Section {
 ; CHECK-NEXT:    Type: CUSTOM (0x0)
 ; CHECK-NEXT:    Size: 26
-; CHECK-NEXT:    Offset: 677
+; CHECK-NEXT:    Offset: 683
 ; CHECK-NEXT:    Name: .debug_pubtypes
 ; CHECK-NEXT:  }
 ; CHECK-NEXT:  Section {
 ; CHECK-NEXT:    Type: CUSTOM (0x0)
 ; CHECK-NEXT:    Size: 56
-; CHECK-NEXT:    Offset: 725
+; CHECK-NEXT:    Offset: 731
 ; CHECK-NEXT:    Name: .debug_line
 ; CHECK-NEXT:  }
 ; CHECK-NEXT:  Section {
 ; CHECK-NEXT:    Type: CUSTOM (0x0)
 ; CHECK-NEXT:    Size: 91
-; CHECK-NEXT:    Offset: 799
+; CHECK-NEXT:    Offset: 805
 ; CHECK-NEXT:    Name: linking
 ; CHECK-NEXT:  }
 ; CHECK-NEXT:  Section {
 ; CHECK-NEXT:    Type: CUSTOM (0x0)
 ; CHECK-NEXT:    Size: 9
-; CHECK-NEXT:    Offset: 904
+; CHECK-NEXT:    Offset: 910
 ; CHECK-NEXT:    Name: reloc.DATA
 ; CHECK-NEXT:  }
 ; CHECK-NEXT:  Section {
 ; CHECK-NEXT:    Type: CUSTOM (0x0)
 ; CHECK-NEXT:    Size: 61
-; CHECK-NEXT:    Offset: 930
+; CHECK-NEXT:    Offset: 936
 ; CHECK-NEXT:    Name: reloc..debug_info
 ; CHECK-NEXT:  }
 ; CHECK-NEXT:  Section {
 ; CHECK-NEXT:    Type: CUSTOM (0x0)
 ; CHECK-NEXT:    Size: 18
-; CHECK-NEXT:    Offset: 1015
+; CHECK-NEXT:    Offset: 1021
 ; CHECK-NEXT:    Name: reloc..debug_aranges
 ; CHECK-NEXT:  }
 ; CHECK-NEXT:  Section {
 ; CHECK-NEXT:    Type: CUSTOM (0x0)
 ; CHECK-NEXT:    Size: 6
-; CHECK-NEXT:    Offset: 1060
+; CHECK-NEXT:    Offset: 1066
 ; CHECK-NEXT:    Name: reloc..debug_pubnames
 ; CHECK-NEXT:  }
 ; CHECK-NEXT:  Section {
 ; CHECK-NEXT:    Type: CUSTOM (0x0)
 ; CHECK-NEXT:    Size: 6
-; CHECK-NEXT:    Offset: 1094
+; CHECK-NEXT:    Offset: 1100
 ; CHECK-NEXT:    Name: reloc..debug_pubtypes
 ; CHECK-NEXT:  }
 ; CHECK-NEXT:  Section {
 ; CHECK-NEXT:    Type: CUSTOM (0x0)
 ; CHECK-NEXT:    Size: 6
-; CHECK-NEXT:    Offset: 1128
+; CHECK-NEXT:    Offset: 1134
 ; CHECK-NEXT:    Name: reloc..debug_line
 ; CHECK-NEXT:  }
 ; CHECK-NEXT:  Section {
 ; CHECK-NEXT:    Type: CUSTOM (0x0)
 ; CHECK-NEXT:    Size: 77
-; CHECK-NEXT:    Offset: 1158
+; CHECK-NEXT:    Offset: 1164
 ; CHECK-NEXT:    Name: producers
 ; CHECK-NEXT:  }
 ; CHECK-NEXT:]
diff --git a/llvm/test/MC/WebAssembly/debug-info64.ll b/llvm/test/MC/WebAssembly/debug-info64.ll
index a63200c908b77..d0081164d73ee 100644
--- a/llvm/test/MC/WebAssembly/debug-info64.ll
+++ b/llvm/test/MC/WebAssembly/debug-info64.ll
@@ -7,37 +7,37 @@
 ; CHECK-NEXT:   Section {
 ; CHECK-NEXT:     Type: TYPE (0x1)
 ; CHECK-NEXT:     Size: 4
-; CHECK-NEXT:     Offset: 8
+; CHECK-NEXT:     Offset: 14
 ; CHECK-NEXT:   }
 ; CHECK-NEXT:   Section {
 ; CHECK-NEXT:     Type: IMPORT (0x2)
 ; CHECK-NEXT:     Size: 81
-; CHECK-NEXT:     Offset: 18
+; CHECK-NEXT:     Offset: 24
 ; CHECK-NEXT:   }
 ; CHECK-NEXT:   Section {
 ; CHECK-NEXT:     Type: FUNCTION (0x3)
 ; CHECK-NEXT:     Size: 2
-; CHECK-NEXT:     Offset: 105
+; CHECK-NEXT:     Offset: 111
 ; CHECK-NEXT:   }
 ; CHECK-NEXT:   Section {
 ; CHECK-NEXT:     Type: ELEM (0x9)
 ; CHECK-NEXT:     Size: 7
-; CHECK-NEXT:     Offset: 113
+; CHECK-NEXT:     Offset: 119
 ; CHECK-NEXT:   }
 ; CHECK-NEXT:   Section {
 ; CHECK-NEXT:     Type: DATACOUNT (0xC)
 ; CHECK-NEXT:     Size: 1
-; CHECK-NEXT:     Offset: 126
+; CHECK-NEXT:     Offset: 132
 ; CHECK-NEXT:   }
 ; CHECK-NEXT:   Section {
 ; CHECK-NEXT:     Type: CODE (0xA)
 ; CHECK-NEXT:     Size: 4
-; CHECK-NEXT:     Offset: 133
+; CHECK-NEXT:     Offset: 139
 ; CHECK-NEXT:   }
 ; CHECK-NEXT:   Section {
 ; CHECK-NEXT:     Type: DATA (0xB)
 ; CHECK-NEXT:     Size: 27
-; CHECK-NEXT:     Offset: 143
+; CHECK-NEXT:     Offset: 149
 ; CHECK-NEXT:     Segments [
 ; CHECK-NEXT:       Segment {
 ; CHECK-NEXT:         Name: .data.foo
@@ -54,97 +54,97 @@
 ; CHECK-NEXT:   Section {
 ; CHECK-NEXT:     Type: CUSTOM (0x0)
 ; CHECK-NEXT:     Size: 86
-; CHECK-NEXT:     Offset: 176
+; CHECK-NEXT:     Offset: 182
 ; CHECK-NEXT:     Name: .debug_abbrev
 ; CHECK-NEXT:   }
 ; CHECK-NEXT:   Section {
 ; CHECK-NEXT:     Type: CUSTOM (0x0)
 ; CHECK-NEXT:     Size: 130
-; CHECK-NEXT:     Offset: 282
+; CHECK-NEXT:     Offset: 288
 ; CHECK-NEXT:     Name: .debug_info
 ; CHECK-NEXT:   }
 ; CHECK-NEXT:   Section {
 ; CHECK-NEXT:     Type: CUSTOM (0x0)
 ; CHECK-NEXT:     Size: 80
-; CHECK-NEXT:     Offset: 430
+; CHECK-NEXT:     Offset: 436
 ; CHECK-NEXT:     Name: .debug_aranges
 ; CHECK-NEXT:   }
 ; CHECK-NEXT:   Section {
 ; CHECK-NEXT:     Type: CUSTOM (0x0)
 ; CHECK-NEXT:     Size: 121
-; CHECK-NEXT:     Offset: 531
+; CHECK-NEXT:     Offset: 537
 ; CHECK-NEXT:     Name: .debug_str
 ; CHECK-NEXT:   }
 ; CHECK-NEXT:   Section {
 ; CHECK-NEXT:     Type: CUSTOM (0x0)
 ; CHECK-NEXT:     Size: 42
-; CHECK-NEXT:     Offset: 669
+; CHECK-NEXT:     Offset: 675
 ; CHECK-NEXT:     Name: .debug_pubnames
 ; CHECK-NEXT:   }
 ; CHECK-NEXT:   Section {
 ; CHECK-NEXT:     Type: CUSTOM (0x0)
 ; CHECK-NEXT:     Size: 26
-; CHECK-NEXT:     Offset: 733
+; CHECK-NEXT:     Offset: 739
 ; CHECK-NEXT:     Name: .debug_pubtypes
 ; CHECK-NEXT:   }
 ; CHECK-NEXT:   Section {
 ; CHECK-NEXT:     Type: CUSTOM (0x0)
 ; CHECK-NEXT:     Size: 60
-; CHECK-NEXT:     Offset: 781
+; CHECK-NEXT:     Offset: 787
 ; CHECK-NEXT:     Name: .debug_line
 ; CHECK-NEXT:   }
 ; CHECK-NEXT:   Section {
 ; CHECK-NEXT:     Type: CUSTOM (0x0)
 ; CHECK-NEXT:     Size: 91
-; CHECK-NEXT:     Offset: 859
+; CHECK-NEXT:     Offset: 865
 ; CHECK-NEXT:     Name: linking
 ; CHECK-NEXT:   }
 ; CHECK-NEXT:   Section {
 ; CHECK-NEXT:     Type: CUSTOM (0x0)
 ; CHECK-NEXT:     Size: 9
-; CHECK-NEXT:     Offset: 964
+; CHECK-NEXT:     Offset: 970
 ; CHECK-NEXT:     Name: reloc.DATA
 ; CHECK-NEXT:   }
 ; CHECK-NEXT:   Section {
 ; CHECK-NEXT:     Type: CUSTOM (0x0)
 ; CHECK-NEXT:     Size: 61
-; CHECK-NEXT:     Offset: 990
+; CHECK-NEXT:     Offset: 996
 ; CHECK-NEXT:     Name: reloc..debug_info
 ; CHECK-NEXT:   }
 ; CHECK-NEXT:   Section {
 ; CHECK-NEXT:     Type: CUSTOM (0x0)
 ; CHECK-NEXT:     Size: 18
-; CHECK-NEXT:     Offset: 1075
+; CHECK-NEXT:     Offset: 1081
 ; CHECK-NEXT:     Name: reloc..debug_aranges
 ; CHECK-NEXT:   }
 ; CHECK-NEXT:   Section {
 ; CHECK-NEXT:     Type: CUSTOM (0x0)
 ; CHECK-NEXT:     Size: 6
-; CHECK-NEXT:     Offset: 1120
+; CHECK-NEXT:     Offset: 1126
 ; CHECK-NEXT:     Name: reloc..debug_pubnames
 ; CHECK-NEXT:   }
 ; CHECK-NEXT:   Section {
 ; CHECK-NEXT:     Type: CUSTOM (0x0)
 ; CHECK-NEXT:     Size: 6
-; CHECK-NEXT:     Offset: 1154
+; CHECK-NEXT:     Offset: 1160
 ; CHECK-NEXT:     Name: reloc..debug_pubtypes
 ; CHECK-NEXT:   }
 ; CHECK-NEXT:   Section {
 ; CHECK-NEXT:     Type: CUSTOM (0x0)
 ; CHECK-NEXT:     Size: 6
-; CHECK-NEXT:     Offset: 1188
+; CHECK-NEXT:     Offset: 1194
 ; CHECK-NEXT:     Name: reloc..debug_line
 ; CHECK-NEXT:   }
 ; CHECK-NEXT:   Section {
 ; CHECK-NEXT:     Type: CUSTOM (0x0)
 ; CHECK-NEXT:     Size: 77
-; CHECK-NEXT:     Offset: 1218
+; CHECK-NEXT:     Offset: 1224
 ; CHECK-NEXT:     Name: producers
 ; CHECK-NEXT:   }
 ; CHECK-NEXT:   Section {
 ; CHECK-NEXT:     Type: CUSTOM (0x0)
 ; CHECK-NEXT:     Size: 11
-; CHECK-NEXT:     Offset: 1311
+; CHECK-NEXT:     Offset: 1317
 ; CHECK-NEXT:     Name: target_features
 ; CHECK-NEXT:   }
 ; CHECK-NEXT: ]
diff --git a/llvm/test/MC/WebAssembly/tag-section.ll b/llvm/test/MC/WebAssembly/tag-section.ll
index c40e132e36f04..56738ec80c8dc 100644
--- a/llvm/test/MC/WebAssembly/tag-section.ll
+++ b/llvm/test/MC/WebAssembly/tag-section.ll
@@ -53,4 +53,4 @@ define i32 @test_throw1(ptr %p) {
 
 ; SEC:          Type: TAG (0xD)
 ; SEC-NEXT:     Size: 3
-; SEC-NEXT:     Offset: 63
+; SEC-NEXT:     Offset: 69
diff --git a/llvm/test/tools/llvm-readobj/wasm/globals.test b/llvm/test/tools/llvm-readobj/wasm/globals.test
index 4e9f6403c267c..0dff18dc5d962 100644
--- a/llvm/test/tools/llvm-readobj/wasm/globals.test
+++ b/llvm/test/tools/llvm-readobj/wasm/globals.test
@@ -19,7 +19,7 @@ Sections:
 # CHECK:      Section {
 # CHECK-NEXT:     Type: DATA (0xB)
 # CHECK-NEXT:     Size: 7
-# CHECK-NEXT:     Offset: 8
+# CHECK-NEXT:     Offset: 14
 # CHECK-NEXT:     Segments [
 # CHECK-NEXT:       Segment {
 # CHECK-NEXT:         Size: 1
diff --git a/llvm/test/tools/llvm-readobj/wasm/sections.test b/llvm/test/tools/llvm-readobj/wasm/sections.test
index 8b8a526295eb2..1a4aadad392f3 100644
--- a/llvm/test/tools/llvm-readobj/wasm/sections.test
+++ b/llvm/test/tools/llvm-readobj/wasm/sections.test
@@ -6,27 +6,27 @@
 # CHECK-NEXT:   Section {
 # CHECK-NEXT:     Type: TYPE (0x1)
 # CHECK-NEXT:     Size: 17
-# CHECK-NEXT:     Offset: 8
+# CHECK-NEXT:     Offset: 14
 # CHECK-NEXT:   }
 # CHECK-NEXT:   Section {
 # CHECK-NEXT:     Type: IMPORT (0x2)
 # CHECK-NEXT:     Size: 93
-# CHECK-NEXT:     Offset: 31
+# CHECK-NEXT:     Offset: 37
 # CHECK-NEXT:   }
 # CHECK-NEXT:   Section {
 # CHECK-NEXT:     Type: FUNCTION (0x3)
 # CHECK-NEXT:     Size: 3
-# CHECK-NEXT:     Offset: 130
+# CHECK-NEXT:     Offset: 136
 # CHECK-NEXT:   }
 # CHECK-NEXT:   Section {
 # CHECK-NEXT:     Type: CODE (0xA)
 # CHECK-NEXT:     Size: 36
-# CHECK-NEXT:     Offset: 139
+# CHECK-NEXT:     Offset: 145
 # CHECK-NEXT:   }
 # CHECK-NEXT:   Section {
 # CHECK-NEXT:     Type: DATA (0xB)
 # CHECK-NEXT:     Size: 19
-# CHECK-NEXT:     Offset: 181
+# CHECK-NEXT:     Offset: 187
 # CHECK-NEXT:     Segments [
 # CHECK-NEXT:       Segment {
 # CHECK-NEXT:         Name: .rodata..L.str
@@ -38,13 +38,13 @@
 # CHECK-NEXT:   Section {
 # CHECK-NEXT:     Type: CUSTOM (0x0)
 # CHECK-NEXT:     Size: 89
-# CHECK-NEXT:     Offset: 206
+# CHECK-NEXT:     Offset: 212
 # CHECK-NEXT:     Name: linking
 # CHECK-NEXT:   }
 # CHECK-NEXT:   Section {
 # CHECK-NEXT:     Type: CUSTOM (0x0)
 # CHECK-NEXT:     Size: 15
-# CHECK-NEXT:     Offset: 309
+# CHECK-NEXT:     Offset: 315
 # CHECK-NEXT:     Name: reloc.CODE
 # CHECK-NEXT:   }
 # CHECK-NEXT: ]

From 12250c4092b9f8fd043b37cbb73555706a4a412b Mon Sep 17 00:00:00 2001
From: Fabian Mora <fmora.dev@gmail.com>
Date: Thu, 21 Dec 2023 17:18:36 -0500
Subject: [PATCH 119/342] Reland [OpenMP][Fix] libomptarget Fortran tests
 (#76189)

This patch fixes the erroneous multiple-target requirement in Fortran
offloading tests. Additionally, it adds two new variables
(test_flags_clang, test_flags_flang) to lit.cfg so that
compiler-specific flags for Clang and Flang can be specified.

This patch re-lands: #74543. The error was caused by having:
```
config.substitutions.append(("%flags", config.test_flags))
config.substitutions.append(("%flags_clang", config.test_flags_clang))
config.substitutions.append(("%flags_flang", config.test_flags_flang))
```
when instead it has to be:
```
config.substitutions.append(("%flags_clang", config.test_flags_clang))
config.substitutions.append(("%flags_flang", config.test_flags_flang))
config.substitutions.append(("%flags", config.test_flags))
```
because LIT replaces with the first longest sub-string match.
---
 openmp/libomptarget/test/lit.cfg                   | 14 ++++++++++----
 .../basic-target-region-1D-array-section.f90       |  2 +-
 .../basic-target-region-3D-array-section.f90       |  2 +-
 .../fortran/basic-target-region-3D-array.f90       |  2 +-
 .../offloading/fortran/basic_target_region.f90     |  2 +-
 .../test/offloading/fortran/constant-arr-index.f90 |  3 +--
 .../declare-target-array-in-target-region.f90      |  2 +-
 .../double-target-call-with-declare-target.f90     |  3 ++-
 .../fortran/target-region-implicit-array.f90       |  2 +-
 .../offloading/fortran/target_map_common_block.f90 |  2 +-
 .../fortran/target_map_common_block2.f90           |  2 +-
 11 files changed, 21 insertions(+), 15 deletions(-)

diff --git a/openmp/libomptarget/test/lit.cfg b/openmp/libomptarget/test/lit.cfg
index adbdd7cc35cc4..19c5e5c457222 100644
--- a/openmp/libomptarget/test/lit.cfg
+++ b/openmp/libomptarget/test/lit.cfg
@@ -78,6 +78,10 @@ config.test_flags = " -I " + config.test_source_root + \
     " -L " + config.library_dir + \
     " -L " + config.llvm_lib_directory
 
+# compiler specific flags
+config.test_flags_clang = ""
+config.test_flags_flang = ""
+
 if config.omp_host_rtl_directory:
     config.test_flags = config.test_flags + " -L " + \
         config.omp_host_rtl_directory
@@ -136,7 +140,7 @@ else: # Unices
     if config.cuda_libdir:
         config.test_flags += " -Wl,-rpath," + config.cuda_libdir
     if config.libomptarget_current_target.startswith('nvptx'):
-        config.test_flags += " --libomptarget-nvptx-bc-path=" + config.library_dir + '/DeviceRTL'
+        config.test_flags_clang += " --libomptarget-nvptx-bc-path=" + config.library_dir + '/DeviceRTL'
     if config.libomptarget_current_target.endswith('-LTO'):
         config.test_flags += " -foffload-lto"
     if config.libomptarget_current_target.endswith('-JIT-LTO') and evaluate_bool_env(
@@ -273,13 +277,13 @@ for libomptarget_target in config.libomptarget_all_targets:
             libomptarget_target, \
             "%not --crash %t"))
         config.substitutions.append(("%clangxx-" + libomptarget_target, \
-                                     "%clangxx %openmp_flags %cuda_flags %flags -fopenmp-targets=" +\
+                                     "%clangxx %openmp_flags %cuda_flags %flags %flags_clang -fopenmp-targets=" +\
                                      remove_suffix_if_present(libomptarget_target)))
         config.substitutions.append(("%clang-" + libomptarget_target, \
-                                     "%clang %openmp_flags %cuda_flags %flags -fopenmp-targets=" +\
+                                     "%clang %openmp_flags %cuda_flags %flags %flags_clang -fopenmp-targets=" +\
                                      remove_suffix_if_present(libomptarget_target)))
         config.substitutions.append(("%flang-" + libomptarget_target, \
-                                     "%flang %openmp_flags %flags -fopenmp-targets=" +\
+                                     "%flang %openmp_flags %flags %flags_flang -fopenmp-targets=" +\
                                      remove_suffix_if_present(libomptarget_target)))
         config.substitutions.append(("%fcheck-" + libomptarget_target, \
             config.libomptarget_filecheck + " %s"))
@@ -356,5 +360,7 @@ if config.libomptarget_current_target.startswith('nvptx') and config.cuda_path:
     config.substitutions.append(("%cuda_flags", "--cuda-path=" + config.cuda_path))
 else:
     config.substitutions.append(("%cuda_flags", ""))
+config.substitutions.append(("%flags_clang", config.test_flags_clang))
+config.substitutions.append(("%flags_flang", config.test_flags_flang))
 config.substitutions.append(("%flags", config.test_flags))
 config.substitutions.append(("%not", config.libomptarget_not))
diff --git a/openmp/libomptarget/test/offloading/fortran/basic-target-region-1D-array-section.f90 b/openmp/libomptarget/test/offloading/fortran/basic-target-region-1D-array-section.f90
index 993b91d4eb623..476b77e4a549b 100644
--- a/openmp/libomptarget/test/offloading/fortran/basic-target-region-1D-array-section.f90
+++ b/openmp/libomptarget/test/offloading/fortran/basic-target-region-1D-array-section.f90
@@ -1,6 +1,6 @@
 ! Basic offloading test of arrays with provided lower 
 ! and upper bounds as specified by OpenMP's sectioning
-! REQUIRES: flang, amdgcn-amd-amdhsa, nvptx64-nvidia-cuda
+! REQUIRES: flang
 ! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
 ! UNSUPPORTED: aarch64-unknown-linux-gnu
 ! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
diff --git a/openmp/libomptarget/test/offloading/fortran/basic-target-region-3D-array-section.f90 b/openmp/libomptarget/test/offloading/fortran/basic-target-region-3D-array-section.f90
index 669d3674926f6..229798b57477d 100644
--- a/openmp/libomptarget/test/offloading/fortran/basic-target-region-3D-array-section.f90
+++ b/openmp/libomptarget/test/offloading/fortran/basic-target-region-3D-array-section.f90
@@ -1,6 +1,6 @@
 ! Basic offloading test of a regular array explicitly
 ! passed within a target region
-! REQUIRES: flang, amdgcn-amd-amdhsa, nvptx64-nvidia-cuda
+! REQUIRES: flang
 ! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
 ! UNSUPPORTED: aarch64-unknown-linux-gnu
 ! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
diff --git a/openmp/libomptarget/test/offloading/fortran/basic-target-region-3D-array.f90 b/openmp/libomptarget/test/offloading/fortran/basic-target-region-3D-array.f90
index c87d6ee24aed3..ea3048185d52b 100644
--- a/openmp/libomptarget/test/offloading/fortran/basic-target-region-3D-array.f90
+++ b/openmp/libomptarget/test/offloading/fortran/basic-target-region-3D-array.f90
@@ -1,6 +1,6 @@
 ! Basic offloading test of a regular array explicitly
 ! passed within a target region
-! REQUIRES: flang, amdgcn-amd-amdhsa, nvptx64-nvidia-cuda
+! REQUIRES: flang
 ! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
 ! UNSUPPORTED: aarch64-unknown-linux-gnu
 ! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
diff --git a/openmp/libomptarget/test/offloading/fortran/basic_target_region.f90 b/openmp/libomptarget/test/offloading/fortran/basic_target_region.f90
index 6423ac765670d..d856d42bb8cd6 100644
--- a/openmp/libomptarget/test/offloading/fortran/basic_target_region.f90
+++ b/openmp/libomptarget/test/offloading/fortran/basic_target_region.f90
@@ -1,5 +1,5 @@
 ! Basic offloading test with a target region
-! REQUIRES: flang, amdgcn-amd-amdhsa, nvptx64-nvidia-cuda
+! REQUIRES: flang
 ! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
 ! UNSUPPORTED: aarch64-unknown-linux-gnu
 ! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
diff --git a/openmp/libomptarget/test/offloading/fortran/constant-arr-index.f90 b/openmp/libomptarget/test/offloading/fortran/constant-arr-index.f90
index 9064f60896f10..669630555c31e 100644
--- a/openmp/libomptarget/test/offloading/fortran/constant-arr-index.f90
+++ b/openmp/libomptarget/test/offloading/fortran/constant-arr-index.f90
@@ -2,8 +2,7 @@
 ! that checks constant indexing on device
 ! correctly works (regression test for prior
 ! bug).
-! REQUIRES: flang, amdgcn-amd-amdhsa
-! UNSUPPORTED: nvptx64-nvidia-cuda
+! REQUIRES: flang
 ! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
 ! UNSUPPORTED: aarch64-unknown-linux-gnu
 ! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
diff --git a/openmp/libomptarget/test/offloading/fortran/declare-target-array-in-target-region.f90 b/openmp/libomptarget/test/offloading/fortran/declare-target-array-in-target-region.f90
index d2e59d93a0209..c09146198768b 100644
--- a/openmp/libomptarget/test/offloading/fortran/declare-target-array-in-target-region.f90
+++ b/openmp/libomptarget/test/offloading/fortran/declare-target-array-in-target-region.f90
@@ -1,7 +1,7 @@
 ! Offloading test with a target region mapping a declare target
 ! Fortran array writing some values to it and checking the host
 ! correctly receives the updates made on the device.
-! REQUIRES: flang, amdgcn-amd-amdhsa, nvptx64-nvidia-cuda
+! REQUIRES: flang
 ! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
 ! UNSUPPORTED: aarch64-unknown-linux-gnu
 ! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
diff --git a/openmp/libomptarget/test/offloading/fortran/double-target-call-with-declare-target.f90 b/openmp/libomptarget/test/offloading/fortran/double-target-call-with-declare-target.f90
index 884acb275a0eb..56c96727d4752 100644
--- a/openmp/libomptarget/test/offloading/fortran/double-target-call-with-declare-target.f90
+++ b/openmp/libomptarget/test/offloading/fortran/double-target-call-with-declare-target.f90
@@ -2,7 +2,8 @@
 ! declare target Fortran array and writing some values to 
 ! it before checking the host correctly receives the 
 ! correct updates made on the device.
-! REQUIRES: flang, amdgcn-amd-amdhsa, nvptx64-nvidia-cuda
+! REQUIRES: flang
+! UNSUPPORTED: nvptx64-nvidia-cuda
 ! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
 ! UNSUPPORTED: aarch64-unknown-linux-gnu
 ! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
diff --git a/openmp/libomptarget/test/offloading/fortran/target-region-implicit-array.f90 b/openmp/libomptarget/test/offloading/fortran/target-region-implicit-array.f90
index 5ef2547545e4b..ada6ef2ad7301 100644
--- a/openmp/libomptarget/test/offloading/fortran/target-region-implicit-array.f90
+++ b/openmp/libomptarget/test/offloading/fortran/target-region-implicit-array.f90
@@ -1,6 +1,6 @@
 ! Basic offloading test of a regular array explicitly
 ! passed within a target region
-! REQUIRES: flang, amdgcn-amd-amdhsa, nvptx64-nvidia-cuda
+! REQUIRES: flang
 ! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
 ! UNSUPPORTED: aarch64-unknown-linux-gnu
 ! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
diff --git a/openmp/libomptarget/test/offloading/fortran/target_map_common_block.f90 b/openmp/libomptarget/test/offloading/fortran/target_map_common_block.f90
index e782ef8a670a7..f20423edb9576 100644
--- a/openmp/libomptarget/test/offloading/fortran/target_map_common_block.f90
+++ b/openmp/libomptarget/test/offloading/fortran/target_map_common_block.f90
@@ -1,5 +1,5 @@
 ! Basic offloading test with a target region
-! REQUIRES: flang, amdgcn-amd-amdhsa
+! REQUIRES: flang
 ! UNSUPPORTED: nvptx64-nvidia-cuda
 ! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
 ! UNSUPPORTED: aarch64-unknown-linux-gnu
diff --git a/openmp/libomptarget/test/offloading/fortran/target_map_common_block2.f90 b/openmp/libomptarget/test/offloading/fortran/target_map_common_block2.f90
index 8a9c47545fbd7..24e3e2b7ab117 100644
--- a/openmp/libomptarget/test/offloading/fortran/target_map_common_block2.f90
+++ b/openmp/libomptarget/test/offloading/fortran/target_map_common_block2.f90
@@ -1,4 +1,4 @@
-! REQUIRES: flang, amdgcn-amd-amdhsa
+! REQUIRES: flang
 ! UNSUPPORTED: nvptx64-nvidia-cuda
 ! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
 ! UNSUPPORTED: aarch64-unknown-linux-gnu

From 82b38e83cfbb3f996313b22f5daf0d104c0f27dc Mon Sep 17 00:00:00 2001
From: Tacet <4922191+AdvenamTacet@users.noreply.github.com>
Date: Thu, 21 Dec 2023 23:26:10 +0100
Subject: [PATCH 120/342] [ASan][libc++] Optimization of container annotations
 (#76082)

This commit implements conditional compilation for ASan helper code.

As convey to me by @EricWF, string benchmarks with UBSan have been
experiencing significant performance hit after the commit with ASan
string annotations. This is likely due to the fact that no-op ASan code
is not optimized out with UBSan. To address this issue, this commit
conditionalizes the inclusion of ASan helper function bodies using
`#ifdef` directives. This approach allows us to selectively include only
the ASan code when it's actually required, thereby enhancing
optimizations and improving performance.

While issue was noticed in string benchmarks, I expect same overhead
(just less noticeable) in other containers, therefore `std::vector` and
`std::deque` have same changes.

To see impact of that change run `string.libcxx.out` with UBSan and
`--benchmark_filter=BM_StringAssign` or
`--benchmark_filter=BM_StringConstruct`.
---
 libcxx/include/deque  | 27 +++++++++++++++++++++++++++
 libcxx/include/string | 11 +++++++++++
 libcxx/include/vector | 11 +++++++++++
 3 files changed, 49 insertions(+)

diff --git a/libcxx/include/deque b/libcxx/include/deque
index d0520b635bcc8..fca8b3d6e2c73 100644
--- a/libcxx/include/deque
+++ b/libcxx/include/deque
@@ -998,15 +998,19 @@ private:
   }
 
   _LIBCPP_HIDE_FROM_ABI void __annotate_new(size_type __current_size) const _NOEXCEPT {
+    (void)__current_size;
+#ifndef _LIBCPP_HAS_NO_ASAN
     if (__current_size == 0)
       __annotate_from_to(0, __map_.size() * __block_size, __asan_poison, __asan_back_moved);
     else {
       __annotate_from_to(0, __start_, __asan_poison, __asan_front_moved);
       __annotate_from_to(__start_ + __current_size, __map_.size() * __block_size, __asan_poison, __asan_back_moved);
     }
+#endif
   }
 
   _LIBCPP_HIDE_FROM_ABI void __annotate_delete() const _NOEXCEPT {
+#ifndef _LIBCPP_HAS_NO_ASAN
     if (empty()) {
       for (size_t __i = 0; __i < __map_.size(); ++__i) {
         __annotate_whole_block(__i, __asan_unposion);
@@ -1015,30 +1019,52 @@ private:
       __annotate_from_to(0, __start_, __asan_unposion, __asan_front_moved);
       __annotate_from_to(__start_ + size(), __map_.size() * __block_size, __asan_unposion, __asan_back_moved);
     }
+#endif
   }
 
   _LIBCPP_HIDE_FROM_ABI void __annotate_increase_front(size_type __n) const _NOEXCEPT {
+    (void)__n;
+#ifndef _LIBCPP_HAS_NO_ASAN
     __annotate_from_to(__start_ - __n, __start_, __asan_unposion, __asan_front_moved);
+#endif
   }
 
   _LIBCPP_HIDE_FROM_ABI void __annotate_increase_back(size_type __n) const _NOEXCEPT {
+    (void)__n;
+#ifndef _LIBCPP_HAS_NO_ASAN
     __annotate_from_to(__start_ + size(), __start_ + size() + __n, __asan_unposion, __asan_back_moved);
+#endif
   }
 
   _LIBCPP_HIDE_FROM_ABI void __annotate_shrink_front(size_type __old_size, size_type __old_start) const _NOEXCEPT {
+    (void)__old_size;
+    (void)__old_start;
+#ifndef _LIBCPP_HAS_NO_ASAN
     __annotate_from_to(__old_start, __old_start + (__old_size - size()), __asan_poison, __asan_front_moved);
+#endif
   }
 
   _LIBCPP_HIDE_FROM_ABI void __annotate_shrink_back(size_type __old_size, size_type __old_start) const _NOEXCEPT {
+    (void)__old_size;
+    (void)__old_start;
+#ifndef _LIBCPP_HAS_NO_ASAN
     __annotate_from_to(__old_start + size(), __old_start + __old_size, __asan_poison, __asan_back_moved);
+#endif
   }
 
   _LIBCPP_HIDE_FROM_ABI void __annotate_poison_block(const void* __beginning, const void* __end) const _NOEXCEPT {
+    (void)__beginning;
+    (void)__end;
+#ifndef _LIBCPP_HAS_NO_ASAN
     __annotate_double_ended_contiguous_container(__beginning, __end, __beginning, __end, __end, __end);
+#endif
   }
 
   _LIBCPP_HIDE_FROM_ABI void
   __annotate_whole_block(size_t __block_index, __asan_annotation_type __annotation_type) const _NOEXCEPT {
+    (void)__block_index;
+    (void)__annotation_type;
+#ifndef _LIBCPP_HAS_NO_ASAN
     __map_const_iterator __block_it = __map_.begin() + __block_index;
     const void* __block_start       = std::__to_address(*__block_it);
     const void* __block_end         = std::__to_address(*__block_it + __block_size);
@@ -1049,6 +1075,7 @@ private:
       __annotate_double_ended_contiguous_container(
           __block_start, __block_end, __block_start, __block_start, __block_start, __block_end);
     }
+#endif
   }
 #if !defined(_LIBCPP_HAS_NO_ASAN)
 
diff --git a/libcxx/include/string b/libcxx/include/string
index fdffca5aed18b..c676182fba8ba 100644
--- a/libcxx/include/string
+++ b/libcxx/include/string
@@ -1903,23 +1903,34 @@ private:
   }
 
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __annotate_new(size_type __current_size) const _NOEXCEPT {
+    (void) __current_size;
+#if !defined(_LIBCPP_HAS_NO_ASAN) && defined(_LIBCPP_INSTRUMENTED_WITH_ASAN)
     if (!__libcpp_is_constant_evaluated() && (__asan_short_string_is_annotated() || __is_long()))
       __annotate_contiguous_container(data() + capacity() + 1, data() + __current_size + 1);
+#endif
   }
 
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __annotate_delete() const _NOEXCEPT {
+#if !defined(_LIBCPP_HAS_NO_ASAN) && defined(_LIBCPP_INSTRUMENTED_WITH_ASAN)
     if (!__libcpp_is_constant_evaluated() && (__asan_short_string_is_annotated() || __is_long()))
       __annotate_contiguous_container(data() + size() + 1, data() + capacity() + 1);
+#endif
   }
 
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __annotate_increase(size_type __n) const _NOEXCEPT {
+    (void) __n;
+#if !defined(_LIBCPP_HAS_NO_ASAN) && defined(_LIBCPP_INSTRUMENTED_WITH_ASAN)
     if (!__libcpp_is_constant_evaluated() && (__asan_short_string_is_annotated() || __is_long()))
       __annotate_contiguous_container(data() + size() + 1, data() + size() + 1 + __n);
+#endif
   }
 
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __annotate_shrink(size_type __old_size) const _NOEXCEPT {
+    (void) __old_size;
+#if !defined(_LIBCPP_HAS_NO_ASAN) && defined(_LIBCPP_INSTRUMENTED_WITH_ASAN)
     if (!__libcpp_is_constant_evaluated() && (__asan_short_string_is_annotated() || __is_long()))
       __annotate_contiguous_container(data() + __old_size + 1, data() + size() + 1);
+#endif
   }
 
   template <size_type __a>
diff --git a/libcxx/include/vector b/libcxx/include/vector
index 3abc917f5c0e1..0098273a195ff 100644
--- a/libcxx/include/vector
+++ b/libcxx/include/vector
@@ -845,19 +845,30 @@ private:
   }
 
   _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void __annotate_new(size_type __current_size) const _NOEXCEPT {
+    (void)__current_size;
+#ifndef _LIBCPP_HAS_NO_ASAN
     __annotate_contiguous_container(data(), data() + capacity(), data() + capacity(), data() + __current_size);
+#endif
   }
 
   _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void __annotate_delete() const _NOEXCEPT {
+#ifndef _LIBCPP_HAS_NO_ASAN
     __annotate_contiguous_container(data(), data() + capacity(), data() + size(), data() + capacity());
+#endif
   }
 
   _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void __annotate_increase(size_type __n) const _NOEXCEPT {
+    (void)__n;
+#ifndef _LIBCPP_HAS_NO_ASAN
     __annotate_contiguous_container(data(), data() + capacity(), data() + size(), data() + size() + __n);
+#endif
   }
 
   _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void __annotate_shrink(size_type __old_size) const _NOEXCEPT {
+    (void)__old_size;
+#ifndef _LIBCPP_HAS_NO_ASAN
     __annotate_contiguous_container(data(), data() + capacity(), data() + __old_size, data() + size());
+#endif
   }
 
   struct _ConstructTransaction {

From 3ca9bcc6ccd0de4e05c7b7749c24f94e5f184b45 Mon Sep 17 00:00:00 2001
From: Arthur Eubanks <aeubanks@google.com>
Date: Thu, 21 Dec 2023 14:30:43 -0800
Subject: [PATCH 121/342] [llvm][docs][X86] Mention code model improvements in
 ReleaseNotes (#76190)

---
 llvm/docs/ReleaseNotes.rst | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst
index 33afa09fcac3d..9d5124680141e 100644
--- a/llvm/docs/ReleaseNotes.rst
+++ b/llvm/docs/ReleaseNotes.rst
@@ -172,6 +172,11 @@ Changes to the X86 Backend
 * Support ISA of ``AVX10.1-256`` and ``AVX10.1-512``.
 * ``-mcpu=pantherlake`` and ``-mcpu=clearwaterforest`` are now supported.
 * ``-mapxf`` is supported.
+* Marking global variables with ``code_model = "small"/"large"`` in the IR now
+  overrides the global code model to allow 32-bit relocations or require 64-bit
+  relocations to the global variable.
+* The medium code model's code generation was audited to be more similar to the
+  small code model where possible.
 
 Changes to the OCaml bindings
 -----------------------------

From e64f5d6305c447b1ec3bc31128753b28f4e87f32 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Thu, 21 Dec 2023 14:34:49 -0800
Subject: [PATCH 122/342] [RISCV] Replace RISCVISD::VP_MERGE_VL with a new node
 that has a separate passthru operand. (#75682)

ISD::VP_MERGE treats the false operand as the source for elements past
VL. The vmerge instruction encodes 3 registers and treats the vd
register as the source for the tail.

This patch adds a new ISD opcode that models the tail source explicitly.
During lowering we copy the false operand to this operand.

I think we can merge RISCVISD::VSELECT_VL with this new opcode by using
an UNDEF passthru, but I'll save that for another patch.
---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   |  27 +++-
 llvm/lib/Target/RISCV/RISCVISelLowering.h     |   6 +-
 .../Target/RISCV/RISCVInstrInfoVVLPatterns.td | 125 ++++++++++--------
 3 files changed, 91 insertions(+), 67 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index d6dedd669ffd0..40518097fcce7 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -5530,7 +5530,7 @@ static unsigned getRISCVVLOp(SDValue Op) {
   case ISD::VP_SELECT:
     return RISCVISD::VSELECT_VL;
   case ISD::VP_MERGE:
-    return RISCVISD::VP_MERGE_VL;
+    return RISCVISD::VMERGE_VL;
   case ISD::VP_ASHR:
     return RISCVISD::SRA_VL;
   case ISD::VP_LSHR:
@@ -5578,6 +5578,8 @@ static bool hasMergeOp(unsigned Opcode) {
     return true;
   if (Opcode >= RISCVISD::STRICT_FADD_VL && Opcode <= RISCVISD::STRICT_FDIV_VL)
     return true;
+  if (Opcode == RISCVISD::VMERGE_VL)
+    return true;
   return false;
 }
 
@@ -8242,8 +8244,8 @@ static SDValue lowerVectorIntrinsicScalars(SDValue Op, SelectionDAG &DAG,
                          AVL);
     // TUMA or TUMU: Currently we always emit tumu policy regardless of tuma.
     // It's fine because vmerge does not care mask policy.
-    return DAG.getNode(RISCVISD::VP_MERGE_VL, DL, VT, Mask, Vec, MaskedOff,
-                       AVL);
+    return DAG.getNode(RISCVISD::VMERGE_VL, DL, VT, Mask, Vec, MaskedOff,
+                       MaskedOff, AVL);
   }
   }
 
@@ -10316,9 +10318,20 @@ SDValue RISCVTargetLowering::lowerVPOp(SDValue Op, SelectionDAG &DAG) const {
   for (const auto &OpIdx : enumerate(Op->ops())) {
     SDValue V = OpIdx.value();
     assert(!isa<VTSDNode>(V) && "Unexpected VTSDNode node!");
-    // Add dummy merge value before the mask.
-    if (HasMergeOp && *ISD::getVPMaskIdx(Op.getOpcode()) == OpIdx.index())
-      Ops.push_back(DAG.getUNDEF(ContainerVT));
+    // Add dummy merge value before the mask. Or if there isn't a mask, before
+    // EVL.
+    if (HasMergeOp) {
+      auto MaskIdx = ISD::getVPMaskIdx(Op.getOpcode());
+      if (MaskIdx) {
+        if (*MaskIdx == OpIdx.index())
+          Ops.push_back(DAG.getUNDEF(ContainerVT));
+      } else if (ISD::getVPExplicitVectorLengthIdx(Op.getOpcode()) ==
+                 OpIdx.index()) {
+        // For VP_MERGE, copy the false operand instead of an undef value.
+        assert(Op.getOpcode() == ISD::VP_MERGE);
+        Ops.push_back(Ops.back());
+      }
+    }
     // Pass through operands which aren't fixed-length vectors.
     if (!V.getValueType().isFixedLengthVector()) {
       Ops.push_back(V);
@@ -18658,7 +18671,7 @@ const char *RISCVTargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(VNSRL_VL)
   NODE_NAME_CASE(SETCC_VL)
   NODE_NAME_CASE(VSELECT_VL)
-  NODE_NAME_CASE(VP_MERGE_VL)
+  NODE_NAME_CASE(VMERGE_VL)
   NODE_NAME_CASE(VMAND_VL)
   NODE_NAME_CASE(VMOR_VL)
   NODE_NAME_CASE(VMXOR_VL)
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index 2d9f716cdf9a4..58ed611efc83d 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -332,10 +332,8 @@ enum NodeType : unsigned {
 
   // Vector select with an additional VL operand. This operation is unmasked.
   VSELECT_VL,
-  // Vector select with operand #2 (the value when the condition is false) tied
-  // to the destination and an additional VL operand. This operation is
-  // unmasked.
-  VP_MERGE_VL,
+  // General vmerge node with mask, true, false, passthru, and vl operands.
+  VMERGE_VL,
 
   // Mask binary operators.
   VMAND_VL,
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
index dc6b57fad3210..33bdc3366aa3e 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
@@ -344,7 +344,14 @@ def SDT_RISCVSelect_VL  : SDTypeProfile<1, 4, [
 ]>;
 
 def riscv_vselect_vl  : SDNode<"RISCVISD::VSELECT_VL", SDT_RISCVSelect_VL>;
-def riscv_vp_merge_vl : SDNode<"RISCVISD::VP_MERGE_VL", SDT_RISCVSelect_VL>;
+
+def SDT_RISCVVMERGE_VL  : SDTypeProfile<1, 5, [
+  SDTCisVec<0>, SDTCisVec<1>, SDTCisSameNumEltsAs<0, 1>, SDTCVecEltisVT<1, i1>,
+  SDTCisSameAs<0, 2>, SDTCisSameAs<2, 3>, SDTCisSameAs<0, 4>,
+  SDTCisVT<5, XLenVT>
+]>;
+
+def riscv_vmerge_vl : SDNode<"RISCVISD::VMERGE_VL", SDT_RISCVVMERGE_VL>;
 
 def SDT_RISCVVMSETCLR_VL : SDTypeProfile<1, 1, [SDTCVecEltisVT<0, i1>,
                                                 SDTCisVT<1, XLenVT>]>;
@@ -675,14 +682,14 @@ multiclass VPatTiedBinaryNoMaskVL_V<SDNode vop,
                      op2_reg_class:$rs2,
                      GPR:$vl, sew, TAIL_AGNOSTIC)>;
   // Tail undisturbed
-  def : Pat<(riscv_vp_merge_vl true_mask,
+  def : Pat<(riscv_vmerge_vl true_mask,
              (result_type (vop
                            result_reg_class:$rs1,
                            (op2_type op2_reg_class:$rs2),
                            srcvalue,
                            true_mask,
                            VLOpFrag)),
-             result_reg_class:$rs1, VLOpFrag),
+             result_reg_class:$rs1, result_reg_class:$rs1, VLOpFrag),
             (!cast<Instruction>(instruction_name#"_"#suffix#"_"# vlmul.MX#"_TIED")
                      result_reg_class:$rs1,
                      op2_reg_class:$rs2,
@@ -712,14 +719,14 @@ multiclass VPatTiedBinaryNoMaskVL_V_RM<SDNode vop,
                      FRM_DYN,
                      GPR:$vl, sew, TAIL_AGNOSTIC)>;
   // Tail undisturbed
-  def : Pat<(riscv_vp_merge_vl true_mask,
+  def : Pat<(riscv_vmerge_vl true_mask,
              (result_type (vop
                            result_reg_class:$rs1,
                            (op2_type op2_reg_class:$rs2),
                            srcvalue,
                            true_mask,
                            VLOpFrag)),
-             result_reg_class:$rs1, VLOpFrag),
+             result_reg_class:$rs1, result_reg_class:$rs1, VLOpFrag),
             (!cast<Instruction>(instruction_name#"_"#suffix#"_"# vlmul.MX#"_TIED")
                      result_reg_class:$rs1,
                      op2_reg_class:$rs2,
@@ -1697,21 +1704,21 @@ multiclass VPatMultiplyAccVL_VV_VX<PatFrag op, string instruction_name> {
   foreach vti = AllIntegerVectors in {
   defvar suffix = vti.LMul.MX;
   let Predicates = GetVTypePredicates<vti>.Predicates in {
-    def : Pat<(riscv_vp_merge_vl (vti.Mask V0),
+    def : Pat<(riscv_vmerge_vl (vti.Mask V0),
                 (vti.Vector (op vti.RegClass:$rd,
                                 (riscv_mul_vl_oneuse vti.RegClass:$rs1, vti.RegClass:$rs2,
                                     srcvalue, (vti.Mask true_mask), VLOpFrag),
                                 srcvalue, (vti.Mask true_mask), VLOpFrag)),
-                            vti.RegClass:$rd, VLOpFrag),
+                            vti.RegClass:$rd, vti.RegClass:$rd, VLOpFrag),
               (!cast<Instruction>(instruction_name#"_VV_"# suffix #"_MASK")
                    vti.RegClass:$rd, vti.RegClass:$rs1, vti.RegClass:$rs2,
                    (vti.Mask V0), GPR:$vl, vti.Log2SEW, TU_MU)>;
-    def : Pat<(riscv_vp_merge_vl (vti.Mask V0),
+    def : Pat<(riscv_vmerge_vl (vti.Mask V0),
                 (vti.Vector (op vti.RegClass:$rd,
                                 (riscv_mul_vl_oneuse (SplatPat XLenVT:$rs1), vti.RegClass:$rs2,
                                     srcvalue, (vti.Mask true_mask), VLOpFrag),
                                 srcvalue, (vti.Mask true_mask), VLOpFrag)),
-                            vti.RegClass:$rd, VLOpFrag),
+                            vti.RegClass:$rd, vti.RegClass:$rd, VLOpFrag),
               (!cast<Instruction>(instruction_name#"_VX_"# suffix #"_MASK")
                    vti.RegClass:$rd, vti.ScalarRegClass:$rs1, vti.RegClass:$rs2,
                    (vti.Mask V0), GPR:$vl, vti.Log2SEW, TU_MU)>;
@@ -1840,17 +1847,17 @@ multiclass VPatFPMulAccVL_VV_VF<PatFrag vop, string instruction_name> {
   foreach vti = AllFloatVectors in {
   defvar suffix = vti.LMul.MX;
   let Predicates = GetVTypePredicates<vti>.Predicates in {
-    def : Pat<(riscv_vp_merge_vl (vti.Mask V0),
+    def : Pat<(riscv_vmerge_vl (vti.Mask V0),
                            (vti.Vector (vop vti.RegClass:$rs1, vti.RegClass:$rs2,
                             vti.RegClass:$rd, (vti.Mask true_mask), VLOpFrag)),
-                            vti.RegClass:$rd, VLOpFrag),
+                            vti.RegClass:$rd, vti.RegClass:$rd, VLOpFrag),
               (!cast<Instruction>(instruction_name#"_VV_"# suffix #"_MASK")
                    vti.RegClass:$rd, vti.RegClass:$rs1, vti.RegClass:$rs2,
                    (vti.Mask V0), GPR:$vl, vti.Log2SEW, TU_MU)>;
-    def : Pat<(riscv_vp_merge_vl (vti.Mask V0),
+    def : Pat<(riscv_vmerge_vl (vti.Mask V0),
                            (vti.Vector (vop (SplatFPOp vti.ScalarRegClass:$rs1), vti.RegClass:$rs2,
                             vti.RegClass:$rd, (vti.Mask true_mask), VLOpFrag)),
-                            vti.RegClass:$rd, VLOpFrag),
+                            vti.RegClass:$rd, vti.RegClass:$rd, VLOpFrag),
               (!cast<Instruction>(instruction_name#"_V" # vti.ScalarSuffix # "_" # suffix # "_MASK")
                    vti.RegClass:$rd, vti.ScalarRegClass:$rs1, vti.RegClass:$rs2,
                    (vti.Mask V0), GPR:$vl, vti.Log2SEW, TU_MU)>;
@@ -1876,10 +1883,10 @@ multiclass VPatFPMulAccVL_VV_VF_RM<PatFrag vop, string instruction_name> {
   foreach vti = AllFloatVectors in {
   defvar suffix = vti.LMul.MX;
   let Predicates = GetVTypePredicates<vti>.Predicates in {
-    def : Pat<(riscv_vp_merge_vl (vti.Mask V0),
+    def : Pat<(riscv_vmerge_vl (vti.Mask V0),
                            (vti.Vector (vop vti.RegClass:$rs1, vti.RegClass:$rs2,
                             vti.RegClass:$rd, (vti.Mask true_mask), VLOpFrag)),
-                            vti.RegClass:$rd, VLOpFrag),
+                            vti.RegClass:$rd, vti.RegClass:$rd, VLOpFrag),
               (!cast<Instruction>(instruction_name#"_VV_"# suffix #"_MASK")
                    vti.RegClass:$rd, vti.RegClass:$rs1, vti.RegClass:$rs2,
                    (vti.Mask V0),
@@ -1887,10 +1894,10 @@ multiclass VPatFPMulAccVL_VV_VF_RM<PatFrag vop, string instruction_name> {
                    // RISCVInsertReadWriteCSR
                    FRM_DYN,
                    GPR:$vl, vti.Log2SEW, TU_MU)>;
-    def : Pat<(riscv_vp_merge_vl (vti.Mask V0),
+    def : Pat<(riscv_vmerge_vl (vti.Mask V0),
                            (vti.Vector (vop (SplatFPOp vti.ScalarRegClass:$rs1), vti.RegClass:$rs2,
                             vti.RegClass:$rd, (vti.Mask true_mask), VLOpFrag)),
-                            vti.RegClass:$rd, VLOpFrag),
+                            vti.RegClass:$rd, vti.RegClass:$rd, VLOpFrag),
               (!cast<Instruction>(instruction_name#"_V" # vti.ScalarSuffix # "_" # suffix # "_MASK")
                    vti.RegClass:$rd, vti.ScalarRegClass:$rs1, vti.RegClass:$rs2,
                    (vti.Mask V0),
@@ -2273,29 +2280,32 @@ foreach vti = AllIntegerVectors in {
                    (vti.Vector (IMPLICIT_DEF)),
                    vti.RegClass:$rs2, simm5:$rs1, (vti.Mask V0), GPR:$vl, vti.Log2SEW)>;
 
-    def : Pat<(vti.Vector (riscv_vp_merge_vl (vti.Mask V0),
-                                             vti.RegClass:$rs1,
-                                             vti.RegClass:$rs2,
-                                             VLOpFrag)),
+    def : Pat<(vti.Vector (riscv_vmerge_vl (vti.Mask V0),
+                                           vti.RegClass:$rs1,
+                                           vti.RegClass:$rs2,
+                                           vti.RegClass:$merge,
+                                           VLOpFrag)),
               (!cast<Instruction>("PseudoVMERGE_VVM_"#vti.LMul.MX)
-                   vti.RegClass:$rs2, vti.RegClass:$rs2, vti.RegClass:$rs1,
-                   (vti.Mask V0), GPR:$vl, vti.Log2SEW)>;
+                  vti.RegClass:$merge, vti.RegClass:$rs2, vti.RegClass:$rs1,
+                  (vti.Mask V0), GPR:$vl, vti.Log2SEW)>;
 
-    def : Pat<(vti.Vector (riscv_vp_merge_vl (vti.Mask V0),
-                                             (SplatPat XLenVT:$rs1),
-                                             vti.RegClass:$rs2,
-                                             VLOpFrag)),
+    def : Pat<(vti.Vector (riscv_vmerge_vl (vti.Mask V0),
+                                            (SplatPat XLenVT:$rs1),
+                                            vti.RegClass:$rs2,
+                                            vti.RegClass:$merge,
+                                            VLOpFrag)),
               (!cast<Instruction>("PseudoVMERGE_VXM_"#vti.LMul.MX)
-                   vti.RegClass:$rs2, vti.RegClass:$rs2, GPR:$rs1,
-                   (vti.Mask V0), GPR:$vl, vti.Log2SEW)>;
-
-    def : Pat<(vti.Vector (riscv_vp_merge_vl (vti.Mask V0),
-                                             (SplatPat_simm5 simm5:$rs1),
-                                             vti.RegClass:$rs2,
-                                             VLOpFrag)),
+                  vti.RegClass:$merge, vti.RegClass:$rs2, GPR:$rs1,
+                  (vti.Mask V0), GPR:$vl, vti.Log2SEW)>;
+
+    def : Pat<(vti.Vector (riscv_vmerge_vl (vti.Mask V0),
+                                           (SplatPat_simm5 simm5:$rs1),
+                                           vti.RegClass:$rs2,
+                                           vti.RegClass:$merge,
+                                           VLOpFrag)),
               (!cast<Instruction>("PseudoVMERGE_VIM_"#vti.LMul.MX)
-                   vti.RegClass:$rs2, vti.RegClass:$rs2, simm5:$rs1,
-                   (vti.Mask V0), GPR:$vl, vti.Log2SEW)>;
+                  vti.RegClass:$merge, vti.RegClass:$rs2, simm5:$rs1,
+                  (vti.Mask V0), GPR:$vl, vti.Log2SEW)>;
   }
 }
 
@@ -2493,21 +2503,23 @@ foreach fvti = AllFloatVectors in {
                    (fvti.Vector (IMPLICIT_DEF)),
                    fvti.RegClass:$rs2, 0, (fvti.Mask V0), GPR:$vl, fvti.Log2SEW)>;
 
-    def : Pat<(fvti.Vector (riscv_vp_merge_vl (fvti.Mask V0),
-                                              fvti.RegClass:$rs1,
-                                              fvti.RegClass:$rs2,
-                                              VLOpFrag)),
-              (!cast<Instruction>("PseudoVMERGE_VVM_"#fvti.LMul.MX)
-                   fvti.RegClass:$rs2, fvti.RegClass:$rs2, fvti.RegClass:$rs1, (fvti.Mask V0),
-                   GPR:$vl, fvti.Log2SEW)>;
-
-    def : Pat<(fvti.Vector (riscv_vp_merge_vl (fvti.Mask V0),
-                                              (SplatFPOp (fvti.Scalar fpimm0)),
-                                              fvti.RegClass:$rs2,
-                                              VLOpFrag)),
-              (!cast<Instruction>("PseudoVMERGE_VIM_"#fvti.LMul.MX)
-                   fvti.RegClass:$rs2, fvti.RegClass:$rs2, 0, (fvti.Mask V0),
-                   GPR:$vl, fvti.Log2SEW)>;
+  def : Pat<(fvti.Vector (riscv_vmerge_vl (fvti.Mask V0),
+                                          fvti.RegClass:$rs1,
+                                          fvti.RegClass:$rs2,
+                                          fvti.RegClass:$merge,
+                                          VLOpFrag)),
+            (!cast<Instruction>("PseudoVMERGE_VVM_"#fvti.LMul.MX)
+                 fvti.RegClass:$merge, fvti.RegClass:$rs2, fvti.RegClass:$rs1, (fvti.Mask V0),
+                 GPR:$vl, fvti.Log2SEW)>;
+
+  def : Pat<(fvti.Vector (riscv_vmerge_vl (fvti.Mask V0),
+                                          (SplatFPOp (fvti.Scalar fpimm0)),
+                                          fvti.RegClass:$rs2,
+                                          fvti.RegClass:$merge,
+                                          VLOpFrag)),
+            (!cast<Instruction>("PseudoVMERGE_VIM_"#fvti.LMul.MX)
+                 fvti.RegClass:$merge, fvti.RegClass:$rs2, 0, (fvti.Mask V0),
+                 GPR:$vl, fvti.Log2SEW)>;
   }
 
   let Predicates = GetVTypePredicates<fvti>.Predicates in {
@@ -2521,12 +2533,13 @@ foreach fvti = AllFloatVectors in {
                    (fvti.Scalar fvti.ScalarRegClass:$rs1),
                    (fvti.Mask V0), GPR:$vl, fvti.Log2SEW)>;
 
-    def : Pat<(fvti.Vector (riscv_vp_merge_vl (fvti.Mask V0),
-                                              (SplatFPOp fvti.ScalarRegClass:$rs1),
-                                              fvti.RegClass:$rs2,
-                                              VLOpFrag)),
+    def : Pat<(fvti.Vector (riscv_vmerge_vl (fvti.Mask V0),
+                                            (SplatFPOp fvti.ScalarRegClass:$rs1),
+                                            fvti.RegClass:$rs2,
+                                            fvti.RegClass:$merge,
+                                            VLOpFrag)),
               (!cast<Instruction>("PseudoVFMERGE_V"#fvti.ScalarSuffix#"M_"#fvti.LMul.MX)
-                   fvti.RegClass:$rs2, fvti.RegClass:$rs2,
+                   fvti.RegClass:$merge, fvti.RegClass:$rs2,
                    (fvti.Scalar fvti.ScalarRegClass:$rs1),
                    (fvti.Mask V0), GPR:$vl, fvti.Log2SEW)>;
 

From f70b229e9643ddb895d491b62a5ec0655917f6f8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Fri, 22 Dec 2023 00:40:12 +0200
Subject: [PATCH 123/342] [LLDB] Define _BSD_SOURCE globally, to get optreset
 available in mingw's getopt.h (#76137)

We previously were defining _BSD_SOURCE right before including getopt.h.
However, on mingw-w64, getopt.h is also transitively included by
unistd.h, and unistd.h can be transitively included by many headers
(recently, by some libc++ headers).

Therefore, to be safe, we need to define _BSD_SOURCE before including
any header. Thus do this in CMake.

This fixes https://github.com/llvm/llvm-project/issues/76050.
---
 lldb/CMakeLists.txt                 | 4 ++++
 lldb/include/lldb/Host/HostGetOpt.h | 4 ----
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/lldb/CMakeLists.txt b/lldb/CMakeLists.txt
index 4a53d7ef3d0da..7844d93d78d29 100644
--- a/lldb/CMakeLists.txt
+++ b/lldb/CMakeLists.txt
@@ -44,6 +44,10 @@ endif()
 
 if (WIN32)
   add_definitions(-D_ENABLE_EXTENDED_ALIGNED_STORAGE)
+  if (NOT MSVC)
+    # _BSD_SOURCE is required for MinGW's getopt.h to define optreset
+    add_definitions(-D_BSD_SOURCE)
+  endif()
 endif()
 
 if (LLDB_ENABLE_PYTHON)
diff --git a/lldb/include/lldb/Host/HostGetOpt.h b/lldb/include/lldb/Host/HostGetOpt.h
index 746e03e1bd1ee..52cfdf4dbb89c 100644
--- a/lldb/include/lldb/Host/HostGetOpt.h
+++ b/lldb/include/lldb/Host/HostGetOpt.h
@@ -11,10 +11,6 @@
 
 #if !defined(_MSC_VER) && !defined(__NetBSD__)
 
-#ifdef _WIN32
-#define _BSD_SOURCE // Required so that getopt.h defines optreset
-#endif
-
 #include <getopt.h>
 #include <unistd.h>
 

From 38eea57e69a8a01e38e8dbc38614043a4553acb1 Mon Sep 17 00:00:00 2001
From: Cyndy Ishida <cyndy_ishida@apple.com>
Date: Thu, 21 Dec 2023 13:57:30 -0800
Subject: [PATCH 124/342] [ADT] fix grammatical typo in Twine.h docs, NFC

---
 llvm/include/llvm/ADT/Twine.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/include/llvm/ADT/Twine.h b/llvm/include/llvm/ADT/Twine.h
index 8dfbe4f72e07d..1f1fd1967efbc 100644
--- a/llvm/include/llvm/ADT/Twine.h
+++ b/llvm/include/llvm/ADT/Twine.h
@@ -37,7 +37,7 @@ namespace llvm {
   /// A Twine is not intended for use directly and should not be stored, its
   /// implementation relies on the ability to store pointers to temporary stack
   /// objects which may be deallocated at the end of a statement. Twines should
-  /// only be used accepted as const references in arguments, when an API wishes
+  /// only be used as const references in arguments, when an API wishes
   /// to accept possibly-concatenated strings.
   ///
   /// Twines support a special 'null' value, which always concatenates to form

From 7c3b67d2038cfb48a80299089f6a1308eee1df7f Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Thu, 21 Dec 2023 16:03:06 -0800
Subject: [PATCH 125/342] [hwasan] Respect strip_path_prefix printing locals
 (#76132)

---
 compiler-rt/lib/hwasan/hwasan_report.cpp      | 10 +++++--
 .../test/hwasan/TestCases/strip_path_prefix.c | 27 +++++++++++++++++++
 2 files changed, 35 insertions(+), 2 deletions(-)
 create mode 100644 compiler-rt/test/hwasan/TestCases/strip_path_prefix.c

diff --git a/compiler-rt/lib/hwasan/hwasan_report.cpp b/compiler-rt/lib/hwasan/hwasan_report.cpp
index bbe89112e4dbe..e9dd919d41497 100644
--- a/compiler-rt/lib/hwasan/hwasan_report.cpp
+++ b/compiler-rt/lib/hwasan/hwasan_report.cpp
@@ -205,6 +205,7 @@ static void PrintStackAllocations(const StackAllocationsRingBuffer *sa,
                                   tag_t addr_tag, uptr untagged_addr) {
   uptr frames = Min((uptr)flags()->stack_history_size, sa->size());
   bool found_local = false;
+  InternalScopedString location;
   for (uptr i = 0; i < frames; i++) {
     const uptr *record_addr = &(*sa)[i];
     uptr record = *record_addr;
@@ -236,8 +237,13 @@ static void PrintStackAllocations(const StackAllocationsRingBuffer *sa,
           Printf("\nPotentially referenced stack objects:\n");
           found_local = true;
         }
-        Printf("  %s in %s %s:%d\n", local.name, local.function_name,
-               local.decl_file, local.decl_line);
+        StackTracePrinter::GetOrInit()->RenderSourceLocation(
+            &location, local.decl_file, local.decl_line, /* column= */ 0,
+            common_flags()->symbolize_vs_style,
+            common_flags()->strip_path_prefix);
+        Printf("  %s in %s %s\n", local.name, local.function_name,
+               location.data());
+        location.clear();
       }
       frame.Clear();
     }
diff --git a/compiler-rt/test/hwasan/TestCases/strip_path_prefix.c b/compiler-rt/test/hwasan/TestCases/strip_path_prefix.c
new file mode 100644
index 0000000000000..5844749a6d977
--- /dev/null
+++ b/compiler-rt/test/hwasan/TestCases/strip_path_prefix.c
@@ -0,0 +1,27 @@
+// RUN: %clang_hwasan -O0 %s -o %t && %env_hwasan_opts=strip_path_prefix='"%S/"' not %run %t 2>&1 | FileCheck %s
+
+// Stack histories currently are not recorded on x86.
+// XFAIL: target=x86_64{{.*}}
+
+#include <assert.h>
+#include <sanitizer/hwasan_interface.h>
+#include <stdio.h>
+
+int t;
+
+__attribute__((noinline)) char *buggy() {
+  char *volatile p;
+  char zzz = {};
+  char yyy = {};
+  p = t ? &yyy : &zzz;
+  return p;
+}
+
+int main() {
+  char *p = buggy();
+  return *p;
+  // CHECK: READ of size 1 at
+  // CHECK: #0 {{.*}} in main strip_path_prefix.c:[[@LINE-2]]
+  // CHECK: Potentially referenced stack objects:
+  // CHECK: zzz in buggy strip_path_prefix.c:[[@LINE-12]]
+}

From 033ec098be730bff04bfb929d254ce57e5ec8534 Mon Sep 17 00:00:00 2001
From: hstk30-hw <hanwei62@huawei.com>
Date: Fri, 22 Dec 2023 09:00:41 +0800
Subject: [PATCH 126/342] [Clang][Sema] Fix Wswitch-default bad warning in
 template (#76007)

https://github.com/llvm/llvm-project/pull/73077 added -Wswitch-default
diagnostic but it produced false positives in templates. This PR will
address that. https://github.com/llvm/llvm-project/issues/75943
---
 clang/lib/Sema/SemaStmt.cpp        |  7 ++--
 clang/test/Sema/switch-default.c   | 28 ----------------
 clang/test/Sema/switch-default.cpp | 53 ++++++++++++++++++++++++++++++
 3 files changed, 57 insertions(+), 31 deletions(-)
 delete mode 100644 clang/test/Sema/switch-default.c
 create mode 100644 clang/test/Sema/switch-default.cpp

diff --git a/clang/lib/Sema/SemaStmt.cpp b/clang/lib/Sema/SemaStmt.cpp
index 63348d27a8c94..f0b03db690843 100644
--- a/clang/lib/Sema/SemaStmt.cpp
+++ b/clang/lib/Sema/SemaStmt.cpp
@@ -1271,6 +1271,9 @@ Sema::ActOnFinishSwitchStmt(SourceLocation SwitchLoc, Stmt *Switch,
 
   bool CaseListIsErroneous = false;
 
+  // FIXME: We'd better diagnose missing or duplicate default labels even
+  // in the dependent case. Because default labels themselves are never
+  // dependent.
   for (SwitchCase *SC = SS->getSwitchCaseList(); SC && !HasDependentValue;
        SC = SC->getNextSwitchCase()) {
 
@@ -1327,9 +1330,6 @@ Sema::ActOnFinishSwitchStmt(SourceLocation SwitchLoc, Stmt *Switch,
     }
   }
 
-  if (!TheDefaultStmt)
-    Diag(SwitchLoc, diag::warn_switch_default);
-
   if (!HasDependentValue) {
     // If we don't have a default statement, check whether the
     // condition is constant.
@@ -1344,6 +1344,7 @@ Sema::ActOnFinishSwitchStmt(SourceLocation SwitchLoc, Stmt *Switch,
       assert(!HasConstantCond ||
              (ConstantCondValue.getBitWidth() == CondWidth &&
               ConstantCondValue.isSigned() == CondIsSigned));
+      Diag(SwitchLoc, diag::warn_switch_default);
     }
     bool ShouldCheckConstantCond = HasConstantCond;
 
diff --git a/clang/test/Sema/switch-default.c b/clang/test/Sema/switch-default.c
deleted file mode 100644
index 342a97ee68b1e..0000000000000
--- a/clang/test/Sema/switch-default.c
+++ /dev/null
@@ -1,28 +0,0 @@
-// RUN: %clang_cc1 -fsyntax-only -verify -Wswitch-default %s
-
-int f1(int a) {
-  switch (a) {                // expected-warning {{'switch' missing 'default' label}}
-    case 1: a++; break;
-    case 2: a += 2; break;
-  }
-  return a;
-}
-
-int f2(int a) {
-  switch (a) {                // no-warning
-    default:
-      ;
-  }
-  return a;
-}
-
-// Warn even completely covered Enum cases(GCC compatibility).
-enum E { A, B };
-enum E check_enum(enum E e) {
-  switch (e) {                // expected-warning {{'switch' missing 'default' label}}
-    case A: break;
-    case B: break;
-  }
-  return e;
-}
-
diff --git a/clang/test/Sema/switch-default.cpp b/clang/test/Sema/switch-default.cpp
new file mode 100644
index 0000000000000..32d03dae88273
--- /dev/null
+++ b/clang/test/Sema/switch-default.cpp
@@ -0,0 +1,53 @@
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++11 -Wswitch-default %s
+
+int f1(int a) {
+  switch (a) {                // expected-warning {{'switch' missing 'default' label}}
+    case 1: a++; break;
+    case 2: a += 2; break;
+  }
+  return a;
+}
+
+int f2(int a) {
+  switch (a) {                // no-warning
+    default:
+      ;
+  }
+  return a;
+}
+
+// Warn even completely covered Enum cases(GCC compatibility).
+enum E { A, B };
+enum E check_enum(enum E e) {
+  switch (e) {                // expected-warning {{'switch' missing 'default' label}}
+    case A: break;
+    case B: break;
+  }
+  return e;
+}
+
+template<typename Index>
+int t1(Index i)
+{
+  switch (i) {              // expected-warning {{'switch' missing 'default' label}}
+    case 0: return 0;
+    case 1: return 1;
+  }
+  return 0;
+}
+
+template<typename Index>
+int t2(Index i)
+{
+  switch (i) {            // no-warning
+    case 0: return 0;
+    case 1: return 1;
+    default: return 2;
+  }
+  return 0;
+}
+
+int main() {
+  return t1(1);       // expected-note {{in instantiation of function template specialization 't1<int>' requested here}}
+}
+

From 7db28dd3f897a8a851bb52741e2de6959a46ddf0 Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Thu, 21 Dec 2023 17:51:14 -0800
Subject: [PATCH 127/342] [hwasan] Classify stack overflow, and use after scope
 (#76133)

We can't distinguish UAR and UAS, but by definition
UAR is already UAS.
---
 compiler-rt/lib/hwasan/hwasan_report.cpp      | 48 ++++++++++++++-----
 .../test/hwasan/TestCases/stack-overflow.c    | 25 ++++++++++
 .../test/hwasan/TestCases/stack-uar-dynamic.c |  2 +
 compiler-rt/test/hwasan/TestCases/stack-uar.c |  2 +
 .../test/hwasan/TestCases/stack-underflow.c   | 25 ++++++++++
 5 files changed, 91 insertions(+), 11 deletions(-)
 create mode 100644 compiler-rt/test/hwasan/TestCases/stack-overflow.c
 create mode 100644 compiler-rt/test/hwasan/TestCases/stack-underflow.c

diff --git a/compiler-rt/lib/hwasan/hwasan_report.cpp b/compiler-rt/lib/hwasan/hwasan_report.cpp
index e9dd919d41497..5b3a99adfea7c 100644
--- a/compiler-rt/lib/hwasan/hwasan_report.cpp
+++ b/compiler-rt/lib/hwasan/hwasan_report.cpp
@@ -221,22 +221,49 @@ static void PrintStackAllocations(const StackAllocationsRingBuffer *sa,
       for (LocalInfo &local : frame.locals) {
         if (!local.has_frame_offset || !local.has_size || !local.has_tag_offset)
           continue;
+        if (!(local.name && internal_strlen(local.name)) &&
+            !(local.function_name && internal_strlen(local.name)) &&
+            !(local.decl_file && internal_strlen(local.decl_file)))
+          continue;
         tag_t obj_tag = base_tag ^ local.tag_offset;
         if (obj_tag != addr_tag)
           continue;
-        // Calculate the offset from the object address to the faulting
-        // address. Because we only store bits 4-19 of FP (bits 0-3 are
-        // guaranteed to be zero), the calculation is performed mod 2^20 and may
-        // harmlessly underflow if the address mod 2^20 is below the object
-        // address.
-        uptr obj_offset =
-            (untagged_addr - fp - local.frame_offset) & (kRecordFPModulus - 1);
-        if (obj_offset >= local.size)
-          continue;
+        // Guess top bits of local variable from the faulting address, because
+        // we only store bits 4-19 of FP (bits 0-3 are guaranteed to be zero).
+        uptr local_beg = (fp + local.frame_offset) |
+                         (untagged_addr & ~(uptr(kRecordFPModulus) - 1));
+        uptr local_end = local_beg + local.size;
+
         if (!found_local) {
           Printf("\nPotentially referenced stack objects:\n");
           found_local = true;
         }
+
+        uptr offset;
+        const char *whence;
+        const char *cause;
+        if (local_beg <= untagged_addr && untagged_addr < local_end) {
+          offset = untagged_addr - local_beg;
+          whence = "inside";
+          cause = "use-after-scope";
+        } else if (untagged_addr >= local_end) {
+          offset = untagged_addr - local_end;
+          whence = "after";
+          cause = "stack-buffer-overflow";
+        } else {
+          offset = local_beg - untagged_addr;
+          whence = "before";
+          cause = "stack-buffer-overflow";
+        }
+        Decorator d;
+        Printf("%s", d.Error());
+        Printf("Cause: %s\n", cause);
+        Printf("%s", d.Default());
+        Printf("%s", d.Location());
+        Printf("%p is located %zd bytes %s a %zd-byte region [%p,%p)\n",
+               untagged_addr, offset, whence, local_end - local_beg, local_beg,
+               local_end);
+        Printf("%s", d.Allocation());
         StackTracePrinter::GetOrInit()->RenderSourceLocation(
             &location, local.decl_file, local.decl_line, /* column= */ 0,
             common_flags()->symbolize_vs_style,
@@ -244,6 +271,7 @@ static void PrintStackAllocations(const StackAllocationsRingBuffer *sa,
         Printf("  %s in %s %s\n", local.name, local.function_name,
                location.data());
         location.clear();
+        Printf("%s\n", d.Default());
       }
       frame.Clear();
     }
@@ -751,8 +779,6 @@ void BaseReport::PrintAddressDescription() const {
   // Check stack first. If the address is on the stack of a live thread, we
   // know it cannot be a heap / global overflow.
   for (const auto &sa : allocations.stack) {
-    // TODO(fmayer): figure out how to distinguish use-after-return and
-    // stack-buffer-overflow.
     Printf("%s", d.Error());
     Printf("\nCause: stack tag-mismatch\n");
     Printf("%s", d.Location());
diff --git a/compiler-rt/test/hwasan/TestCases/stack-overflow.c b/compiler-rt/test/hwasan/TestCases/stack-overflow.c
new file mode 100644
index 0000000000000..10e8d9c59e4bb
--- /dev/null
+++ b/compiler-rt/test/hwasan/TestCases/stack-overflow.c
@@ -0,0 +1,25 @@
+// RUN: %clang_hwasan -g %s -o %t && not %run %t 2>&1 | FileCheck %s
+
+// Stack histories currently are not recorded on x86.
+// XFAIL: target=x86_64{{.*}}
+
+__attribute((noinline)) void buggy() {
+  char c[64];
+  char *volatile p = c;
+  p[65] = 0;
+}
+
+int main() {
+  buggy();
+  // CHECK: WRITE of size 1 at
+  // CHECK: #0 {{.*}} in buggy{{.*}}stack-overflow.c:[[@LINE-6]]
+  // CHECK: Cause: stack tag-mismatch
+  // CHECK: is located in stack of thread
+  // CHECK: Potentially referenced stack objects:
+  // CHECK: Cause: stack-buffer-overflow
+  // CHECK-NEXT: 0x{{.*}} is located 1 bytes after a 64-byte region
+  // CHECK-NEXT: c in buggy {{.*}}stack-overflow.c:
+  // CHECK: Memory tags around the buggy address
+
+  // CHECK: SUMMARY: HWAddressSanitizer: tag-mismatch {{.*}} in buggy
+}
diff --git a/compiler-rt/test/hwasan/TestCases/stack-uar-dynamic.c b/compiler-rt/test/hwasan/TestCases/stack-uar-dynamic.c
index b06568e12eba7..7a2a11593e7af 100644
--- a/compiler-rt/test/hwasan/TestCases/stack-uar-dynamic.c
+++ b/compiler-rt/test/hwasan/TestCases/stack-uar-dynamic.c
@@ -21,6 +21,8 @@ char *buggy(int b) {
 int main() {
   char *p = buggy(1);
   // CHECK: Potentially referenced stack objects:
+  // CHECK-NEXT: use-after-scope
+  // CHECK-NEXT: 0x{{.*}} is located 0 bytes inside a 64-byte region
   // CHECK-NEXT: c in buggy
   p[0] = 0;
 }
diff --git a/compiler-rt/test/hwasan/TestCases/stack-uar.c b/compiler-rt/test/hwasan/TestCases/stack-uar.c
index 29941e617ad67..8810701f0c9ca 100644
--- a/compiler-rt/test/hwasan/TestCases/stack-uar.c
+++ b/compiler-rt/test/hwasan/TestCases/stack-uar.c
@@ -50,6 +50,8 @@ int main() {
   // CHECK: Cause: stack tag-mismatch
   // CHECK: is located in stack of thread
   // CHECK: Potentially referenced stack objects:
+  // CHECK: Cause: use-after-scope
+  // CHECK-NEXT: 0x{{.*}} is located 0 bytes inside a 2048-byte region
   // CHECK-NEXT: {{zzz|yyy}} in buggy {{.*}}stack-uar.c:
   // CHECK: Memory tags around the buggy address
 
diff --git a/compiler-rt/test/hwasan/TestCases/stack-underflow.c b/compiler-rt/test/hwasan/TestCases/stack-underflow.c
new file mode 100644
index 0000000000000..8e5174519272f
--- /dev/null
+++ b/compiler-rt/test/hwasan/TestCases/stack-underflow.c
@@ -0,0 +1,25 @@
+// RUN: %clang_hwasan -g %s -o %t && not %run %t 2>&1 | FileCheck %s
+
+// Stack histories currently are not recorded on x86.
+// XFAIL: target=x86_64{{.*}}
+
+__attribute((noinline)) void buggy() {
+  char c[64];
+  char *volatile p = c;
+  p[-2] = 0;
+}
+
+int main() {
+  buggy();
+  // CHECK: WRITE of size 1 at
+  // CHECK: #0 {{.*}} in buggy{{.*}}stack-underflow.c:[[@LINE-6]]
+  // CHECK: Cause: stack tag-mismatch
+  // CHECK: is located in stack of thread
+  // CHECK: Potentially referenced stack objects:
+  // CHECK: Cause: stack-buffer-overflow
+  // CHECK-NEXT: 0x{{.*}} is located 2 bytes before a 64-byte region
+  // CHECK-NEXT: c in buggy {{.*}}stack-underflow.c:
+  // CHECK: Memory tags around the buggy address
+
+  // CHECK: SUMMARY: HWAddressSanitizer: tag-mismatch {{.*}} in buggy
+}

From c99670ba513529b3ab6a649be7377b863dc110be Mon Sep 17 00:00:00 2001
From: Matthias Springer <me@m-sp.org>
Date: Fri, 22 Dec 2023 11:12:58 +0900
Subject: [PATCH 128/342] [mlir][vector] `LoadOp`/`StoreOp`: Allow 0-D vectors
 (#76134)

Similar to `vector.transfer_read`/`vector.transfer_write`, allow 0-D
vectors.

This commit fixes
`mlir/test/Dialect/Vector/vector-transfer-to-vector-load-store.mlir`
when verifying the IR after each pattern (#74270). That test produces a
temporary 0-D load/store op.
---
 .../mlir/Dialect/Vector/IR/VectorOps.td       | 42 ++++++++++++-------
 .../VectorToLLVM/vector-to-llvm.mlir          | 30 +++++++++++++
 mlir/test/Dialect/Vector/ops.mlir             | 10 +++++
 3 files changed, 67 insertions(+), 15 deletions(-)

diff --git a/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td b/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td
index 423118f79e733..40d874dc99dd9 100644
--- a/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td
+++ b/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td
@@ -1582,22 +1582,27 @@ def Vector_LoadOp : Vector_Op<"load"> {
     vector. If the memref element type is vector, it should match the result
     vector type.
 
-    Example 1: 1-D vector load on a scalar memref.
+    Example: 0-D vector load on a scalar memref.
+    ```mlir
+    %result = vector.load %base[%i, %j] : memref<100x100xf32>, vector<f32>
+    ```
+
+    Example: 1-D vector load on a scalar memref.
     ```mlir
     %result = vector.load %base[%i, %j] : memref<100x100xf32>, vector<8xf32>
     ```
 
-    Example 2: 1-D vector load on a vector memref.
+    Example: 1-D vector load on a vector memref.
     ```mlir
     %result = vector.load %memref[%i, %j] : memref<200x100xvector<8xf32>>, vector<8xf32>
     ```
 
-    Example 3:  2-D vector load on a scalar memref.
+    Example:  2-D vector load on a scalar memref.
     ```mlir
     %result = vector.load %memref[%i, %j] : memref<200x100xf32>, vector<4x8xf32>
     ```
 
-    Example 4:  2-D vector load on a vector memref.
+    Example:  2-D vector load on a vector memref.
     ```mlir
     %result = vector.load %memref[%i, %j] : memref<200x100xvector<4x8xf32>>, vector<4x8xf32>
     ```
@@ -1608,12 +1613,12 @@ def Vector_LoadOp : Vector_Op<"load"> {
     loaded out of bounds. Not all targets may support out-of-bounds vector
     loads.
 
-    Example 5:  Potential out-of-bound vector load.
+    Example:  Potential out-of-bound vector load.
     ```mlir
     %result = vector.load %memref[%index] : memref<?xf32>, vector<8xf32>
     ```
 
-    Example 6:  Explicit out-of-bound vector load.
+    Example:  Explicit out-of-bound vector load.
     ```mlir
     %result = vector.load %memref[%c0] : memref<7xf32>, vector<8xf32>
     ```
@@ -1622,7 +1627,7 @@ def Vector_LoadOp : Vector_Op<"load"> {
   let arguments = (ins Arg<AnyMemRef, "the reference to load from",
       [MemRead]>:$base,
       Variadic<Index>:$indices);
-  let results = (outs AnyVector:$result);
+  let results = (outs AnyVectorOfAnyRank:$result);
 
   let extraClassDeclaration = [{
     MemRefType getMemRefType() {
@@ -1660,22 +1665,27 @@ def Vector_StoreOp : Vector_Op<"store"> {
     to store. If the memref element type is vector, it should match the type
     of the value to store.
 
-    Example 1: 1-D vector store on a scalar memref.
+    Example: 0-D vector store on a scalar memref.
+    ```mlir
+    vector.store %valueToStore, %memref[%i, %j] : memref<200x100xf32>, vector<f32>
+    ```
+
+    Example: 1-D vector store on a scalar memref.
     ```mlir
     vector.store %valueToStore, %memref[%i, %j] : memref<200x100xf32>, vector<8xf32>
     ```
 
-    Example 2: 1-D vector store on a vector memref.
+    Example: 1-D vector store on a vector memref.
     ```mlir
     vector.store %valueToStore, %memref[%i, %j] : memref<200x100xvector<8xf32>>, vector<8xf32>
     ```
 
-    Example 3:  2-D vector store on a scalar memref.
+    Example:  2-D vector store on a scalar memref.
     ```mlir
     vector.store %valueToStore, %memref[%i, %j] : memref<200x100xf32>, vector<4x8xf32>
     ```
 
-    Example 4:  2-D vector store on a vector memref.
+    Example:  2-D vector store on a vector memref.
     ```mlir
     vector.store %valueToStore, %memref[%i, %j] : memref<200x100xvector<4x8xf32>>, vector<4x8xf32>
     ```
@@ -1685,21 +1695,23 @@ def Vector_StoreOp : Vector_Op<"store"> {
     target-specific. No assumptions should be made on the memory written out of
     bounds. Not all targets may support out-of-bounds vector stores.
 
-    Example 5:  Potential out-of-bounds vector store.
+    Example:  Potential out-of-bounds vector store.
     ```mlir
     vector.store %valueToStore, %memref[%index] : memref<?xf32>, vector<8xf32>
     ```
 
-    Example 6:  Explicit out-of-bounds vector store.
+    Example:  Explicit out-of-bounds vector store.
     ```mlir
     vector.store %valueToStore, %memref[%c0] : memref<7xf32>, vector<8xf32>
     ```
   }];
 
-  let arguments = (ins AnyVector:$valueToStore,
+  let arguments = (ins
+      AnyVectorOfAnyRank:$valueToStore,
       Arg<AnyMemRef, "the reference to store to",
       [MemWrite]>:$base,
-      Variadic<Index>:$indices);
+      Variadic<Index>:$indices
+  );
 
   let extraClassDeclaration = [{
     MemRefType getMemRefType() {
diff --git a/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir b/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir
index d80392ebd87b0..7ea0197bdecb3 100644
--- a/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir
+++ b/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir
@@ -2059,6 +2059,36 @@ func.func @vector_store_op_index(%memref : memref<200x100xindex>, %i : index, %j
 
 // -----
 
+func.func @vector_load_op_0d(%memref : memref<200x100xf32>, %i : index, %j : index) -> vector<f32> {
+  %0 = vector.load %memref[%i, %j] : memref<200x100xf32>, vector<f32>
+  return %0 : vector<f32>
+}
+
+// CHECK-LABEL: func @vector_load_op_0d
+// CHECK: %[[load:.*]] = memref.load %{{.*}}[%{{.*}}, %{{.*}}]
+// CHECK: %[[vec:.*]] = llvm.mlir.undef : vector<1xf32>
+// CHECK: %[[c0:.*]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK: %[[inserted:.*]] = llvm.insertelement %[[load]], %[[vec]][%[[c0]] : i32] : vector<1xf32>
+// CHECK: %[[cast:.*]] = builtin.unrealized_conversion_cast %[[inserted]] : vector<1xf32> to vector<f32>
+// CHECK: return %[[cast]] : vector<f32>
+
+// -----
+
+func.func @vector_store_op_0d(%memref : memref<200x100xf32>, %i : index, %j : index) {
+  %val = arith.constant dense<11.0> : vector<f32>
+  vector.store %val, %memref[%i, %j] : memref<200x100xf32>, vector<f32>
+  return
+}
+
+// CHECK-LABEL: func @vector_store_op_0d
+// CHECK: %[[val:.*]] = arith.constant dense<1.100000e+01> : vector<f32>
+// CHECK: %[[cast:.*]] = builtin.unrealized_conversion_cast %[[val]] : vector<f32> to vector<1xf32>
+// CHECK: %[[c0:.*]] = llvm.mlir.constant(0 : index) : i64
+// CHECK: %[[extracted:.*]] = llvm.extractelement %[[cast]][%[[c0]] : i64] : vector<1xf32>
+// CHECK: memref.store %[[extracted]], %{{.*}}[%{{.*}}, %{{.*}}]
+
+// -----
+
 func.func @masked_load_op(%arg0: memref<?xf32>, %arg1: vector<16xi1>, %arg2: vector<16xf32>) -> vector<16xf32> {
   %c0 = arith.constant 0: index
   %0 = vector.maskedload %arg0[%c0], %arg1, %arg2 : memref<?xf32>, vector<16xi1>, vector<16xf32> into vector<16xf32>
diff --git a/mlir/test/Dialect/Vector/ops.mlir b/mlir/test/Dialect/Vector/ops.mlir
index 9f1ec21cdabf6..03532c5c1ceb1 100644
--- a/mlir/test/Dialect/Vector/ops.mlir
+++ b/mlir/test/Dialect/Vector/ops.mlir
@@ -725,6 +725,16 @@ func.func @flat_transpose_int(%arg0: vector<16xi32>) -> vector<16xi32> {
   return %0 : vector<16xi32>
 }
 
+// CHECK-LABEL: @vector_load_and_store_0d_scalar_memref
+func.func @vector_load_and_store_0d_scalar_memref(%memref : memref<200x100xf32>,
+                                                  %i : index, %j : index) {
+  // CHECK: %[[ld:.*]] = vector.load %{{.*}}[%{{.*}}] : memref<200x100xf32>, vector<f32>
+  %0 = vector.load %memref[%i, %j] : memref<200x100xf32>, vector<f32>
+  // CHECK: vector.store %[[ld]], %{{.*}}[%{{.*}}] : memref<200x100xf32>, vector<f32>
+  vector.store %0, %memref[%i, %j] : memref<200x100xf32>, vector<f32>
+  return
+}
+
 // CHECK-LABEL: @vector_load_and_store_1d_scalar_memref
 func.func @vector_load_and_store_1d_scalar_memref(%memref : memref<200x100xf32>,
                                              %i : index, %j : index) {

From c03745d23a22fea669ccc3e481f52ddf3d3f0406 Mon Sep 17 00:00:00 2001
From: eric <eric@efcs.ca>
Date: Thu, 21 Dec 2023 21:32:21 -0500
Subject: [PATCH 129/342] libc++-infa: Hotfix runner group

The runners-32 group is broken, for reasons...
The easiest fix is to move the jobs to runners-8.
(which needs to be renamed, because they're all actually 30 core
machines)
---
 .github/workflows/libcxx-build-and-test.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/libcxx-build-and-test.yaml b/.github/workflows/libcxx-build-and-test.yaml
index 370cf830a60cf..25e8c8c1ef21a 100644
--- a/.github/workflows/libcxx-build-and-test.yaml
+++ b/.github/workflows/libcxx-build-and-test.yaml
@@ -185,7 +185,7 @@ jobs:
           std_modules: 'OFF'
         # Use a larger machine for MSAN to avoid timeout and memory allocation issues.
         - config: 'generic-msan'
-          machine: libcxx-runners-32-set
+          machine: libcxx-runners-8-set
           std_modules: 'OFF'
     runs-on: ${{ matrix.machine }}
     steps:

From 62d8ae0a1e7b80a91dc579dc22b335bb22ed07f8 Mon Sep 17 00:00:00 2001
From: Shengchen Kan <shengchen.kan@intel.com>
Date: Fri, 22 Dec 2023 10:14:59 +0800
Subject: [PATCH 130/342] [X86][NFC] Remove class (VEX/EVEX/XOP)_4V and add
 class VVVV

`VEX_4V` does not look simpler than `VEX, VVVV`. It's kind of confusing
b/c classes like `VEX_L`, `VEX_LIG` do not imply `VEX` but it does.

For APX, we have promote EVEX, NDD, NF and NDD_NF instructions. All of
the 4 variants are in EVEX space and NDD/NDD_NF set the VVVV fields.
To extract the common fields (e.g EVEX) into a class and set VVVV
conditionally, we need VVVV to not imply other prefixes.
---
 llvm/lib/Target/X86/X86InstrAMX.td        |  16 +-
 llvm/lib/Target/X86/X86InstrAVX512.td     | 446 ++++++++++-----------
 llvm/lib/Target/X86/X86InstrArithmetic.td |  16 +-
 llvm/lib/Target/X86/X86InstrMisc.td       |  28 +-
 llvm/lib/Target/X86/X86InstrSSE.td        | 462 +++++++++++-----------
 llvm/lib/Target/X86/X86InstrTBM.td        |   4 +-
 llvm/lib/Target/X86/X86InstrUtils.td      |  20 +-
 llvm/lib/Target/X86/X86InstrXOP.td        |  28 +-
 8 files changed, 509 insertions(+), 511 deletions(-)

diff --git a/llvm/lib/Target/X86/X86InstrAMX.td b/llvm/lib/Target/X86/X86InstrAMX.td
index 2dbb3e5ee3169..71e6a44c9d8e7 100644
--- a/llvm/lib/Target/X86/X86InstrAMX.td
+++ b/llvm/lib/Target/X86/X86InstrAMX.td
@@ -91,19 +91,19 @@ let Predicates = [HasAMXINT8, In64BitMode] in {
       def TDPBSSD : I<0x5e, MRMSrcReg4VOp3, (outs TILE:$dst),
                       (ins TILE:$src1, TILE:$src2, TILE:$src3),
                       "tdpbssd\t{$src3, $src2, $dst|$dst, $src2, $src3}", []>,
-                      VEX_4V, T8XD;
+                      VEX, VVVV, T8XD;
       def TDPBSUD : I<0x5e, MRMSrcReg4VOp3, (outs TILE:$dst),
                       (ins TILE:$src1, TILE:$src2, TILE:$src3),
                       "tdpbsud\t{$src3, $src2, $dst|$dst, $src2, $src3}", []>,
-                      VEX_4V, T8XS;
+                      VEX, VVVV, T8XS;
       def TDPBUSD : I<0x5e, MRMSrcReg4VOp3, (outs TILE:$dst),
                       (ins TILE:$src1, TILE:$src2, TILE:$src3),
                       "tdpbusd\t{$src3, $src2, $dst|$dst, $src2, $src3}", []>,
-                      VEX_4V, T8PD;
+                      VEX, VVVV, T8PD;
       def TDPBUUD : I<0x5e, MRMSrcReg4VOp3, (outs TILE:$dst),
                       (ins TILE:$src1, TILE:$src2, TILE:$src3),
                       "tdpbuud\t{$src3, $src2, $dst|$dst, $src2, $src3}", []>,
-                      VEX_4V, T8PS;
+                      VEX, VVVV, T8PS;
     }
 
     // Pseduo instruction for RA.
@@ -163,7 +163,7 @@ let Predicates = [HasAMXBF16, In64BitMode] in {
     def TDPBF16PS : I<0x5c, MRMSrcReg4VOp3, (outs TILE:$dst),
                       (ins TILE:$src1, TILE:$src2, TILE:$src3),
                       "tdpbf16ps\t{$src3, $src2, $dst|$dst, $src2, $src3}",
-                      []>, VEX_4V, T8XS;
+                      []>, VEX, VVVV, T8XS;
 
     // Pseduo instruction for RA.
     let isPseudo = true, Constraints = "$src4 = $dst" in
@@ -193,7 +193,7 @@ let Predicates = [HasAMXFP16, In64BitMode] in {
       def TDPFP16PS : I<0x5c, MRMSrcReg4VOp3, (outs TILE:$dst),
                         (ins TILE:$src1, TILE:$src2, TILE:$src3),
                         "tdpfp16ps\t{$src3, $src2, $src1|$src1, $src2, $src3}",
-                        []>, VEX_4V, T8XD;
+                        []>, VEX, VVVV, T8XD;
     }
 
     // Pseduo instruction for RA.
@@ -222,11 +222,11 @@ let Predicates = [HasAMXCOMPLEX, In64BitMode] in {
       def TCMMIMFP16PS   : I<0x6c, MRMSrcReg4VOp3, (outs TILE:$dst),
                             (ins TILE:$src1, TILE:$src2, TILE:$src3),
                             "tcmmimfp16ps\t{$src3, $src2, $src1|$src1, $src2, $src3}",
-                            []>, T8PD, VEX_4V;
+                            []>, T8PD, VEX, VVVV;
       def TCMMRLFP16PS : I<0x6c, MRMSrcReg4VOp3, (outs TILE:$dst),
                             (ins TILE:$src1, TILE:$src2, TILE:$src3),
                             "tcmmrlfp16ps\t{$src3, $src2, $src1|$src1, $src2, $src3}",
-                            []>, VEX_4V, WIG, T8PS;
+                            []>, VEX, VVVV, WIG, T8PS;
 
     } // Constraints = "$src1 = $dst"
 
diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td
index e1fe2b680b96a..86619dfd07bca 100644
--- a/llvm/lib/Target/X86/X86InstrAVX512.td
+++ b/llvm/lib/Target/X86/X86InstrAVX512.td
@@ -378,7 +378,7 @@ multiclass vinsert_for_size_split<int Opcode, X86VectorVTInfo From,
                    (vinsert_for_mask:$src3 (To.VT To.RC:$src1),
                                            (From.VT From.RC:$src2),
                                            (iPTR imm))>,
-                   AVX512AIi8Base, EVEX_4V, Sched<[sched]>;
+                   AVX512AIi8Base, EVEX, VVVV, Sched<[sched]>;
     let mayLoad = 1 in
     defm rm : AVX512_maskable_split<Opcode, MRMSrcMem, To, (outs To.RC:$dst),
                    (ins To.RC:$src1, From.MemOp:$src2, u8imm:$src3),
@@ -389,7 +389,7 @@ multiclass vinsert_for_size_split<int Opcode, X86VectorVTInfo From,
                                (iPTR imm)),
                    (vinsert_for_mask:$src3 (To.VT To.RC:$src1),
                                (From.VT (From.LdFrag addr:$src2)),
-                               (iPTR imm))>, AVX512AIi8Base, EVEX_4V,
+                               (iPTR imm))>, AVX512AIi8Base, EVEX, VVVV,
                    EVEX_CD8<From.EltSize, From.CD8TupleForm>,
                    Sched<[sched.Folded, sched.ReadAfterFold]>;
   }
@@ -647,14 +647,14 @@ def VINSERTPSZrr : AVX512AIi8<0x21, MRMSrcReg, (outs VR128X:$dst),
       (ins VR128X:$src1, VR128X:$src2, u8imm:$src3),
       "vinsertps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
       [(set VR128X:$dst, (X86insertps VR128X:$src1, VR128X:$src2, timm:$src3))]>,
-      EVEX_4V, Sched<[SchedWriteFShuffle.XMM]>;
+      EVEX, VVVV, Sched<[SchedWriteFShuffle.XMM]>;
 def VINSERTPSZrm: AVX512AIi8<0x21, MRMSrcMem, (outs VR128X:$dst),
       (ins VR128X:$src1, f32mem:$src2, u8imm:$src3),
       "vinsertps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
       [(set VR128X:$dst, (X86insertps VR128X:$src1,
                           (v4f32 (scalar_to_vector (loadf32 addr:$src2))),
                           timm:$src3))]>,
-      EVEX_4V, EVEX_CD8<32, CD8VT1>,
+      EVEX, VVVV, EVEX_CD8<32, CD8VT1>,
       Sched<[SchedWriteFShuffle.XMM.Folded, SchedWriteFShuffle.XMM.ReadAfterFold]>;
 }
 
@@ -1593,7 +1593,7 @@ let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain,
           (ins _.RC:$src2, _.RC:$src3),
           OpcodeStr, "$src3, $src2", "$src2, $src3",
           (_.VT (X86VPermt2 _.RC:$src2, IdxVT.RC:$src1, _.RC:$src3)), 1>,
-          EVEX_4V, AVX5128IBase, Sched<[sched]>;
+          EVEX, VVVV, AVX5128IBase, Sched<[sched]>;
 
   let mayLoad = 1 in
   defm rm: AVX512_maskable_3src_cast<opc, MRMSrcMem, _, IdxVT, (outs _.RC:$dst),
@@ -1601,7 +1601,7 @@ let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain,
             OpcodeStr, "$src3, $src2", "$src2, $src3",
             (_.VT (X86VPermt2 _.RC:$src2, IdxVT.RC:$src1,
                    (_.VT (_.LdFrag addr:$src3)))), 1>,
-            EVEX_4V, AVX5128IBase, Sched<[sched.Folded, sched.ReadAfterFold]>;
+            EVEX, VVVV, AVX5128IBase, Sched<[sched.Folded, sched.ReadAfterFold]>;
   }
 }
 
@@ -1616,7 +1616,7 @@ multiclass avx512_perm_i_mb<bits<8> opc, string OpcodeStr,
               !strconcat("$src2, ${src3}", _.BroadcastStr ),
               (_.VT (X86VPermt2 _.RC:$src2,
                IdxVT.RC:$src1,(_.VT (_.BroadcastLdFrag addr:$src3)))), 1>,
-              AVX5128IBase, EVEX_4V, EVEX_B,
+              AVX5128IBase, EVEX, VVVV, EVEX_B,
               Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
@@ -1715,14 +1715,14 @@ let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in {
           (ins IdxVT.RC:$src2, _.RC:$src3),
           OpcodeStr, "$src3, $src2", "$src2, $src3",
           (_.VT (X86VPermt2 _.RC:$src1, IdxVT.RC:$src2, _.RC:$src3)), 1>,
-          EVEX_4V, AVX5128IBase, Sched<[sched]>;
+          EVEX, VVVV, AVX5128IBase, Sched<[sched]>;
 
   defm rm: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
             (ins IdxVT.RC:$src2, _.MemOp:$src3),
             OpcodeStr, "$src3, $src2", "$src2, $src3",
             (_.VT (X86VPermt2 _.RC:$src1, IdxVT.RC:$src2,
                    (_.LdFrag addr:$src3))), 1>,
-            EVEX_4V, AVX5128IBase, Sched<[sched.Folded, sched.ReadAfterFold]>;
+            EVEX, VVVV, AVX5128IBase, Sched<[sched.Folded, sched.ReadAfterFold]>;
   }
 }
 multiclass avx512_perm_t_mb<bits<8> opc, string OpcodeStr,
@@ -1735,7 +1735,7 @@ multiclass avx512_perm_t_mb<bits<8> opc, string OpcodeStr,
               !strconcat("$src2, ${src3}", _.BroadcastStr ),
               (_.VT (X86VPermt2 _.RC:$src1,
                IdxVT.RC:$src2,(_.VT (_.BroadcastLdFrag addr:$src3)))), 1>,
-              AVX5128IBase, EVEX_4V, EVEX_B,
+              AVX5128IBase, EVEX, VVVV, EVEX_B,
               Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
@@ -1800,35 +1800,35 @@ multiclass WriteFVarBlendask<bits<8> opc, string OpcodeStr,
              (ins _.RC:$src1, _.RC:$src2),
              !strconcat(OpcodeStr,
              "\t{$src2, $src1, ${dst}|${dst}, $src1, $src2}"), []>,
-             EVEX_4V, Sched<[sched]>;
+             EVEX, VVVV, Sched<[sched]>;
   def rrk : AVX5128I<opc, MRMSrcReg, (outs _.RC:$dst),
              (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2),
              !strconcat(OpcodeStr,
              "\t{$src2, $src1, ${dst} {${mask}}|${dst} {${mask}}, $src1, $src2}"),
-             []>, EVEX_4V, EVEX_K, Sched<[sched]>;
+             []>, EVEX, VVVV, EVEX_K, Sched<[sched]>;
   def rrkz : AVX5128I<opc, MRMSrcReg, (outs _.RC:$dst),
              (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2),
              !strconcat(OpcodeStr,
              "\t{$src2, $src1, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src1, $src2}"),
-             []>, EVEX_4V, EVEX_KZ, Sched<[sched]>;
+             []>, EVEX, VVVV, EVEX_KZ, Sched<[sched]>;
   let mayLoad = 1 in {
   def rm  : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst),
              (ins _.RC:$src1, _.MemOp:$src2),
              !strconcat(OpcodeStr,
              "\t{$src2, $src1, ${dst}|${dst}, $src1, $src2}"),
-             []>, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>,
+             []>, EVEX, VVVV, EVEX_CD8<_.EltSize, CD8VF>,
              Sched<[sched.Folded, sched.ReadAfterFold]>;
   def rmk : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst),
              (ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2),
              !strconcat(OpcodeStr,
              "\t{$src2, $src1, ${dst} {${mask}}|${dst} {${mask}}, $src1, $src2}"),
-             []>, EVEX_4V, EVEX_K, EVEX_CD8<_.EltSize, CD8VF>,
+             []>, EVEX, VVVV, EVEX_K, EVEX_CD8<_.EltSize, CD8VF>,
              Sched<[sched.Folded, sched.ReadAfterFold]>;
   def rmkz : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst),
              (ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2),
              !strconcat(OpcodeStr,
              "\t{$src2, $src1, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src1, $src2}"),
-             []>, EVEX_4V, EVEX_KZ, EVEX_CD8<_.EltSize, CD8VF>,
+             []>, EVEX, VVVV, EVEX_KZ, EVEX_CD8<_.EltSize, CD8VF>,
              Sched<[sched.Folded, sched.ReadAfterFold]>;
   }
   }
@@ -1841,7 +1841,7 @@ multiclass WriteFVarBlendask_rmb<bits<8> opc, string OpcodeStr,
        !strconcat(OpcodeStr,
             "\t{${src2}", _.BroadcastStr, ", $src1, $dst {${mask}}|",
             "$dst {${mask}}, $src1, ${src2}", _.BroadcastStr, "}"), []>,
-      EVEX_4V, EVEX_K, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>,
+      EVEX, VVVV, EVEX_K, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>,
       Sched<[sched.Folded, sched.ReadAfterFold]>;
 
   def rmbkz : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst),
@@ -1849,7 +1849,7 @@ multiclass WriteFVarBlendask_rmb<bits<8> opc, string OpcodeStr,
        !strconcat(OpcodeStr,
             "\t{${src2}", _.BroadcastStr, ", $src1, $dst {${mask}} {z}|",
             "$dst {${mask}} {z}, $src1, ${src2}", _.BroadcastStr, "}"), []>,
-      EVEX_4V, EVEX_KZ, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>,
+      EVEX, VVVV, EVEX_KZ, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>,
       Sched<[sched.Folded, sched.ReadAfterFold]>;
 
   def rmb : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst),
@@ -1857,7 +1857,7 @@ multiclass WriteFVarBlendask_rmb<bits<8> opc, string OpcodeStr,
        !strconcat(OpcodeStr,
             "\t{${src2}", _.BroadcastStr, ", $src1, $dst|",
             "$dst, $src1, ${src2}", _.BroadcastStr, "}"), []>,
-      EVEX_4V, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>,
+      EVEX, VVVV, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>,
       Sched<[sched.Folded, sched.ReadAfterFold]>;
   }
 }
@@ -1921,7 +1921,7 @@ multiclass avx512_cmp_scalar<X86VectorVTInfo _, SDNode OpNode, SDNode OpNodeSAE,
                       "$cc, $src2, $src1", "$src1, $src2, $cc",
                       (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), timm:$cc),
                       (OpNode_su (_.VT _.RC:$src1), (_.VT _.RC:$src2),
-                                 timm:$cc)>, EVEX_4V, VEX_LIG, Sched<[sched]>, SIMD_EXC;
+                                 timm:$cc)>, EVEX, VVVV, VEX_LIG, Sched<[sched]>, SIMD_EXC;
   let mayLoad = 1 in
   defm  rm_Int  : AVX512_maskable_cmp<0xC2, MRMSrcMem, _,
                     (outs _.KRC:$dst),
@@ -1931,7 +1931,7 @@ multiclass avx512_cmp_scalar<X86VectorVTInfo _, SDNode OpNode, SDNode OpNodeSAE,
                     (OpNode (_.VT _.RC:$src1), (_.ScalarIntMemFrags addr:$src2),
                         timm:$cc),
                     (OpNode_su (_.VT _.RC:$src1), (_.ScalarIntMemFrags addr:$src2),
-                        timm:$cc)>, EVEX_4V, VEX_LIG, EVEX_CD8<_.EltSize, CD8VT1>,
+                        timm:$cc)>, EVEX, VVVV, VEX_LIG, EVEX_CD8<_.EltSize, CD8VT1>,
                     Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
 
   let Uses = [MXCSR] in
@@ -1944,7 +1944,7 @@ multiclass avx512_cmp_scalar<X86VectorVTInfo _, SDNode OpNode, SDNode OpNodeSAE,
                                 timm:$cc),
                      (OpNodeSAE_su (_.VT _.RC:$src1), (_.VT _.RC:$src2),
                                    timm:$cc)>,
-                     EVEX_4V, VEX_LIG, EVEX_B, Sched<[sched]>;
+                     EVEX, VVVV, VEX_LIG, EVEX_B, Sched<[sched]>;
 
   let isCodeGenOnly = 1 in {
     let isCommutable = 1 in
@@ -1955,7 +1955,7 @@ multiclass avx512_cmp_scalar<X86VectorVTInfo _, SDNode OpNode, SDNode OpNodeSAE,
                 [(set _.KRC:$dst, (OpNode _.FRC:$src1,
                                           _.FRC:$src2,
                                           timm:$cc))]>,
-                EVEX_4V, VEX_LIG, Sched<[sched]>, SIMD_EXC;
+                EVEX, VVVV, VEX_LIG, Sched<[sched]>, SIMD_EXC;
     def rm : AVX512Ii8<0xC2, MRMSrcMem,
               (outs _.KRC:$dst),
               (ins _.FRC:$src1, _.ScalarMemOp:$src2, u8imm:$cc),
@@ -1964,7 +1964,7 @@ multiclass avx512_cmp_scalar<X86VectorVTInfo _, SDNode OpNode, SDNode OpNodeSAE,
               [(set _.KRC:$dst, (OpNode _.FRC:$src1,
                                         (_.ScalarLdFrag addr:$src2),
                                         timm:$cc))]>,
-              EVEX_4V, VEX_LIG, EVEX_CD8<_.EltSize, CD8VT1>,
+              EVEX, VVVV, VEX_LIG, EVEX_CD8<_.EltSize, CD8VT1>,
               Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
   }
 }
@@ -1991,24 +1991,24 @@ multiclass avx512_icmp_packed<bits<8> opc, string OpcodeStr,
   def rr : AVX512BI<opc, MRMSrcReg,
              (outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2),
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-             []>, EVEX_4V, Sched<[sched]>;
+             []>, EVEX, VVVV, Sched<[sched]>;
   let mayLoad = 1, hasSideEffects = 0 in
   def rm : AVX512BI<opc, MRMSrcMem,
              (outs _.KRC:$dst), (ins _.RC:$src1, _.MemOp:$src2),
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-             []>, EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>;
+             []>, EVEX, VVVV, Sched<[sched.Folded, sched.ReadAfterFold]>;
   let isCommutable = IsCommutable, hasSideEffects = 0 in
   def rrk : AVX512BI<opc, MRMSrcReg,
               (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2),
               !strconcat(OpcodeStr, "\t{$src2, $src1, $dst {${mask}}|",
                           "$dst {${mask}}, $src1, $src2}"),
-              []>, EVEX_4V, EVEX_K, Sched<[sched]>;
+              []>, EVEX, VVVV, EVEX_K, Sched<[sched]>;
   let mayLoad = 1, hasSideEffects = 0 in
   def rmk : AVX512BI<opc, MRMSrcMem,
               (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2),
               !strconcat(OpcodeStr, "\t{$src2, $src1, $dst {${mask}}|",
                           "$dst {${mask}}, $src1, $src2}"),
-              []>, EVEX_4V, EVEX_K, Sched<[sched.Folded, sched.ReadAfterFold]>;
+              []>, EVEX, VVVV, EVEX_K, Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
 multiclass avx512_icmp_packed_rmb<bits<8> opc, string OpcodeStr,
@@ -2020,14 +2020,14 @@ multiclass avx512_icmp_packed_rmb<bits<8> opc, string OpcodeStr,
               (outs _.KRC:$dst), (ins _.RC:$src1, _.ScalarMemOp:$src2),
               !strconcat(OpcodeStr, "\t{${src2}", _.BroadcastStr, ", $src1, $dst",
                                     "|$dst, $src1, ${src2}", _.BroadcastStr, "}"),
-              []>, EVEX_4V, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
+              []>, EVEX, VVVV, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
   def rmbk : AVX512BI<opc, MRMSrcMem,
                (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1,
                                        _.ScalarMemOp:$src2),
                !strconcat(OpcodeStr,
                           "\t{${src2}", _.BroadcastStr, ", $src1, $dst {${mask}}|",
                           "$dst {${mask}}, $src1, ${src2}", _.BroadcastStr, "}"),
-               []>, EVEX_4V, EVEX_K, EVEX_B,
+               []>, EVEX, VVVV, EVEX_K, EVEX_B,
                Sched<[sched.Folded, sched.ReadAfterFold]>;
   }
 }
@@ -2113,7 +2113,7 @@ multiclass avx512_icmp_cc<bits<8> opc, string Suffix, PatFrag Frag,
              [(set _.KRC:$dst, (_.KVT (Frag:$cc (_.VT _.RC:$src1),
                                                 (_.VT _.RC:$src2),
                                                 cond)))]>,
-             EVEX_4V, Sched<[sched]>;
+             EVEX, VVVV, Sched<[sched]>;
   def rmi : AVX512AIi8<opc, MRMSrcMem,
              (outs _.KRC:$dst), (ins _.RC:$src1, _.MemOp:$src2, u8imm:$cc),
              !strconcat("vpcmp", Suffix,
@@ -2123,7 +2123,7 @@ multiclass avx512_icmp_cc<bits<8> opc, string Suffix, PatFrag Frag,
                                  (_.VT _.RC:$src1),
                                  (_.VT (_.LdFrag addr:$src2)),
                                  cond)))]>,
-             EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>;
+             EVEX, VVVV, Sched<[sched.Folded, sched.ReadAfterFold]>;
   let isCommutable = 1 in
   def rrik : AVX512AIi8<opc, MRMSrcReg,
               (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2,
@@ -2135,7 +2135,7 @@ multiclass avx512_icmp_cc<bits<8> opc, string Suffix, PatFrag Frag,
                                      (_.KVT (Frag_su:$cc (_.VT _.RC:$src1),
                                                          (_.VT _.RC:$src2),
                                                          cond))))]>,
-              EVEX_4V, EVEX_K, Sched<[sched]>;
+              EVEX, VVVV, EVEX_K, Sched<[sched]>;
   def rmik : AVX512AIi8<opc, MRMSrcMem,
               (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2,
                                     u8imm:$cc),
@@ -2148,7 +2148,7 @@ multiclass avx512_icmp_cc<bits<8> opc, string Suffix, PatFrag Frag,
                                        (_.VT _.RC:$src1),
                                        (_.VT (_.LdFrag addr:$src2)),
                                        cond))))]>,
-              EVEX_4V, EVEX_K, Sched<[sched.Folded, sched.ReadAfterFold]>;
+              EVEX, VVVV, EVEX_K, Sched<[sched.Folded, sched.ReadAfterFold]>;
 
   def : Pat<(_.KVT (Frag:$cc (_.LdFrag addr:$src2),
                              (_.VT _.RC:$src1), cond)),
@@ -2177,7 +2177,7 @@ multiclass avx512_icmp_cc_rmb<bits<8> opc, string Suffix, PatFrag Frag,
                                        (_.VT _.RC:$src1),
                                        (_.BroadcastLdFrag addr:$src2),
                                        cond)))]>,
-             EVEX_4V, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
+             EVEX, VVVV, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
   def rmibk : AVX512AIi8<opc, MRMSrcMem,
               (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1,
                                        _.ScalarMemOp:$src2, u8imm:$cc),
@@ -2189,7 +2189,7 @@ multiclass avx512_icmp_cc_rmb<bits<8> opc, string Suffix, PatFrag Frag,
                                              (_.VT _.RC:$src1),
                                              (_.BroadcastLdFrag addr:$src2),
                                              cond))))]>,
-              EVEX_4V, EVEX_K, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
+              EVEX, VVVV, EVEX_K, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
 
   def : Pat<(_.KVT (Frag:$cc (_.BroadcastLdFrag addr:$src2),
                     (_.VT _.RC:$src1), cond)),
@@ -2405,11 +2405,11 @@ multiclass avx512_vcmp<X86SchedWriteWidths sched, AVX512VLVectorVTInfo _,
 }
 
 defm VCMPPD : avx512_vcmp<SchedWriteFCmp, avx512vl_f64_info>,
-                          AVX512PDIi8Base, EVEX_4V, EVEX_CD8<64, CD8VF>, REX_W;
+                          AVX512PDIi8Base, EVEX, VVVV, EVEX_CD8<64, CD8VF>, REX_W;
 defm VCMPPS : avx512_vcmp<SchedWriteFCmp, avx512vl_f32_info>,
-                          AVX512PSIi8Base, EVEX_4V, EVEX_CD8<32, CD8VF>;
+                          AVX512PSIi8Base, EVEX, VVVV, EVEX_CD8<32, CD8VF>;
 defm VCMPPH : avx512_vcmp<SchedWriteFCmp, avx512vl_f16_info, HasFP16>,
-                          AVX512PSIi8Base, EVEX_4V, EVEX_CD8<16, CD8VF>, TA;
+                          AVX512PSIi8Base, EVEX, VVVV, EVEX_CD8<16, CD8VF>, TA;
 
 // Patterns to select fp compares with load as first operand.
 let Predicates = [HasAVX512] in {
@@ -2812,13 +2812,13 @@ multiclass avx512_mask_binop_all<bits<8> opc, string OpcodeStr,
                                  X86FoldableSchedWrite sched, bit IsCommutable,
                                  Predicate prdW = HasAVX512> {
   defm B : avx512_mask_binop<opc, !strconcat(OpcodeStr, "b"), VK8, OpNode,
-                             sched, HasDQI, IsCommutable>, VEX_4V, VEX_L, PD;
+                             sched, HasDQI, IsCommutable>, VEX, VVVV, VEX_L, PD;
   defm W : avx512_mask_binop<opc, !strconcat(OpcodeStr, "w"), VK16, OpNode,
-                             sched, prdW, IsCommutable>, VEX_4V, VEX_L, PS;
+                             sched, prdW, IsCommutable>, VEX, VVVV, VEX_L, PS;
   defm D : avx512_mask_binop<opc, !strconcat(OpcodeStr, "d"), VK32, OpNode,
-                             sched, HasBWI, IsCommutable>, VEX_4V, VEX_L, REX_W, PD;
+                             sched, HasBWI, IsCommutable>, VEX, VVVV, VEX_L, REX_W, PD;
   defm Q : avx512_mask_binop<opc, !strconcat(OpcodeStr, "q"), VK64, OpNode,
-                             sched, HasBWI, IsCommutable>, VEX_4V, VEX_L, REX_W, PS;
+                             sched, HasBWI, IsCommutable>, VEX, VVVV, VEX_L, REX_W, PS;
 }
 
 // TODO - do we need a X86SchedWriteWidths::KMASK type?
@@ -2869,7 +2869,7 @@ multiclass avx512_mask_unpck<string Suffix, X86KVectorVTInfo Dst,
     def rr : I<0x4b, MRMSrcReg, (outs Dst.KRC:$dst),
                (ins Src.KRC:$src1, Src.KRC:$src2),
                "kunpck"#Suffix#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
-               VEX_4V, VEX_L, Sched<[sched]>;
+               VEX, VVVV, VEX_L, Sched<[sched]>;
 
     def : Pat<(Dst.KVT (concat_vectors Src.KRC:$src1, Src.KRC:$src2)),
               (!cast<Instruction>(NAME#rr) Src.KRC:$src2, Src.KRC:$src1)>;
@@ -3897,7 +3897,7 @@ multiclass avx512_move_scalar<string asm, SDNode OpNode, PatFrag vzload_frag,
              (ins _.RC:$src1, _.RC:$src2),
              !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
              [(set _.RC:$dst, (_.VT (OpNode _.RC:$src1, _.RC:$src2)))],
-             _.ExeDomain>, EVEX_4V, Sched<[SchedWriteFShuffle.XMM]>;
+             _.ExeDomain>, EVEX, VVVV, Sched<[SchedWriteFShuffle.XMM]>;
   let Predicates = [prd] in {
   def rrkz : AVX512PI<0x10, MRMSrcReg, (outs _.RC:$dst),
               (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2),
@@ -3906,7 +3906,7 @@ multiclass avx512_move_scalar<string asm, SDNode OpNode, PatFrag vzload_frag,
               [(set _.RC:$dst, (_.VT (X86selects _.KRCWM:$mask,
                                       (_.VT (OpNode _.RC:$src1, _.RC:$src2)),
                                       _.ImmAllZerosV)))],
-              _.ExeDomain>, EVEX_4V, EVEX_KZ, Sched<[SchedWriteFShuffle.XMM]>;
+              _.ExeDomain>, EVEX, VVVV, EVEX_KZ, Sched<[SchedWriteFShuffle.XMM]>;
   let Constraints = "$src0 = $dst"  in
   def rrk : AVX512PI<0x10, MRMSrcReg, (outs _.RC:$dst),
              (ins _.RC:$src0, _.KRCWM:$mask, _.RC:$src1, _.RC:$src2),
@@ -3915,7 +3915,7 @@ multiclass avx512_move_scalar<string asm, SDNode OpNode, PatFrag vzload_frag,
              [(set _.RC:$dst, (_.VT (X86selects _.KRCWM:$mask,
                                      (_.VT (OpNode _.RC:$src1, _.RC:$src2)),
                                      (_.VT _.RC:$src0))))],
-             _.ExeDomain>, EVEX_4V, EVEX_K, Sched<[SchedWriteFShuffle.XMM]>;
+             _.ExeDomain>, EVEX, VVVV, EVEX_K, Sched<[SchedWriteFShuffle.XMM]>;
   let canFoldAsLoad = 1, isReMaterializable = 1 in {
   def rm : AVX512PI<0x10, MRMSrcMem, (outs _.RC:$dst), (ins _.ScalarMemOp:$src),
              !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
@@ -4286,7 +4286,7 @@ let hasSideEffects = 0, isCodeGenOnly = 1, ForceDisassemble = 1 in {
     def VMOVSHZrr_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst),
         (ins VR128X:$src1, VR128X:$src2),
         "vmovsh\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-        []>, T_MAP5XS, EVEX_4V, VEX_LIG,
+        []>, T_MAP5XS, EVEX, VVVV, VEX_LIG,
         Sched<[SchedWriteFShuffle.XMM]>;
 
     let Constraints = "$src0 = $dst" in
@@ -4295,20 +4295,20 @@ let hasSideEffects = 0, isCodeGenOnly = 1, ForceDisassemble = 1 in {
          VR128X:$src1, VR128X:$src2),
         "vmovsh\t{$src2, $src1, $dst {${mask}}|"#
           "$dst {${mask}}, $src1, $src2}",
-        []>, T_MAP5XS, EVEX_K, EVEX_4V, VEX_LIG,
+        []>, T_MAP5XS, EVEX_K, EVEX, VVVV, VEX_LIG,
         Sched<[SchedWriteFShuffle.XMM]>;
 
     def VMOVSHZrrkz_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst),
         (ins f16x_info.KRCWM:$mask, VR128X:$src1, VR128X:$src2),
         "vmovsh\t{$src2, $src1, $dst {${mask}} {z}|"#
           "$dst {${mask}} {z}, $src1, $src2}",
-        []>, EVEX_KZ, T_MAP5XS, EVEX_4V, VEX_LIG,
+        []>, EVEX_KZ, T_MAP5XS, EVEX, VVVV, VEX_LIG,
         Sched<[SchedWriteFShuffle.XMM]>;
   }
   def VMOVSSZrr_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst),
                            (ins VR128X:$src1, VR128X:$src2),
                            "vmovss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                           []>, XS, EVEX_4V, VEX_LIG,
+                           []>, XS, EVEX, VVVV, VEX_LIG,
                            Sched<[SchedWriteFShuffle.XMM]>;
 
   let Constraints = "$src0 = $dst" in
@@ -4317,20 +4317,20 @@ let hasSideEffects = 0, isCodeGenOnly = 1, ForceDisassemble = 1 in {
                                                    VR128X:$src1, VR128X:$src2),
                              "vmovss\t{$src2, $src1, $dst {${mask}}|"#
                                         "$dst {${mask}}, $src1, $src2}",
-                             []>, EVEX_K, XS, EVEX_4V, VEX_LIG,
+                             []>, EVEX_K, XS, EVEX, VVVV, VEX_LIG,
                              Sched<[SchedWriteFShuffle.XMM]>;
 
   def VMOVSSZrrkz_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst),
                          (ins f32x_info.KRCWM:$mask, VR128X:$src1, VR128X:$src2),
                          "vmovss\t{$src2, $src1, $dst {${mask}} {z}|"#
                                     "$dst {${mask}} {z}, $src1, $src2}",
-                         []>, EVEX_KZ, XS, EVEX_4V, VEX_LIG,
+                         []>, EVEX_KZ, XS, EVEX, VVVV, VEX_LIG,
                          Sched<[SchedWriteFShuffle.XMM]>;
 
   def VMOVSDZrr_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst),
                            (ins VR128X:$src1, VR128X:$src2),
                            "vmovsd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                           []>, XD, EVEX_4V, VEX_LIG, REX_W,
+                           []>, XD, EVEX, VVVV, VEX_LIG, REX_W,
                            Sched<[SchedWriteFShuffle.XMM]>;
 
   let Constraints = "$src0 = $dst" in
@@ -4339,7 +4339,7 @@ let hasSideEffects = 0, isCodeGenOnly = 1, ForceDisassemble = 1 in {
                                                    VR128X:$src1, VR128X:$src2),
                              "vmovsd\t{$src2, $src1, $dst {${mask}}|"#
                                         "$dst {${mask}}, $src1, $src2}",
-                             []>, EVEX_K, XD, EVEX_4V, VEX_LIG,
+                             []>, EVEX_K, XD, EVEX, VVVV, VEX_LIG,
                              REX_W, Sched<[SchedWriteFShuffle.XMM]>;
 
   def VMOVSDZrrkz_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst),
@@ -4347,7 +4347,7 @@ let hasSideEffects = 0, isCodeGenOnly = 1, ForceDisassemble = 1 in {
                                                           VR128X:$src2),
                               "vmovsd\t{$src2, $src1, $dst {${mask}} {z}|"#
                                          "$dst {${mask}} {z}, $src1, $src2}",
-                              []>, EVEX_KZ, XD, EVEX_4V, VEX_LIG,
+                              []>, EVEX_KZ, XD, EVEX, VVVV, VEX_LIG,
                               REX_W, Sched<[SchedWriteFShuffle.XMM]>;
 }
 
@@ -4665,14 +4665,14 @@ multiclass avx512_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
                     (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
                     "$src2, $src1", "$src1, $src2",
                     (_.VT (OpNode _.RC:$src1, _.RC:$src2)),
-                    IsCommutable, IsCommutable>, AVX512BIBase, EVEX_4V,
+                    IsCommutable, IsCommutable>, AVX512BIBase, EVEX, VVVV,
                     Sched<[sched]>;
 
   defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
                   (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr,
                   "$src2, $src1", "$src1, $src2",
                   (_.VT (OpNode _.RC:$src1, (_.LdFrag addr:$src2)))>,
-                  AVX512BIBase, EVEX_4V,
+                  AVX512BIBase, EVEX, VVVV,
                   Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
@@ -4686,7 +4686,7 @@ multiclass avx512_binop_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode,
                   "$src1, ${src2}"#_.BroadcastStr,
                   (_.VT (OpNode _.RC:$src1,
                                 (_.BroadcastLdFrag addr:$src2)))>,
-                  AVX512BIBase, EVEX_4V, EVEX_B,
+                  AVX512BIBase, EVEX, VVVV, EVEX_B,
                   Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
@@ -4796,13 +4796,13 @@ multiclass avx512_binop_rm2<bits<8> opc, string OpcodeStr,
                                          (_Src.VT _Src.RC:$src1),
                                          (_Src.VT _Src.RC:$src2))),
                             IsCommutable>,
-                            AVX512BIBase, EVEX_4V, Sched<[sched]>;
+                            AVX512BIBase, EVEX, VVVV, Sched<[sched]>;
   defm rm : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst),
                         (ins _Src.RC:$src1, _Src.MemOp:$src2), OpcodeStr,
                         "$src2, $src1", "$src1, $src2",
                         (_Dst.VT (OpNode (_Src.VT _Src.RC:$src1),
                                       (_Src.LdFrag addr:$src2)))>,
-                        AVX512BIBase, EVEX_4V,
+                        AVX512BIBase, EVEX, VVVV,
                         Sched<[sched.Folded, sched.ReadAfterFold]>;
 
   defm rmb : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst),
@@ -4812,7 +4812,7 @@ multiclass avx512_binop_rm2<bits<8> opc, string OpcodeStr,
                      "$src1, ${src2}"#_Brdct.BroadcastStr,
                     (_Dst.VT (OpNode (_Src.VT _Src.RC:$src1), (bitconvert
                                  (_Brdct.VT (_Brdct.BroadcastLdFrag addr:$src2)))))>,
-                    AVX512BIBase, EVEX_4V, EVEX_B,
+                    AVX512BIBase, EVEX, VVVV, EVEX_B,
                     Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
@@ -4884,7 +4884,7 @@ multiclass avx512_packs_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode,
                      "$src1, ${src2}"#_Src.BroadcastStr,
                     (_Dst.VT (OpNode (_Src.VT _Src.RC:$src1), (bitconvert
                                  (_Src.VT (_Src.BroadcastLdFrag addr:$src2)))))>,
-                    EVEX_4V, EVEX_B, EVEX_CD8<_Src.EltSize, CD8VF>,
+                    EVEX, VVVV, EVEX_B, EVEX_CD8<_Src.EltSize, CD8VF>,
                     Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
@@ -4899,13 +4899,13 @@ multiclass avx512_packs_rm<bits<8> opc, string OpcodeStr,
                                          (_Src.VT _Src.RC:$src1),
                                          (_Src.VT _Src.RC:$src2))),
                             IsCommutable, IsCommutable>,
-                            EVEX_CD8<_Src.EltSize, CD8VF>, EVEX_4V, Sched<[sched]>;
+                            EVEX_CD8<_Src.EltSize, CD8VF>, EVEX, VVVV, Sched<[sched]>;
   defm rm : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst),
                         (ins _Src.RC:$src1, _Src.MemOp:$src2), OpcodeStr,
                         "$src2, $src1", "$src1, $src2",
                         (_Dst.VT (OpNode (_Src.VT _Src.RC:$src1),
                                       (_Src.LdFrag addr:$src2)))>,
-                         EVEX_4V, EVEX_CD8<_Src.EltSize, CD8VF>,
+                         EVEX, VVVV, EVEX_CD8<_Src.EltSize, CD8VF>,
                          Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
@@ -5445,18 +5445,18 @@ multiclass avx512_binop_s_round<bits<8> opc, string OpcodeStr, SDPatternOperator
                               sched.PS.Scl, IsCommutable>,
              avx512_fp_scalar_round<opc, OpcodeStr#"ss", f32x_info, RndNode,
                               sched.PS.Scl>,
-                              XS, EVEX_4V, VEX_LIG,  EVEX_CD8<32, CD8VT1>;
+                              XS, EVEX, VVVV, VEX_LIG,  EVEX_CD8<32, CD8VT1>;
   defm SDZ : avx512_fp_scalar<opc, OpcodeStr#"sd", f64x_info, OpNode, VecNode,
                               sched.PD.Scl, IsCommutable>,
              avx512_fp_scalar_round<opc, OpcodeStr#"sd", f64x_info, RndNode,
                               sched.PD.Scl>,
-                              XD, REX_W, EVEX_4V, VEX_LIG, EVEX_CD8<64, CD8VT1>;
+                              XD, REX_W, EVEX, VVVV, VEX_LIG, EVEX_CD8<64, CD8VT1>;
   let Predicates = [HasFP16] in
     defm SHZ : avx512_fp_scalar<opc, OpcodeStr#"sh", f16x_info, OpNode,
                                 VecNode, sched.PH.Scl, IsCommutable>,
                avx512_fp_scalar_round<opc, OpcodeStr#"sh", f16x_info, RndNode,
                                 sched.PH.Scl>,
-                                T_MAP5XS, EVEX_4V, VEX_LIG, EVEX_CD8<16, CD8VT1>;
+                                T_MAP5XS, EVEX, VVVV, VEX_LIG, EVEX_CD8<16, CD8VT1>;
 }
 
 multiclass avx512_binop_s_sae<bits<8> opc, string OpcodeStr, SDNode OpNode,
@@ -5465,16 +5465,16 @@ multiclass avx512_binop_s_sae<bits<8> opc, string OpcodeStr, SDNode OpNode,
   defm SSZ : avx512_fp_scalar_sae<opc, OpcodeStr#"ss", f32x_info, OpNode,
                               VecNode, SaeNode, sched.PS.Scl, IsCommutable,
                               NAME#"SS">,
-                              XS, EVEX_4V, VEX_LIG,  EVEX_CD8<32, CD8VT1>;
+                              XS, EVEX, VVVV, VEX_LIG,  EVEX_CD8<32, CD8VT1>;
   defm SDZ : avx512_fp_scalar_sae<opc, OpcodeStr#"sd", f64x_info, OpNode,
                               VecNode, SaeNode, sched.PD.Scl, IsCommutable,
                               NAME#"SD">,
-                              XD, REX_W, EVEX_4V, VEX_LIG, EVEX_CD8<64, CD8VT1>;
+                              XD, REX_W, EVEX, VVVV, VEX_LIG, EVEX_CD8<64, CD8VT1>;
   let Predicates = [HasFP16] in {
     defm SHZ : avx512_fp_scalar_sae<opc, OpcodeStr#"sh", f16x_info, OpNode,
                                 VecNode, SaeNode, sched.PH.Scl, IsCommutable,
                                 NAME#"SH">,
-                                T_MAP5XS, EVEX_4V, VEX_LIG, EVEX_CD8<16, CD8VT1>,
+                                T_MAP5XS, EVEX, VVVV, VEX_LIG, EVEX_CD8<16, CD8VT1>,
                                 NotEVEX2VEXConvertible;
   }
 }
@@ -5516,29 +5516,29 @@ multiclass avx512_comutable_binop_s<bits<8> opc, string OpcodeStr,
 }
 defm VMINCSSZ : avx512_comutable_binop_s<0x5D, "vminss", f32x_info, X86fminc,
                                          SchedWriteFCmp.Scl, "VMINCSS">, XS,
-                                         EVEX_4V, VEX_LIG, EVEX_CD8<32, CD8VT1>, SIMD_EXC;
+                                         EVEX, VVVV, VEX_LIG, EVEX_CD8<32, CD8VT1>, SIMD_EXC;
 
 defm VMINCSDZ : avx512_comutable_binop_s<0x5D, "vminsd", f64x_info, X86fminc,
                                          SchedWriteFCmp.Scl, "VMINCSD">, XD,
-                                         REX_W, EVEX_4V, VEX_LIG,
+                                         REX_W, EVEX, VVVV, VEX_LIG,
                                          EVEX_CD8<64, CD8VT1>, SIMD_EXC;
 
 defm VMAXCSSZ : avx512_comutable_binop_s<0x5F, "vmaxss", f32x_info, X86fmaxc,
                                          SchedWriteFCmp.Scl, "VMAXCSS">, XS,
-                                         EVEX_4V, VEX_LIG, EVEX_CD8<32, CD8VT1>, SIMD_EXC;
+                                         EVEX, VVVV, VEX_LIG, EVEX_CD8<32, CD8VT1>, SIMD_EXC;
 
 defm VMAXCSDZ : avx512_comutable_binop_s<0x5F, "vmaxsd", f64x_info, X86fmaxc,
                                          SchedWriteFCmp.Scl, "VMAXCSD">, XD,
-                                         REX_W, EVEX_4V, VEX_LIG,
+                                         REX_W, EVEX, VVVV, VEX_LIG,
                                          EVEX_CD8<64, CD8VT1>, SIMD_EXC;
 
 defm VMINCSHZ : avx512_comutable_binop_s<0x5D, "vminsh", f16x_info, X86fminc,
                                          SchedWriteFCmp.Scl, "VMINCSH">, T_MAP5XS,
-                                         EVEX_4V, VEX_LIG, EVEX_CD8<16, CD8VT1>, SIMD_EXC,
+                                         EVEX, VVVV, VEX_LIG, EVEX_CD8<16, CD8VT1>, SIMD_EXC,
                                          NotEVEX2VEXConvertible;
 defm VMAXCSHZ : avx512_comutable_binop_s<0x5F, "vmaxsh", f16x_info, X86fmaxc,
                                          SchedWriteFCmp.Scl, "VMAXCSH">, T_MAP5XS,
-                                         EVEX_4V, VEX_LIG, EVEX_CD8<16, CD8VT1>, SIMD_EXC,
+                                         EVEX, VVVV, VEX_LIG, EVEX_CD8<16, CD8VT1>, SIMD_EXC,
                                          NotEVEX2VEXConvertible;
 
 multiclass avx512_fp_packed<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
@@ -5556,21 +5556,21 @@ multiclass avx512_fp_packed<bits<8> opc, string OpcodeStr, SDPatternOperator OpN
                                  "$src2, $src1", "$src1, $src2",
                                  (_.VT (OpNode _.RC:$src1, _.RC:$src2)),
                                  (_.VT (MaskOpNode _.RC:$src1, _.RC:$src2)), ClobberConstraint,
-                                 IsCommutable, IsKCommutable, IsKCommutable>, EVEX_4V, Sched<[sched]>;
+                                 IsCommutable, IsKCommutable, IsKCommutable>, EVEX, VVVV, Sched<[sched]>;
   let mayLoad = 1 in {
     defm rm: AVX512_maskable_split<opc, MRMSrcMem, _, (outs _.RC:$dst),
                                    (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr#suffix,
                                    "$src2, $src1", "$src1, $src2",
                                    (OpNode _.RC:$src1, (_.LdFrag addr:$src2)),
                                    (MaskOpNode _.RC:$src1, (_.LdFrag addr:$src2)),
-                                   ClobberConstraint>, EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>;
+                                   ClobberConstraint>, EVEX, VVVV, Sched<[sched.Folded, sched.ReadAfterFold]>;
     defm rmb: AVX512_maskable_split<opc, MRMSrcMem, _, (outs _.RC:$dst),
                                     (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr#suffix,
                                     "${src2}"#_.BroadcastStr#", $src1",
                                     "$src1, ${src2}"#_.BroadcastStr,
                                     (OpNode  _.RC:$src1, (_.VT (_.BroadcastLdFrag addr:$src2))),
                                     (MaskOpNode  _.RC:$src1, (_.VT (_.BroadcastLdFrag addr:$src2))),
-                                    ClobberConstraint>, EVEX_4V, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
+                                    ClobberConstraint>, EVEX, VVVV, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
     }
   }
 }
@@ -5586,7 +5586,7 @@ multiclass avx512_fp_round_packed<bits<8> opc, string OpcodeStr,
                   "$rc, $src2, $src1", "$src1, $src2, $rc",
                   (_.VT (OpNodeRnd _.RC:$src1, _.RC:$src2, (i32 timm:$rc))),
                   0, 0, 0, vselect_mask, ClobberConstraint>,
-                  EVEX_4V, EVEX_B, EVEX_RC, Sched<[sched]>;
+                  EVEX, VVVV, EVEX_B, EVEX_RC, Sched<[sched]>;
 }
 
 multiclass avx512_fp_sae_packed<bits<8> opc, string OpcodeStr,
@@ -5597,7 +5597,7 @@ multiclass avx512_fp_sae_packed<bits<8> opc, string OpcodeStr,
                   (ins _.RC:$src1, _.RC:$src2), OpcodeStr#_.Suffix,
                   "{sae}, $src2, $src1", "$src1, $src2, {sae}",
                   (_.VT (OpNodeSAE _.RC:$src1, _.RC:$src2))>,
-                  EVEX_4V, EVEX_B, Sched<[sched]>;
+                  EVEX, VVVV, EVEX_B, Sched<[sched]>;
 }
 
 multiclass avx512_fp_binop_p<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
@@ -5734,18 +5734,18 @@ multiclass avx512_fp_scalef_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
                   (ins _.RC:$src1, _.RC:$src2), OpcodeStr#_.Suffix,
                   "$src2, $src1", "$src1, $src2",
                   (_.VT (OpNode _.RC:$src1, _.RC:$src2))>,
-                  EVEX_4V, Sched<[sched]>;
+                  EVEX, VVVV, Sched<[sched]>;
   defm rm: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
                   (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr#_.Suffix,
                   "$src2, $src1", "$src1, $src2",
                   (OpNode _.RC:$src1, (_.LdFrag addr:$src2))>,
-                  EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>;
+                  EVEX, VVVV, Sched<[sched.Folded, sched.ReadAfterFold]>;
   defm rmb: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
                    (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr#_.Suffix,
                    "${src2}"#_.BroadcastStr#", $src1",
                    "$src1, ${src2}"#_.BroadcastStr,
                    (OpNode  _.RC:$src1, (_.VT (_.BroadcastLdFrag addr:$src2)))>,
-                   EVEX_4V, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
+                   EVEX, VVVV, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
   }
 }
 
@@ -5773,7 +5773,7 @@ multiclass avx512_fp_scalef_all<bits<8> opc, bits<8> opcScaler, string OpcodeStr
                                 EVEX_V512, T_MAP6PD, EVEX_CD8<16, CD8VF>;
     defm SHZ : avx512_fp_scalef_scalar<opcScaler, OpcodeStr, X86scalefs, sched.Scl, f16x_info>,
                avx512_fp_scalar_round<opcScaler, OpcodeStr#"sh", f16x_info, X86scalefsRnd, sched.Scl>,
-                             EVEX_4V, T_MAP6PD, EVEX_CD8<16, CD8VT1>;
+                             EVEX, VVVV, T_MAP6PD, EVEX_CD8<16, CD8VT1>;
   }
   defm PSZ : avx512_fp_scalef_p<opc, OpcodeStr, X86scalef, sched.ZMM, v16f32_info>,
              avx512_fp_round_packed<opc, OpcodeStr, X86scalefRnd, sched.ZMM, v16f32_info>,
@@ -5784,11 +5784,11 @@ multiclass avx512_fp_scalef_all<bits<8> opc, bits<8> opcScaler, string OpcodeStr
   defm SSZ : avx512_fp_scalef_scalar<opcScaler, OpcodeStr, X86scalefs, sched.Scl, f32x_info>,
              avx512_fp_scalar_round<opcScaler, OpcodeStr#"ss", f32x_info,
                                     X86scalefsRnd, sched.Scl>,
-                                    EVEX_4V, VEX_LIG, EVEX_CD8<32, CD8VT1>, T8PD;
+                                    EVEX, VVVV, VEX_LIG, EVEX_CD8<32, CD8VT1>, T8PD;
   defm SDZ : avx512_fp_scalef_scalar<opcScaler, OpcodeStr, X86scalefs, sched.Scl, f64x_info>,
              avx512_fp_scalar_round<opcScaler, OpcodeStr#"sd", f64x_info,
                                     X86scalefsRnd, sched.Scl>,
-                                    EVEX_4V, VEX_LIG, EVEX_CD8<64, CD8VT1>, REX_W, T8PD;
+                                    EVEX, VVVV, VEX_LIG, EVEX_CD8<64, CD8VT1>, REX_W, T8PD;
 
   // Define only if AVX512VL feature is present.
   let Predicates = [HasVLX] in {
@@ -5825,13 +5825,13 @@ multiclass avx512_vptest<bits<8> opc, string OpcodeStr,
                    (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
                       "$src2, $src1", "$src1, $src2",
                    (null_frag), (null_frag), 1>,
-                   EVEX_4V, Sched<[sched]>;
+                   EVEX, VVVV, Sched<[sched]>;
   let mayLoad = 1 in
   defm rm : AVX512_maskable_cmp<opc, MRMSrcMem, _, (outs _.KRC:$dst),
                    (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr,
                        "$src2, $src1", "$src1, $src2",
                    (null_frag), (null_frag)>,
-                   EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>,
+                   EVEX, VVVV, EVEX_CD8<_.EltSize, CD8VF>,
                    Sched<[sched.Folded, sched.ReadAfterFold]>;
   }
 }
@@ -5844,7 +5844,7 @@ multiclass avx512_vptest_mb<bits<8> opc, string OpcodeStr,
                     "${src2}"#_.BroadcastStr#", $src1",
                     "$src1, ${src2}"#_.BroadcastStr,
                     (null_frag), (null_frag)>,
-                    EVEX_B, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>,
+                    EVEX_B, EVEX, VVVV, EVEX_CD8<_.EltSize, CD8VF>,
                     Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
@@ -5944,13 +5944,13 @@ multiclass avx512_shift_rrm<bits<8> opc, string OpcodeStr, SDNode OpNode,
                    (ins _.RC:$src1, VR128X:$src2), OpcodeStr,
                       "$src2, $src1", "$src1, $src2",
                    (_.VT (OpNode _.RC:$src1, (SrcVT VR128X:$src2)))>,
-                   AVX512BIBase, EVEX_4V, Sched<[sched]>;
+                   AVX512BIBase, EVEX, VVVV, Sched<[sched]>;
   defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
                    (ins _.RC:$src1, i128mem:$src2), OpcodeStr,
                        "$src2, $src1", "$src1, $src2",
                    (_.VT (OpNode _.RC:$src1, (SrcVT (load addr:$src2))))>,
                    AVX512BIBase,
-                   EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>;
+                   EVEX, VVVV, Sched<[sched.Folded, sched.ReadAfterFold]>;
   }
 }
 
@@ -6035,22 +6035,22 @@ multiclass avx512_shift_rmi_dq<bits<8> opcd, bits<8> opcq,
 defm VPSRL : avx512_shift_rmi_dq<0x72, 0x73, MRM2r, MRM2m, "vpsrl", X86vsrli,
                                  SchedWriteVecShiftImm>,
              avx512_shift_rmi_w<0x71, MRM2r, MRM2m, "vpsrlw", X86vsrli,
-                                SchedWriteVecShiftImm>, AVX512BIi8Base, EVEX_4V;
+                                SchedWriteVecShiftImm>, AVX512BIi8Base, EVEX, VVVV;
 
 defm VPSLL : avx512_shift_rmi_dq<0x72, 0x73, MRM6r, MRM6m, "vpsll", X86vshli,
                                  SchedWriteVecShiftImm>,
              avx512_shift_rmi_w<0x71, MRM6r, MRM6m, "vpsllw", X86vshli,
-                                SchedWriteVecShiftImm>, AVX512BIi8Base, EVEX_4V;
+                                SchedWriteVecShiftImm>, AVX512BIi8Base, EVEX, VVVV;
 
 defm VPSRA : avx512_shift_rmi_dq<0x72, 0x72, MRM4r, MRM4m, "vpsra", X86vsrai,
                                  SchedWriteVecShiftImm, 1>,
              avx512_shift_rmi_w<0x71, MRM4r, MRM4m, "vpsraw", X86vsrai,
-                                SchedWriteVecShiftImm>, AVX512BIi8Base, EVEX_4V;
+                                SchedWriteVecShiftImm>, AVX512BIi8Base, EVEX, VVVV;
 
 defm VPROR : avx512_shift_rmi_dq<0x72, 0x72, MRM0r, MRM0m, "vpror", X86vrotri,
-                                 SchedWriteVecShiftImm>, AVX512BIi8Base, EVEX_4V;
+                                 SchedWriteVecShiftImm>, AVX512BIi8Base, EVEX, VVVV;
 defm VPROL : avx512_shift_rmi_dq<0x72, 0x72, MRM1r, MRM1m, "vprol", X86vrotli,
-                                 SchedWriteVecShiftImm>, AVX512BIi8Base, EVEX_4V;
+                                 SchedWriteVecShiftImm>, AVX512BIi8Base, EVEX, VVVV;
 
 defm VPSLL : avx512_shift_types<0xF2, 0xF3, 0xF1, "vpsll", X86vshl,
                                 SchedWriteVecShift>;
@@ -6097,13 +6097,13 @@ multiclass avx512_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode,
                    (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
                       "$src2, $src1", "$src1, $src2",
                    (_.VT (OpNode _.RC:$src1, (_.VT _.RC:$src2)))>,
-                   AVX5128IBase, EVEX_4V, Sched<[sched]>;
+                   AVX5128IBase, EVEX, VVVV, Sched<[sched]>;
   defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
                    (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr,
                        "$src2, $src1", "$src1, $src2",
                    (_.VT (OpNode _.RC:$src1,
                    (_.VT (_.LdFrag addr:$src2))))>,
-                   AVX5128IBase, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>,
+                   AVX5128IBase, EVEX, VVVV, EVEX_CD8<_.EltSize, CD8VF>,
                    Sched<[sched.Folded, sched.ReadAfterFold]>;
   }
 }
@@ -6116,7 +6116,7 @@ multiclass avx512_var_shift_mb<bits<8> opc, string OpcodeStr, SDNode OpNode,
                     "${src2}"#_.BroadcastStr#", $src1",
                     "$src1, ${src2}"#_.BroadcastStr,
                     (_.VT (OpNode _.RC:$src1, (_.VT (_.BroadcastLdFrag addr:$src2))))>,
-                    AVX5128IBase, EVEX_B, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>,
+                    AVX5128IBase, EVEX_B, EVEX, VVVV, EVEX_CD8<_.EltSize, CD8VF>,
                     Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
@@ -6374,14 +6374,14 @@ multiclass avx512_permil_vec<bits<8> OpcVar, string OpcodeStr, SDNode OpNode,
                   "$src2, $src1", "$src1, $src2",
                   (_.VT (OpNode _.RC:$src1,
                                (Ctrl.VT Ctrl.RC:$src2)))>,
-                  T8PD, EVEX_4V, Sched<[sched]>;
+                  T8PD, EVEX, VVVV, Sched<[sched]>;
   defm rm: AVX512_maskable<OpcVar, MRMSrcMem, _, (outs _.RC:$dst),
                   (ins _.RC:$src1, Ctrl.MemOp:$src2), OpcodeStr,
                   "$src2, $src1", "$src1, $src2",
                   (_.VT (OpNode
                            _.RC:$src1,
                            (Ctrl.VT (Ctrl.LdFrag addr:$src2))))>,
-                  T8PD, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>,
+                  T8PD, EVEX, VVVV, EVEX_CD8<_.EltSize, CD8VF>,
                   Sched<[sched.Folded, sched.ReadAfterFold]>;
   defm rmb: AVX512_maskable<OpcVar, MRMSrcMem, _, (outs _.RC:$dst),
                    (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr,
@@ -6390,7 +6390,7 @@ multiclass avx512_permil_vec<bits<8> OpcVar, string OpcodeStr, SDNode OpNode,
                    (_.VT (OpNode
                             _.RC:$src1,
                             (Ctrl.VT (Ctrl.BroadcastLdFrag addr:$src2))))>,
-                   T8PD, EVEX_4V, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>,
+                   T8PD, EVEX, VVVV, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>,
                    Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
@@ -6469,13 +6469,13 @@ def VMOVLHPSZrr : AVX512PSI<0x16, MRMSrcReg, (outs VR128X:$dst),
           (ins VR128X:$src1, VR128X:$src2),
           "vmovlhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
           [(set VR128X:$dst, (v4f32 (X86Movlhps VR128X:$src1, VR128X:$src2)))]>,
-          Sched<[SchedWriteFShuffle.XMM]>, EVEX_4V;
+          Sched<[SchedWriteFShuffle.XMM]>, EVEX, VVVV;
 let isCommutable = 1 in
 def VMOVHLPSZrr : AVX512PSI<0x12, MRMSrcReg, (outs VR128X:$dst),
           (ins VR128X:$src1, VR128X:$src2),
           "vmovhlps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
           [(set VR128X:$dst, (v4f32 (X86Movhlps VR128X:$src1, VR128X:$src2)))]>,
-          Sched<[SchedWriteFShuffle.XMM]>, EVEX_4V;
+          Sched<[SchedWriteFShuffle.XMM]>, EVEX, VVVV;
 
 //===----------------------------------------------------------------------===//
 // VMOVHPS/PD VMOVLPS Instructions
@@ -6494,7 +6494,7 @@ multiclass avx512_mov_hilo_packed<bits<8> opc, string OpcodeStr,
                      (OpNode _.RC:$src1,
                        (_.VT (bitconvert
                          (v2f64 (scalar_to_vector (loadf64 addr:$src2)))))))]>,
-                  Sched<[SchedWriteFShuffle.XMM.Folded, SchedWriteFShuffle.XMM.ReadAfterFold]>, EVEX_4V;
+                  Sched<[SchedWriteFShuffle.XMM.Folded, SchedWriteFShuffle.XMM.ReadAfterFold]>, EVEX, VVVV;
 }
 
 // No patterns for MOVLPS/MOVHPS as the Movlhps node should only be created in
@@ -6565,14 +6565,14 @@ multiclass avx512_fma3p_213_rm<bits<8> opc, string OpcodeStr, SDPatternOperator
           OpcodeStr, "$src3, $src2", "$src2, $src3",
           (_.VT (OpNode _.RC:$src2, _.RC:$src1, _.RC:$src3)),
           (_.VT (MaskOpNode _.RC:$src2, _.RC:$src1, _.RC:$src3)), 1, 1>,
-          EVEX_4V, Sched<[sched]>;
+          EVEX, VVVV, Sched<[sched]>;
 
   defm m: AVX512_maskable_fma<opc, MRMSrcMem, _, (outs _.RC:$dst),
           (ins _.RC:$src2, _.MemOp:$src3),
           OpcodeStr, "$src3, $src2", "$src2, $src3",
           (_.VT (OpNode _.RC:$src2, _.RC:$src1, (_.LdFrag addr:$src3))),
           (_.VT (MaskOpNode _.RC:$src2, _.RC:$src1, (_.LdFrag addr:$src3))), 1, 0>,
-          EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold,
+          EVEX, VVVV, Sched<[sched.Folded, sched.ReadAfterFold,
                           sched.ReadAfterFold]>;
 
   defm mb: AVX512_maskable_fma<opc, MRMSrcMem, _, (outs _.RC:$dst),
@@ -6583,7 +6583,7 @@ multiclass avx512_fma3p_213_rm<bits<8> opc, string OpcodeStr, SDPatternOperator
              _.RC:$src1,(_.VT (_.BroadcastLdFrag addr:$src3))),
             (MaskOpNode _.RC:$src2,
              _.RC:$src1,(_.VT (_.BroadcastLdFrag addr:$src3))), 1, 0>,
-            EVEX_4V, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold,
+            EVEX, VVVV, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold,
                                     sched.ReadAfterFold]>;
   }
 }
@@ -6598,7 +6598,7 @@ multiclass avx512_fma3_213_round<bits<8> opc, string OpcodeStr, SDNode OpNode,
           OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc",
           (_.VT ( OpNode _.RC:$src2, _.RC:$src1, _.RC:$src3, (i32 timm:$rc))),
           (_.VT ( OpNode _.RC:$src2, _.RC:$src1, _.RC:$src3, (i32 timm:$rc))), 1, 1>,
-          EVEX_4V, EVEX_B, EVEX_RC, Sched<[sched]>;
+          EVEX, VVVV, EVEX_B, EVEX_RC, Sched<[sched]>;
 }
 
 multiclass avx512_fma3p_213_common<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
@@ -6660,14 +6660,14 @@ multiclass avx512_fma3p_231_rm<bits<8> opc, string OpcodeStr, SDPatternOperator
           OpcodeStr, "$src3, $src2", "$src2, $src3",
           (null_frag),
           (_.VT (MaskOpNode _.RC:$src2, _.RC:$src3, _.RC:$src1)), 1, 1>,
-          EVEX_4V, Sched<[sched]>;
+          EVEX, VVVV, Sched<[sched]>;
 
   defm m: AVX512_maskable_fma<opc, MRMSrcMem, _, (outs _.RC:$dst),
           (ins _.RC:$src2, _.MemOp:$src3),
           OpcodeStr, "$src3, $src2", "$src2, $src3",
           (_.VT (OpNode _.RC:$src2, (_.LdFrag addr:$src3), _.RC:$src1)),
           (_.VT (MaskOpNode _.RC:$src2, (_.LdFrag addr:$src3), _.RC:$src1)), 1, 0>,
-          EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold,
+          EVEX, VVVV, Sched<[sched.Folded, sched.ReadAfterFold,
                           sched.ReadAfterFold]>;
 
   defm mb: AVX512_maskable_fma<opc, MRMSrcMem, _, (outs _.RC:$dst),
@@ -6679,7 +6679,7 @@ multiclass avx512_fma3p_231_rm<bits<8> opc, string OpcodeStr, SDPatternOperator
                       _.RC:$src1)),
          (_.VT (MaskOpNode _.RC:$src2,
                            (_.VT (_.BroadcastLdFrag addr:$src3)),
-                           _.RC:$src1)), 1, 0>, EVEX_4V, EVEX_B,
+                           _.RC:$src1)), 1, 0>, EVEX, VVVV, EVEX_B,
          Sched<[sched.Folded, sched.ReadAfterFold,
                 sched.ReadAfterFold]>;
   }
@@ -6695,7 +6695,7 @@ multiclass avx512_fma3_231_round<bits<8> opc, string OpcodeStr, SDNode OpNode,
           OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc",
           (null_frag),
           (_.VT (OpNode _.RC:$src2, _.RC:$src3, _.RC:$src1, (i32 timm:$rc))),
-          1, 1>, EVEX_4V, EVEX_B, EVEX_RC, Sched<[sched]>;
+          1, 1>, EVEX, VVVV, EVEX_B, EVEX_RC, Sched<[sched]>;
 }
 
 multiclass avx512_fma3p_231_common<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
@@ -6756,7 +6756,7 @@ multiclass avx512_fma3p_132_rm<bits<8> opc, string OpcodeStr, SDPatternOperator
           OpcodeStr, "$src3, $src2", "$src2, $src3",
           (null_frag),
           (_.VT (MaskOpNode _.RC:$src1, _.RC:$src3, _.RC:$src2)), 1, 1>,
-          EVEX_4V, Sched<[sched]>;
+          EVEX, VVVV, Sched<[sched]>;
 
   // Pattern is 312 order so that the load is in a different place from the
   // 213 and 231 patterns this helps tablegen's duplicate pattern detection.
@@ -6765,7 +6765,7 @@ multiclass avx512_fma3p_132_rm<bits<8> opc, string OpcodeStr, SDPatternOperator
           OpcodeStr, "$src3, $src2", "$src2, $src3",
           (_.VT (OpNode (_.LdFrag addr:$src3), _.RC:$src1, _.RC:$src2)),
           (_.VT (MaskOpNode (_.LdFrag addr:$src3), _.RC:$src1, _.RC:$src2)), 1, 0>,
-          EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold,
+          EVEX, VVVV, Sched<[sched.Folded, sched.ReadAfterFold,
                           sched.ReadAfterFold]>;
 
   // Pattern is 312 order so that the load is in a different place from the
@@ -6778,7 +6778,7 @@ multiclass avx512_fma3p_132_rm<bits<8> opc, string OpcodeStr, SDPatternOperator
                        _.RC:$src1, _.RC:$src2)),
          (_.VT (MaskOpNode (_.VT (_.BroadcastLdFrag addr:$src3)),
                            _.RC:$src1, _.RC:$src2)), 1, 0>,
-         EVEX_4V, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold,
+         EVEX, VVVV, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold,
                                  sched.ReadAfterFold]>;
   }
 }
@@ -6793,7 +6793,7 @@ multiclass avx512_fma3_132_round<bits<8> opc, string OpcodeStr, SDNode OpNode,
           OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc",
           (null_frag),
           (_.VT (OpNode _.RC:$src1, _.RC:$src3, _.RC:$src2, (i32 timm:$rc))),
-          1, 1>, EVEX_4V, EVEX_B, EVEX_RC, Sched<[sched]>;
+          1, 1>, EVEX, VVVV, EVEX_B, EVEX_RC, Sched<[sched]>;
 }
 
 multiclass avx512_fma3p_132_common<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
@@ -6851,33 +6851,33 @@ let Constraints = "$src1 = $dst", hasSideEffects = 0 in {
   defm r_Int: AVX512_maskable_3src_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
           (ins _.RC:$src2, _.RC:$src3), OpcodeStr,
           "$src3, $src2", "$src2, $src3", (null_frag), 1, 1>,
-          EVEX_4V, Sched<[SchedWriteFMA.Scl]>, SIMD_EXC;
+          EVEX, VVVV, Sched<[SchedWriteFMA.Scl]>, SIMD_EXC;
 
   let mayLoad = 1 in
   defm m_Int: AVX512_maskable_3src_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
           (ins _.RC:$src2, _.IntScalarMemOp:$src3), OpcodeStr,
           "$src3, $src2", "$src2, $src3", (null_frag), 1, 1>,
-          EVEX_4V, Sched<[SchedWriteFMA.Scl.Folded, SchedWriteFMA.Scl.ReadAfterFold,
+          EVEX, VVVV, Sched<[SchedWriteFMA.Scl.Folded, SchedWriteFMA.Scl.ReadAfterFold,
                           SchedWriteFMA.Scl.ReadAfterFold]>, SIMD_EXC;
 
   let Uses = [MXCSR] in
   defm rb_Int: AVX512_maskable_3src_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
          (ins _.RC:$src2, _.RC:$src3, AVX512RC:$rc),
          OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc", (null_frag), 1, 1>,
-         EVEX_4V, EVEX_B, EVEX_RC, Sched<[SchedWriteFMA.Scl]>;
+         EVEX, VVVV, EVEX_B, EVEX_RC, Sched<[SchedWriteFMA.Scl]>;
 
   let isCodeGenOnly = 1, isCommutable = 1 in {
     def r     : AVX512<opc, MRMSrcReg, (outs _.FRC:$dst),
                      (ins _.FRC:$src1, _.FRC:$src2, _.FRC:$src3),
                      !strconcat(OpcodeStr,
                               "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
-                     !if(MaskOnlyReg, [], [RHS_r])>, Sched<[SchedWriteFMA.Scl]>, EVEX_4V, SIMD_EXC;
+                     !if(MaskOnlyReg, [], [RHS_r])>, Sched<[SchedWriteFMA.Scl]>, EVEX, VVVV, SIMD_EXC;
     def m     : AVX512<opc, MRMSrcMem, (outs _.FRC:$dst),
                     (ins _.FRC:$src1, _.FRC:$src2, _.ScalarMemOp:$src3),
                     !strconcat(OpcodeStr,
                                "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
                     [RHS_m]>, Sched<[SchedWriteFMA.Scl.Folded, SchedWriteFMA.Scl.ReadAfterFold,
-                                     SchedWriteFMA.Scl.ReadAfterFold]>, EVEX_4V, SIMD_EXC;
+                                     SchedWriteFMA.Scl.ReadAfterFold]>, EVEX, VVVV, SIMD_EXC;
 
     let Uses = [MXCSR] in
     def rb    : AVX512<opc, MRMSrcReg, (outs _.FRC:$dst),
@@ -6885,7 +6885,7 @@ let Constraints = "$src1 = $dst", hasSideEffects = 0 in {
                      !strconcat(OpcodeStr,
                               "\t{$rc, $src3, $src2, $dst|$dst, $src2, $src3, $rc}"),
                      !if(MaskOnlyReg, [], [RHS_b])>, EVEX_B, EVEX_RC,
-                     Sched<[SchedWriteFMA.Scl]>, EVEX_4V;
+                     Sched<[SchedWriteFMA.Scl]>, EVEX, VVVV;
   }// isCodeGenOnly = 1
 }// Constraints = "$src1 = $dst"
 }
@@ -7189,13 +7189,13 @@ multiclass avx512_pmadd52_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
           (ins _.RC:$src2, _.RC:$src3),
           OpcodeStr, "$src3, $src2", "$src2, $src3",
           (_.VT (OpNode _.RC:$src2, _.RC:$src3, _.RC:$src1)), 1, 1>,
-          T8PD, EVEX_4V, Sched<[sched]>;
+          T8PD, EVEX, VVVV, Sched<[sched]>;
 
   defm m: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
           (ins _.RC:$src2, _.MemOp:$src3),
           OpcodeStr, "$src3, $src2", "$src2, $src3",
           (_.VT (OpNode _.RC:$src2, (_.LdFrag addr:$src3), _.RC:$src1))>,
-          T8PD, EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold,
+          T8PD, EVEX, VVVV, Sched<[sched.Folded, sched.ReadAfterFold,
                                 sched.ReadAfterFold]>;
 
   defm mb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
@@ -7205,7 +7205,7 @@ multiclass avx512_pmadd52_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
             (OpNode _.RC:$src2,
                     (_.VT (_.BroadcastLdFrag addr:$src3)),
                     _.RC:$src1)>,
-            T8PD, EVEX_4V, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold,
+            T8PD, EVEX, VVVV, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold,
                                           sched.ReadAfterFold]>;
   }
 }
@@ -7247,19 +7247,19 @@ let ExeDomain = DstVT.ExeDomain, Uses = _Uses,
     def rr : SI<opc, MRMSrcReg, (outs DstVT.FRC:$dst),
               (ins DstVT.FRC:$src1, SrcRC:$src),
               !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>,
-              EVEX_4V, Sched<[sched, ReadDefault, ReadInt2Fpu]>;
+              EVEX, VVVV, Sched<[sched, ReadDefault, ReadInt2Fpu]>;
     let mayLoad = 1 in
       def rm : SI<opc, MRMSrcMem, (outs DstVT.FRC:$dst),
               (ins DstVT.FRC:$src1, x86memop:$src),
               asm#"{"#mem#"}\t{$src, $src1, $dst|$dst, $src1, $src}", []>,
-              EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>;
+              EVEX, VVVV, Sched<[sched.Folded, sched.ReadAfterFold]>;
   } // hasSideEffects = 0
   def rr_Int : SI<opc, MRMSrcReg, (outs DstVT.RC:$dst),
                 (ins DstVT.RC:$src1, SrcRC:$src2),
                 !strconcat(asm,"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                 [(set DstVT.RC:$dst,
                       (OpNode (DstVT.VT DstVT.RC:$src1), SrcRC:$src2))]>,
-               EVEX_4V, Sched<[sched, ReadDefault, ReadInt2Fpu]>;
+               EVEX, VVVV, Sched<[sched, ReadDefault, ReadInt2Fpu]>;
 
   def rm_Int : SI<opc, MRMSrcMem, (outs DstVT.RC:$dst),
                 (ins DstVT.RC:$src1, x86memop:$src2),
@@ -7267,7 +7267,7 @@ let ExeDomain = DstVT.ExeDomain, Uses = _Uses,
                 [(set DstVT.RC:$dst,
                       (OpNode (DstVT.VT DstVT.RC:$src1),
                                (ld_frag addr:$src2)))]>,
-                EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>;
+                EVEX, VVVV, Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
   def : InstAlias<"v"#asm#mem#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                   (!cast<Instruction>(NAME#"rr_Int") DstVT.RC:$dst,
@@ -7287,7 +7287,7 @@ multiclass avx512_vcvtsi_round<bits<8> opc, SDNode OpNode,
                     (OpNode (DstVT.VT DstVT.RC:$src1),
                              SrcRC:$src2,
                              (i32 timm:$rc)))]>,
-              EVEX_4V, EVEX_B, EVEX_RC, Sched<[sched, ReadDefault, ReadInt2Fpu]>;
+              EVEX, VVVV, EVEX_B, EVEX_RC, Sched<[sched, ReadDefault, ReadInt2Fpu]>;
   def : InstAlias<"v"#asm#mem#"\t{$src2, $rc, $src1, $dst|$dst, $src1, $rc, $src2}",
                   (!cast<Instruction>(NAME#"rrb_Int") DstVT.RC:$dst,
                   DstVT.RC:$src1, SrcRC:$src2, AVX512RC:$rc), 0, "att">;
@@ -7646,25 +7646,25 @@ multiclass avx512_cvt_fp_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _
                          "$src2, $src1", "$src1, $src2",
                          (_.VT (OpNode (_.VT _.RC:$src1),
                                        (_Src.VT _Src.RC:$src2)))>,
-                         EVEX_4V, VEX_LIG, Sched<[sched]>;
+                         EVEX, VVVV, VEX_LIG, Sched<[sched]>;
   defm rm_Int : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
                          (ins _.RC:$src1, _Src.IntScalarMemOp:$src2), OpcodeStr,
                          "$src2, $src1", "$src1, $src2",
                          (_.VT (OpNode (_.VT _.RC:$src1),
                                   (_Src.ScalarIntMemFrags addr:$src2)))>,
-                         EVEX_4V, VEX_LIG,
+                         EVEX, VVVV, VEX_LIG,
                          Sched<[sched.Folded, sched.ReadAfterFold]>;
 
   let isCodeGenOnly = 1, hasSideEffects = 0 in {
     def rr : I<opc, MRMSrcReg, (outs _.FRC:$dst),
                (ins _.FRC:$src1, _Src.FRC:$src2),
                OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
-               EVEX_4V, VEX_LIG, Sched<[sched]>;
+               EVEX, VVVV, VEX_LIG, Sched<[sched]>;
     let mayLoad = 1 in
     def rm : I<opc, MRMSrcMem, (outs _.FRC:$dst),
                (ins _.FRC:$src1, _Src.ScalarMemOp:$src2),
                OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
-               EVEX_4V, VEX_LIG, Sched<[sched.Folded, sched.ReadAfterFold]>;
+               EVEX, VVVV, VEX_LIG, Sched<[sched.Folded, sched.ReadAfterFold]>;
   }
 }
 
@@ -7678,7 +7678,7 @@ multiclass avx512_cvt_fp_sae_scalar<bits<8> opc, string OpcodeStr, X86VectorVTIn
                         "{sae}, $src2, $src1", "$src1, $src2, {sae}",
                         (_.VT (OpNodeSAE (_.VT _.RC:$src1),
                                          (_Src.VT _Src.RC:$src2)))>,
-                        EVEX_4V, VEX_LIG, EVEX_B, Sched<[sched]>;
+                        EVEX, VVVV, VEX_LIG, EVEX_B, Sched<[sched]>;
 }
 
 // Scalar Conversion with rounding control (RC)
@@ -7691,7 +7691,7 @@ multiclass avx512_cvt_fp_rc_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInf
                         "$rc, $src2, $src1", "$src1, $src2, $rc",
                         (_.VT (OpNodeRnd (_.VT _.RC:$src1),
                                          (_Src.VT _Src.RC:$src2), (i32 timm:$rc)))>,
-                        EVEX_4V, VEX_LIG, Sched<[sched]>,
+                        EVEX, VVVV, VEX_LIG, Sched<[sched]>,
                         EVEX_B, EVEX_RC;
 }
 multiclass avx512_cvt_fp_scalar_trunc<bits<8> opc, string OpcodeStr,
@@ -9129,12 +9129,12 @@ multiclass avx512_fp14_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
                            (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
                            "$src2, $src1", "$src1, $src2",
                            (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2))>,
-                           EVEX_4V, VEX_LIG, Sched<[sched]>;
+                           EVEX, VVVV, VEX_LIG, Sched<[sched]>;
   defm rm : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
                          (ins _.RC:$src1, _.IntScalarMemOp:$src2), OpcodeStr,
                          "$src2, $src1", "$src1, $src2",
                          (OpNode (_.VT _.RC:$src1),
-                          (_.ScalarIntMemFrags addr:$src2))>, EVEX_4V, VEX_LIG,
+                          (_.ScalarIntMemFrags addr:$src2))>, EVEX, VVVV, VEX_LIG,
                           Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 }
@@ -9250,16 +9250,16 @@ multiclass avx512_fp28_s<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
 multiclass avx512_eri_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
                         SDNode OpNodeSAE, X86FoldableSchedWrite sched> {
   defm SSZ : avx512_fp28_s<opc, OpcodeStr#"ss", f32x_info, OpNode, OpNodeSAE,
-                           sched>, EVEX_CD8<32, CD8VT1>, VEX_LIG, T8PD, EVEX_4V;
+                           sched>, EVEX_CD8<32, CD8VT1>, VEX_LIG, T8PD, EVEX, VVVV;
   defm SDZ : avx512_fp28_s<opc, OpcodeStr#"sd", f64x_info, OpNode, OpNodeSAE,
-                           sched>, EVEX_CD8<64, CD8VT1>, VEX_LIG, REX_W, T8PD, EVEX_4V;
+                           sched>, EVEX_CD8<64, CD8VT1>, VEX_LIG, REX_W, T8PD, EVEX, VVVV;
 }
 
 multiclass avx512_vgetexpsh<bits<8> opc, string OpcodeStr, SDNode OpNode,
                         SDNode OpNodeSAE, X86FoldableSchedWrite sched> {
   let Predicates = [HasFP16] in
   defm SHZ : avx512_fp28_s<opc, OpcodeStr#"sh", f16x_info, OpNode,  OpNodeSAE, sched>,
-               EVEX_CD8<16, CD8VT1>, T_MAP6PD, EVEX_4V;
+               EVEX_CD8<16, CD8VT1>, T_MAP6PD, EVEX, VVVV;
 }
 
 let Predicates = [HasERI] in {
@@ -9501,11 +9501,11 @@ multiclass avx512_sqrt_scalar<bits<8> opc, string OpcodeStr, X86FoldableSchedWri
 multiclass avx512_sqrt_scalar_all<bits<8> opc, string OpcodeStr,
                                   X86SchedWriteSizes sched> {
   defm SHZ : avx512_sqrt_scalar<opc, OpcodeStr#"sh", sched.PH.Scl, f16x_info, NAME#"SH", HasFP16>,
-                        EVEX_CD8<16, CD8VT1>, EVEX_4V, T_MAP5XS;
+                        EVEX_CD8<16, CD8VT1>, EVEX, VVVV, T_MAP5XS;
   defm SSZ : avx512_sqrt_scalar<opc, OpcodeStr#"ss", sched.PS.Scl, f32x_info, NAME#"SS">,
-                        EVEX_CD8<32, CD8VT1>, EVEX_4V, XS;
+                        EVEX_CD8<32, CD8VT1>, EVEX, VVVV, XS;
   defm SDZ : avx512_sqrt_scalar<opc, OpcodeStr#"sd", sched.PD.Scl, f64x_info, NAME#"SD">,
-                        EVEX_CD8<64, CD8VT1>, EVEX_4V, XD, REX_W;
+                        EVEX_CD8<64, CD8VT1>, EVEX, VVVV, XD, REX_W;
 }
 
 defm VSQRT : avx512_sqrt_packed_all<0x51, "vsqrt", SchedWriteFSqrtSizes>,
@@ -9569,17 +9569,17 @@ multiclass avx512_rndscale_scalar<bits<8> opc, string OpcodeStr,
 let Predicates = [HasFP16] in
 defm VRNDSCALESHZ : avx512_rndscale_scalar<0x0A, "vrndscalesh",
                                            SchedWriteFRnd.Scl, f16x_info>,
-                                           AVX512PSIi8Base, TA, EVEX_4V,
+                                           AVX512PSIi8Base, TA, EVEX, VVVV,
                                            EVEX_CD8<16, CD8VT1>;
 
 defm VRNDSCALESSZ : avx512_rndscale_scalar<0x0A, "vrndscaless",
                                            SchedWriteFRnd.Scl, f32x_info>,
-                                           AVX512AIi8Base, EVEX_4V, VEX_LIG,
+                                           AVX512AIi8Base, EVEX, VVVV, VEX_LIG,
                                            EVEX_CD8<32, CD8VT1>;
 
 defm VRNDSCALESDZ : avx512_rndscale_scalar<0x0B, "vrndscalesd",
                                            SchedWriteFRnd.Scl, f64x_info>,
-                                           REX_W, AVX512AIi8Base, EVEX_4V, VEX_LIG,
+                                           REX_W, AVX512AIi8Base, EVEX, VVVV, VEX_LIG,
                                            EVEX_CD8<64, CD8VT1>;
 
 multiclass avx512_masked_scalar<SDNode OpNode, string OpcPrefix, SDNode Move,
@@ -10773,13 +10773,13 @@ multiclass avx512_common_3Op_rm_imm8<bits<8> opc, SDNode OpNode, string OpStr,
                    AVX512VLVectorVTInfo SrcInfo, Predicate Pred = HasBWI> {
   let Predicates = [Pred] in {
     defm Z    : avx512_3Op_rm_imm8<opc, OpStr, OpNode, sched.ZMM, DestInfo.info512,
-                           SrcInfo.info512>, EVEX_V512, AVX512AIi8Base, EVEX_4V;
+                           SrcInfo.info512>, EVEX_V512, AVX512AIi8Base, EVEX, VVVV;
   }
   let Predicates = [Pred, HasVLX] in {
     defm Z128 : avx512_3Op_rm_imm8<opc, OpStr, OpNode, sched.XMM, DestInfo.info128,
-                           SrcInfo.info128>, EVEX_V128, AVX512AIi8Base, EVEX_4V;
+                           SrcInfo.info128>, EVEX_V128, AVX512AIi8Base, EVEX, VVVV;
     defm Z256 : avx512_3Op_rm_imm8<opc, OpStr, OpNode, sched.YMM, DestInfo.info256,
-                           SrcInfo.info256>, EVEX_V256, AVX512AIi8Base, EVEX_4V;
+                           SrcInfo.info256>, EVEX_V256, AVX512AIi8Base, EVEX, VVVV;
   }
 }
 
@@ -10835,38 +10835,38 @@ defm VGETMANT : avx512_common_unary_fp_sae_packed_imm_all<"vgetmant", 0x26, 0x26
 defm VRANGEPD : avx512_common_fp_sae_packed_imm<"vrangepd", avx512vl_f64_info,
                                                 0x50, X86VRange, X86VRangeSAE,
                                                 SchedWriteFAdd, HasDQI>,
-      AVX512AIi8Base, EVEX_4V, EVEX_CD8<64, CD8VF>, REX_W;
+      AVX512AIi8Base, EVEX, VVVV, EVEX_CD8<64, CD8VF>, REX_W;
 defm VRANGEPS : avx512_common_fp_sae_packed_imm<"vrangeps", avx512vl_f32_info,
                                                 0x50, X86VRange, X86VRangeSAE,
                                                 SchedWriteFAdd, HasDQI>,
-      AVX512AIi8Base, EVEX_4V, EVEX_CD8<32, CD8VF>;
+      AVX512AIi8Base, EVEX, VVVV, EVEX_CD8<32, CD8VF>;
 
 defm VRANGESD: avx512_common_fp_sae_scalar_imm<"vrangesd",
       f64x_info, 0x51, X86Ranges, X86RangesSAE, SchedWriteFAdd, HasDQI>,
-      AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<64, CD8VT1>, REX_W;
+      AVX512AIi8Base, VEX_LIG, EVEX, VVVV, EVEX_CD8<64, CD8VT1>, REX_W;
 defm VRANGESS: avx512_common_fp_sae_scalar_imm<"vrangess", f32x_info,
       0x51, X86Ranges, X86RangesSAE, SchedWriteFAdd, HasDQI>,
-      AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<32, CD8VT1>;
+      AVX512AIi8Base, VEX_LIG, EVEX, VVVV, EVEX_CD8<32, CD8VT1>;
 
 defm VREDUCESD: avx512_common_fp_sae_scalar_imm<"vreducesd", f64x_info,
       0x57, X86Reduces, X86ReducesSAE, SchedWriteFRnd, HasDQI>,
-      AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<64, CD8VT1>, REX_W;
+      AVX512AIi8Base, VEX_LIG, EVEX, VVVV, EVEX_CD8<64, CD8VT1>, REX_W;
 defm VREDUCESS: avx512_common_fp_sae_scalar_imm<"vreducess", f32x_info,
       0x57, X86Reduces, X86ReducesSAE, SchedWriteFRnd, HasDQI>,
-      AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<32, CD8VT1>;
+      AVX512AIi8Base, VEX_LIG, EVEX, VVVV, EVEX_CD8<32, CD8VT1>;
 defm VREDUCESH: avx512_common_fp_sae_scalar_imm<"vreducesh", f16x_info,
       0x57, X86Reduces, X86ReducesSAE, SchedWriteFRnd, HasFP16>,
-      AVX512PSIi8Base, TA, VEX_LIG, EVEX_4V, EVEX_CD8<16, CD8VT1>;
+      AVX512PSIi8Base, TA, VEX_LIG, EVEX, VVVV, EVEX_CD8<16, CD8VT1>;
 
 defm VGETMANTSD: avx512_common_fp_sae_scalar_imm<"vgetmantsd", f64x_info,
       0x27, X86GetMants, X86GetMantsSAE, SchedWriteFRnd, HasAVX512>,
-      AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<64, CD8VT1>, REX_W;
+      AVX512AIi8Base, VEX_LIG, EVEX, VVVV, EVEX_CD8<64, CD8VT1>, REX_W;
 defm VGETMANTSS: avx512_common_fp_sae_scalar_imm<"vgetmantss", f32x_info,
       0x27, X86GetMants, X86GetMantsSAE, SchedWriteFRnd, HasAVX512>,
-      AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<32, CD8VT1>;
+      AVX512AIi8Base, VEX_LIG, EVEX, VVVV, EVEX_CD8<32, CD8VT1>;
 defm VGETMANTSH: avx512_common_fp_sae_scalar_imm<"vgetmantsh", f16x_info,
       0x27, X86GetMants, X86GetMantsSAE, SchedWriteFRnd, HasFP16>,
-      AVX512PSIi8Base, TA, VEX_LIG, EVEX_4V, EVEX_CD8<16, CD8VT1>;
+      AVX512PSIi8Base, TA, VEX_LIG, EVEX, VVVV, EVEX_CD8<16, CD8VT1>;
 
 multiclass avx512_shuff_packed_128_common<bits<8> opc, string OpcodeStr,
                                           X86FoldableSchedWrite sched,
@@ -10920,13 +10920,13 @@ multiclass avx512_shuff_packed_128<string OpcodeStr, X86FoldableSchedWrite sched
 }
 
 defm VSHUFF32X4 : avx512_shuff_packed_128<"vshuff32x4", WriteFShuffle256,
-      avx512vl_f32_info, avx512vl_f64_info, 0x23, "VPERM2F128">, AVX512AIi8Base, EVEX_4V, EVEX_CD8<32, CD8VF>;
+      avx512vl_f32_info, avx512vl_f64_info, 0x23, "VPERM2F128">, AVX512AIi8Base, EVEX, VVVV, EVEX_CD8<32, CD8VF>;
 defm VSHUFF64X2 : avx512_shuff_packed_128<"vshuff64x2", WriteFShuffle256,
-      avx512vl_f64_info, avx512vl_f64_info, 0x23, "VPERM2F128">, AVX512AIi8Base, EVEX_4V, EVEX_CD8<64, CD8VF>, REX_W;
+      avx512vl_f64_info, avx512vl_f64_info, 0x23, "VPERM2F128">, AVX512AIi8Base, EVEX, VVVV, EVEX_CD8<64, CD8VF>, REX_W;
 defm VSHUFI32X4 : avx512_shuff_packed_128<"vshufi32x4", WriteFShuffle256,
-      avx512vl_i32_info, avx512vl_i64_info, 0x43, "VPERM2I128">, AVX512AIi8Base, EVEX_4V, EVEX_CD8<32, CD8VF>;
+      avx512vl_i32_info, avx512vl_i64_info, 0x43, "VPERM2I128">, AVX512AIi8Base, EVEX, VVVV, EVEX_CD8<32, CD8VF>;
 defm VSHUFI64X2 : avx512_shuff_packed_128<"vshufi64x2", WriteFShuffle256,
-      avx512vl_i64_info, avx512vl_i64_info, 0x43, "VPERM2I128">, AVX512AIi8Base, EVEX_4V, EVEX_CD8<64, CD8VF>, REX_W;
+      avx512vl_i64_info, avx512vl_i64_info, 0x43, "VPERM2I128">, AVX512AIi8Base, EVEX, VVVV, EVEX_CD8<64, CD8VF>, REX_W;
 
 multiclass avx512_valign<bits<8> opc, string OpcodeStr,
                          X86FoldableSchedWrite sched, X86VectorVTInfo _>{
@@ -10962,15 +10962,15 @@ multiclass avx512_valign_common<string OpcodeStr, X86SchedWriteWidths sched,
                                 AVX512VLVectorVTInfo _> {
   let Predicates = [HasAVX512] in {
     defm Z    : avx512_valign<0x03, OpcodeStr, sched.ZMM, _.info512>,
-                                AVX512AIi8Base, EVEX_4V, EVEX_V512;
+                                AVX512AIi8Base, EVEX, VVVV, EVEX_V512;
   }
   let Predicates = [HasAVX512, HasVLX] in {
     defm Z128 : avx512_valign<0x03, OpcodeStr, sched.XMM, _.info128>,
-                                AVX512AIi8Base, EVEX_4V, EVEX_V128;
+                                AVX512AIi8Base, EVEX, VVVV, EVEX_V128;
     // We can't really override the 256-bit version so change it back to unset.
     let EVEX2VEXOverride = ? in
     defm Z256 : avx512_valign<0x03, OpcodeStr, sched.YMM, _.info256>,
-                                AVX512AIi8Base, EVEX_4V, EVEX_V256;
+                                AVX512AIi8Base, EVEX, VVVV, EVEX_V256;
   }
 }
 
@@ -11427,7 +11427,7 @@ multiclass avx512_insert_elt_m<bits<8> opc, string OpcodeStr, SDNode OpNode,
       OpcodeStr#"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
       [(set _.RC:$dst,
           (_.VT (OpNode _.RC:$src1, (LdFrag addr:$src2), immoperator:$src3)))]>,
-      EVEX_4V, EVEX_CD8<_.EltSize, CD8VT1>, Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>;
+      EVEX, VVVV, EVEX_CD8<_.EltSize, CD8VT1>, Sched<[WriteVecInsert.Folded, WriteVecInsert.ReadAfterFold]>;
 }
 
 multiclass avx512_insert_elt_bw<bits<8> opc, string OpcodeStr, SDNode OpNode,
@@ -11437,7 +11437,7 @@ multiclass avx512_insert_elt_bw<bits<8> opc, string OpcodeStr, SDNode OpNode,
         (ins _.RC:$src1, GR32orGR64:$src2, u8imm:$src3),
         OpcodeStr#"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
         [(set _.RC:$dst,
-            (OpNode _.RC:$src1, GR32orGR64:$src2, timm:$src3))]>, EVEX_4V,
+            (OpNode _.RC:$src1, GR32orGR64:$src2, timm:$src3))]>, EVEX, VVVV,
         Sched<[WriteVecInsert]>;
 
     defm NAME : avx512_insert_elt_m<opc, OpcodeStr, OpNode, _, LdFrag, timm>;
@@ -11452,7 +11452,7 @@ multiclass avx512_insert_elt_dq<bits<8> opc, string OpcodeStr,
         OpcodeStr#"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
         [(set _.RC:$dst,
             (_.VT (insertelt _.RC:$src1, GRC:$src2, imm:$src3)))]>,
-        EVEX_4V, TAPD, Sched<[WriteVecInsert]>;
+        EVEX, VVVV, TAPD, Sched<[WriteVecInsert]>;
 
     defm NAME : avx512_insert_elt_m<opc, OpcodeStr, insertelt, _,
                                     _.ScalarLdFrag, imm>, TAPD;
@@ -11501,7 +11501,7 @@ multiclass avx512_shufp<string OpcodeStr, AVX512VLVectorVTInfo VTInfo_FP>{
   defm NAME: avx512_common_3Op_imm8<OpcodeStr, VTInfo_FP, 0xC6, X86Shufp,
                                     SchedWriteFShuffle>,
                                     EVEX_CD8<VTInfo_FP.info512.EltSize, CD8VF>,
-                                    AVX512AIi8Base, EVEX_4V;
+                                    AVX512AIi8Base, EVEX, VVVV;
 }
 
 defm VSHUFPS: avx512_shufp<"vshufps", avx512vl_f32_info>, PS;
@@ -11543,10 +11543,10 @@ multiclass avx512_shift_packed_all<bits<8> opc, SDNode OpNode, Format MRMr,
 }
 defm VPSLLDQ : avx512_shift_packed_all<0x73, X86vshldq, MRM7r, MRM7m, "vpslldq",
                                        SchedWriteShuffle, HasBWI>,
-                                       AVX512PDIi8Base, EVEX_4V, WIG;
+                                       AVX512PDIi8Base, EVEX, VVVV, WIG;
 defm VPSRLDQ : avx512_shift_packed_all<0x73, X86vshrdq, MRM3r, MRM3m, "vpsrldq",
                                        SchedWriteShuffle, HasBWI>,
-                                       AVX512PDIi8Base, EVEX_4V, WIG;
+                                       AVX512PDIi8Base, EVEX, VVVV, WIG;
 
 multiclass avx512_psadbw_packed<bits<8> opc, SDNode OpNode,
                                 string OpcodeStr, X86FoldableSchedWrite sched,
@@ -11584,7 +11584,7 @@ multiclass avx512_psadbw_packed_all<bits<8> opc, SDNode OpNode,
 }
 
 defm VPSADBW : avx512_psadbw_packed_all<0xf6, X86psadbw, "vpsadbw",
-                                        SchedWritePSADBW, HasBWI>, EVEX_4V, WIG;
+                                        SchedWritePSADBW, HasBWI>, EVEX, VVVV, WIG;
 
 // Transforms to swizzle an immediate to enable better matching when
 // memory operand isn't in the right place.
@@ -11659,7 +11659,7 @@ multiclass avx512_ternlog<bits<8> opc, string OpcodeStr, SDNode OpNode,
                               (_.VT _.RC:$src2),
                               (_.VT _.RC:$src3),
                               (i8 timm:$src4)), 1, 1>,
-                      AVX512AIi8Base, EVEX_4V, Sched<[sched]>;
+                      AVX512AIi8Base, EVEX, VVVV, Sched<[sched]>;
   defm rmi : AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
                     (ins _.RC:$src2, _.MemOp:$src3, u8imm:$src4),
                     OpcodeStr, "$src4, $src3, $src2", "$src2, $src3, $src4",
@@ -11667,7 +11667,7 @@ multiclass avx512_ternlog<bits<8> opc, string OpcodeStr, SDNode OpNode,
                             (_.VT _.RC:$src2),
                             (_.VT (bitconvert (_.LdFrag addr:$src3))),
                             (i8 timm:$src4)), 1, 0>,
-                    AVX512AIi8Base, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>,
+                    AVX512AIi8Base, EVEX, VVVV, EVEX_CD8<_.EltSize, CD8VF>,
                     Sched<[sched.Folded, sched.ReadAfterFold]>;
   defm rmbi : AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
                     (ins _.RC:$src2, _.ScalarMemOp:$src3, u8imm:$src4),
@@ -11677,7 +11677,7 @@ multiclass avx512_ternlog<bits<8> opc, string OpcodeStr, SDNode OpNode,
                             (_.VT _.RC:$src2),
                             (_.VT (_.BroadcastLdFrag addr:$src3)),
                             (i8 timm:$src4)), 1, 0>, EVEX_B,
-                    AVX512AIi8Base, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>,
+                    AVX512AIi8Base, EVEX, VVVV, EVEX_CD8<_.EltSize, CD8VF>,
                     Sched<[sched.Folded, sched.ReadAfterFold]>;
   }// Constraints = "$src1 = $dst"
 
@@ -12002,23 +12002,23 @@ multiclass avx512_fixupimm_packed_all<X86SchedWriteWidths sched,
   let Predicates = [HasAVX512] in
     defm Z    : avx512_fixupimm_packed_sae<0x54, "vfixupimm", sched.ZMM,
                                 _Vec.info512, _Tbl.info512>, AVX512AIi8Base,
-                                EVEX_4V, EVEX_V512;
+                                EVEX, VVVV, EVEX_V512;
   let Predicates = [HasAVX512, HasVLX] in {
     defm Z128 : avx512_fixupimm_packed<0x54, "vfixupimm", sched.XMM,
                             _Vec.info128, _Tbl.info128>, AVX512AIi8Base,
-                            EVEX_4V, EVEX_V128;
+                            EVEX, VVVV, EVEX_V128;
     defm Z256 : avx512_fixupimm_packed<0x54, "vfixupimm", sched.YMM,
                             _Vec.info256, _Tbl.info256>, AVX512AIi8Base,
-                            EVEX_4V, EVEX_V256;
+                            EVEX, VVVV, EVEX_V256;
   }
 }
 
 defm VFIXUPIMMSSZ : avx512_fixupimm_scalar<0x55, "vfixupimm",
                                            SchedWriteFAdd.Scl, f32x_info, v4i32x_info>,
-                          AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<32, CD8VT1>;
+                          AVX512AIi8Base, VEX_LIG, EVEX, VVVV, EVEX_CD8<32, CD8VT1>;
 defm VFIXUPIMMSDZ : avx512_fixupimm_scalar<0x55, "vfixupimm",
                                            SchedWriteFAdd.Scl, f64x_info, v2i64x_info>,
-                          AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<64, CD8VT1>, REX_W;
+                          AVX512AIi8Base, VEX_LIG, EVEX, VVVV, EVEX_CD8<64, CD8VT1>, REX_W;
 defm VFIXUPIMMPS : avx512_fixupimm_packed_all<SchedWriteFAdd, avx512vl_f32_info,
                          avx512vl_i32_info>, EVEX_CD8<32, CD8VF>;
 defm VFIXUPIMMPD : avx512_fixupimm_packed_all<SchedWriteFAdd, avx512vl_f64_info,
@@ -12165,17 +12165,17 @@ multiclass avx512_vaes<bits<8> Op, string OpStr, string IntPrefix> {
     defm Z128 : AESI_binop_rm_int<Op, OpStr,
                                   !cast<Intrinsic>(IntPrefix),
                                   loadv2i64, 0, VR128X, i128mem>,
-                  EVEX_4V, EVEX_CD8<64, CD8VF>, EVEX_V128, WIG;
+                  EVEX, VVVV, EVEX_CD8<64, CD8VF>, EVEX_V128, WIG;
     defm Z256 : AESI_binop_rm_int<Op, OpStr,
                                   !cast<Intrinsic>(IntPrefix#"_256"),
                                   loadv4i64, 0, VR256X, i256mem>,
-                  EVEX_4V, EVEX_CD8<64, CD8VF>, EVEX_V256, WIG;
+                  EVEX, VVVV, EVEX_CD8<64, CD8VF>, EVEX_V256, WIG;
     }
     let Predicates = [HasAVX512, HasVAES] in
     defm Z    : AESI_binop_rm_int<Op, OpStr,
                                   !cast<Intrinsic>(IntPrefix#"_512"),
                                   loadv8i64, 0, VR512, i512mem>,
-                  EVEX_4V, EVEX_CD8<64, CD8VF>, EVEX_V512, WIG;
+                  EVEX, VVVV, EVEX_CD8<64, CD8VF>, EVEX_V512, WIG;
 }
 
 defm VAESENC      : avx512_vaes<0xDC, "vaesenc", "int_x86_aesni_aesenc">;
@@ -12189,14 +12189,14 @@ defm VAESDECLAST  : avx512_vaes<0xDF, "vaesdeclast", "int_x86_aesni_aesdeclast">
 
 let Predicates = [HasAVX512, HasVPCLMULQDQ] in
 defm VPCLMULQDQZ : vpclmulqdq<VR512, i512mem, loadv8i64, int_x86_pclmulqdq_512>,
-                              EVEX_4V, EVEX_V512, EVEX_CD8<64, CD8VF>, WIG;
+                              EVEX, VVVV, EVEX_V512, EVEX_CD8<64, CD8VF>, WIG;
 
 let Predicates = [HasVLX, HasVPCLMULQDQ] in {
 defm VPCLMULQDQZ128 : vpclmulqdq<VR128X, i128mem, loadv2i64, int_x86_pclmulqdq>,
-                              EVEX_4V, EVEX_V128, EVEX_CD8<64, CD8VF>, WIG;
+                              EVEX, VVVV, EVEX_V128, EVEX_CD8<64, CD8VF>, WIG;
 
 defm VPCLMULQDQZ256: vpclmulqdq<VR256X, i256mem, loadv4i64,
-                                int_x86_pclmulqdq_256>, EVEX_4V, EVEX_V256,
+                                int_x86_pclmulqdq_256>, EVEX, VVVV, EVEX_V256,
                                 EVEX_CD8<64, CD8VF>, WIG;
 }
 
@@ -12217,13 +12217,13 @@ multiclass VBMI2_shift_var_rm<bits<8> Op, string OpStr, SDNode OpNode,
                 (ins VTI.RC:$src2, VTI.RC:$src3), OpStr,
                 "$src3, $src2", "$src2, $src3",
                 (VTI.VT (OpNode VTI.RC:$src1, VTI.RC:$src2, VTI.RC:$src3))>,
-                T8PD, EVEX_4V, Sched<[sched]>;
+                T8PD, EVEX, VVVV, Sched<[sched]>;
     defm m:   AVX512_maskable_3src<Op, MRMSrcMem, VTI, (outs VTI.RC:$dst),
                 (ins VTI.RC:$src2, VTI.MemOp:$src3), OpStr,
                 "$src3, $src2", "$src2, $src3",
                 (VTI.VT (OpNode VTI.RC:$src1, VTI.RC:$src2,
                         (VTI.VT (VTI.LdFrag addr:$src3))))>,
-                T8PD, EVEX_4V,
+                T8PD, EVEX, VVVV,
                 Sched<[sched.Folded, sched.ReadAfterFold]>;
   }
 }
@@ -12239,7 +12239,7 @@ multiclass VBMI2_shift_var_rmb<bits<8> Op, string OpStr, SDNode OpNode,
               "$src2, ${src3}"#VTI.BroadcastStr,
               (OpNode VTI.RC:$src1, VTI.RC:$src2,
                (VTI.VT (VTI.BroadcastLdFrag addr:$src3)))>,
-              T8PD, EVEX_4V, EVEX_B,
+              T8PD, EVEX, VVVV, EVEX_B,
               Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
@@ -12284,9 +12284,9 @@ multiclass VBMI2_shift_imm<bits<8> wOp, bits<8> dqOp, string Prefix,
              avx512vl_i16_info, avx512vl_i16_info, HasVBMI2>,
              REX_W, EVEX_CD8<16, CD8VF>;
   defm D : avx512_common_3Op_imm8<Prefix#"d", avx512vl_i32_info, dqOp,
-             OpNode, sched, HasVBMI2>, AVX512AIi8Base, EVEX_4V, EVEX_CD8<32, CD8VF>;
+             OpNode, sched, HasVBMI2>, AVX512AIi8Base, EVEX, VVVV, EVEX_CD8<32, CD8VF>;
   defm Q : avx512_common_3Op_imm8<Prefix#"q", avx512vl_i64_info, dqOp, OpNode,
-             sched, HasVBMI2>, AVX512AIi8Base, EVEX_4V, EVEX_CD8<64, CD8VF>, REX_W;
+             sched, HasVBMI2>, AVX512AIi8Base, EVEX, VVVV, EVEX_CD8<64, CD8VF>, REX_W;
 }
 
 // Concat & Shift
@@ -12321,13 +12321,13 @@ multiclass VNNI_rmb<bits<8> Op, string OpStr, SDNode OpNode,
                                    (VTI.VT (OpNode VTI.RC:$src1,
                                             VTI.RC:$src2, VTI.RC:$src3)),
                                    IsCommutable, IsCommutable>,
-                                   EVEX_4V, T8PD, Sched<[sched]>;
+                                   EVEX, VVVV, T8PD, Sched<[sched]>;
   defm m  :   AVX512_maskable_3src<Op, MRMSrcMem, VTI, (outs VTI.RC:$dst),
                                    (ins VTI.RC:$src2, VTI.MemOp:$src3), OpStr,
                                    "$src3, $src2", "$src2, $src3",
                                    (VTI.VT (OpNode VTI.RC:$src1, VTI.RC:$src2,
                                             (VTI.VT (VTI.LdFrag addr:$src3))))>,
-                                   EVEX_4V, EVEX_CD8<32, CD8VF>, T8PD,
+                                   EVEX, VVVV, EVEX_CD8<32, CD8VF>, T8PD,
                                    Sched<[sched.Folded, sched.ReadAfterFold,
                                           sched.ReadAfterFold]>;
   defm mb :   AVX512_maskable_3src<Op, MRMSrcMem, VTI, (outs VTI.RC:$dst),
@@ -12336,7 +12336,7 @@ multiclass VNNI_rmb<bits<8> Op, string OpStr, SDNode OpNode,
                                    "$src2, ${src3}"#VTI.BroadcastStr,
                                    (OpNode VTI.RC:$src1, VTI.RC:$src2,
                                     (VTI.VT (VTI.BroadcastLdFrag addr:$src3)))>,
-                                   EVEX_4V, EVEX_CD8<32, CD8VF>, EVEX_B,
+                                   EVEX, VVVV, EVEX_CD8<32, CD8VF>, EVEX_B,
                                    T8PD, Sched<[sched.Folded, sched.ReadAfterFold,
                                                 sched.ReadAfterFold]>;
   }
@@ -12406,7 +12406,7 @@ multiclass VPSHUFBITQMB_rm<X86FoldableSchedWrite sched, X86VectorVTInfo VTI> {
                                 (X86Vpshufbitqmb (VTI.VT VTI.RC:$src1),
                                 (VTI.VT VTI.RC:$src2)),
                                 (X86Vpshufbitqmb_su (VTI.VT VTI.RC:$src1),
-                                (VTI.VT VTI.RC:$src2))>, EVEX_4V, T8PD,
+                                (VTI.VT VTI.RC:$src2))>, EVEX, VVVV, T8PD,
                                 Sched<[sched]>;
   defm rm : AVX512_maskable_cmp<0x8F, MRMSrcMem, VTI, (outs VTI.KRC:$dst),
                                 (ins VTI.RC:$src1, VTI.MemOp:$src2),
@@ -12416,7 +12416,7 @@ multiclass VPSHUFBITQMB_rm<X86FoldableSchedWrite sched, X86VectorVTInfo VTI> {
                                 (VTI.VT (VTI.LdFrag addr:$src2))),
                                 (X86Vpshufbitqmb_su (VTI.VT VTI.RC:$src1),
                                 (VTI.VT (VTI.LdFrag addr:$src2)))>,
-                                EVEX_4V, EVEX_CD8<8, CD8VF>, T8PD,
+                                EVEX, VVVV, EVEX_CD8<8, CD8VF>, T8PD,
                                 Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
@@ -12483,10 +12483,10 @@ multiclass GF2P8AFFINE_avx512_common<bits<8> Op, string OpStr, SDNode OpNode,
 
 defm VGF2P8AFFINEINVQB : GF2P8AFFINE_avx512_common<0xCF, "vgf2p8affineinvqb",
                          X86GF2P8affineinvqb, SchedWriteVecIMul>,
-                         EVEX_4V, EVEX_CD8<8, CD8VF>, REX_W, AVX512AIi8Base;
+                         EVEX, VVVV, EVEX_CD8<8, CD8VF>, REX_W, AVX512AIi8Base;
 defm VGF2P8AFFINEQB    : GF2P8AFFINE_avx512_common<0xCE, "vgf2p8affineqb",
                          X86GF2P8affineqb, SchedWriteVecIMul>,
-                         EVEX_4V, EVEX_CD8<8, CD8VF>, REX_W, AVX512AIi8Base;
+                         EVEX, VVVV, EVEX_CD8<8, CD8VF>, REX_W, AVX512AIi8Base;
 
 
 //===----------------------------------------------------------------------===//
@@ -12498,25 +12498,25 @@ let hasSideEffects = 0, mayLoad = 1, ExeDomain = SSEPackedSingle,
 defm V4FMADDPSrm : AVX512_maskable_3src_in_asm<0x9A, MRMSrcMem, v16f32_info,
                     (outs VR512:$dst), (ins VR512:$src2, f128mem:$src3),
                     "v4fmaddps", "$src3, $src2", "$src2, $src3",
-                    []>, EVEX_V512, EVEX_4V, T8XD, EVEX_CD8<32, CD8VQ>,
+                    []>, EVEX_V512, EVEX, VVVV, T8XD, EVEX_CD8<32, CD8VQ>,
                     Sched<[SchedWriteFMA.ZMM.Folded]>;
 
 defm V4FNMADDPSrm : AVX512_maskable_3src_in_asm<0xAA, MRMSrcMem, v16f32_info,
                      (outs VR512:$dst), (ins VR512:$src2, f128mem:$src3),
                      "v4fnmaddps", "$src3, $src2", "$src2, $src3",
-                     []>, EVEX_V512, EVEX_4V, T8XD, EVEX_CD8<32, CD8VQ>,
+                     []>, EVEX_V512, EVEX, VVVV, T8XD, EVEX_CD8<32, CD8VQ>,
                      Sched<[SchedWriteFMA.ZMM.Folded]>;
 
 defm V4FMADDSSrm : AVX512_maskable_3src_in_asm<0x9B, MRMSrcMem, f32x_info,
                     (outs VR128X:$dst), (ins  VR128X:$src2, f128mem:$src3),
                     "v4fmaddss", "$src3, $src2", "$src2, $src3",
-                    []>, VEX_LIG, EVEX_4V, T8XD, EVEX_CD8<32, CD8VF>,
+                    []>, VEX_LIG, EVEX, VVVV, T8XD, EVEX_CD8<32, CD8VF>,
                     Sched<[SchedWriteFMA.Scl.Folded]>;
 
 defm V4FNMADDSSrm : AVX512_maskable_3src_in_asm<0xAB, MRMSrcMem, f32x_info,
                      (outs VR128X:$dst), (ins VR128X:$src2, f128mem:$src3),
                      "v4fnmaddss", "$src3, $src2", "$src2, $src3",
-                     []>, VEX_LIG, EVEX_4V, T8XD, EVEX_CD8<32, CD8VF>,
+                     []>, VEX_LIG, EVEX, VVVV, T8XD, EVEX_CD8<32, CD8VF>,
                      Sched<[SchedWriteFMA.Scl.Folded]>;
 }
 
@@ -12529,13 +12529,13 @@ let hasSideEffects = 0, mayLoad = 1, ExeDomain = SSEPackedInt,
 defm VP4DPWSSDrm : AVX512_maskable_3src_in_asm<0x52, MRMSrcMem, v16i32_info,
                     (outs VR512:$dst), (ins VR512:$src2, f128mem:$src3),
                      "vp4dpwssd", "$src3, $src2", "$src2, $src3",
-                    []>, EVEX_V512, EVEX_4V, T8XD, EVEX_CD8<32, CD8VQ>,
+                    []>, EVEX_V512, EVEX, VVVV, T8XD, EVEX_CD8<32, CD8VQ>,
                     Sched<[SchedWriteFMA.ZMM.Folded]>;
 
 defm VP4DPWSSDSrm : AVX512_maskable_3src_in_asm<0x53, MRMSrcMem, v16i32_info,
                      (outs VR512:$dst), (ins VR512:$src2, f128mem:$src3),
                      "vp4dpwssds", "$src3, $src2", "$src2, $src3",
-                     []>, EVEX_V512, EVEX_4V, T8XD, EVEX_CD8<32, CD8VQ>,
+                     []>, EVEX_V512, EVEX, VVVV, T8XD, EVEX_CD8<32, CD8VQ>,
                      Sched<[SchedWriteFMA.ZMM.Folded]>;
 }
 
@@ -12558,7 +12558,7 @@ multiclass avx512_vp2intersect_modes<X86FoldableSchedWrite sched, X86VectorVTInf
                              "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                   [(set _.KRPC:$dst, (X86vp2intersect
                             _.RC:$src1, (_.VT _.RC:$src2)))]>,
-                  EVEX_4V, T8XD, Sched<[sched]>;
+                  EVEX, VVVV, T8XD, Sched<[sched]>;
 
   def rm : I<0x68, MRMSrcMem,
                   (outs _.KRPC:$dst),
@@ -12567,7 +12567,7 @@ multiclass avx512_vp2intersect_modes<X86FoldableSchedWrite sched, X86VectorVTInf
                              "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                   [(set _.KRPC:$dst, (X86vp2intersect
                             _.RC:$src1, (_.VT (bitconvert (_.LdFrag addr:$src2)))))]>,
-                  EVEX_4V, T8XD, EVEX_CD8<_.EltSize, CD8VF>,
+                  EVEX, VVVV, T8XD, EVEX_CD8<_.EltSize, CD8VF>,
                   Sched<[sched.Folded, sched.ReadAfterFold]>;
 
   def rmb : I<0x68, MRMSrcMem,
@@ -12577,7 +12577,7 @@ multiclass avx512_vp2intersect_modes<X86FoldableSchedWrite sched, X86VectorVTInf
                              ", $src1, $dst|$dst, $src1, ${src2}", _.BroadcastStr ,"}"),
                   [(set _.KRPC:$dst, (X86vp2intersect
                              _.RC:$src1, (_.VT (_.BroadcastLdFrag addr:$src2))))]>,
-                  EVEX_4V, T8XD, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>,
+                  EVEX, VVVV, T8XD, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>,
                   Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
@@ -12744,13 +12744,13 @@ multiclass avx512_dpbf16ps_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
                            (ins src_v.RC:$src2, src_v.RC:$src3),
                            OpcodeStr, "$src3, $src2", "$src2, $src3",
                            (_.VT (OpNode _.RC:$src1, src_v.RC:$src2, src_v.RC:$src3))>,
-                           EVEX_4V, Sched<[sched]>;
+                           EVEX, VVVV, Sched<[sched]>;
 
   defm m: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
                                (ins src_v.RC:$src2, src_v.MemOp:$src3),
                                OpcodeStr, "$src3, $src2", "$src2, $src3",
                                (_.VT (OpNode _.RC:$src1, src_v.RC:$src2,
-                               (src_v.LdFrag addr:$src3)))>, EVEX_4V,
+                               (src_v.LdFrag addr:$src3)))>, EVEX, VVVV,
                                Sched<[sched.Folded, sched.ReadAfterFold]>;
 
   defm mb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
@@ -12760,7 +12760,7 @@ multiclass avx512_dpbf16ps_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
                   !strconcat("$src2, ${src3}", _.BroadcastStr),
                   (_.VT (OpNode _.RC:$src1, src_v.RC:$src2,
                   (src_v.VT (src_v.BroadcastLdFrag addr:$src3))))>,
-                  EVEX_B, EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>;
+                  EVEX_B, EVEX, VVVV, Sched<[sched.Folded, sched.ReadAfterFold]>;
 
 }
 } // Constraints = "$src1 = $dst"
@@ -13390,17 +13390,17 @@ let Constraints = "@earlyclobber $dst, $src1 = $dst" in {
     defm r: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
             (ins _.RC:$src2, _.RC:$src3),
             OpcodeStr, "$src3, $src2", "$src2, $src3",
-            (_.VT (OpNode _.RC:$src2, _.RC:$src3, _.RC:$src1)), IsCommutable>, EVEX_4V;
+            (_.VT (OpNode _.RC:$src2, _.RC:$src3, _.RC:$src1)), IsCommutable>, EVEX, VVVV;
 
     defm m: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
             (ins _.RC:$src2, _.MemOp:$src3),
             OpcodeStr, "$src3, $src2", "$src2, $src3",
-            (_.VT (OpNode _.RC:$src2, (_.LdFrag addr:$src3), _.RC:$src1))>, EVEX_4V;
+            (_.VT (OpNode _.RC:$src2, (_.LdFrag addr:$src3), _.RC:$src1))>, EVEX, VVVV;
 
     defm mb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
             (ins _.RC:$src2, _.ScalarMemOp:$src3),
             OpcodeStr, !strconcat("${src3}", _.BroadcastStr,", $src2"), !strconcat("$src2, ${src3}", _.BroadcastStr),
-            (_.VT (OpNode _.RC:$src2, (_.VT (_.BroadcastLdFrag addr:$src3)), _.RC:$src1))>, EVEX_B, EVEX_4V;
+            (_.VT (OpNode _.RC:$src2, (_.VT (_.BroadcastLdFrag addr:$src3)), _.RC:$src1))>, EVEX_B, EVEX, VVVV;
   }
 } // Constraints = "@earlyclobber $dst, $src1 = $dst"
 
@@ -13411,7 +13411,7 @@ multiclass avx512_cfmaop_round<bits<8> opc, string OpcodeStr, SDNode OpNode,
           (ins _.RC:$src2, _.RC:$src3, AVX512RC:$rc),
           OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc",
           (_.VT (OpNode _.RC:$src2, _.RC:$src3, _.RC:$src1, (i32 timm:$rc)))>,
-          EVEX_4V, EVEX_B, EVEX_RC;
+          EVEX, VVVV, EVEX_B, EVEX_RC;
 }
 
 
@@ -13504,12 +13504,12 @@ multiclass avx512_cfmbinop_sh_common<bits<8> opc, string OpcodeStr, SDNode OpNod
 
 let Uses = [MXCSR] in {
   defm VFMADDCSHZ  : avx512_cfmaop_sh_common<0x57, "vfmaddcsh", x86vfmaddcSh, x86vfmaddcShRnd, 1>,
-                                    T_MAP6XS, EVEX_CD8<32, CD8VT1>, EVEX_V128, EVEX_4V;
+                                    T_MAP6XS, EVEX_CD8<32, CD8VT1>, EVEX_V128, EVEX, VVVV;
   defm VFCMADDCSHZ : avx512_cfmaop_sh_common<0x57, "vfcmaddcsh", x86vfcmaddcSh, x86vfcmaddcShRnd, 0>,
-                                    T_MAP6XD, EVEX_CD8<32, CD8VT1>, EVEX_V128, EVEX_4V;
+                                    T_MAP6XD, EVEX_CD8<32, CD8VT1>, EVEX_V128, EVEX, VVVV;
 
   defm VFMULCSHZ  : avx512_cfmbinop_sh_common<0xD7, "vfmulcsh", x86vfmulcSh, x86vfmulcShRnd, 1>,
-                                    T_MAP6XS, EVEX_CD8<32, CD8VT1>, EVEX_V128, VEX_LIG, EVEX_4V;
+                                    T_MAP6XS, EVEX_CD8<32, CD8VT1>, EVEX_V128, VEX_LIG, EVEX, VVVV;
   defm VFCMULCSHZ : avx512_cfmbinop_sh_common<0xD7, "vfcmulcsh", x86vfcmulcSh, x86vfcmulcShRnd, 0>,
-                                    T_MAP6XD, EVEX_CD8<32, CD8VT1>, EVEX_V128, VEX_LIG, EVEX_4V;
+                                    T_MAP6XD, EVEX_CD8<32, CD8VT1>, EVEX_V128, VEX_LIG, EVEX, VVVV;
 }
diff --git a/llvm/lib/Target/X86/X86InstrArithmetic.td b/llvm/lib/Target/X86/X86InstrArithmetic.td
index 87feb7dc3b4ee..6f4b69c9b5c9f 100644
--- a/llvm/lib/Target/X86/X86InstrArithmetic.td
+++ b/llvm/lib/Target/X86/X86InstrArithmetic.td
@@ -1095,23 +1095,23 @@ let Predicates = [HasBMI, NoEGPR] in {
   def rr : I<0xF2, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
              !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
              [(set RC:$dst, EFLAGS, (X86and_flag (not RC:$src1), RC:$src2))]>,
-           VEX_4V, Sched<[sched]>;
+           VEX, VVVV, Sched<[sched]>;
   def rm : I<0xF2, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
              !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
              [(set RC:$dst, EFLAGS,
               (X86and_flag (not RC:$src1), (ld_frag addr:$src2)))]>,
-           VEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>;
+           VEX, VVVV, Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 let Predicates = [HasBMI, HasEGPR, In64BitMode] in {
   def rr_EVEX : I<0xF2, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
                   !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                   [(set RC:$dst, EFLAGS, (X86and_flag (not RC:$src1), RC:$src2))]>,
-                EVEX_4V, Sched<[sched]>;
+                EVEX, VVVV, Sched<[sched]>;
   def rm_EVEX : I<0xF2, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
                   !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                   [(set RC:$dst, EFLAGS,
                    (X86and_flag (not RC:$src1), (ld_frag addr:$src2)))]>,
-                EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>;
+                EVEX, VVVV, Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 }
 
@@ -1141,12 +1141,12 @@ let hasSideEffects = 0 in {
 let Predicates = [HasBMI2, NoEGPR] in {
   def rr : I<0xF6, MRMSrcReg, (outs RC:$dst1, RC:$dst2), (ins RC:$src),
              !strconcat(mnemonic, "\t{$src, $dst2, $dst1|$dst1, $dst2, $src}"),
-             []>, T8XD, VEX_4V, Sched<[WriteIMulH, sched]>;
+             []>, T8XD, VEX, VVVV, Sched<[WriteIMulH, sched]>;
 
   let mayLoad = 1 in
   def rm : I<0xF6, MRMSrcMem, (outs RC:$dst1, RC:$dst2), (ins x86memop:$src),
              !strconcat(mnemonic, "\t{$src, $dst2, $dst1|$dst1, $dst2, $src}"),
-             []>, T8XD, VEX_4V,
+             []>, T8XD, VEX, VVVV,
              Sched<[WriteIMulHLd, sched.Folded,
                     // Memory operand.
                     ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault,
@@ -1165,11 +1165,11 @@ let Predicates = [HasBMI2, NoEGPR] in {
 let Predicates = [HasBMI2, HasEGPR, In64BitMode] in
   def rr#_EVEX : I<0xF6, MRMSrcReg, (outs RC:$dst1, RC:$dst2), (ins RC:$src),
                    !strconcat(mnemonic, "\t{$src, $dst2, $dst1|$dst1, $dst2, $src}"),
-                   []>, T8XD, EVEX_4V, Sched<[WriteIMulH, sched]>;
+                   []>, T8XD, EVEX, VVVV, Sched<[WriteIMulH, sched]>;
 let Predicates = [HasBMI2, HasEGPR, In64BitMode], mayLoad = 1 in
   def rm#_EVEX : I<0xF6, MRMSrcMem, (outs RC:$dst1, RC:$dst2), (ins x86memop:$src),
                    !strconcat(mnemonic, "\t{$src, $dst2, $dst1|$dst1, $dst2, $src}"),
-                   []>, T8XD, EVEX_4V,
+                   []>, T8XD, EVEX, VVVV,
                  Sched<[WriteIMulHLd, sched.Folded,
                         // Memory operand.
                         ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault,
diff --git a/llvm/lib/Target/X86/X86InstrMisc.td b/llvm/lib/Target/X86/X86InstrMisc.td
index 3006969b76d67..a6bed74b5bef1 100644
--- a/llvm/lib/Target/X86/X86InstrMisc.td
+++ b/llvm/lib/Target/X86/X86InstrMisc.td
@@ -165,10 +165,10 @@ def POPP64r  : I<0x58, AddRegFrm, (outs GR64:$reg), (ins), "popp\t$reg", []>,
                  REX_W, ExplicitREX2Prefix, Requires<[In64BitMode]>;
 def POP2: I<0x8F, MRM0r, (outs GR64:$reg1, GR64:$reg2), (ins),
             "pop2\t{$reg2, $reg1|$reg1, $reg2}",
-            []>, EVEX_4V, EVEX_B, T_MAP4PS;
+            []>, EVEX, VVVV, EVEX_B, T_MAP4PS;
 def POP2P: I<0x8F, MRM0r, (outs GR64:$reg1, GR64:$reg2), (ins),
              "pop2p\t{$reg2, $reg1|$reg1, $reg2}",
-             []>, EVEX_4V, EVEX_B, T_MAP4PS, REX_W;
+             []>, EVEX, VVVV, EVEX_B, T_MAP4PS, REX_W;
 
 } // mayLoad, SchedRW
 let mayLoad = 1, mayStore = 1, SchedRW = [WriteCopy] in
@@ -186,10 +186,10 @@ def PUSHP64r  : I<0x50, AddRegFrm, (outs), (ins GR64:$reg), "pushp\t$reg", []>,
                   REX_W, ExplicitREX2Prefix, Requires<[In64BitMode]>;
 def PUSH2: I<0xFF, MRM6r, (outs), (ins GR64:$reg1, GR64:$reg2),
             "push2\t{$reg2, $reg1|$reg1, $reg2}",
-            []>, EVEX_4V, EVEX_B, T_MAP4PS;
+            []>, EVEX, VVVV, EVEX_B, T_MAP4PS;
 def PUSH2P: I<0xFF, MRM6r, (outs), (ins GR64:$reg1, GR64:$reg2),
              "push2p\t{$reg2, $reg1|$reg1, $reg2}",
-             []>, EVEX_4V, EVEX_B, T_MAP4PS, REX_W;
+             []>, EVEX, VVVV, EVEX_B, T_MAP4PS, REX_W;
 } // mayStore, SchedRW
 let mayLoad = 1, mayStore = 1, SchedRW = [WriteCopy] in {
 def PUSH64rmm: I<0xFF, MRM6m, (outs), (ins i64mem:$src), "push{q}\t$src", []>,
@@ -1218,11 +1218,11 @@ multiclass bmi_bls<string mnemonic, Format RegMRM, Format MemMRM,
 let hasSideEffects = 0 in {
   def rr#Suffix : I<0xF3, RegMRM, (outs RC:$dst), (ins RC:$src),
                     !strconcat(mnemonic, "\t{$src, $dst|$dst, $src}"), []>,
-                  T8PS, VEX_4V, Sched<[sched]>;
+                  T8PS, VEX, VVVV, Sched<[sched]>;
   let mayLoad = 1 in
   def rm#Suffix : I<0xF3, MemMRM, (outs RC:$dst), (ins x86memop:$src),
                     !strconcat(mnemonic, "\t{$src, $dst|$dst, $src}"), []>,
-                  T8PS, VEX_4V, Sched<[sched.Folded]>;
+                  T8PS, VEX, VVVV, Sched<[sched.Folded]>;
 }
 }
 
@@ -1371,11 +1371,11 @@ multiclass bmi_pdep_pext<string mnemonic, RegisterClass RC,
   def rr#Suffix : I<0xF5, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
                     !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                     [(set RC:$dst, (OpNode RC:$src1, RC:$src2))]>,
-                  VEX_4V, Sched<[WriteALU]>;
+                  VEX, VVVV, Sched<[WriteALU]>;
   def rm#Suffix : I<0xF5, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
                     !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                     [(set RC:$dst, (OpNode RC:$src1, (ld_frag addr:$src2)))]>,
-                  VEX_4V, Sched<[WriteALU.Folded, WriteALU.ReadAfterFold]>;
+                  VEX, VVVV, Sched<[WriteALU.Folded, WriteALU.ReadAfterFold]>;
 }
 
 let Predicates = [HasBMI2, NoEGPR] in {
@@ -1419,12 +1419,12 @@ multiclass lwpins_intr<RegisterClass RC> {
   def rri : Ii32<0x12, MRM0r, (outs), (ins RC:$src0, GR32:$src1, i32imm:$cntl),
                  "lwpins\t{$cntl, $src1, $src0|$src0, $src1, $cntl}",
                  [(set EFLAGS, (X86lwpins RC:$src0, GR32:$src1, timm:$cntl))]>,
-                 XOP_4V, XOPA;
+                 XOP, VVVV, XOPA;
   let mayLoad = 1 in
   def rmi : Ii32<0x12, MRM0m, (outs), (ins RC:$src0, i32mem:$src1, i32imm:$cntl),
                  "lwpins\t{$cntl, $src1, $src0|$src0, $src1, $cntl}",
                  [(set EFLAGS, (X86lwpins RC:$src0, (loadi32 addr:$src1), timm:$cntl))]>,
-                 XOP_4V, XOPA;
+                 XOP, VVVV, XOPA;
 }
 
 let Defs = [EFLAGS] in {
@@ -1435,12 +1435,12 @@ let Defs = [EFLAGS] in {
 multiclass lwpval_intr<RegisterClass RC, Intrinsic Int> {
   def rri : Ii32<0x12, MRM1r, (outs), (ins RC:$src0, GR32:$src1, i32imm:$cntl),
                  "lwpval\t{$cntl, $src1, $src0|$src0, $src1, $cntl}",
-                 [(Int RC:$src0, GR32:$src1, timm:$cntl)]>, XOP_4V, XOPA;
+                 [(Int RC:$src0, GR32:$src1, timm:$cntl)]>, XOP, VVVV, XOPA;
   let mayLoad = 1 in
   def rmi : Ii32<0x12, MRM1m, (outs), (ins RC:$src0, i32mem:$src1, i32imm:$cntl),
                  "lwpval\t{$cntl, $src1, $src0|$src0, $src1, $cntl}",
                  [(Int RC:$src0, (loadi32 addr:$src1), timm:$cntl)]>,
-                 XOP_4V, XOPA;
+                 XOP, VVVV, XOPA;
 }
 
 defm LWPVAL32 : lwpval_intr<GR32, int_x86_lwpval32>;
@@ -1670,14 +1670,14 @@ def CMPCCXADDmr32 : I<0xe0, MRMDestMem4VOp3CC, (outs GR32:$dst),
           "cmp${cond}xadd\t{$src3, $dst, $dstsrc2|$dstsrc2, $dst, $src3}",
           [(set GR32:$dst, (X86cmpccxadd addr:$dstsrc2,
             GR32:$dstsrc1, GR32:$src3, timm:$cond))]>,
-          VEX_4V, T8PD, Sched<[WriteXCHG]>;
+          VEX, VVVV, T8PD, Sched<[WriteXCHG]>;
 
 def CMPCCXADDmr64 : I<0xe0, MRMDestMem4VOp3CC, (outs GR64:$dst),
           (ins GR64:$dstsrc1, i64mem:$dstsrc2, GR64:$src3, ccode:$cond),
           "cmp${cond}xadd\t{$src3, $dst, $dstsrc2|$dstsrc2, $dst, $src3}",
           [(set GR64:$dst, (X86cmpccxadd addr:$dstsrc2,
             GR64:$dstsrc1, GR64:$src3, timm:$cond))]>,
-          VEX_4V, REX_W, T8PD, Sched<[WriteXCHG]>;
+          VEX, VVVV, REX_W, T8PD, Sched<[WriteXCHG]>;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td
index 2e1560a9f7dc1..d91c7740aae39 100644
--- a/llvm/lib/Target/X86/X86InstrSSE.td
+++ b/llvm/lib/Target/X86/X86InstrSSE.td
@@ -215,7 +215,7 @@ multiclass sse12_move<RegisterClass RC, SDNode OpNode, ValueType vt,
   let Predicates = [UseAVX, OptForSize] in
   defm V#NAME : sse12_move_rr<OpNode, vt, OpcodeStr,
                               "\t{$src2, $src1, $dst|$dst, $src1, $src2}", d>,
-                              VEX_4V, VEX_LIG, WIG;
+                              VEX, VVVV, VEX_LIG, WIG;
 
   def V#NAME#mr : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src),
                      !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
@@ -683,7 +683,7 @@ multiclass sse12_mov_hilo_packed<bits<8>opc, SDPatternOperator pdnode,
   let Predicates = [UseAVX] in
     defm V#NAME : sse12_mov_hilo_packed_base<opc, pdnode, base_opc,
                                     "\t{$src2, $src1, $dst|$dst, $src1, $src2}">,
-                                    VEX_4V, WIG;
+                                    VEX, VVVV, WIG;
 
   let Constraints = "$src1 = $dst" in
     defm NAME : sse12_mov_hilo_packed_base<opc,  pdnode, base_opc,
@@ -823,14 +823,14 @@ let Predicates = [UseAVX] in {
                       "movlhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                       [(set VR128:$dst,
                         (v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))]>,
-                      VEX_4V, Sched<[SchedWriteFShuffle.XMM]>, WIG;
+                      VEX, VVVV, Sched<[SchedWriteFShuffle.XMM]>, WIG;
   let isCommutable = 1 in
   def VMOVHLPSrr : VPSI<0x12, MRMSrcReg, (outs VR128:$dst),
                                        (ins VR128:$src1, VR128:$src2),
                       "movhlps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                       [(set VR128:$dst,
                         (v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))]>,
-                      VEX_4V, Sched<[SchedWriteFShuffle.XMM]>, WIG;
+                      VEX, VVVV, Sched<[SchedWriteFShuffle.XMM]>, WIG;
 }
 let Constraints = "$src1 = $dst" in {
   def MOVLHPSrr : PSI<0x16, MRMSrcReg, (outs VR128:$dst),
@@ -941,16 +941,16 @@ defm VCVTSD2SI64 : sse12_cvt_s<0x2D, FR64, GR64, llrint, f64mem, loadf64,
 // where appropriate to do so.
 let isCodeGenOnly = 1 in {
 defm VCVTSI2SS   : sse12_vcvt_avx<0x2A, GR32, FR32, i32mem, "cvtsi2ss", "l",
-                                  WriteCvtI2SS, SSEPackedSingle>, XS, VEX_4V,
+                                  WriteCvtI2SS, SSEPackedSingle>, XS, VEX, VVVV,
                                   VEX_LIG, SIMD_EXC;
 defm VCVTSI642SS : sse12_vcvt_avx<0x2A, GR64, FR32, i64mem, "cvtsi2ss", "q",
-                                  WriteCvtI2SS, SSEPackedSingle>, XS, VEX_4V,
+                                  WriteCvtI2SS, SSEPackedSingle>, XS, VEX, VVVV,
                                   REX_W, VEX_LIG, SIMD_EXC;
 defm VCVTSI2SD   : sse12_vcvt_avx<0x2A, GR32, FR64, i32mem, "cvtsi2sd", "l",
-                                  WriteCvtI2SD, SSEPackedDouble>, XD, VEX_4V,
+                                  WriteCvtI2SD, SSEPackedDouble>, XD, VEX, VVVV,
                                   VEX_LIG;
 defm VCVTSI642SD : sse12_vcvt_avx<0x2A, GR64, FR64, i64mem, "cvtsi2sd", "q",
-                                  WriteCvtI2SD, SSEPackedDouble>, XD, VEX_4V,
+                                  WriteCvtI2SD, SSEPackedDouble>, XD, VEX, VVVV,
                                   REX_W, VEX_LIG, SIMD_EXC;
 } // isCodeGenOnly = 1
 
@@ -1090,16 +1090,16 @@ defm CVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v2f64, X86cvts2si,
 let Predicates = [UseAVX] in {
 defm VCVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
           i32mem, "cvtsi2ss", "l", WriteCvtI2SS, SSEPackedSingle, 0>,
-          XS, VEX_4V, VEX_LIG, SIMD_EXC;
+          XS, VEX, VVVV, VEX_LIG, SIMD_EXC;
 defm VCVTSI642SS : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
           i64mem, "cvtsi2ss", "q", WriteCvtI2SS, SSEPackedSingle, 0>,
-          XS, VEX_4V, VEX_LIG, REX_W, SIMD_EXC;
+          XS, VEX, VVVV, VEX_LIG, REX_W, SIMD_EXC;
 defm VCVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
           i32mem, "cvtsi2sd", "l", WriteCvtI2SD, SSEPackedDouble, 0>,
-          XD, VEX_4V, VEX_LIG;
+          XD, VEX, VVVV, VEX_LIG;
 defm VCVTSI642SD : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
           i64mem, "cvtsi2sd", "q", WriteCvtI2SD, SSEPackedDouble, 0>,
-          XD, VEX_4V, VEX_LIG, REX_W, SIMD_EXC;
+          XD, VEX, VVVV, VEX_LIG, REX_W, SIMD_EXC;
 }
 let Constraints = "$src1 = $dst" in {
   defm CVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
@@ -1289,13 +1289,13 @@ let isCodeGenOnly = 1, hasSideEffects = 0, Predicates = [UseAVX],
 def VCVTSD2SSrr  : VSDI<0x5A, MRMSrcReg, (outs FR32:$dst),
                         (ins FR32:$src1, FR64:$src2),
                         "cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
-                        VEX_4V, VEX_LIG, WIG,
+                        VEX, VVVV, VEX_LIG, WIG,
                         Sched<[WriteCvtSD2SS]>, SIMD_EXC;
 let mayLoad = 1 in
 def VCVTSD2SSrm  : I<0x5A, MRMSrcMem, (outs FR32:$dst),
                      (ins FR32:$src1, f64mem:$src2),
                      "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
-                     XD, VEX_4V, VEX_LIG, WIG,
+                     XD, VEX, VVVV, VEX_LIG, WIG,
                      Sched<[WriteCvtSD2SS.Folded, WriteCvtSD2SS.ReadAfterFold]>, SIMD_EXC;
 }
 
@@ -1321,14 +1321,14 @@ def VCVTSD2SSrr_Int: I<0x5A, MRMSrcReg,
                        "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                        [(set VR128:$dst,
                          (v4f32 (X86frounds VR128:$src1, (v2f64 VR128:$src2))))]>,
-                       XD, VEX_4V, VEX_LIG, WIG, Requires<[UseAVX]>,
+                       XD, VEX, VVVV, VEX_LIG, WIG, Requires<[UseAVX]>,
                        Sched<[WriteCvtSD2SS]>;
 def VCVTSD2SSrm_Int: I<0x5A, MRMSrcMem,
                        (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2),
                        "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                        [(set VR128:$dst,
                          (v4f32 (X86frounds VR128:$src1, (sse_load_f64 addr:$src2))))]>,
-                       XD, VEX_4V, VEX_LIG, WIG, Requires<[UseAVX]>,
+                       XD, VEX, VVVV, VEX_LIG, WIG, Requires<[UseAVX]>,
                        Sched<[WriteCvtSD2SS.Folded, WriteCvtSD2SS.ReadAfterFold]>;
 let Constraints = "$src1 = $dst" in {
 def CVTSD2SSrr_Int: I<0x5A, MRMSrcReg,
@@ -1353,13 +1353,13 @@ let isCodeGenOnly = 1, hasSideEffects = 0, ExeDomain = SSEPackedSingle in {
 def VCVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst),
                     (ins FR64:$src1, FR32:$src2),
                     "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
-                    XS, VEX_4V, VEX_LIG, WIG,
+                    XS, VEX, VVVV, VEX_LIG, WIG,
                     Sched<[WriteCvtSS2SD]>, Requires<[UseAVX]>, SIMD_EXC;
 let mayLoad = 1 in
 def VCVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst),
                     (ins FR64:$src1, f32mem:$src2),
                     "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
-                    XS, VEX_4V, VEX_LIG, WIG,
+                    XS, VEX, VVVV, VEX_LIG, WIG,
                     Sched<[WriteCvtSS2SD.Folded, WriteCvtSS2SD.ReadAfterFold]>,
                     Requires<[UseAVX, OptForSize]>, SIMD_EXC;
 } // isCodeGenOnly = 1, hasSideEffects = 0
@@ -1386,13 +1386,13 @@ let hasSideEffects = 0, Uses = [MXCSR], mayRaiseFPException = 1,
 def VCVTSS2SDrr_Int: I<0x5A, MRMSrcReg,
                       (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
                     "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                    []>, XS, VEX_4V, VEX_LIG, WIG,
+                    []>, XS, VEX, VVVV, VEX_LIG, WIG,
                     Requires<[HasAVX]>, Sched<[WriteCvtSS2SD]>;
 let mayLoad = 1 in
 def VCVTSS2SDrm_Int: I<0x5A, MRMSrcMem,
                       (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2),
                     "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                    []>, XS, VEX_4V, VEX_LIG, WIG, Requires<[HasAVX]>,
+                    []>, XS, VEX, VVVV, VEX_LIG, WIG, Requires<[HasAVX]>,
                     Sched<[WriteCvtSS2SD.Folded, WriteCvtSS2SD.ReadAfterFold]>;
 let Constraints = "$src1 = $dst" in { // SSE2 instructions with XS prefix
 def CVTSS2SDrr_Int: I<0x5A, MRMSrcReg,
@@ -1860,12 +1860,12 @@ let ExeDomain = SSEPackedSingle in
 defm VCMPSS : sse12_cmp_scalar<FR32, f32mem, ssmem, X86cmps, v4f32, loadf32,
                  "cmpss\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
                  SchedWriteFCmpSizes.PS.Scl, sse_load_f32>,
-                 XS, VEX_4V, VEX_LIG, WIG;
+                 XS, VEX, VVVV, VEX_LIG, WIG;
 let ExeDomain = SSEPackedDouble in
 defm VCMPSD : sse12_cmp_scalar<FR64, f64mem, sdmem, X86cmps, v2f64, loadf64,
                  "cmpsd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
                  SchedWriteFCmpSizes.PD.Scl, sse_load_f64>,
-                 XD, VEX_4V, VEX_LIG, WIG;
+                 XD, VEX, VVVV, VEX_LIG, WIG;
 
 let Constraints = "$src1 = $dst" in {
   let ExeDomain = SSEPackedSingle in
@@ -1979,16 +1979,16 @@ multiclass sse12_cmp_packed<RegisterClass RC, X86MemOperand x86memop,
 
 defm VCMPPS : sse12_cmp_packed<VR128, f128mem, v4f32,
                "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
-               SchedWriteFCmpSizes.PS.XMM, SSEPackedSingle, loadv4f32>, PS, VEX_4V, WIG;
+               SchedWriteFCmpSizes.PS.XMM, SSEPackedSingle, loadv4f32>, PS, VEX, VVVV, WIG;
 defm VCMPPD : sse12_cmp_packed<VR128, f128mem, v2f64,
                "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
-               SchedWriteFCmpSizes.PD.XMM, SSEPackedDouble, loadv2f64>, PD, VEX_4V, WIG;
+               SchedWriteFCmpSizes.PD.XMM, SSEPackedDouble, loadv2f64>, PD, VEX, VVVV, WIG;
 defm VCMPPSY : sse12_cmp_packed<VR256, f256mem, v8f32,
                "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
-               SchedWriteFCmpSizes.PS.YMM, SSEPackedSingle, loadv8f32>, PS, VEX_4V, VEX_L, WIG;
+               SchedWriteFCmpSizes.PS.YMM, SSEPackedSingle, loadv8f32>, PS, VEX, VVVV, VEX_L, WIG;
 defm VCMPPDY : sse12_cmp_packed<VR256, f256mem, v4f64,
                "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
-               SchedWriteFCmpSizes.PD.YMM, SSEPackedDouble, loadv4f64>, PD, VEX_4V, VEX_L, WIG;
+               SchedWriteFCmpSizes.PD.YMM, SSEPackedDouble, loadv4f64>, PD, VEX, VVVV, VEX_L, WIG;
 let Constraints = "$src1 = $dst" in {
   defm CMPPS : sse12_cmp_packed<VR128, f128mem, v4f32,
                  "cmpps\t{$cc, $src2, $dst|$dst, $src2, $cc}",
@@ -2076,19 +2076,19 @@ let Predicates = [HasAVX, NoVLX] in {
   defm VSHUFPS  : sse12_shuffle<VR128, f128mem, v4f32,
            "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
            loadv4f32, SchedWriteFShuffle.XMM, SSEPackedSingle>,
-           PS, VEX_4V, WIG;
+           PS, VEX, VVVV, WIG;
   defm VSHUFPSY : sse12_shuffle<VR256, f256mem, v8f32,
            "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
            loadv8f32, SchedWriteFShuffle.YMM, SSEPackedSingle>,
-           PS, VEX_4V, VEX_L, WIG;
+           PS, VEX, VVVV, VEX_L, WIG;
   defm VSHUFPD  : sse12_shuffle<VR128, f128mem, v2f64,
            "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
            loadv2f64, SchedWriteFShuffle.XMM, SSEPackedDouble>,
-           PD, VEX_4V, WIG;
+           PD, VEX, VVVV, WIG;
   defm VSHUFPDY : sse12_shuffle<VR256, f256mem, v4f64,
            "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
            loadv4f64, SchedWriteFShuffle.YMM, SSEPackedDouble>,
-           PD, VEX_4V, VEX_L, WIG;
+           PD, VEX, VVVV, VEX_L, WIG;
 }
 let Constraints = "$src1 = $dst" in {
   defm SHUFPS : sse12_shuffle<VR128, f128mem, v4f32,
@@ -2126,29 +2126,29 @@ multiclass sse12_unpack_interleave<bits<8> opc, SDNode OpNode, ValueType vt,
 let Predicates = [HasAVX, NoVLX] in {
 defm VUNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, load,
       VR128, f128mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                     SchedWriteFShuffle.XMM, SSEPackedSingle>, PS, VEX_4V, WIG;
+                     SchedWriteFShuffle.XMM, SSEPackedSingle>, PS, VEX, VVVV, WIG;
 defm VUNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, load,
       VR128, f128mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                     SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, PD, VEX_4V, WIG;
+                     SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, PD, VEX, VVVV, WIG;
 defm VUNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, load,
       VR128, f128mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                     SchedWriteFShuffle.XMM, SSEPackedSingle>, PS, VEX_4V, WIG;
+                     SchedWriteFShuffle.XMM, SSEPackedSingle>, PS, VEX, VVVV, WIG;
 defm VUNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, load,
       VR128, f128mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                     SchedWriteFShuffle.XMM, SSEPackedDouble>, PD, VEX_4V, WIG;
+                     SchedWriteFShuffle.XMM, SSEPackedDouble>, PD, VEX, VVVV, WIG;
 
 defm VUNPCKHPSY: sse12_unpack_interleave<0x15, X86Unpckh, v8f32, load,
       VR256, f256mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                     SchedWriteFShuffle.YMM, SSEPackedSingle>, PS, VEX_4V, VEX_L, WIG;
+                     SchedWriteFShuffle.YMM, SSEPackedSingle>, PS, VEX, VVVV, VEX_L, WIG;
 defm VUNPCKHPDY: sse12_unpack_interleave<0x15, X86Unpckh, v4f64, load,
       VR256, f256mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                     SchedWriteFShuffle.YMM, SSEPackedDouble>, PD, VEX_4V, VEX_L, WIG;
+                     SchedWriteFShuffle.YMM, SSEPackedDouble>, PD, VEX, VVVV, VEX_L, WIG;
 defm VUNPCKLPSY: sse12_unpack_interleave<0x14, X86Unpckl, v8f32, load,
       VR256, f256mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                     SchedWriteFShuffle.YMM, SSEPackedSingle>, PS, VEX_4V, VEX_L, WIG;
+                     SchedWriteFShuffle.YMM, SSEPackedSingle>, PS, VEX, VVVV, VEX_L, WIG;
 defm VUNPCKLPDY: sse12_unpack_interleave<0x14, X86Unpckl, v4f64, load,
       VR256, f256mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                     SchedWriteFShuffle.YMM, SSEPackedDouble>, PD, VEX_4V, VEX_L, WIG;
+                     SchedWriteFShuffle.YMM, SSEPackedDouble>, PD, VEX, VVVV, VEX_L, WIG;
 }// Predicates = [HasAVX, NoVLX]
 
 let Constraints = "$src1 = $dst" in {
@@ -2276,7 +2276,7 @@ multiclass PDI_binop_all<bits<8> opc, string OpcodeStr, SDNode Opcode,
 let Predicates = [HasAVX, prd] in
   defm V#NAME : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode, OpVT128,
                              VR128, load, i128mem, sched.XMM,
-                             IsCommutable, 0>, VEX_4V, WIG;
+                             IsCommutable, 0>, VEX, VVVV, WIG;
 
 let Constraints = "$src1 = $dst" in
   defm NAME : PDI_binop_rm<opc, OpcodeStr, Opcode, OpVT128, VR128,
@@ -2285,7 +2285,7 @@ let Constraints = "$src1 = $dst" in
 let Predicates = [HasAVX2, prd] in
   defm V#NAME#Y : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode,
                                OpVT256, VR256, load, i256mem, sched.YMM,
-                               IsCommutable, 0>, VEX_4V, VEX_L, WIG;
+                               IsCommutable, 0>, VEX, VVVV, VEX_L, WIG;
 }
 
 // These are ordered here for pattern ordering requirements with the fp versions
@@ -2312,19 +2312,19 @@ multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr,
   let Predicates = [HasAVX, NoVLX] in {
   defm V#NAME#PSY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedSingle,
         !strconcat(OpcodeStr, "ps"), f256mem, sched.YMM,
-        [], [], 0>, PS, VEX_4V, VEX_L, WIG;
+        [], [], 0>, PS, VEX, VVVV, VEX_L, WIG;
 
   defm V#NAME#PDY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedDouble,
         !strconcat(OpcodeStr, "pd"), f256mem, sched.YMM,
-        [], [], 0>, PD, VEX_4V, VEX_L, WIG;
+        [], [], 0>, PD, VEX, VVVV, VEX_L, WIG;
 
   defm V#NAME#PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle,
        !strconcat(OpcodeStr, "ps"), f128mem, sched.XMM,
-       [], [], 0>, PS, VEX_4V, WIG;
+       [], [], 0>, PS, VEX, VVVV, WIG;
 
   defm V#NAME#PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble,
        !strconcat(OpcodeStr, "pd"), f128mem, sched.XMM,
-       [], [], 0>, PD, VEX_4V, WIG;
+       [], [], 0>, PD, VEX, VVVV, WIG;
   }
 
   let Constraints = "$src1 = $dst" in {
@@ -2636,17 +2636,17 @@ let Uses = [MXCSR], mayRaiseFPException = 1 in {
   let Predicates = [HasAVX, NoVLX] in {
   defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode,
                                VR128, v4f32, f128mem, loadv4f32,
-                               SSEPackedSingle, sched.PS.XMM, 0>, PS, VEX_4V, WIG;
+                               SSEPackedSingle, sched.PS.XMM, 0>, PS, VEX, VVVV, WIG;
   defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode,
                                VR128, v2f64, f128mem, loadv2f64,
-                               SSEPackedDouble, sched.PD.XMM, 0>, PD, VEX_4V, WIG;
+                               SSEPackedDouble, sched.PD.XMM, 0>, PD, VEX, VVVV, WIG;
 
   defm V#NAME#PSY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"),
                         OpNode, VR256, v8f32, f256mem, loadv8f32,
-                        SSEPackedSingle, sched.PS.YMM, 0>, PS, VEX_4V, VEX_L, WIG;
+                        SSEPackedSingle, sched.PS.YMM, 0>, PS, VEX, VVVV, VEX_L, WIG;
   defm V#NAME#PDY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"),
                         OpNode, VR256, v4f64, f256mem, loadv4f64,
-                        SSEPackedDouble, sched.PD.YMM, 0>, PD, VEX_4V, VEX_L, WIG;
+                        SSEPackedDouble, sched.PD.YMM, 0>, PD, VEX, VVVV, VEX_L, WIG;
   }
 
   let Constraints = "$src1 = $dst" in {
@@ -2665,10 +2665,10 @@ multiclass basic_sse12_fp_binop_s<bits<8> opc, string OpcodeStr, SDPatternOperat
 let Uses = [MXCSR], mayRaiseFPException = 1 in {
   defm V#NAME#SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"),
                          OpNode, FR32, f32mem, SSEPackedSingle, sched.PS.Scl, 0>,
-                         XS, VEX_4V, VEX_LIG, WIG;
+                         XS, VEX, VVVV, VEX_LIG, WIG;
   defm V#NAME#SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"),
                          OpNode, FR64, f64mem, SSEPackedDouble, sched.PD.Scl, 0>,
-                         XD, VEX_4V, VEX_LIG, WIG;
+                         XD, VEX, VVVV, VEX_LIG, WIG;
 
   let Constraints = "$src1 = $dst" in {
     defm SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"),
@@ -2687,10 +2687,10 @@ multiclass basic_sse12_fp_binop_s_int<bits<8> opc, string OpcodeStr,
 let Uses = [MXCSR], mayRaiseFPException = 1 in {
   defm V#NAME#SS : sse12_fp_scalar_int<opc, OpNode, VR128, v4f32,
                    !strconcat(OpcodeStr, "ss"), ssmem, sse_load_f32,
-                   SSEPackedSingle, sched.PS.Scl, 0>, XS, VEX_4V, VEX_LIG, WIG;
+                   SSEPackedSingle, sched.PS.Scl, 0>, XS, VEX, VVVV, VEX_LIG, WIG;
   defm V#NAME#SD : sse12_fp_scalar_int<opc, OpNode, VR128, v2f64,
                    !strconcat(OpcodeStr, "sd"), sdmem, sse_load_f64,
-                   SSEPackedDouble, sched.PD.Scl, 0>, XD, VEX_4V, VEX_LIG, WIG;
+                   SSEPackedDouble, sched.PD.Scl, 0>, XD, VEX, VVVV, VEX_LIG, WIG;
 
   let Constraints = "$src1 = $dst" in {
     defm SS : sse12_fp_scalar_int<opc, OpNode, VR128, v4f32,
@@ -3020,7 +3020,7 @@ multiclass sse1_fp_unop_s_intr<string OpcodeStr, Predicate AVXTarget> {
   defm V#NAME#SS  : avx_fp_unop_s_intr<v4f32, sse_load_f32,
                       !cast<Intrinsic>("int_x86_sse_"#OpcodeStr#_ss),
                       AVXTarget>,
-                      XS, VEX_4V, VEX_LIG, WIG;
+                      XS, VEX, VVVV, VEX_LIG, WIG;
 }
 
 multiclass sse1_fp_unop_s<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
@@ -3029,7 +3029,7 @@ multiclass sse1_fp_unop_s<bits<8> opc, string OpcodeStr, SDPatternOperator OpNod
                       ssmem, OpNode, SSEPackedSingle, sched.Scl, UseSSE1>, XS;
   defm V#NAME#SS  : avx_fp_unop_s<opc, "v"#OpcodeStr#ss, FR32, f32,
                       f32mem, ssmem, OpNode, SSEPackedSingle, sched.Scl, AVXTarget>,
-                       XS, VEX_4V, VEX_LIG, WIG;
+                       XS, VEX, VVVV, VEX_LIG, WIG;
 }
 
 multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
@@ -3038,7 +3038,7 @@ multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr, SDPatternOperator OpNod
                          sdmem, OpNode, SSEPackedDouble, sched.Scl, UseSSE2>, XD;
   defm V#NAME#SD  : avx_fp_unop_s<opc, "v"#OpcodeStr#sd, FR64, f64,
                          f64mem, sdmem, OpNode, SSEPackedDouble, sched.Scl, AVXTarget>,
-                         XD, VEX_4V, VEX_LIG, WIG;
+                         XD, VEX, VVVV, VEX_LIG, WIG;
 }
 
 // Square root.
@@ -3537,12 +3537,12 @@ defm PMULUDQ : PDI_binop_all<0xF4, "pmuludq", X86pmuludq, v2i64, v4i64,
 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
 defm VPMADDWD : PDI_binop_rm2<0xF5, "vpmaddwd", X86vpmaddwd, v4i32, v8i16, VR128,
                               load, i128mem, SchedWriteVecIMul.XMM, 0>,
-                              VEX_4V, WIG;
+                              VEX, VVVV, WIG;
 
 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
 defm VPMADDWDY : PDI_binop_rm2<0xF5, "vpmaddwd", X86vpmaddwd, v8i32, v16i16,
                                VR256, load, i256mem, SchedWriteVecIMul.YMM,
-                               0>, VEX_4V, VEX_L, WIG;
+                               0>, VEX, VVVV, VEX_L, WIG;
 let Constraints = "$src1 = $dst" in
 defm PMADDWD : PDI_binop_rm2<0xF5, "pmaddwd", X86vpmaddwd, v4i32, v8i16, VR128,
                              memop, i128mem, SchedWriteVecIMul.XMM>;
@@ -3550,11 +3550,11 @@ defm PMADDWD : PDI_binop_rm2<0xF5, "pmaddwd", X86vpmaddwd, v4i32, v8i16, VR128,
 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
 defm VPSADBW : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v2i64, v16i8, VR128,
                              load, i128mem, SchedWritePSADBW.XMM, 0>,
-                             VEX_4V, WIG;
+                             VEX, VVVV, WIG;
 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
 defm VPSADBWY : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v4i64, v32i8, VR256,
                              load, i256mem, SchedWritePSADBW.YMM, 0>,
-                             VEX_4V, VEX_L, WIG;
+                             VEX, VVVV, VEX_L, WIG;
 let Constraints = "$src1 = $dst" in
 defm PSADBW : PDI_binop_rm2<0xF6, "psadbw", X86psadbw, v2i64, v16i8, VR128,
                             memop, i128mem, SchedWritePSADBW.XMM>;
@@ -3604,11 +3604,11 @@ multiclass PDI_binop_rmi_all<bits<8> opc, bits<8> opc2, Format ImmForm,
 let Predicates = [HasAVX, prd] in
   defm V#NAME : PDI_binop_rmi<opc, opc2, ImmForm, !strconcat("v", OpcodeStr),
                               OpNode, OpNode2, VR128, sched.XMM, schedImm.XMM,
-                              DstVT128, SrcVT, load, 0>, VEX_4V, WIG;
+                              DstVT128, SrcVT, load, 0>, VEX, VVVV, WIG;
 let Predicates = [HasAVX2, prd] in
   defm V#NAME#Y : PDI_binop_rmi<opc, opc2, ImmForm, !strconcat("v", OpcodeStr),
                                 OpNode, OpNode2, VR256, sched.YMM, schedImm.YMM,
-                                DstVT256, SrcVT, load, 0>, VEX_4V, VEX_L,
+                                DstVT256, SrcVT, load, 0>, VEX, VVVV, VEX_L,
                                 WIG;
 let Constraints = "$src1 = $dst" in
   defm NAME : PDI_binop_rmi<opc, opc2, ImmForm, OpcodeStr, OpNode, OpNode2,
@@ -3631,11 +3631,11 @@ multiclass PDI_binop_ri_all<bits<8> opc, Format ImmForm, string OpcodeStr,
                             SDNode OpNode, X86SchedWriteWidths sched> {
 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
   defm V#NAME : PDI_binop_ri<opc, ImmForm, !strconcat("v", OpcodeStr), OpNode,
-                             VR128, v16i8, sched.XMM, 0>, VEX_4V, WIG;
+                             VR128, v16i8, sched.XMM, 0>, VEX, VVVV, WIG;
 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
   defm V#NAME#Y : PDI_binop_ri<opc, ImmForm, !strconcat("v", OpcodeStr), OpNode,
                                VR256, v32i8, sched.YMM, 0>,
-                               VEX_4V, VEX_L, WIG;
+                               VEX, VVVV, VEX_L, WIG;
 let Constraints = "$src1 = $dst" in
   defm NAME : PDI_binop_ri<opc, ImmForm, OpcodeStr, OpNode, VR128, v16i8,
                            sched.XMM>;
@@ -3821,33 +3821,33 @@ multiclass sse4_pack<bits<8> opc, string OpcodeStr, ValueType OutVT,
 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
   defm VPACKSSWB : sse2_pack<0x63, "vpacksswb", v16i8, v8i16, X86Packss, VR128,
                              i128mem, SchedWriteShuffle.XMM, load, 0>,
-                             VEX_4V, WIG;
+                             VEX, VVVV, WIG;
   defm VPACKSSDW : sse2_pack<0x6B, "vpackssdw", v8i16, v4i32, X86Packss, VR128,
                              i128mem, SchedWriteShuffle.XMM, load, 0>,
-                             VEX_4V, WIG;
+                             VEX, VVVV, WIG;
 
   defm VPACKUSWB : sse2_pack<0x67, "vpackuswb", v16i8, v8i16, X86Packus, VR128,
                              i128mem, SchedWriteShuffle.XMM, load, 0>,
-                             VEX_4V, WIG;
+                             VEX, VVVV, WIG;
   defm VPACKUSDW : sse4_pack<0x2B, "vpackusdw", v8i16, v4i32, X86Packus, VR128,
                              i128mem, SchedWriteShuffle.XMM, load, 0>,
-                             VEX_4V, WIG;
+                             VEX, VVVV, WIG;
 }
 
 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
   defm VPACKSSWBY : sse2_pack<0x63, "vpacksswb", v32i8, v16i16, X86Packss, VR256,
                               i256mem, SchedWriteShuffle.YMM, load, 0>,
-                              VEX_4V, VEX_L, WIG;
+                              VEX, VVVV, VEX_L, WIG;
   defm VPACKSSDWY : sse2_pack<0x6B, "vpackssdw", v16i16, v8i32, X86Packss, VR256,
                               i256mem, SchedWriteShuffle.YMM, load, 0>,
-                              VEX_4V, VEX_L, WIG;
+                              VEX, VVVV, VEX_L, WIG;
 
   defm VPACKUSWBY : sse2_pack<0x67, "vpackuswb", v32i8, v16i16, X86Packus, VR256,
                               i256mem, SchedWriteShuffle.YMM, load, 0>,
-                              VEX_4V, VEX_L, WIG;
+                              VEX, VVVV, VEX_L, WIG;
   defm VPACKUSDWY : sse4_pack<0x2B, "vpackusdw", v16i16, v8i32, X86Packus, VR256,
                               i256mem, SchedWriteShuffle.YMM, load, 0>,
-                              VEX_4V, VEX_L, WIG;
+                              VEX, VVVV, VEX_L, WIG;
 }
 
 let Constraints = "$src1 = $dst" in {
@@ -3892,61 +3892,61 @@ multiclass sse2_unpack<bits<8> opc, string OpcodeStr, ValueType vt,
 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
   defm VPUNPCKLBW  : sse2_unpack<0x60, "vpunpcklbw", v16i8, X86Unpckl, VR128,
                                  i128mem, SchedWriteShuffle.XMM, load, 0>,
-                                 VEX_4V, WIG;
+                                 VEX, VVVV, WIG;
   defm VPUNPCKLWD  : sse2_unpack<0x61, "vpunpcklwd", v8i16, X86Unpckl, VR128,
                                  i128mem, SchedWriteShuffle.XMM, load, 0>,
-                                 VEX_4V, WIG;
+                                 VEX, VVVV, WIG;
   defm VPUNPCKHBW  : sse2_unpack<0x68, "vpunpckhbw", v16i8, X86Unpckh, VR128,
                                  i128mem, SchedWriteShuffle.XMM, load, 0>,
-                                 VEX_4V, WIG;
+                                 VEX, VVVV, WIG;
   defm VPUNPCKHWD  : sse2_unpack<0x69, "vpunpckhwd", v8i16, X86Unpckh, VR128,
                                  i128mem, SchedWriteShuffle.XMM, load, 0>,
-                                 VEX_4V, WIG;
+                                 VEX, VVVV, WIG;
 }
 
 let Predicates = [HasAVX, NoVLX] in {
   defm VPUNPCKLDQ  : sse2_unpack<0x62, "vpunpckldq", v4i32, X86Unpckl, VR128,
                                  i128mem, SchedWriteShuffle.XMM, load, 0>,
-                                 VEX_4V, WIG;
+                                 VEX, VVVV, WIG;
   defm VPUNPCKLQDQ : sse2_unpack<0x6C, "vpunpcklqdq", v2i64, X86Unpckl, VR128,
                                  i128mem, SchedWriteShuffle.XMM, load, 0>,
-                                 VEX_4V, WIG;
+                                 VEX, VVVV, WIG;
   defm VPUNPCKHDQ  : sse2_unpack<0x6A, "vpunpckhdq", v4i32, X86Unpckh, VR128,
                                  i128mem, SchedWriteShuffle.XMM, load, 0>,
-                                 VEX_4V, WIG;
+                                 VEX, VVVV, WIG;
   defm VPUNPCKHQDQ : sse2_unpack<0x6D, "vpunpckhqdq", v2i64, X86Unpckh, VR128,
                                  i128mem, SchedWriteShuffle.XMM, load, 0>,
-                                 VEX_4V, WIG;
+                                 VEX, VVVV, WIG;
 }
 
 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
   defm VPUNPCKLBWY  : sse2_unpack<0x60, "vpunpcklbw", v32i8, X86Unpckl, VR256,
                                   i256mem, SchedWriteShuffle.YMM, load, 0>,
-                                  VEX_4V, VEX_L, WIG;
+                                  VEX, VVVV, VEX_L, WIG;
   defm VPUNPCKLWDY  : sse2_unpack<0x61, "vpunpcklwd", v16i16, X86Unpckl, VR256,
                                   i256mem, SchedWriteShuffle.YMM, load, 0>,
-                                  VEX_4V, VEX_L, WIG;
+                                  VEX, VVVV, VEX_L, WIG;
   defm VPUNPCKHBWY  : sse2_unpack<0x68, "vpunpckhbw", v32i8, X86Unpckh, VR256,
                                   i256mem, SchedWriteShuffle.YMM, load, 0>,
-                                  VEX_4V, VEX_L, WIG;
+                                  VEX, VVVV, VEX_L, WIG;
   defm VPUNPCKHWDY  : sse2_unpack<0x69, "vpunpckhwd", v16i16, X86Unpckh, VR256,
                                   i256mem, SchedWriteShuffle.YMM, load, 0>,
-                                  VEX_4V, VEX_L, WIG;
+                                  VEX, VVVV, VEX_L, WIG;
 }
 
 let Predicates = [HasAVX2, NoVLX] in {
   defm VPUNPCKLDQY  : sse2_unpack<0x62, "vpunpckldq", v8i32, X86Unpckl, VR256,
                                   i256mem, SchedWriteShuffle.YMM, load, 0>,
-                                  VEX_4V, VEX_L, WIG;
+                                  VEX, VVVV, VEX_L, WIG;
   defm VPUNPCKLQDQY : sse2_unpack<0x6C, "vpunpcklqdq", v4i64, X86Unpckl, VR256,
                                   i256mem, SchedWriteShuffle.YMM, load, 0>,
-                                  VEX_4V, VEX_L, WIG;
+                                  VEX, VVVV, VEX_L, WIG;
   defm VPUNPCKHDQY  : sse2_unpack<0x6A, "vpunpckhdq", v8i32, X86Unpckh, VR256,
                                   i256mem, SchedWriteShuffle.YMM, load, 0>,
-                                  VEX_4V, VEX_L, WIG;
+                                  VEX, VVVV, VEX_L, WIG;
   defm VPUNPCKHQDQY : sse2_unpack<0x6D, "vpunpckhqdq", v4i64, X86Unpckh, VR256,
                                   i256mem, SchedWriteShuffle.YMM, load, 0>,
-                                  VEX_4V, VEX_L, WIG;
+                                  VEX, VVVV, VEX_L, WIG;
 }
 
 let Constraints = "$src1 = $dst" in {
@@ -4014,7 +4014,7 @@ def PEXTRWrr : PDIi8<0xC5, MRMSrcReg,
 
 // Insert
 let Predicates = [HasAVX, NoBWI] in
-defm VPINSRW : sse2_pinsrw<0>, PD, VEX_4V, WIG;
+defm VPINSRW : sse2_pinsrw<0>, PD, VEX, VVVV, WIG;
 
 let Predicates = [UseSSE2], Constraints = "$src1 = $dst" in
 defm PINSRW : sse2_pinsrw, PD;
@@ -4563,18 +4563,18 @@ let Predicates = [HasAVX] in {
   let ExeDomain = SSEPackedSingle in {
     defm VADDSUBPS : sse3_addsub<"vaddsubps", v4f32, VR128, f128mem,
                                  SchedWriteFAddSizes.PS.XMM, loadv4f32, 0>,
-                                 XD, VEX_4V, WIG;
+                                 XD, VEX, VVVV, WIG;
     defm VADDSUBPSY : sse3_addsub<"vaddsubps", v8f32, VR256, f256mem,
                                   SchedWriteFAddSizes.PS.YMM, loadv8f32, 0>,
-                                  XD, VEX_4V, VEX_L, WIG;
+                                  XD, VEX, VVVV, VEX_L, WIG;
   }
   let ExeDomain = SSEPackedDouble in {
     defm VADDSUBPD : sse3_addsub<"vaddsubpd", v2f64, VR128, f128mem,
                                  SchedWriteFAddSizes.PD.XMM, loadv2f64, 0>,
-                                 PD, VEX_4V, WIG;
+                                 PD, VEX, VVVV, WIG;
     defm VADDSUBPDY : sse3_addsub<"vaddsubpd", v4f64, VR256, f256mem,
                                   SchedWriteFAddSizes.PD.YMM, loadv4f64, 0>,
-                                  PD, VEX_4V, VEX_L, WIG;
+                                  PD, VEX, VVVV, VEX_L, WIG;
   }
 }
 let Constraints = "$src1 = $dst", Predicates = [UseSSE3] in {
@@ -4635,23 +4635,23 @@ let Uses = [MXCSR], mayRaiseFPException = 1 in {
 let Predicates = [HasAVX] in {
   let ExeDomain = SSEPackedSingle in {
     defm VHADDPS  : S3D_Int<0x7C, "vhaddps", v4f32, VR128, f128mem,
-                            X86fhadd, WriteFHAdd, loadv4f32, 0>, VEX_4V, WIG;
+                            X86fhadd, WriteFHAdd, loadv4f32, 0>, VEX, VVVV, WIG;
     defm VHSUBPS  : S3D_Int<0x7D, "vhsubps", v4f32, VR128, f128mem,
-                            X86fhsub, WriteFHAdd, loadv4f32, 0>, VEX_4V, WIG;
+                            X86fhsub, WriteFHAdd, loadv4f32, 0>, VEX, VVVV, WIG;
     defm VHADDPSY : S3D_Int<0x7C, "vhaddps", v8f32, VR256, f256mem,
-                            X86fhadd, WriteFHAddY, loadv8f32, 0>, VEX_4V, VEX_L, WIG;
+                            X86fhadd, WriteFHAddY, loadv8f32, 0>, VEX, VVVV, VEX_L, WIG;
     defm VHSUBPSY : S3D_Int<0x7D, "vhsubps", v8f32, VR256, f256mem,
-                            X86fhsub, WriteFHAddY, loadv8f32, 0>, VEX_4V, VEX_L, WIG;
+                            X86fhsub, WriteFHAddY, loadv8f32, 0>, VEX, VVVV, VEX_L, WIG;
   }
   let ExeDomain = SSEPackedDouble in {
     defm VHADDPD  : S3_Int<0x7C, "vhaddpd", v2f64, VR128, f128mem,
-                           X86fhadd, WriteFHAdd, loadv2f64, 0>, VEX_4V, WIG;
+                           X86fhadd, WriteFHAdd, loadv2f64, 0>, VEX, VVVV, WIG;
     defm VHSUBPD  : S3_Int<0x7D, "vhsubpd", v2f64, VR128, f128mem,
-                           X86fhsub, WriteFHAdd, loadv2f64, 0>, VEX_4V, WIG;
+                           X86fhsub, WriteFHAdd, loadv2f64, 0>, VEX, VVVV, WIG;
     defm VHADDPDY : S3_Int<0x7C, "vhaddpd", v4f64, VR256, f256mem,
-                           X86fhadd, WriteFHAddY, loadv4f64, 0>, VEX_4V, VEX_L, WIG;
+                           X86fhadd, WriteFHAddY, loadv4f64, 0>, VEX, VVVV, VEX_L, WIG;
     defm VHSUBPDY : S3_Int<0x7D, "vhsubpd", v4f64, VR256, f256mem,
-                           X86fhsub, WriteFHAddY, loadv4f64, 0>, VEX_4V, VEX_L, WIG;
+                           X86fhsub, WriteFHAddY, loadv4f64, 0>, VEX, VVVV, VEX_L, WIG;
   }
 }
 
@@ -4806,45 +4806,45 @@ let ImmT = NoImm, Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
 let isCommutable = 0 in {
   defm VPSHUFB    : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v16i8, v16i8,
                                   VR128, load, i128mem,
-                                  SchedWriteVarShuffle.XMM, 0>, VEX_4V, WIG;
+                                  SchedWriteVarShuffle.XMM, 0>, VEX, VVVV, WIG;
   defm VPMADDUBSW : SS3I_binop_rm<0x04, "vpmaddubsw", X86vpmaddubsw, v8i16,
                                   v16i8, VR128, load, i128mem,
-                                  SchedWriteVecIMul.XMM, 0>, VEX_4V, WIG;
+                                  SchedWriteVecIMul.XMM, 0>, VEX, VVVV, WIG;
 }
 defm VPMULHRSW    : SS3I_binop_rm<0x0B, "vpmulhrsw", X86mulhrs, v8i16, v8i16,
                                   VR128, load, i128mem,
-                                  SchedWriteVecIMul.XMM, 0>, VEX_4V, WIG;
+                                  SchedWriteVecIMul.XMM, 0>, VEX, VVVV, WIG;
 }
 
 let ImmT = NoImm, Predicates = [HasAVX] in {
 let isCommutable = 0 in {
   defm VPHADDW    : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v8i16, v8i16, VR128,
                                   load, i128mem,
-                                  SchedWritePHAdd.XMM, 0>, VEX_4V, WIG;
+                                  SchedWritePHAdd.XMM, 0>, VEX, VVVV, WIG;
   defm VPHADDD    : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v4i32, v4i32, VR128,
                                   load, i128mem,
-                                  SchedWritePHAdd.XMM, 0>, VEX_4V, WIG;
+                                  SchedWritePHAdd.XMM, 0>, VEX, VVVV, WIG;
   defm VPHSUBW    : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v8i16, v8i16, VR128,
                                   load, i128mem,
-                                  SchedWritePHAdd.XMM, 0>, VEX_4V, WIG;
+                                  SchedWritePHAdd.XMM, 0>, VEX, VVVV, WIG;
   defm VPHSUBD    : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v4i32, v4i32, VR128,
                                   load, i128mem,
-                                  SchedWritePHAdd.XMM, 0>, VEX_4V, WIG;
+                                  SchedWritePHAdd.XMM, 0>, VEX, VVVV, WIG;
   defm VPSIGNB    : SS3I_binop_rm_int<0x08, "vpsignb",
                                       int_x86_ssse3_psign_b_128,
-                                      SchedWriteVecALU.XMM, load, 0>, VEX_4V, WIG;
+                                      SchedWriteVecALU.XMM, load, 0>, VEX, VVVV, WIG;
   defm VPSIGNW    : SS3I_binop_rm_int<0x09, "vpsignw",
                                       int_x86_ssse3_psign_w_128,
-                                      SchedWriteVecALU.XMM, load, 0>, VEX_4V, WIG;
+                                      SchedWriteVecALU.XMM, load, 0>, VEX, VVVV, WIG;
   defm VPSIGND    : SS3I_binop_rm_int<0x0A, "vpsignd",
                                       int_x86_ssse3_psign_d_128,
-                                      SchedWriteVecALU.XMM, load, 0>, VEX_4V, WIG;
+                                      SchedWriteVecALU.XMM, load, 0>, VEX, VVVV, WIG;
   defm VPHADDSW   : SS3I_binop_rm_int<0x03, "vphaddsw",
                                       int_x86_ssse3_phadd_sw_128,
-                                      SchedWritePHAdd.XMM, load, 0>, VEX_4V, WIG;
+                                      SchedWritePHAdd.XMM, load, 0>, VEX, VVVV, WIG;
   defm VPHSUBSW   : SS3I_binop_rm_int<0x07, "vphsubsw",
                                       int_x86_ssse3_phsub_sw_128,
-                                      SchedWritePHAdd.XMM, load, 0>, VEX_4V, WIG;
+                                      SchedWritePHAdd.XMM, load, 0>, VEX, VVVV, WIG;
 }
 }
 
@@ -4852,42 +4852,42 @@ let ImmT = NoImm, Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
 let isCommutable = 0 in {
   defm VPSHUFBY   : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v32i8, v32i8,
                                   VR256, load, i256mem,
-                                  SchedWriteVarShuffle.YMM, 0>, VEX_4V, VEX_L, WIG;
+                                  SchedWriteVarShuffle.YMM, 0>, VEX, VVVV, VEX_L, WIG;
   defm VPMADDUBSWY : SS3I_binop_rm<0x04, "vpmaddubsw", X86vpmaddubsw, v16i16,
                                    v32i8, VR256, load, i256mem,
-                                   SchedWriteVecIMul.YMM, 0>, VEX_4V, VEX_L, WIG;
+                                   SchedWriteVecIMul.YMM, 0>, VEX, VVVV, VEX_L, WIG;
 }
 defm VPMULHRSWY   : SS3I_binop_rm<0x0B, "vpmulhrsw", X86mulhrs, v16i16, v16i16,
                                   VR256, load, i256mem,
-                                  SchedWriteVecIMul.YMM, 0>, VEX_4V, VEX_L, WIG;
+                                  SchedWriteVecIMul.YMM, 0>, VEX, VVVV, VEX_L, WIG;
 }
 
 let ImmT = NoImm, Predicates = [HasAVX2] in {
 let isCommutable = 0 in {
   defm VPHADDWY   : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v16i16, v16i16,
                                   VR256, load, i256mem,
-                                  SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, WIG;
+                                  SchedWritePHAdd.YMM, 0>, VEX, VVVV, VEX_L, WIG;
   defm VPHADDDY   : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v8i32, v8i32, VR256,
                                   load, i256mem,
-                                  SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, WIG;
+                                  SchedWritePHAdd.YMM, 0>, VEX, VVVV, VEX_L, WIG;
   defm VPHSUBWY   : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v16i16, v16i16,
                                   VR256, load, i256mem,
-                                  SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, WIG;
+                                  SchedWritePHAdd.YMM, 0>, VEX, VVVV, VEX_L, WIG;
   defm VPHSUBDY   : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v8i32, v8i32, VR256,
                                   load, i256mem,
-                                  SchedWritePHAdd.YMM, 0>, VEX_4V, VEX_L, WIG;
+                                  SchedWritePHAdd.YMM, 0>, VEX, VVVV, VEX_L, WIG;
   defm VPSIGNB   : SS3I_binop_rm_int_y<0x08, "vpsignb", int_x86_avx2_psign_b,
-                                       SchedWriteVecALU.YMM>, VEX_4V, VEX_L, WIG;
+                                       SchedWriteVecALU.YMM>, VEX, VVVV, VEX_L, WIG;
   defm VPSIGNW   : SS3I_binop_rm_int_y<0x09, "vpsignw", int_x86_avx2_psign_w,
-                                       SchedWriteVecALU.YMM>, VEX_4V, VEX_L, WIG;
+                                       SchedWriteVecALU.YMM>, VEX, VVVV, VEX_L, WIG;
   defm VPSIGND   : SS3I_binop_rm_int_y<0x0A, "vpsignd", int_x86_avx2_psign_d,
-                                       SchedWriteVecALU.YMM>, VEX_4V, VEX_L, WIG;
+                                       SchedWriteVecALU.YMM>, VEX, VVVV, VEX_L, WIG;
   defm VPHADDSW  : SS3I_binop_rm_int_y<0x03, "vphaddsw",
                                        int_x86_avx2_phadd_sw,
-                                       SchedWritePHAdd.YMM>, VEX_4V, VEX_L, WIG;
+                                       SchedWritePHAdd.YMM>, VEX, VVVV, VEX_L, WIG;
   defm VPHSUBSW  : SS3I_binop_rm_int_y<0x07, "vphsubsw",
                                        int_x86_avx2_phsub_sw,
-                                       SchedWritePHAdd.YMM>, VEX_4V, VEX_L, WIG;
+                                       SchedWritePHAdd.YMM>, VEX, VVVV, VEX_L, WIG;
 }
 }
 
@@ -4956,10 +4956,10 @@ multiclass ssse3_palignr<string asm, ValueType VT, RegisterClass RC,
 
 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
   defm VPALIGNR : ssse3_palignr<"vpalignr", v16i8, VR128, load, i128mem,
-                                SchedWriteShuffle.XMM, 0>, VEX_4V, WIG;
+                                SchedWriteShuffle.XMM, 0>, VEX, VVVV, WIG;
 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
   defm VPALIGNRY : ssse3_palignr<"vpalignr", v32i8, VR256, load, i256mem,
-                                 SchedWriteShuffle.YMM, 0>, VEX_4V, VEX_L, WIG;
+                                 SchedWriteShuffle.YMM, 0>, VEX, VVVV, VEX_L, WIG;
 let Constraints = "$src1 = $dst", Predicates = [UseSSSE3] in
   defm PALIGNR : ssse3_palignr<"palignr", v16i8, VR128, memop, i128mem,
                                SchedWriteShuffle.XMM>;
@@ -5367,7 +5367,7 @@ multiclass SS41I_insert8<bits<8> opc, string asm, bit Is2Addr = 1> {
 }
 
 let Predicates = [HasAVX, NoBWI] in {
-  defm VPINSRB : SS41I_insert8<0x20, "vpinsrb", 0>, VEX_4V, WIG;
+  defm VPINSRB : SS41I_insert8<0x20, "vpinsrb", 0>, VEX, VVVV, WIG;
   def : Pat<(X86pinsrb VR128:$src1, (i32 (anyext (i8 GR8:$src2))), timm:$src3),
             (VPINSRBrr VR128:$src1, (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
                        GR8:$src2, sub_8bit), timm:$src3)>;
@@ -5398,7 +5398,7 @@ multiclass SS41I_insert32<bits<8> opc, string asm, bit Is2Addr = 1> {
 }
 
 let Predicates = [HasAVX, NoDQI] in
-  defm VPINSRD : SS41I_insert32<0x22, "vpinsrd", 0>, VEX_4V;
+  defm VPINSRD : SS41I_insert32<0x22, "vpinsrd", 0>, VEX, VVVV;
 let Constraints = "$src1 = $dst" in
   defm PINSRD : SS41I_insert32<0x22, "pinsrd">;
 
@@ -5424,7 +5424,7 @@ multiclass SS41I_insert64<bits<8> opc, string asm, bit Is2Addr = 1> {
 }
 
 let Predicates = [HasAVX, NoDQI] in
-  defm VPINSRQ : SS41I_insert64<0x22, "vpinsrq", 0>, VEX_4V, REX_W;
+  defm VPINSRQ : SS41I_insert64<0x22, "vpinsrq", 0>, VEX, VVVV, REX_W;
 let Constraints = "$src1 = $dst" in
   defm PINSRQ : SS41I_insert64<0x22, "pinsrq">, REX_W;
 
@@ -5459,7 +5459,7 @@ multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1> {
 let ExeDomain = SSEPackedSingle in {
   let Predicates = [UseAVX] in
     defm VINSERTPS : SS41I_insertf32<0x21, "vinsertps", 0>,
-                     VEX_4V, WIG;
+                     VEX, VVVV, WIG;
   let Constraints = "$src1 = $dst" in
     defm INSERTPS : SS41I_insertf32<0x21, "insertps", 1>;
 }
@@ -5638,9 +5638,9 @@ let Predicates = [HasAVX, NoVLX] in {
 let Predicates = [UseAVX] in {
   defm VROUND  : sse41_fp_binop_s<0x0A, 0x0B, "vround", SchedWriteFRnd.Scl,
                                   v4f32, v2f64, X86RndScales, 0>,
-                                  VEX_4V, VEX_LIG, WIG, SIMD_EXC;
+                                  VEX, VVVV, VEX_LIG, WIG, SIMD_EXC;
   defm VROUND  : avx_fp_unop_rm<0x0A, 0x0B, "vround", SchedWriteFRnd.Scl>,
-                                VEX_4V, VEX_LIG, WIG, SIMD_EXC;
+                                VEX, VVVV, VEX_LIG, WIG, SIMD_EXC;
 }
 
 let Predicates = [UseAVX] in {
@@ -5842,65 +5842,65 @@ multiclass SS48I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
 let Predicates = [HasAVX, NoVLX] in {
   defm VPMINSD   : SS48I_binop_rm<0x39, "vpminsd", smin, v4i32, VR128,
                                   load, i128mem, SchedWriteVecALU.XMM, 0>,
-                                  VEX_4V, WIG;
+                                  VEX, VVVV, WIG;
   defm VPMINUD   : SS48I_binop_rm<0x3B, "vpminud", umin, v4i32, VR128,
                                   load, i128mem, SchedWriteVecALU.XMM, 0>,
-                                  VEX_4V, WIG;
+                                  VEX, VVVV, WIG;
   defm VPMAXSD   : SS48I_binop_rm<0x3D, "vpmaxsd", smax, v4i32, VR128,
                                   load, i128mem, SchedWriteVecALU.XMM, 0>,
-                                  VEX_4V, WIG;
+                                  VEX, VVVV, WIG;
   defm VPMAXUD   : SS48I_binop_rm<0x3F, "vpmaxud", umax, v4i32, VR128,
                                   load, i128mem, SchedWriteVecALU.XMM, 0>,
-                                  VEX_4V, WIG;
+                                  VEX, VVVV, WIG;
   defm VPMULDQ   : SS48I_binop_rm<0x28, "vpmuldq", X86pmuldq, v2i64, VR128,
                                   load, i128mem, SchedWriteVecIMul.XMM, 0>,
-                                  VEX_4V, WIG;
+                                  VEX, VVVV, WIG;
 }
 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
   defm VPMINSB   : SS48I_binop_rm<0x38, "vpminsb", smin, v16i8, VR128,
                                   load, i128mem, SchedWriteVecALU.XMM, 0>,
-                                  VEX_4V, WIG;
+                                  VEX, VVVV, WIG;
   defm VPMINUW   : SS48I_binop_rm<0x3A, "vpminuw", umin, v8i16, VR128,
                                   load, i128mem, SchedWriteVecALU.XMM, 0>,
-                                  VEX_4V, WIG;
+                                  VEX, VVVV, WIG;
   defm VPMAXSB   : SS48I_binop_rm<0x3C, "vpmaxsb", smax, v16i8, VR128,
                                   load, i128mem, SchedWriteVecALU.XMM, 0>,
-                                  VEX_4V, WIG;
+                                  VEX, VVVV, WIG;
   defm VPMAXUW   : SS48I_binop_rm<0x3E, "vpmaxuw", umax, v8i16, VR128,
                                   load, i128mem, SchedWriteVecALU.XMM, 0>,
-                                  VEX_4V, WIG;
+                                  VEX, VVVV, WIG;
 }
 
 let Predicates = [HasAVX2, NoVLX] in {
   defm VPMINSDY  : SS48I_binop_rm<0x39, "vpminsd", smin, v8i32, VR256,
                                   load, i256mem, SchedWriteVecALU.YMM, 0>,
-                                  VEX_4V, VEX_L, WIG;
+                                  VEX, VVVV, VEX_L, WIG;
   defm VPMINUDY  : SS48I_binop_rm<0x3B, "vpminud", umin, v8i32, VR256,
                                   load, i256mem, SchedWriteVecALU.YMM, 0>,
-                                  VEX_4V, VEX_L, WIG;
+                                  VEX, VVVV, VEX_L, WIG;
   defm VPMAXSDY  : SS48I_binop_rm<0x3D, "vpmaxsd", smax, v8i32, VR256,
                                   load, i256mem, SchedWriteVecALU.YMM, 0>,
-                                  VEX_4V, VEX_L, WIG;
+                                  VEX, VVVV, VEX_L, WIG;
   defm VPMAXUDY  : SS48I_binop_rm<0x3F, "vpmaxud", umax, v8i32, VR256,
                                   load, i256mem, SchedWriteVecALU.YMM, 0>,
-                                  VEX_4V, VEX_L, WIG;
+                                  VEX, VVVV, VEX_L, WIG;
   defm VPMULDQY  : SS48I_binop_rm<0x28, "vpmuldq", X86pmuldq, v4i64, VR256,
                                   load, i256mem, SchedWriteVecIMul.YMM, 0>,
-                                  VEX_4V, VEX_L, WIG;
+                                  VEX, VVVV, VEX_L, WIG;
 }
 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
   defm VPMINSBY  : SS48I_binop_rm<0x38, "vpminsb", smin, v32i8, VR256,
                                   load, i256mem, SchedWriteVecALU.YMM, 0>,
-                                  VEX_4V, VEX_L, WIG;
+                                  VEX, VVVV, VEX_L, WIG;
   defm VPMINUWY  : SS48I_binop_rm<0x3A, "vpminuw", umin, v16i16, VR256,
                                   load, i256mem, SchedWriteVecALU.YMM, 0>,
-                                  VEX_4V, VEX_L, WIG;
+                                  VEX, VVVV, VEX_L, WIG;
   defm VPMAXSBY  : SS48I_binop_rm<0x3C, "vpmaxsb", smax, v32i8, VR256,
                                   load, i256mem, SchedWriteVecALU.YMM, 0>,
-                                  VEX_4V, VEX_L, WIG;
+                                  VEX, VVVV, VEX_L, WIG;
   defm VPMAXUWY  : SS48I_binop_rm<0x3E, "vpmaxuw", umax, v16i16, VR256,
                                   load, i256mem, SchedWriteVecALU.YMM, 0>,
-                                  VEX_4V, VEX_L, WIG;
+                                  VEX, VVVV, VEX_L, WIG;
 }
 
 let Constraints = "$src1 = $dst" in {
@@ -5927,20 +5927,20 @@ let Constraints = "$src1 = $dst" in {
 let Predicates = [HasAVX, NoVLX] in
   defm VPMULLD  : SS48I_binop_rm<0x40, "vpmulld", mul, v4i32, VR128,
                                  load, i128mem, SchedWritePMULLD.XMM, 0>,
-                                 VEX_4V, WIG;
+                                 VEX, VVVV, WIG;
 let Predicates = [HasAVX] in
   defm VPCMPEQQ : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v2i64, VR128,
                                  load, i128mem, SchedWriteVecALU.XMM, 0>,
-                                 VEX_4V, WIG;
+                                 VEX, VVVV, WIG;
 
 let Predicates = [HasAVX2, NoVLX] in
   defm VPMULLDY  : SS48I_binop_rm<0x40, "vpmulld", mul, v8i32, VR256,
                                   load, i256mem, SchedWritePMULLD.YMM, 0>,
-                                  VEX_4V, VEX_L, WIG;
+                                  VEX, VVVV, VEX_L, WIG;
 let Predicates = [HasAVX2] in
   defm VPCMPEQQY : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v4i64, VR256,
                                   load, i256mem, SchedWriteVecALU.YMM, 0>,
-                                  VEX_4V, VEX_L, WIG;
+                                  VEX, VVVV, VEX_L, WIG;
 
 let Constraints = "$src1 = $dst" in {
   defm PMULLD  : SS48I_binop_rm<0x40, "pmulld", mul, v4i32, VR128,
@@ -6088,22 +6088,22 @@ let Predicates = [HasAVX] in {
   let isCommutable = 0 in {
     defm VMPSADBW : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_sse41_mpsadbw,
                                         VR128, load, i128mem, 0,
-                                        SchedWriteMPSAD.XMM>, VEX_4V, WIG;
+                                        SchedWriteMPSAD.XMM>, VEX, VVVV, WIG;
   }
 
 let Uses = [MXCSR], mayRaiseFPException = 1 in {
   let ExeDomain = SSEPackedSingle in
   defm VDPPS : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_sse41_dpps,
                                    VR128, load, f128mem, 0,
-                                   SchedWriteDPPS.XMM>, VEX_4V, WIG;
+                                   SchedWriteDPPS.XMM>, VEX, VVVV, WIG;
   let ExeDomain = SSEPackedDouble in
   defm VDPPD : SS41I_binop_rmi_int<0x41, "vdppd", int_x86_sse41_dppd,
                                    VR128, load, f128mem, 0,
-                                   SchedWriteDPPD.XMM>, VEX_4V, WIG;
+                                   SchedWriteDPPD.XMM>, VEX, VVVV, WIG;
   let ExeDomain = SSEPackedSingle in
   defm VDPPSY : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_avx_dp_ps_256,
                                     VR256, load, i256mem, 0,
-                                    SchedWriteDPPS.YMM>, VEX_4V, VEX_L, WIG;
+                                    SchedWriteDPPS.YMM>, VEX, VVVV, VEX_L, WIG;
 }
 }
 
@@ -6111,7 +6111,7 @@ let Predicates = [HasAVX2] in {
   let isCommutable = 0 in {
   defm VMPSADBWY : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_avx2_mpsadbw,
                                   VR256, load, i256mem, 0,
-                                  SchedWriteMPSAD.YMM>, VEX_4V, VEX_L, WIG;
+                                  SchedWriteMPSAD.YMM>, VEX, VVVV, VEX_L, WIG;
   }
 }
 
@@ -6170,30 +6170,30 @@ let Predicates = [HasAVX] in {
   defm VBLENDPS : SS41I_blend_rmi<0x0C, "vblendps", X86Blendi, v4f32,
                                   VR128, load, f128mem, 0, SSEPackedSingle,
                                   SchedWriteFBlend.XMM, BlendCommuteImm4>,
-                                  VEX_4V, WIG;
+                                  VEX, VVVV, WIG;
   defm VBLENDPSY : SS41I_blend_rmi<0x0C, "vblendps", X86Blendi, v8f32,
                                    VR256, load, f256mem, 0, SSEPackedSingle,
                                    SchedWriteFBlend.YMM, BlendCommuteImm8>,
-                                   VEX_4V, VEX_L, WIG;
+                                   VEX, VVVV, VEX_L, WIG;
   defm VBLENDPD : SS41I_blend_rmi<0x0D, "vblendpd", X86Blendi, v2f64,
                                   VR128, load, f128mem, 0, SSEPackedDouble,
                                   SchedWriteFBlend.XMM, BlendCommuteImm2>,
-                                  VEX_4V, WIG;
+                                  VEX, VVVV, WIG;
   defm VBLENDPDY : SS41I_blend_rmi<0x0D, "vblendpd", X86Blendi, v4f64,
                                    VR256, load, f256mem, 0, SSEPackedDouble,
                                    SchedWriteFBlend.YMM, BlendCommuteImm4>,
-                                   VEX_4V, VEX_L, WIG;
+                                   VEX, VVVV, VEX_L, WIG;
   defm VPBLENDW : SS41I_blend_rmi<0x0E, "vpblendw", X86Blendi, v8i16,
                                   VR128, load, i128mem, 0, SSEPackedInt,
                                   SchedWriteBlend.XMM, BlendCommuteImm8>,
-                                  VEX_4V, WIG;
+                                  VEX, VVVV, WIG;
 }
 
 let Predicates = [HasAVX2] in {
   defm VPBLENDWY : SS41I_blend_rmi<0x0E, "vpblendw", X86Blendi, v16i16,
                                    VR256, load, i256mem, 0, SSEPackedInt,
                                    SchedWriteBlend.YMM, BlendCommuteImm8>,
-                                   VEX_4V, VEX_L, WIG;
+                                   VEX, VVVV, VEX_L, WIG;
 }
 
 // Emulate vXi32/vXi64 blends with vXf32/vXf64 or pblendw.
@@ -6290,7 +6290,7 @@ multiclass SS41I_quaternary_avx<bits<8> opc, string OpcodeStr, RegisterClass RC,
                   !strconcat(OpcodeStr,
                     "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
                   [(set RC:$dst, (VT (OpNode RC:$src3, RC:$src2, RC:$src1)))],
-                  SSEPackedInt>, TAPD, VEX_4V,
+                  SSEPackedInt>, TAPD, VEX, VVVV,
                 Sched<[sched]>;
 
   def rm : Ii8Reg<opc, MRMSrcMem, (outs RC:$dst),
@@ -6299,7 +6299,7 @@ multiclass SS41I_quaternary_avx<bits<8> opc, string OpcodeStr, RegisterClass RC,
                     "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
                   [(set RC:$dst,
                         (OpNode RC:$src3, (mem_frag addr:$src2),
-                                RC:$src1))], SSEPackedInt>, TAPD, VEX_4V,
+                                RC:$src1))], SSEPackedInt>, TAPD, VEX, VVVV,
                 Sched<[sched.Folded, sched.ReadAfterFold,
                        // x86memop:$src2
                        ReadDefault, ReadDefault, ReadDefault, ReadDefault,
@@ -6564,12 +6564,12 @@ multiclass SS42I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
 let Predicates = [HasAVX] in
   defm VPCMPGTQ : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v2i64, VR128,
                                  load, i128mem, SchedWriteVecALU.XMM, 0>,
-                                 VEX_4V, WIG;
+                                 VEX, VVVV, WIG;
 
 let Predicates = [HasAVX2] in
   defm VPCMPGTQY : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v4i64, VR256,
                                   load, i256mem, SchedWriteVecALU.YMM, 0>,
-                                  VEX_4V, VEX_L, WIG;
+                                  VEX, VVVV, VEX_L, WIG;
 
 let Constraints = "$src1 = $dst" in
   defm PCMPGTQ : SS42I_binop_rm<0x37, "pcmpgtq", X86pcmpgt, v2i64, VR128,
@@ -6832,28 +6832,28 @@ multiclass AESI_binop_rm_int<bits<8> opc, string OpcodeStr,
 // Perform One Round of an AES Encryption/Decryption Flow
 let Predicates = [HasAVX, NoVLX_Or_NoVAES, HasAES] in {
   defm VAESENC          : AESI_binop_rm_int<0xDC, "vaesenc",
-                         int_x86_aesni_aesenc, load>, VEX_4V, WIG;
+                         int_x86_aesni_aesenc, load>, VEX, VVVV, WIG;
   defm VAESENCLAST      : AESI_binop_rm_int<0xDD, "vaesenclast",
-                         int_x86_aesni_aesenclast, load>, VEX_4V, WIG;
+                         int_x86_aesni_aesenclast, load>, VEX, VVVV, WIG;
   defm VAESDEC          : AESI_binop_rm_int<0xDE, "vaesdec",
-                         int_x86_aesni_aesdec, load>, VEX_4V, WIG;
+                         int_x86_aesni_aesdec, load>, VEX, VVVV, WIG;
   defm VAESDECLAST      : AESI_binop_rm_int<0xDF, "vaesdeclast",
-                         int_x86_aesni_aesdeclast, load>, VEX_4V, WIG;
+                         int_x86_aesni_aesdeclast, load>, VEX, VVVV, WIG;
 }
 
 let Predicates = [NoVLX, HasVAES] in {
   defm VAESENCY         : AESI_binop_rm_int<0xDC, "vaesenc",
                          int_x86_aesni_aesenc_256, load, 0, VR256,
-                         i256mem>, VEX_4V, VEX_L, WIG;
+                         i256mem>, VEX, VVVV, VEX_L, WIG;
   defm VAESENCLASTY     : AESI_binop_rm_int<0xDD, "vaesenclast",
                          int_x86_aesni_aesenclast_256, load, 0, VR256,
-                         i256mem>, VEX_4V, VEX_L, WIG;
+                         i256mem>, VEX, VVVV, VEX_L, WIG;
   defm VAESDECY         : AESI_binop_rm_int<0xDE, "vaesdec",
                          int_x86_aesni_aesdec_256, load, 0, VR256,
-                         i256mem>, VEX_4V, VEX_L, WIG;
+                         i256mem>, VEX, VVVV, VEX_L, WIG;
   defm VAESDECLASTY     : AESI_binop_rm_int<0xDF, "vaesdeclast",
                          int_x86_aesni_aesdeclast_256, load, 0, VR256,
-                         i256mem>, VEX_4V, VEX_L, WIG;
+                         i256mem>, VEX, VVVV, VEX_L, WIG;
 }
 
 let Constraints = "$src1 = $dst" in {
@@ -6994,11 +6994,11 @@ multiclass vpclmulqdq<RegisterClass RC, X86MemOperand MemOp,
 
 let Predicates = [HasAVX, NoVLX_Or_NoVPCLMULQDQ, HasPCLMUL] in
 defm VPCLMULQDQ : vpclmulqdq<VR128, i128mem, load,
-                             int_x86_pclmulqdq>, VEX_4V, WIG;
+                             int_x86_pclmulqdq>, VEX, VVVV, WIG;
 
 let Predicates = [NoVLX, HasVPCLMULQDQ] in
 defm VPCLMULQDQY : vpclmulqdq<VR256, i256mem, load,
-                              int_x86_pclmulqdq_256>, VEX_4V, VEX_L, WIG;
+                              int_x86_pclmulqdq_256>, VEX, VVVV, VEX_L, WIG;
 
 multiclass vpclmulqdq_aliases_impl<string InstStr, RegisterClass RC,
                                    X86MemOperand MemOp, string Hi, string Lo> {
@@ -7169,11 +7169,11 @@ let isCommutable = 1 in
 def VPERM2F128rr : AVXAIi8<0x06, MRMSrcReg, (outs VR256:$dst),
           (ins VR256:$src1, VR256:$src2, u8imm:$src3),
           "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", []>,
-          VEX_4V, VEX_L, Sched<[WriteFShuffle256]>;
+          VEX, VVVV, VEX_L, Sched<[WriteFShuffle256]>;
 def VPERM2F128rm : AVXAIi8<0x06, MRMSrcMem, (outs VR256:$dst),
           (ins VR256:$src1, f256mem:$src2, u8imm:$src3),
           "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", []>,
-          VEX_4V, VEX_L, Sched<[WriteFShuffle256.Folded, WriteFShuffle256.ReadAfterFold]>;
+          VEX, VVVV, VEX_L, Sched<[WriteFShuffle256.Folded, WriteFShuffle256.ReadAfterFold]>;
 }
 
 // Immediate transform to help with commuting.
@@ -7212,12 +7212,12 @@ let hasSideEffects = 0, ExeDomain = SSEPackedSingle in {
 def VINSERTF128rr : AVXAIi8<0x18, MRMSrcReg, (outs VR256:$dst),
           (ins VR256:$src1, VR128:$src2, u8imm:$src3),
           "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
-          []>, Sched<[WriteFShuffle256]>, VEX_4V, VEX_L;
+          []>, Sched<[WriteFShuffle256]>, VEX, VVVV, VEX_L;
 let mayLoad = 1 in
 def VINSERTF128rm : AVXAIi8<0x18, MRMSrcMem, (outs VR256:$dst),
           (ins VR256:$src1, f128mem:$src2, u8imm:$src3),
           "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
-          []>, Sched<[WriteFShuffle256.Folded, WriteFShuffle256.ReadAfterFold]>, VEX_4V, VEX_L;
+          []>, Sched<[WriteFShuffle256.Folded, WriteFShuffle256.ReadAfterFold]>, VEX, VVVV, VEX_L;
 }
 
 // To create a 256-bit all ones value, we should produce VCMPTRUEPS
@@ -7315,22 +7315,22 @@ multiclass avx_movmask_rm<bits<8> opc_rm, bits<8> opc_mr, string OpcodeStr,
              (ins VR128:$src1, f128mem:$src2),
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
              [(set VR128:$dst, (IntLd addr:$src2, VR128:$src1))]>,
-             VEX_4V, Sched<[schedX.RM]>;
+             VEX, VVVV, Sched<[schedX.RM]>;
   def Yrm : AVX8I<opc_rm, MRMSrcMem, (outs VR256:$dst),
              (ins VR256:$src1, f256mem:$src2),
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
              [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>,
-             VEX_4V, VEX_L, Sched<[schedY.RM]>;
+             VEX, VVVV, VEX_L, Sched<[schedY.RM]>;
   def mr  : AVX8I<opc_mr, MRMDestMem, (outs),
              (ins f128mem:$dst, VR128:$src1, VR128:$src2),
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
              [(IntSt addr:$dst, VR128:$src1, VR128:$src2)]>,
-             VEX_4V, Sched<[schedX.MR]>;
+             VEX, VVVV, Sched<[schedX.MR]>;
   def Ymr : AVX8I<opc_mr, MRMDestMem, (outs),
              (ins f256mem:$dst, VR256:$src1, VR256:$src2),
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
              [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>,
-             VEX_4V, VEX_L, Sched<[schedY.MR]>;
+             VEX, VVVV, VEX_L, Sched<[schedY.MR]>;
 }
 
 let ExeDomain = SSEPackedSingle in
@@ -7361,14 +7361,14 @@ multiclass avx_vnni_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
              !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
              [(set VR128:$dst, (v4i32 (OpNode VR128:$src1,
                                        VR128:$src2, VR128:$src3)))]>,
-             VEX_4V, Sched<[SchedWriteVecIMul.XMM]>;
+             VEX, VVVV, Sched<[SchedWriteVecIMul.XMM]>;
 
   def rm  : AVX8I<opc, MRMSrcMem, (outs VR128:$dst),
              (ins VR128:$src1, VR128:$src2, i128mem:$src3),
              !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
              [(set VR128:$dst, (v4i32 (OpNode VR128:$src1, VR128:$src2,
                                       (loadv4i32 addr:$src3))))]>,
-             VEX_4V, Sched<[SchedWriteVecIMul.XMM.Folded,
+             VEX, VVVV, Sched<[SchedWriteVecIMul.XMM.Folded,
                             SchedWriteVecIMul.XMM.ReadAfterFold,
                             SchedWriteVecIMul.XMM.ReadAfterFold]>;
 
@@ -7378,14 +7378,14 @@ multiclass avx_vnni_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
              !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
              [(set VR256:$dst, (v8i32 (OpNode VR256:$src1,
                                        VR256:$src2, VR256:$src3)))]>,
-             VEX_4V, VEX_L, Sched<[SchedWriteVecIMul.YMM]>;
+             VEX, VVVV, VEX_L, Sched<[SchedWriteVecIMul.YMM]>;
 
   def Yrm  : AVX8I<opc, MRMSrcMem, (outs VR256:$dst),
              (ins VR256:$src1, VR256:$src2, i256mem:$src3),
              !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
              [(set VR256:$dst, (v8i32 (OpNode VR256:$src1, VR256:$src2,
                                       (loadv8i32 addr:$src3))))]>,
-             VEX_4V, VEX_L, Sched<[SchedWriteVecIMul.YMM.Folded,
+             VEX, VVVV, VEX_L, Sched<[SchedWriteVecIMul.YMM.Folded,
                                    SchedWriteVecIMul.YMM.ReadAfterFold,
                                    SchedWriteVecIMul.YMM.ReadAfterFold]>;
 }
@@ -7424,13 +7424,13 @@ multiclass avx_permil<bits<8> opc_rm, bits<8> opc_rmi, string OpcodeStr,
     def rr  : AVX8I<opc_rm, MRMSrcReg, (outs RC:$dst),
                (ins RC:$src1, RC:$src2),
                !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-               [(set RC:$dst, (f_vt (X86VPermilpv RC:$src1, (i_vt RC:$src2))))]>, VEX_4V,
+               [(set RC:$dst, (f_vt (X86VPermilpv RC:$src1, (i_vt RC:$src2))))]>, VEX, VVVV,
                Sched<[varsched]>;
     def rm  : AVX8I<opc_rm, MRMSrcMem, (outs RC:$dst),
                (ins RC:$src1, x86memop_i:$src2),
                !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                [(set RC:$dst, (f_vt (X86VPermilpv RC:$src1,
-                              (i_vt (load addr:$src2)))))]>, VEX_4V,
+                              (i_vt (load addr:$src2)))))]>, VEX, VVVV,
                Sched<[varsched.Folded, sched.ReadAfterFold]>;
 
     def ri  : AVXAIi8<opc_rmi, MRMSrcReg, (outs RC:$dst),
@@ -7558,14 +7558,14 @@ multiclass AVX2_blend_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
         !strconcat(OpcodeStr,
             "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
         [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, timm:$src3)))]>,
-        Sched<[sched]>, VEX_4V;
+        Sched<[sched]>, VEX, VVVV;
   def rmi : AVX2AIi8<opc, MRMSrcMem, (outs RC:$dst),
         (ins RC:$src1, x86memop:$src2, u8imm:$src3),
         !strconcat(OpcodeStr,
             "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
         [(set RC:$dst,
           (OpVT (OpNode RC:$src1, (load addr:$src2), timm:$src3)))]>,
-        Sched<[sched.Folded, sched.ReadAfterFold]>, VEX_4V;
+        Sched<[sched.Folded, sched.ReadAfterFold]>, VEX, VVVV;
 
   // Pattern to commute if load is in first source.
   def : Pat<(OpVT (OpNode (load addr:$src2), RC:$src1, timm:$src3)),
@@ -7815,7 +7815,7 @@ multiclass avx2_perm<bits<8> opc, string OpcodeStr,
                          "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                      [(set VR256:$dst,
                        (OpVT (X86VPermv VR256:$src1, VR256:$src2)))]>,
-                     Sched<[Sched]>, VEX_4V, VEX_L;
+                     Sched<[Sched]>, VEX, VVVV, VEX_L;
     def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst),
                      (ins VR256:$src1, memOp:$src2),
                      !strconcat(OpcodeStr,
@@ -7823,7 +7823,7 @@ multiclass avx2_perm<bits<8> opc, string OpcodeStr,
                      [(set VR256:$dst,
                        (OpVT (X86VPermv VR256:$src1,
                               (load addr:$src2))))]>,
-                     Sched<[Sched.Folded, Sched.ReadAfterFold]>, VEX_4V, VEX_L;
+                     Sched<[Sched.Folded, Sched.ReadAfterFold]>, VEX, VVVV, VEX_L;
   }
 }
 
@@ -7866,11 +7866,11 @@ let isCommutable = 1 in
 def VPERM2I128rr : AVX2AIi8<0x46, MRMSrcReg, (outs VR256:$dst),
           (ins VR256:$src1, VR256:$src2, u8imm:$src3),
           "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", []>,
-          Sched<[WriteShuffle256]>, VEX_4V, VEX_L;
+          Sched<[WriteShuffle256]>, VEX, VVVV, VEX_L;
 def VPERM2I128rm : AVX2AIi8<0x46, MRMSrcMem, (outs VR256:$dst),
           (ins VR256:$src1, f256mem:$src2, u8imm:$src3),
           "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", []>,
-          Sched<[WriteShuffle256.Folded, WriteShuffle256.ReadAfterFold]>, VEX_4V, VEX_L;
+          Sched<[WriteShuffle256.Folded, WriteShuffle256.ReadAfterFold]>, VEX, VVVV, VEX_L;
 
 let Predicates = [HasAVX2] in {
   defm : vperm2x128_lowering<"VPERM2I128", v4i64,  loadv4i64>;
@@ -7888,12 +7888,12 @@ let hasSideEffects = 0 in {
 def VINSERTI128rr : AVX2AIi8<0x38, MRMSrcReg, (outs VR256:$dst),
           (ins VR256:$src1, VR128:$src2, u8imm:$src3),
           "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
-          []>, Sched<[WriteShuffle256]>, VEX_4V, VEX_L;
+          []>, Sched<[WriteShuffle256]>, VEX, VVVV, VEX_L;
 let mayLoad = 1 in
 def VINSERTI128rm : AVX2AIi8<0x38, MRMSrcMem, (outs VR256:$dst),
           (ins VR256:$src1, i128mem:$src2, u8imm:$src3),
           "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
-          []>, Sched<[WriteShuffle256.Folded, WriteShuffle256.ReadAfterFold]>, VEX_4V, VEX_L;
+          []>, Sched<[WriteShuffle256.Folded, WriteShuffle256.ReadAfterFold]>, VEX, VVVV, VEX_L;
 }
 
 let Predicates = [HasAVX2, NoVLX] in {
@@ -7939,22 +7939,22 @@ multiclass avx2_pmovmask<string OpcodeStr,
              (ins VR128:$src1, i128mem:$src2),
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
              [(set VR128:$dst, (IntLd128 addr:$src2, VR128:$src1))]>,
-             VEX_4V, Sched<[schedX.RM]>;
+             VEX, VVVV, Sched<[schedX.RM]>;
   def Yrm : AVX28I<0x8c, MRMSrcMem, (outs VR256:$dst),
              (ins VR256:$src1, i256mem:$src2),
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
              [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>,
-             VEX_4V, VEX_L, Sched<[schedY.RM]>;
+             VEX, VVVV, VEX_L, Sched<[schedY.RM]>;
   def mr  : AVX28I<0x8e, MRMDestMem, (outs),
              (ins i128mem:$dst, VR128:$src1, VR128:$src2),
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
              [(IntSt128 addr:$dst, VR128:$src1, VR128:$src2)]>,
-             VEX_4V, Sched<[schedX.MR]>;
+             VEX, VVVV, Sched<[schedX.MR]>;
   def Ymr : AVX28I<0x8e, MRMDestMem, (outs),
              (ins i256mem:$dst, VR256:$src1, VR256:$src2),
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
              [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>,
-             VEX_4V, VEX_L, Sched<[schedY.MR]>;
+             VEX, VVVV, VEX_L, Sched<[schedY.MR]>;
 }
 
 defm VPMASKMOVD : avx2_pmovmask<"vpmaskmovd",
@@ -8012,28 +8012,28 @@ multiclass avx2_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode,
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
              [(set VR128:$dst,
                (vt128 (OpNode VR128:$src1, (vt128 VR128:$src2))))]>,
-             VEX_4V, Sched<[SchedWriteVarVecShift.XMM]>;
+             VEX, VVVV, Sched<[SchedWriteVarVecShift.XMM]>;
   def rm  : AVX28I<opc, MRMSrcMem, (outs VR128:$dst),
              (ins VR128:$src1, i128mem:$src2),
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
              [(set VR128:$dst,
                (vt128 (OpNode VR128:$src1,
                        (vt128 (load addr:$src2)))))]>,
-             VEX_4V, Sched<[SchedWriteVarVecShift.XMM.Folded,
+             VEX, VVVV, Sched<[SchedWriteVarVecShift.XMM.Folded,
                             SchedWriteVarVecShift.XMM.ReadAfterFold]>;
   def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst),
              (ins VR256:$src1, VR256:$src2),
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
              [(set VR256:$dst,
                (vt256 (OpNode VR256:$src1, (vt256 VR256:$src2))))]>,
-             VEX_4V, VEX_L, Sched<[SchedWriteVarVecShift.YMM]>;
+             VEX, VVVV, VEX_L, Sched<[SchedWriteVarVecShift.YMM]>;
   def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst),
              (ins VR256:$src1, i256mem:$src2),
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
              [(set VR256:$dst,
                (vt256 (OpNode VR256:$src1,
                        (vt256 (load addr:$src2)))))]>,
-             VEX_4V, VEX_L, Sched<[SchedWriteVarVecShift.YMM.Folded,
+             VEX, VVVV, VEX_L, Sched<[SchedWriteVarVecShift.YMM.Folded,
                                    SchedWriteVarVecShift.YMM.ReadAfterFold]>;
 }
 
@@ -8146,10 +8146,10 @@ multiclass GF2P8AFFINE_common<bits<8> Op, string OpStr, SDNode OpNode> {
   let Predicates  = [HasGFNI, HasAVX, NoVLX] in {
     defm V#NAME    : GF2P8AFFINE_rmi<Op, "v"#OpStr, v16i8, OpNode, VR128,
                                      load, i128mem, SchedWriteVecIMul.XMM>,
-                                     VEX_4V, REX_W;
+                                     VEX, VVVV, REX_W;
     defm V#NAME#Y : GF2P8AFFINE_rmi<Op, "v"#OpStr, v32i8, OpNode, VR256,
                                      load, i256mem, SchedWriteVecIMul.YMM>,
-                                     VEX_4V, VEX_L, REX_W;
+                                     VEX, VVVV, VEX_L, REX_W;
   }
 }
 
@@ -8160,9 +8160,9 @@ defm GF2P8MULB      : GF2P8MULB_rm<"gf2p8mulb", v16i8, VR128, memop,
                                     i128mem, SchedWriteVecALU.XMM, 1>;
 let Predicates  = [HasGFNI, HasAVX, NoVLX] in {
   defm VGF2P8MULB   : GF2P8MULB_rm<"vgf2p8mulb", v16i8, VR128, load,
-                                   i128mem, SchedWriteVecALU.XMM>, VEX_4V;
+                                   i128mem, SchedWriteVecALU.XMM>, VEX, VVVV;
   defm VGF2P8MULBY  : GF2P8MULB_rm<"vgf2p8mulb", v32i8, VR256, load,
-                                   i256mem, SchedWriteVecALU.YMM>, VEX_4V, VEX_L;
+                                   i256mem, SchedWriteVecALU.YMM>, VEX, VVVV, VEX_L;
 }
 // GF2P8AFFINEINVQB, GF2P8AFFINEQB
 let isCommutable = 0 in {
@@ -8183,28 +8183,28 @@ multiclass avx_ifma_rm<bits<8> opc, string OpcodeStr, SDNode OpNode> {
                !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
                [(set VR128:$dst, (v2i64 (OpNode VR128:$src2,
                                          VR128:$src3, VR128:$src1)))]>,
-               VEX_4V, Sched<[SchedWriteVecIMul.XMM]>;
+               VEX, VVVV, Sched<[SchedWriteVecIMul.XMM]>;
   }
     def rm  : AVX8I<opc, MRMSrcMem, (outs VR128:$dst),
                (ins VR128:$src1, VR128:$src2, i128mem:$src3),
                !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
                [(set VR128:$dst, (v2i64 (OpNode VR128:$src2,
                                         (loadv2i64 addr:$src3), VR128:$src1)))]>,
-               VEX_4V, Sched<[SchedWriteVecIMul.XMM]>;
+               VEX, VVVV, Sched<[SchedWriteVecIMul.XMM]>;
   let isCommutable = 1 in {
     def Yrr  : AVX8I<opc, MRMSrcReg, (outs VR256:$dst),
                (ins VR256:$src1, VR256:$src2, VR256:$src3),
                !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
                [(set VR256:$dst, (v4i64 (OpNode VR256:$src2,
                                          VR256:$src3, VR256:$src1)))]>,
-               VEX_4V, VEX_L, Sched<[SchedWriteVecIMul.YMM]>;
+               VEX, VVVV, VEX_L, Sched<[SchedWriteVecIMul.YMM]>;
   }
     def Yrm  : AVX8I<opc, MRMSrcMem, (outs VR256:$dst),
                (ins VR256:$src1, VR256:$src2, i256mem:$src3),
                !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
                [(set VR256:$dst, (v4i64 (OpNode VR256:$src2,
                                         (loadv4i64 addr:$src3), VR256:$src1)))]>,
-               VEX_4V, VEX_L, Sched<[SchedWriteVecIMul.YMM]>;
+               VEX, VVVV, VEX_L, Sched<[SchedWriteVecIMul.YMM]>;
 }
 
 defm VPMADD52HUQ : avx_ifma_rm<0xb5, "vpmadd52huq", x86vpmadd52h>, REX_W, ExplicitVEXPrefix;
@@ -8222,13 +8222,13 @@ multiclass avx_dotprod_rm<bits<8> Opc, string OpcodeStr, ValueType OpVT,
              (ins RC:$src1, RC:$src2, RC:$src3),
              !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
              [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, RC:$src3)))]>,
-             VEX_4V, Sched<[Sched]>;
+             VEX, VVVV, Sched<[Sched]>;
   def rm  :  I<Opc, MRMSrcMem, (outs RC:$dst),
              (ins RC:$src1, RC:$src2, X86memop:$src3),
              !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
              [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2,
                                    (MemOpFrag addr:$src3))))]>,
-             VEX_4V, Sched<[Sched.Folded, Sched.ReadAfterFold]>;
+             VEX, VVVV, Sched<[Sched.Folded, Sched.ReadAfterFold]>;
 }
 
 let Predicates = [HasAVXVNNIINT8] in {
@@ -8349,7 +8349,7 @@ def VSHA512RNDS2rr : I<0xcb, MRMSrcReg, (outs VR256:$dst),
                       "vsha512rnds2\t{$src3, $src2, $dst|$dst, $src2, $src3}",
                       [(set VR256:$dst,
                         (int_x86_vsha512rnds2 VR256:$src1, VR256:$src2, VR128:$src3))]>,
-                      VEX_L, VEX_4V, T8XD, Sched<[WriteVecIMul]>;
+                      VEX_L, VEX, VVVV, T8XD, Sched<[WriteVecIMul]>;
 }
 
 // FIXME: Is there a better scheduler class for SM3 than WriteVecIMul?
@@ -8361,14 +8361,14 @@ let Predicates = [HasSM3], Constraints = "$src1 = $dst" in {
               [(set VR128:$dst,
                (!cast<Intrinsic>("int_x86_"#OpStr) VR128:$src1,
                 VR128:$src2, VR128:$src3))]>,
-              Sched<[WriteVecIMul]>, VEX_4V;
+              Sched<[WriteVecIMul]>, VEX, VVVV;
     def rm : I<0xda, MRMSrcMem, (outs VR128:$dst),
               (ins VR128:$src1, VR128:$src2, i128mem:$src3),
               !strconcat(OpStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
               [(set VR128:$dst,
                (!cast<Intrinsic>("int_x86_"#OpStr) VR128:$src1,
                 VR128:$src2, (loadv4i32 addr:$src3)))]>,
-              Sched<[WriteVecIMul]>, VEX_4V;
+              Sched<[WriteVecIMul]>, VEX, VVVV;
   }
 
   multiclass VSM3RNDS2_Base {
@@ -8391,7 +8391,7 @@ let Predicates = [HasSM3], Constraints = "$src1 = $dst" in {
 
 defm VSM3MSG1 : SM3_Base<"vsm3msg1">, T8PS;
 defm VSM3MSG2 : SM3_Base<"vsm3msg2">, T8PD;
-defm VSM3RNDS2 : VSM3RNDS2_Base, VEX_4V, TAPD;
+defm VSM3RNDS2 : VSM3RNDS2_Base, VEX, VVVV, TAPD;
 
 // FIXME: Is there a better scheduler class for SM4 than WriteVecIMul?
 let Predicates = [HasSM4] in {
@@ -8412,10 +8412,10 @@ let Predicates = [HasSM4] in {
   }
 }
 
-defm VSM4KEY4  : SM4_Base<"vsm4key4", VR128, "128", loadv4i32, i128mem>, T8XS, VEX_4V;
-defm VSM4KEY4Y : SM4_Base<"vsm4key4", VR256, "256", loadv8i32, i256mem>, T8XS, VEX_L, VEX_4V;
-defm VSM4RNDS4  : SM4_Base<"vsm4rnds4", VR128, "128", loadv4i32, i128mem>, T8XD, VEX_4V;
-defm VSM4RNDS4Y : SM4_Base<"vsm4rnds4", VR256, "256", loadv8i32, i256mem>, T8XD, VEX_L, VEX_4V;
+defm VSM4KEY4  : SM4_Base<"vsm4key4", VR128, "128", loadv4i32, i128mem>, T8XS, VEX, VVVV;
+defm VSM4KEY4Y : SM4_Base<"vsm4key4", VR256, "256", loadv8i32, i256mem>, T8XS, VEX_L, VEX, VVVV;
+defm VSM4RNDS4  : SM4_Base<"vsm4rnds4", VR128, "128", loadv4i32, i128mem>, T8XD, VEX, VVVV;
+defm VSM4RNDS4Y : SM4_Base<"vsm4rnds4", VR256, "256", loadv8i32, i256mem>, T8XD, VEX_L, VEX, VVVV;
 
 let Predicates = [HasAVXVNNIINT16], Constraints = "$src1 = $dst" in
 multiclass avx_vnni_int16<bits<8> opc, string OpcodeStr, bit IsCommutable> {
@@ -8426,7 +8426,7 @@ multiclass avx_vnni_int16<bits<8> opc, string OpcodeStr, bit IsCommutable> {
               [(set VR128:$dst,
                 (v4i32 (!cast<Intrinsic>("int_x86_avx2_"#OpcodeStr#"_128")
                         VR128:$src1, VR128:$src2, VR128:$src3)))]>,
-              VEX_4V, Sched<[SchedWriteVecIMul.XMM]>;
+              VEX, VVVV, Sched<[SchedWriteVecIMul.XMM]>;
 
   def rm  : I<opc, MRMSrcMem, (outs VR128:$dst),
               (ins VR128:$src1, VR128:$src2, i128mem:$src3),
@@ -8434,7 +8434,7 @@ multiclass avx_vnni_int16<bits<8> opc, string OpcodeStr, bit IsCommutable> {
               [(set VR128:$dst,
                 (v4i32 (!cast<Intrinsic>("int_x86_avx2_"#OpcodeStr#"_128")
                         VR128:$src1, VR128:$src2, (loadv4i32 addr:$src3))))]>,
-              VEX_4V, Sched<[SchedWriteVecIMul.XMM]>;
+              VEX, VVVV, Sched<[SchedWriteVecIMul.XMM]>;
 
   let isCommutable = IsCommutable in
   def Yrr  : I<opc, MRMSrcReg, (outs VR256:$dst),
@@ -8443,7 +8443,7 @@ multiclass avx_vnni_int16<bits<8> opc, string OpcodeStr, bit IsCommutable> {
                [(set VR256:$dst,
                  (v8i32 (!cast<Intrinsic>("int_x86_avx2_"#OpcodeStr#"_256")
                          VR256:$src1, VR256:$src2, VR256:$src3)))]>,
-               VEX_4V, VEX_L, Sched<[SchedWriteVecIMul.YMM]>;
+               VEX, VVVV, VEX_L, Sched<[SchedWriteVecIMul.YMM]>;
 
   def Yrm  : I<opc, MRMSrcMem, (outs VR256:$dst),
                (ins VR256:$src1, VR256:$src2, i256mem:$src3),
@@ -8451,7 +8451,7 @@ multiclass avx_vnni_int16<bits<8> opc, string OpcodeStr, bit IsCommutable> {
                [(set VR256:$dst,
                  (v8i32 (!cast<Intrinsic>("int_x86_avx2_"#OpcodeStr#"_256")
                          VR256:$src1, VR256:$src2, (loadv8i32 addr:$src3))))]>,
-               VEX_4V, VEX_L, Sched<[SchedWriteVecIMul.YMM]>;
+               VEX, VVVV, VEX_L, Sched<[SchedWriteVecIMul.YMM]>;
 }
 
 defm VPDPWSUD   : avx_vnni_int16<0xd2, "vpdpwsud", 0>, T8XS;
diff --git a/llvm/lib/Target/X86/X86InstrTBM.td b/llvm/lib/Target/X86/X86InstrTBM.td
index ed514038a12e4..09200f0c1a9f6 100644
--- a/llvm/lib/Target/X86/X86InstrTBM.td
+++ b/llvm/lib/Target/X86/X86InstrTBM.td
@@ -46,11 +46,11 @@ multiclass tbm_binary_rm<bits<8> opc, Format FormReg, Format FormMem,
 let hasSideEffects = 0 in {
   def rr : I<opc,  FormReg, (outs RC:$dst), (ins RC:$src),
              !strconcat(OpcodeStr,"\t{$src, $dst|$dst, $src}"), []>,
-             XOP_4V, XOP9, Sched<[Sched]>;
+             XOP, VVVV, XOP9, Sched<[Sched]>;
   let mayLoad = 1 in
   def rm : I<opc,  FormMem, (outs RC:$dst), (ins x86memop:$src),
              !strconcat(OpcodeStr,"\t{$src, $dst|$dst, $src}"), []>,
-             XOP_4V, XOP9, Sched<[Sched.Folded]>;
+             XOP, VVVV, XOP9, Sched<[Sched.Folded]>;
 }
 }
 
diff --git a/llvm/lib/Target/X86/X86InstrUtils.td b/llvm/lib/Target/X86/X86InstrUtils.td
index a94efd2b1a050..dd59a641dfaa2 100644
--- a/llvm/lib/Target/X86/X86InstrUtils.td
+++ b/llvm/lib/Target/X86/X86InstrUtils.td
@@ -66,11 +66,10 @@ class VEX    { Encoding OpEnc = EncVEX; }
 class WIG  { bit IgnoresW = 1; }
 // Special version of REX_W that can be changed to VEX.W==0 for EVEX2VEX.
 class VEX_W1X  { bit hasREX_W = 1; bit EVEX_W1_VEX_W0 = 1; }
-class VEX_4V : VEX { bit hasVEX_4V = 1; }
 class VEX_L  { bit hasVEX_L = 1; }
 class VEX_LIG { bit ignoresVEX_L = 1; }
+class VVVV { bit hasVEX_4V = 1; }
 class EVEX   { Encoding OpEnc = EncEVEX; }
-class EVEX_4V : EVEX { bit hasVEX_4V = 1; }
 class EVEX_K { bit hasEVEX_K = 1; }
 class EVEX_KZ : EVEX_K { bit hasEVEX_Z = 1; }
 class EVEX_B { bit hasEVEX_B = 1; }
@@ -88,7 +87,6 @@ class EVEX_CD8<int esize, CD8VForm form> {
 }
 class NoCD8 { bits<7> CD8_Scale = 0; }
 class XOP { Encoding OpEnc = EncXOP; }
-class XOP_4V : XOP { bit hasVEX_4V = 1; }
 class EVEX2VEXOverride<string VEXInstrName> {
   string EVEX2VEXOverride = VEXInstrName;
 }
@@ -860,7 +858,7 @@ class AVX512PI<bits<8> o, Format F, dag outs, dag ins, string asm,
 class AVX512FMA3S<bits<8> o, Format F, dag outs, dag ins, string asm,
            list<dag>pattern>
       : I<o, F, outs, ins, asm, pattern>, T8PD,
-        EVEX_4V, Requires<[HasAVX512]>;
+        EVEX, VVVV, Requires<[HasAVX512]>;
 
 class AVX512<bits<8> o, Format F, dag outs, dag ins, string asm,
            list<dag>pattern>
@@ -889,29 +887,29 @@ class PCLMULIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
 class FMA3<bits<8> o, Format F, dag outs, dag ins, string asm,
            list<dag>pattern>
       : I<o, F, outs, ins, asm, pattern>, T8PD,
-        VEX_4V, FMASC, Requires<[HasFMA, NoFMA4, NoVLX]>;
+        VEX, VVVV, FMASC, Requires<[HasFMA, NoFMA4, NoVLX]>;
 class FMA3S<bits<8> o, Format F, dag outs, dag ins, string asm,
             list<dag>pattern>
       : I<o, F, outs, ins, asm, pattern>, T8PD,
-        VEX_4V, FMASC, Requires<[HasFMA, NoFMA4, NoAVX512]>;
+        VEX, VVVV, FMASC, Requires<[HasFMA, NoFMA4, NoAVX512]>;
 class FMA3S_Int<bits<8> o, Format F, dag outs, dag ins, string asm,
                 list<dag>pattern>
       : I<o, F, outs, ins, asm, pattern>, T8PD,
-        VEX_4V, FMASC, Requires<[HasFMA, NoAVX512]>;
+        VEX, VVVV, FMASC, Requires<[HasFMA, NoAVX512]>;
 
 // FMA4 Instruction Templates
 class FMA4<bits<8> o, Format F, dag outs, dag ins, string asm,
            list<dag>pattern>
       : Ii8Reg<o, F, outs, ins, asm, pattern>, TAPD,
-        VEX_4V, FMASC, Requires<[HasFMA4, NoVLX]>;
+        VEX, VVVV, FMASC, Requires<[HasFMA4, NoVLX]>;
 class FMA4S<bits<8> o, Format F, dag outs, dag ins, string asm,
             list<dag>pattern>
       : Ii8Reg<o, F, outs, ins, asm, pattern>, TAPD,
-        VEX_4V, FMASC, Requires<[HasFMA4, NoAVX512]>;
+        VEX, VVVV, FMASC, Requires<[HasFMA4, NoAVX512]>;
 class FMA4S_Int<bits<8> o, Format F, dag outs, dag ins, string asm,
                 list<dag>pattern>
       : Ii8Reg<o, F, outs, ins, asm, pattern>, TAPD,
-        VEX_4V, FMASC, Requires<[HasFMA4]>;
+        VEX, VVVV, FMASC, Requires<[HasFMA4]>;
 
 // XOP 2, 3 and 4 Operand Instruction Template
 class IXOP<bits<8> o, Format F, dag outs, dag ins, string asm,
@@ -934,7 +932,7 @@ class IXOPi8Reg<bits<8> o, Format F, dag outs, dag ins, string asm,
 class IXOP5<bits<8> o, Format F, dag outs, dag ins, string asm,
            list<dag>pattern>
       : Ii8Reg<o, F, outs, ins, asm, pattern, SSEPackedInt>, TAPD,
-        VEX_4V, Requires<[HasXOP]>;
+        VEX, VVVV, Requires<[HasXOP]>;
 
 // X86-64 Instruction templates...
 //
diff --git a/llvm/lib/Target/X86/X86InstrXOP.td b/llvm/lib/Target/X86/X86InstrXOP.td
index a62bb2e855c9f..1504d77bfb86e 100644
--- a/llvm/lib/Target/X86/X86InstrXOP.td
+++ b/llvm/lib/Target/X86/X86InstrXOP.td
@@ -105,7 +105,7 @@ multiclass xop3op<bits<8> opc, string OpcodeStr, SDNode OpNode,
            [(set VR128:$dst,
               (vt128 (OpNode (vt128 VR128:$src1),
                              (vt128 (load addr:$src2)))))]>,
-           XOP_4V, REX_W, Sched<[sched.Folded, sched.ReadAfterFold]>;
+           XOP, VVVV, REX_W, Sched<[sched.Folded, sched.ReadAfterFold]>;
   def mr : IXOP<opc, MRMSrcMem4VOp3, (outs VR128:$dst),
            (ins i128mem:$src1, VR128:$src2),
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
@@ -119,7 +119,7 @@ multiclass xop3op<bits<8> opc, string OpcodeStr, SDNode OpNode,
                (ins VR128:$src1, VR128:$src2),
                !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                []>,
-               XOP_4V, REX_W, Sched<[sched]>;
+               XOP, VVVV, REX_W, Sched<[sched]>;
 }
 
 let ExeDomain = SSEPackedInt in {
@@ -173,7 +173,7 @@ multiclass xop4opm2<bits<8> opc, string OpcodeStr, Intrinsic Int,
            !strconcat(OpcodeStr,
            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
            [(set VR128:$dst,
-              (Int VR128:$src1, VR128:$src2, VR128:$src3))]>, XOP_4V,
+              (Int VR128:$src1, VR128:$src2, VR128:$src3))]>, XOP, VVVV,
            Sched<[sched]>;
   def rm : IXOPi8Reg<opc, MRMSrcMem, (outs VR128:$dst),
            (ins VR128:$src1, i128mem:$src2, VR128:$src3),
@@ -181,7 +181,7 @@ multiclass xop4opm2<bits<8> opc, string OpcodeStr, Intrinsic Int,
            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
            [(set VR128:$dst,
               (Int VR128:$src1, (load addr:$src2),
-              VR128:$src3))]>, XOP_4V, Sched<[sched.Folded, sched.ReadAfterFold]>;
+              VR128:$src3))]>, XOP, VVVV, Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
 let ExeDomain = SSEPackedInt in {
@@ -252,7 +252,7 @@ multiclass xopvpcom<bits<8> opc, string Suffix, SDNode OpNode, ValueType vt128,
              [(set VR128:$dst,
                 (vt128 (OpNode (vt128 VR128:$src1), (vt128 VR128:$src2),
                                timm:$cc)))]>,
-             XOP_4V, Sched<[sched]>;
+             XOP, VVVV, Sched<[sched]>;
     def mi : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst),
              (ins VR128:$src1, i128mem:$src2, u8imm:$cc),
              !strconcat("vpcom", Suffix,
@@ -261,7 +261,7 @@ multiclass xopvpcom<bits<8> opc, string Suffix, SDNode OpNode, ValueType vt128,
                 (vt128 (OpNode (vt128 VR128:$src1),
                                (vt128 (load addr:$src2)),
                                 timm:$cc)))]>,
-             XOP_4V, Sched<[sched.Folded, sched.ReadAfterFold]>;
+             XOP, VVVV, Sched<[sched.Folded, sched.ReadAfterFold]>;
   }
 
   def : Pat<(OpNode (load addr:$src2),
@@ -288,7 +288,7 @@ multiclass xop4op<bits<8> opc, string OpcodeStr, SDNode OpNode,
             [(set VR128:$dst,
               (vt128 (OpNode (vt128 VR128:$src1), (vt128 VR128:$src2),
                              (vt128 VR128:$src3))))]>,
-            XOP_4V, Sched<[sched]>;
+            XOP, VVVV, Sched<[sched]>;
   def rrm : IXOPi8Reg<opc, MRMSrcMemOp4, (outs VR128:$dst),
             (ins VR128:$src1, VR128:$src2, i128mem:$src3),
             !strconcat(OpcodeStr,
@@ -296,7 +296,7 @@ multiclass xop4op<bits<8> opc, string OpcodeStr, SDNode OpNode,
             [(set VR128:$dst,
               (vt128 (OpNode (vt128 VR128:$src1), (vt128 VR128:$src2),
                              (vt128 (load addr:$src3)))))]>,
-            XOP_4V, REX_W, Sched<[sched.Folded, sched.ReadAfterFold, sched.ReadAfterFold]>;
+            XOP, VVVV, REX_W, Sched<[sched.Folded, sched.ReadAfterFold, sched.ReadAfterFold]>;
   def rmr : IXOPi8Reg<opc, MRMSrcMem, (outs VR128:$dst),
             (ins VR128:$src1, i128mem:$src2, VR128:$src3),
             !strconcat(OpcodeStr,
@@ -304,7 +304,7 @@ multiclass xop4op<bits<8> opc, string OpcodeStr, SDNode OpNode,
             [(set VR128:$dst,
               (v16i8 (OpNode (vt128 VR128:$src1), (vt128 (load addr:$src2)),
                              (vt128 VR128:$src3))))]>,
-            XOP_4V, Sched<[sched.Folded, sched.ReadAfterFold,
+            XOP, VVVV, Sched<[sched.Folded, sched.ReadAfterFold,
                            // 128mem:$src2
                            ReadDefault, ReadDefault, ReadDefault, ReadDefault,
                            ReadDefault,
@@ -316,7 +316,7 @@ multiclass xop4op<bits<8> opc, string OpcodeStr, SDNode OpNode,
                 (ins VR128:$src1, VR128:$src2, VR128:$src3),
                 !strconcat(OpcodeStr,
                 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
-                []>, XOP_4V, REX_W, Sched<[sched]>;
+                []>, XOP, VVVV, REX_W, Sched<[sched]>;
 }
 
 let ExeDomain = SSEPackedInt in {
@@ -333,7 +333,7 @@ multiclass xop4op_int<bits<8> opc, string OpcodeStr, RegisterClass RC,
             !strconcat(OpcodeStr,
             "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
             [(set RC:$dst, (VT (or (and RC:$src3, RC:$src1),
-                                   (X86andnp RC:$src3, RC:$src2))))]>, XOP_4V,
+                                   (X86andnp RC:$src3, RC:$src2))))]>, XOP, VVVV,
             Sched<[sched]>;
   // FIXME: We can't write a pattern for this in tablegen.
   let hasSideEffects = 0, mayLoad = 1 in
@@ -342,14 +342,14 @@ multiclass xop4op_int<bits<8> opc, string OpcodeStr, RegisterClass RC,
             !strconcat(OpcodeStr,
             "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
             []>,
-            XOP_4V, REX_W, Sched<[sched.Folded, sched.ReadAfterFold, sched.ReadAfterFold]>;
+            XOP, VVVV, REX_W, Sched<[sched.Folded, sched.ReadAfterFold, sched.ReadAfterFold]>;
   def rmr : IXOPi8Reg<opc, MRMSrcMem, (outs RC:$dst),
             (ins RC:$src1, x86memop:$src2, RC:$src3),
             !strconcat(OpcodeStr,
             "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
             [(set RC:$dst, (VT (or (and RC:$src3, RC:$src1),
                                    (X86andnp RC:$src3, (load addr:$src2)))))]>,
-            XOP_4V, Sched<[sched.Folded, sched.ReadAfterFold,
+            XOP, VVVV, Sched<[sched.Folded, sched.ReadAfterFold,
                            // x86memop:$src2
                            ReadDefault, ReadDefault, ReadDefault, ReadDefault,
                            ReadDefault,
@@ -361,7 +361,7 @@ multiclass xop4op_int<bits<8> opc, string OpcodeStr, RegisterClass RC,
             (ins RC:$src1, RC:$src2, RC:$src3),
             !strconcat(OpcodeStr,
             "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
-            []>, XOP_4V, REX_W, Sched<[sched]>;
+            []>, XOP, VVVV, REX_W, Sched<[sched]>;
 }
 
 let ExeDomain = SSEPackedInt in {

From 847a6f8f0a6b9b98f2a219c7af59306e9b03b796 Mon Sep 17 00:00:00 2001
From: Ryan Holt <ryanholt@mathworks.com>
Date: Thu, 21 Dec 2023 18:49:15 -0800
Subject: [PATCH 131/342] [mlir][MemRef] Add runtime bounds checking (#75817)

This change adds (runtime) bounds checks for `memref` ops using the
existing `RuntimeVerifiableOpInterface`. For `memref.load` and
`memref.store`, we check that the indices are in-bounds of the memref's
index space. For `memref.reinterpret_cast` and `memref.subview` we check
that the resulting address space is in-bounds of the input memref's
address space.
---
 .../Transforms/RuntimeOpVerification.cpp      | 172 +++++++++++++++++-
 .../Memref/cast-runtime-verification.mlir     |   8 +-
 .../Memref/load-runtime-verification.mlir     |  67 +++++++
 ...reinterpret-cast-runtime-verification.mlir |  74 ++++++++
 .../Memref/subview-runtime-verification.mlir  |  89 +++++++++
 5 files changed, 405 insertions(+), 5 deletions(-)
 create mode 100644 mlir/test/Integration/Dialect/Memref/load-runtime-verification.mlir
 create mode 100644 mlir/test/Integration/Dialect/Memref/reinterpret-cast-runtime-verification.mlir
 create mode 100644 mlir/test/Integration/Dialect/Memref/subview-runtime-verification.mlir

diff --git a/mlir/lib/Dialect/MemRef/Transforms/RuntimeOpVerification.cpp b/mlir/lib/Dialect/MemRef/Transforms/RuntimeOpVerification.cpp
index 05a069d98ef35..05b813a3b1e90 100644
--- a/mlir/lib/Dialect/MemRef/Transforms/RuntimeOpVerification.cpp
+++ b/mlir/lib/Dialect/MemRef/Transforms/RuntimeOpVerification.cpp
@@ -8,10 +8,14 @@
 
 #include "mlir/Dialect/MemRef/Transforms/RuntimeOpVerification.h"
 
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Arith/Utils/Utils.h"
 #include "mlir/Dialect/ControlFlow/IR/ControlFlow.h"
 #include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/MemRef/Utils/MemRefUtils.h"
+#include "mlir/Dialect/Utils/IndexingUtils.h"
 #include "mlir/Interfaces/RuntimeVerifiableOpInterface.h"
 
 using namespace mlir;
@@ -21,6 +25,12 @@ static std::string generateErrorMessage(Operation *op, const std::string &msg) {
   std::string buffer;
   llvm::raw_string_ostream stream(buffer);
   OpPrintingFlags flags;
+  // We may generate a lot of error messages and so we need to ensure the
+  // printing is fast.
+  flags.elideLargeElementsAttrs();
+  flags.printGenericOpForm();
+  flags.skipRegions();
+  flags.useLocalScope();
   stream << "ERROR: Runtime op verification failed\n";
   op->print(stream, flags);
   stream << "\n^ " << msg;
@@ -133,6 +143,161 @@ struct CastOpInterface
   }
 };
 
+/// Verifies that the indices on load/store ops are in-bounds of the memref's
+/// index space: 0 <= index#i < dim#i
+template <typename LoadStoreOp>
+struct LoadStoreOpInterface
+    : public RuntimeVerifiableOpInterface::ExternalModel<
+          LoadStoreOpInterface<LoadStoreOp>, LoadStoreOp> {
+  void generateRuntimeVerification(Operation *op, OpBuilder &builder,
+                                   Location loc) const {
+    auto loadStoreOp = cast<LoadStoreOp>(op);
+
+    auto memref = loadStoreOp.getMemref();
+    auto rank = memref.getType().getRank();
+    if (rank == 0) {
+      return;
+    }
+    auto indices = loadStoreOp.getIndices();
+
+    auto zero = builder.create<arith::ConstantIndexOp>(loc, 0);
+    Value assertCond;
+    for (auto i : llvm::seq<int64_t>(0, rank)) {
+      auto index = indices[i];
+
+      auto dimOp = builder.createOrFold<memref::DimOp>(loc, memref, i);
+
+      auto geLow = builder.createOrFold<arith::CmpIOp>(
+          loc, arith::CmpIPredicate::sge, index, zero);
+      auto ltHigh = builder.createOrFold<arith::CmpIOp>(
+          loc, arith::CmpIPredicate::slt, index, dimOp);
+      auto andOp = builder.createOrFold<arith::AndIOp>(loc, geLow, ltHigh);
+
+      assertCond =
+          i > 0 ? builder.createOrFold<arith::AndIOp>(loc, assertCond, andOp)
+                : andOp;
+    }
+    builder.create<cf::AssertOp>(
+        loc, assertCond, generateErrorMessage(op, "out-of-bounds access"));
+  }
+};
+
+/// Compute the linear index for the provided strided layout and indices.
+Value computeLinearIndex(OpBuilder &builder, Location loc, OpFoldResult offset,
+                         ArrayRef<OpFoldResult> strides,
+                         ArrayRef<OpFoldResult> indices) {
+  auto [expr, values] = computeLinearIndex(offset, strides, indices);
+  auto index =
+      affine::makeComposedFoldedAffineApply(builder, loc, expr, values);
+  return getValueOrCreateConstantIndexOp(builder, loc, index);
+}
+
+/// Returns two Values representing the bounds of the provided strided layout
+/// metadata. The bounds are returned as a half open interval -- [low, high).
+std::pair<Value, Value> computeLinearBounds(OpBuilder &builder, Location loc,
+                                            OpFoldResult offset,
+                                            ArrayRef<OpFoldResult> strides,
+                                            ArrayRef<OpFoldResult> sizes) {
+  auto zeros = SmallVector<int64_t>(sizes.size(), 0);
+  auto indices = getAsIndexOpFoldResult(builder.getContext(), zeros);
+  auto lowerBound = computeLinearIndex(builder, loc, offset, strides, indices);
+  auto upperBound = computeLinearIndex(builder, loc, offset, strides, sizes);
+  return {lowerBound, upperBound};
+}
+
+/// Returns two Values representing the bounds of the memref. The bounds are
+/// returned as a half open interval -- [low, high).
+std::pair<Value, Value> computeLinearBounds(OpBuilder &builder, Location loc,
+                                            TypedValue<BaseMemRefType> memref) {
+  auto runtimeMetadata = builder.create<ExtractStridedMetadataOp>(loc, memref);
+  auto offset = runtimeMetadata.getConstifiedMixedOffset();
+  auto strides = runtimeMetadata.getConstifiedMixedStrides();
+  auto sizes = runtimeMetadata.getConstifiedMixedSizes();
+  return computeLinearBounds(builder, loc, offset, strides, sizes);
+}
+
+/// Verifies that the linear bounds of a reinterpret_cast op are within the
+/// linear bounds of the base memref: low >= baseLow && high <= baseHigh
+struct ReinterpretCastOpInterface
+    : public RuntimeVerifiableOpInterface::ExternalModel<
+          ReinterpretCastOpInterface, ReinterpretCastOp> {
+  void generateRuntimeVerification(Operation *op, OpBuilder &builder,
+                                   Location loc) const {
+    auto reinterpretCast = cast<ReinterpretCastOp>(op);
+    auto baseMemref = reinterpretCast.getSource();
+    auto resultMemref =
+        cast<TypedValue<BaseMemRefType>>(reinterpretCast.getResult());
+
+    builder.setInsertionPointAfter(op);
+
+    // Compute the linear bounds of the base memref
+    auto [baseLow, baseHigh] = computeLinearBounds(builder, loc, baseMemref);
+
+    // Compute the linear bounds of the resulting memref
+    auto [low, high] = computeLinearBounds(builder, loc, resultMemref);
+
+    // Check low >= baseLow
+    auto geLow = builder.createOrFold<arith::CmpIOp>(
+        loc, arith::CmpIPredicate::sge, low, baseLow);
+
+    // Check high <= baseHigh
+    auto leHigh = builder.createOrFold<arith::CmpIOp>(
+        loc, arith::CmpIPredicate::sle, high, baseHigh);
+
+    auto assertCond = builder.createOrFold<arith::AndIOp>(loc, geLow, leHigh);
+
+    builder.create<cf::AssertOp>(
+        loc, assertCond,
+        generateErrorMessage(
+            op,
+            "result of reinterpret_cast is out-of-bounds of the base memref"));
+  }
+};
+
+/// Verifies that the linear bounds of a subview op are within the linear bounds
+/// of the base memref: low >= baseLow && high <= baseHigh
+/// TODO: This is not yet a full runtime verification of subview. For example,
+/// consider:
+///   %m = memref.alloc(%c10, %c10) : memref<10x10xf32>
+///   memref.subview %m[%c0, %c0][%c20, %c2][%c1, %c1]
+///      : memref<?x?xf32> to memref<?x?xf32>
+/// The subview is in-bounds of the entire base memref but the first dimension
+/// is out-of-bounds. Future work would verify the bounds on a per-dimension
+/// basis.
+struct SubViewOpInterface
+    : public RuntimeVerifiableOpInterface::ExternalModel<SubViewOpInterface,
+                                                         SubViewOp> {
+  void generateRuntimeVerification(Operation *op, OpBuilder &builder,
+                                   Location loc) const {
+    auto subView = cast<SubViewOp>(op);
+    auto baseMemref = cast<TypedValue<BaseMemRefType>>(subView.getSource());
+    auto resultMemref = cast<TypedValue<BaseMemRefType>>(subView.getResult());
+
+    builder.setInsertionPointAfter(op);
+
+    // Compute the linear bounds of the base memref
+    auto [baseLow, baseHigh] = computeLinearBounds(builder, loc, baseMemref);
+
+    // Compute the linear bounds of the resulting memref
+    auto [low, high] = computeLinearBounds(builder, loc, resultMemref);
+
+    // Check low >= baseLow
+    auto geLow = builder.createOrFold<arith::CmpIOp>(
+        loc, arith::CmpIPredicate::sge, low, baseLow);
+
+    // Check high <= baseHigh
+    auto leHigh = builder.createOrFold<arith::CmpIOp>(
+        loc, arith::CmpIPredicate::sle, high, baseHigh);
+
+    auto assertCond = builder.createOrFold<arith::AndIOp>(loc, geLow, leHigh);
+
+    builder.create<cf::AssertOp>(
+        loc, assertCond,
+        generateErrorMessage(op,
+                             "subview is out-of-bounds of the base memref"));
+  }
+};
+
 struct ExpandShapeOpInterface
     : public RuntimeVerifiableOpInterface::ExternalModel<ExpandShapeOpInterface,
                                                          ExpandShapeOp> {
@@ -183,8 +348,13 @@ void mlir::memref::registerRuntimeVerifiableOpInterfaceExternalModels(
   registry.addExtension(+[](MLIRContext *ctx, memref::MemRefDialect *dialect) {
     CastOp::attachInterface<CastOpInterface>(*ctx);
     ExpandShapeOp::attachInterface<ExpandShapeOpInterface>(*ctx);
+    LoadOp::attachInterface<LoadStoreOpInterface<LoadOp>>(*ctx);
+    ReinterpretCastOp::attachInterface<ReinterpretCastOpInterface>(*ctx);
+    StoreOp::attachInterface<LoadStoreOpInterface<StoreOp>>(*ctx);
+    SubViewOp::attachInterface<SubViewOpInterface>(*ctx);
 
     // Load additional dialects of which ops may get created.
-    ctx->loadDialect<arith::ArithDialect, cf::ControlFlowDialect>();
+    ctx->loadDialect<affine::AffineDialect, arith::ArithDialect,
+                     cf::ControlFlowDialect>();
   });
 }
diff --git a/mlir/test/Integration/Dialect/Memref/cast-runtime-verification.mlir b/mlir/test/Integration/Dialect/Memref/cast-runtime-verification.mlir
index 6ad817a73408c..52b8c16d753da 100644
--- a/mlir/test/Integration/Dialect/Memref/cast-runtime-verification.mlir
+++ b/mlir/test/Integration/Dialect/Memref/cast-runtime-verification.mlir
@@ -33,26 +33,26 @@ func.func @main() {
   %alloc = memref.alloc() : memref<5xf32>
 
   //      CHECK: ERROR: Runtime op verification failed
-  // CHECK-NEXT: memref.cast %{{.*}} : memref<?xf32> to memref<10xf32>
+  // CHECK-NEXT: "memref.cast"(%{{.*}}) : (memref<?xf32>) -> memref<10xf32>
   // CHECK-NEXT: ^ size mismatch of dim 0
   // CHECK-NEXT: Location: loc({{.*}})
   %1 = memref.cast %alloc : memref<5xf32> to memref<?xf32>
   func.call @cast_to_static_dim(%1) : (memref<?xf32>) -> (memref<10xf32>)
 
   // CHECK-NEXT: ERROR: Runtime op verification failed
-  // CHECK-NEXT: memref.cast %{{.*}} : memref<*xf32> to memref<f32>
+  // CHECK-NEXT: "memref.cast"(%{{.*}}) : (memref<*xf32>) -> memref<f32>
   // CHECK-NEXT: ^ rank mismatch
   // CHECK-NEXT: Location: loc({{.*}})
   %3 = memref.cast %alloc : memref<5xf32> to memref<*xf32>
   func.call @cast_to_ranked(%3) : (memref<*xf32>) -> (memref<f32>)
 
   // CHECK-NEXT: ERROR: Runtime op verification failed
-  // CHECK-NEXT: memref.cast %{{.*}} : memref<?xf32, strided<[?], offset: ?>> to memref<?xf32, strided<[9], offset: 5>>
+  // CHECK-NEXT: "memref.cast"(%{{.*}}) : (memref<?xf32, strided<[?], offset: ?>>) -> memref<?xf32, strided<[9], offset: 5>>
   // CHECK-NEXT: ^ offset mismatch
   // CHECK-NEXT: Location: loc({{.*}})
 
   // CHECK-NEXT: ERROR: Runtime op verification failed
-  // CHECK-NEXT: memref.cast %{{.*}} : memref<?xf32, strided<[?], offset: ?>> to memref<?xf32, strided<[9], offset: 5>>
+  // CHECK-NEXT: "memref.cast"(%{{.*}}) : (memref<?xf32, strided<[?], offset: ?>>) -> memref<?xf32, strided<[9], offset: 5>>
   // CHECK-NEXT: ^ stride mismatch of dim 0
   // CHECK-NEXT: Location: loc({{.*}})
   %4 = memref.cast %alloc
diff --git a/mlir/test/Integration/Dialect/Memref/load-runtime-verification.mlir b/mlir/test/Integration/Dialect/Memref/load-runtime-verification.mlir
new file mode 100644
index 0000000000000..169dfd7056459
--- /dev/null
+++ b/mlir/test/Integration/Dialect/Memref/load-runtime-verification.mlir
@@ -0,0 +1,67 @@
+// RUN: mlir-opt %s -generate-runtime-verification \
+// RUN:     -expand-strided-metadata \
+// RUN:     -finalize-memref-to-llvm \
+// RUN:     -test-cf-assert \
+// RUN:     -convert-func-to-llvm \
+// RUN:     -reconcile-unrealized-casts | \
+// RUN: mlir-cpu-runner -e main -entry-point-result=void \
+// RUN:     -shared-libs=%mlir_runner_utils 2>&1 | \
+// RUN: FileCheck %s
+
+func.func @load(%memref: memref<1xf32>, %index: index) {
+    memref.load %memref[%index] :  memref<1xf32>
+    return
+}
+
+func.func @load_dynamic(%memref: memref<?xf32>, %index: index) {
+    memref.load %memref[%index] :  memref<?xf32>
+    return
+}
+
+func.func @load_nd_dynamic(%memref: memref<?x?x?xf32>, %index0: index, %index1: index, %index2: index) {
+    memref.load %memref[%index0, %index1, %index2] :  memref<?x?x?xf32>
+    return
+}
+
+func.func @main() {
+  %0 = arith.constant 0 : index
+  %1 = arith.constant 1 : index
+  %n1 = arith.constant -1 : index
+  %2 = arith.constant 2 : index
+  %alloca_1 = memref.alloca() : memref<1xf32>
+  %alloc_1 = memref.alloc(%1) : memref<?xf32>
+  %alloc_2x2x2 = memref.alloc(%2, %2, %2) : memref<?x?x?xf32>
+
+  //      CHECK: ERROR: Runtime op verification failed
+  // CHECK-NEXT: "memref.load"(%{{.*}}, %{{.*}}) : (memref<1xf32>, index) -> f32
+  // CHECK-NEXT: ^ out-of-bounds access
+  // CHECK-NEXT: Location: loc({{.*}})
+  func.call @load(%alloca_1, %1) : (memref<1xf32>, index) -> ()
+
+  //      CHECK: ERROR: Runtime op verification failed
+  // CHECK-NEXT: "memref.load"(%{{.*}}, %{{.*}}) : (memref<?xf32>, index) -> f32
+  // CHECK-NEXT: ^ out-of-bounds access
+  // CHECK-NEXT: Location: loc({{.*}})
+  func.call @load_dynamic(%alloc_1, %1) : (memref<?xf32>, index) -> ()
+
+  //      CHECK: ERROR: Runtime op verification failed
+  // CHECK-NEXT: "memref.load"(%{{.*}}, %{{.*}}) : (memref<?x?x?xf32>, index, index, index) -> f32
+  // CHECK-NEXT: ^ out-of-bounds access
+  // CHECK-NEXT: Location: loc({{.*}})
+  func.call @load_nd_dynamic(%alloc_2x2x2, %1, %n1, %0) : (memref<?x?x?xf32>, index, index, index) -> ()
+
+  // CHECK-NOT: ERROR: Runtime op verification failed
+  func.call @load(%alloca_1, %0) : (memref<1xf32>, index) -> ()
+
+  // CHECK-NOT: ERROR: Runtime op verification failed
+  func.call @load_dynamic(%alloc_1, %0) : (memref<?xf32>, index) -> ()
+
+  // CHECK-NOT: ERROR: Runtime op verification failed
+  func.call @load_nd_dynamic(%alloc_2x2x2, %1, %1, %0) : (memref<?x?x?xf32>, index, index, index) -> ()
+
+  memref.dealloc %alloc_1 : memref<?xf32>
+  memref.dealloc %alloc_2x2x2 : memref<?x?x?xf32>
+
+  return
+}
+
diff --git a/mlir/test/Integration/Dialect/Memref/reinterpret-cast-runtime-verification.mlir b/mlir/test/Integration/Dialect/Memref/reinterpret-cast-runtime-verification.mlir
new file mode 100644
index 0000000000000..3700291540547
--- /dev/null
+++ b/mlir/test/Integration/Dialect/Memref/reinterpret-cast-runtime-verification.mlir
@@ -0,0 +1,74 @@
+// RUN: mlir-opt %s -generate-runtime-verification \
+// RUN:     -lower-affine \
+// RUN:     -finalize-memref-to-llvm \
+// RUN:     -test-cf-assert \
+// RUN:     -convert-func-to-llvm \
+// RUN:     -reconcile-unrealized-casts | \
+// RUN: mlir-cpu-runner -e main -entry-point-result=void \
+// RUN:     -shared-libs=%mlir_runner_utils 2>&1 | \
+// RUN: FileCheck %s
+
+func.func @reinterpret_cast(%memref: memref<1xf32>, %offset: index) {
+    memref.reinterpret_cast %memref to
+                    offset: [%offset],
+                    sizes: [1],
+                    strides: [1]
+                  : memref<1xf32> to  memref<1xf32, strided<[1], offset: ?>>
+    return
+}
+
+func.func @reinterpret_cast_fully_dynamic(%memref: memref<?xf32>, %offset: index, %size: index, %stride: index)  {
+    memref.reinterpret_cast %memref to
+                    offset: [%offset],
+                    sizes: [%size],
+                    strides: [%stride]
+                  : memref<?xf32> to  memref<?xf32, strided<[?], offset: ?>>
+    return
+}
+
+func.func @main() {
+  %0 = arith.constant 0 : index
+  %1 = arith.constant 1 : index
+  %n1 = arith.constant -1 : index
+  %4 = arith.constant 4 : index
+  %5 = arith.constant 5 : index
+
+  %alloca_1 = memref.alloca() : memref<1xf32>
+  %alloc_4 = memref.alloc(%4) : memref<?xf32>
+
+  // Offset is out-of-bounds
+  //      CHECK: ERROR: Runtime op verification failed
+  // CHECK-NEXT: "memref.reinterpret_cast"(%{{.*}})
+  // CHECK-NEXT: ^ result of reinterpret_cast is out-of-bounds of the base memref
+  // CHECK-NEXT: Location: loc({{.*}})
+  func.call @reinterpret_cast(%alloca_1, %1) : (memref<1xf32>, index) -> ()
+
+  // Offset is out-of-bounds
+  //      CHECK: ERROR: Runtime op verification failed
+  // CHECK-NEXT: "memref.reinterpret_cast"(%{{.*}})
+  // CHECK-NEXT: ^ result of reinterpret_cast is out-of-bounds of the base memref
+  // CHECK-NEXT: Location: loc({{.*}})
+  func.call @reinterpret_cast(%alloca_1, %n1) : (memref<1xf32>, index) -> ()
+
+  // Size is out-of-bounds
+  //      CHECK: ERROR: Runtime op verification failed
+  // CHECK-NEXT: "memref.reinterpret_cast"(%{{.*}})
+  // CHECK-NEXT: ^ result of reinterpret_cast is out-of-bounds of the base memref
+  // CHECK-NEXT: Location: loc({{.*}})
+  func.call @reinterpret_cast_fully_dynamic(%alloc_4, %0, %5, %1) : (memref<?xf32>, index, index, index) -> ()
+
+  // Stride is out-of-bounds
+  //      CHECK: ERROR: Runtime op verification failed
+  // CHECK-NEXT: "memref.reinterpret_cast"(%{{.*}})
+  // CHECK-NEXT: ^ result of reinterpret_cast is out-of-bounds of the base memref
+  // CHECK-NEXT: Location: loc({{.*}})
+  func.call @reinterpret_cast_fully_dynamic(%alloc_4, %0, %4, %4) : (memref<?xf32>, index, index, index) -> ()
+
+  //  CHECK-NOT: ERROR: Runtime op verification failed
+  func.call @reinterpret_cast(%alloca_1, %0) : (memref<1xf32>, index) -> ()
+
+  //  CHECK-NOT: ERROR: Runtime op verification failed
+  func.call @reinterpret_cast_fully_dynamic(%alloc_4, %0, %4, %1) : (memref<?xf32>, index, index, index) -> ()
+
+  return
+}
diff --git a/mlir/test/Integration/Dialect/Memref/subview-runtime-verification.mlir b/mlir/test/Integration/Dialect/Memref/subview-runtime-verification.mlir
new file mode 100644
index 0000000000000..48987ce216f1a
--- /dev/null
+++ b/mlir/test/Integration/Dialect/Memref/subview-runtime-verification.mlir
@@ -0,0 +1,89 @@
+// RUN: mlir-opt %s -generate-runtime-verification \
+// RUN:     -expand-strided-metadata \
+// RUN:     -lower-affine \
+// RUN:     -finalize-memref-to-llvm \
+// RUN:     -test-cf-assert \
+// RUN:     -convert-func-to-llvm \
+// RUN:     -reconcile-unrealized-casts | \
+// RUN: mlir-cpu-runner -e main -entry-point-result=void \
+// RUN:     -shared-libs=%mlir_runner_utils 2>&1 | \
+// RUN: FileCheck %s
+
+func.func @subview(%memref: memref<1xf32>, %offset: index) {
+    memref.subview %memref[%offset] [1] [1] : 
+        memref<1xf32> to 
+        memref<1xf32, strided<[1], offset: ?>>
+    return
+}
+
+func.func @subview_dynamic(%memref: memref<?x4xf32>, %offset: index, %size: index, %stride: index) {
+    memref.subview %memref[%offset, 0] [%size, 4] [%stride, 1] : 
+        memref<?x4xf32> to 
+        memref<?x4xf32, strided<[?, 1], offset: ?>>
+    return
+}
+
+func.func @subview_dynamic_rank_reduce(%memref: memref<?x4xf32>, %offset: index, %size: index, %stride: index) {
+    memref.subview %memref[%offset, 0] [%size, 1] [%stride, 1] :
+        memref<?x4xf32> to
+        memref<?xf32, strided<[?], offset: ?>>
+    return
+}
+
+func.func @main() {
+  %0 = arith.constant 0 : index
+  %1 = arith.constant 1 : index
+  %n1 = arith.constant -1 : index
+  %4 = arith.constant 4 : index
+  %5 = arith.constant 5 : index
+
+  %alloca = memref.alloca() : memref<1xf32>
+  %alloc = memref.alloc(%4) : memref<?x4xf32>
+
+  // Offset is out-of-bounds
+  //      CHECK: ERROR: Runtime op verification failed
+  // CHECK-NEXT: "memref.subview"
+  // CHECK-NEXT: ^ subview is out-of-bounds of the base memref
+  // CHECK-NEXT: Location: loc({{.*}})
+  func.call @subview_dynamic_rank_reduce(%alloc, %5, %5, %1) : (memref<?x4xf32>, index, index, index) -> ()
+
+  // Offset is out-of-bounds
+  //      CHECK: ERROR: Runtime op verification failed
+  // CHECK-NEXT: "memref.subview"
+  // CHECK-NEXT: ^ subview is out-of-bounds of the base memref
+  // CHECK-NEXT: Location: loc({{.*}})
+  func.call @subview(%alloca, %1) : (memref<1xf32>, index) -> ()
+
+  // Offset is out-of-bounds
+  //      CHECK: ERROR: Runtime op verification failed
+  // CHECK-NEXT: "memref.subview"
+  // CHECK-NEXT: ^ subview is out-of-bounds of the base memref
+  // CHECK-NEXT: Location: loc({{.*}})
+  func.call @subview(%alloca, %n1) : (memref<1xf32>, index) -> ()
+
+  // Size is out-of-bounds
+  //      CHECK: ERROR: Runtime op verification failed
+  // CHECK-NEXT: "memref.subview"
+  // CHECK-NEXT: ^ subview is out-of-bounds of the base memref
+  // CHECK-NEXT: Location: loc({{.*}})
+  func.call @subview_dynamic(%alloc, %0, %5, %1) : (memref<?x4xf32>, index, index, index) -> ()
+
+  // Stride is out-of-bounds
+  //      CHECK: ERROR: Runtime op verification failed
+  // CHECK-NEXT: "memref.subview"
+  // CHECK-NEXT: ^ subview is out-of-bounds of the base memref
+  // CHECK-NEXT: Location: loc({{.*}})
+  func.call @subview_dynamic(%alloc, %0, %4, %4) : (memref<?x4xf32>, index, index, index) -> ()
+
+  // CHECK-NOT: ERROR: Runtime op verification failed
+  func.call @subview(%alloca, %0) : (memref<1xf32>, index) -> ()
+
+  // CHECK-NOT: ERROR: Runtime op verification failed
+  func.call @subview_dynamic(%alloc, %0, %4, %1) : (memref<?x4xf32>, index, index, index) -> ()
+
+  // CHECK-NOT: ERROR: Runtime op verification failed
+  func.call @subview_dynamic_rank_reduce(%alloc, %0, %1, %0) : (memref<?x4xf32>, index, index, index) -> ()
+
+
+  return
+}

From 26ddf4eee2a009147faa3000c55d7822c2087dce Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Thu, 21 Dec 2023 18:59:11 -0800
Subject: [PATCH 132/342] [ELF] Change .debug_names tombstone value to
 UINT32_MAX/UINT64_MAX (#74686)

`clang -g -gpubnames -fdebug-types-section` now emits .debug_names
section with references to local type unit entries defined in COMDAT
.debug_info sections.

```
.section        .debug_info,"G",@progbits,5657452045627120676,comdat
.Ltu_begin0:
...

.section        .debug_names,"",@progbits
...
// DWARF32
.long   .Ltu_begin0                     # Type unit 0
// DWARF64
// .long   .Ltu_begin0                     # Type unit 0
```

When `.Ltu_begin0` is relative to a non-prevailing .debug_info section,
the relocation resolves to 0, which is a valid offset within the
.debug_info section.

```
cat > a.cc <<e
struct A { int x; };
inline A foo() { return {1}; }
int main() { foo(); }
e
cat > b.cc <<e
struct A { int x; };
inline A foo() { return {1}; }
void use() { foo(); }
e
clang++ -g -gpubnames -fdebug-types-section -fuse-ld=lld a.cc b.cc -o old
```
```
% llvm-dwarfdump old
...
  Local Type Unit offsets [
    LocalTU[0]: 0x00000000
  ]
...
  Local Type Unit offsets [
    LocalTU[0]: 0x00000000  // indistinguishable from a valid offset within .debug_info
  ]
```

https://dwarfstd.org/issues/231013.1.html proposes that we use a
tombstone value instead to inform consumers. This patch implements the
idea. The second LocalTU entry will now use 0xffffffff.

https://reviews.llvm.org/D84825 has a TODO that we should switch the
tombstone value for most `.debug_*` sections to UINT64_MAX. We have
postponed the change for more than three years for consumers to migrate.
At some point we shall make the change, so that .debug_names is no long
different from other debug section that is not .debug_loc/.debug_ranges.

Co-authored-by: Alexander Yermolovich <ayermolo@meta.com>
---
 lld/ELF/InputSection.cpp           | 22 ++++++++++++++++------
 lld/test/ELF/debug-dead-reloc-32.s | 11 +++++++++++
 lld/test/ELF/debug-dead-reloc.s    | 24 ++++++++++++++++++++++--
 3 files changed, 49 insertions(+), 8 deletions(-)

diff --git a/lld/ELF/InputSection.cpp b/lld/ELF/InputSection.cpp
index 81468a20dfb54..5dfb57fda432e 100644
--- a/lld/ELF/InputSection.cpp
+++ b/lld/ELF/InputSection.cpp
@@ -898,10 +898,16 @@ void InputSection::relocateNonAlloc(uint8_t *buf, ArrayRef<RelTy> rels) {
   const TargetInfo &target = *elf::target;
   const auto emachine = config->emachine;
   const bool isDebug = isDebugSection(*this);
-  const bool isDebugLocOrRanges =
-      isDebug && (name == ".debug_loc" || name == ".debug_ranges");
   const bool isDebugLine = isDebug && name == ".debug_line";
   std::optional<uint64_t> tombstone;
+  if (isDebug) {
+    if (name == ".debug_loc" || name == ".debug_ranges")
+      tombstone = 1;
+    else if (name == ".debug_names")
+      tombstone = UINT64_MAX; // tombstone value
+    else
+      tombstone = 0;
+  }
   for (const auto &patAndValue : llvm::reverse(config->deadRelocInNonAlloc))
     if (patAndValue.first.match(this->name)) {
       tombstone = patAndValue.second;
@@ -946,8 +952,7 @@ void InputSection::relocateNonAlloc(uint8_t *buf, ArrayRef<RelTy> rels) {
       return;
     }
 
-    if (tombstone ||
-        (isDebug && (type == target.symbolicRel || expr == R_DTPREL))) {
+    if (tombstone && (expr == R_ABS || expr == R_DTPREL)) {
       // Resolve relocations in .debug_* referencing (discarded symbols or ICF
       // folded section symbols) to a tombstone value. Resolving to addend is
       // unsatisfactory because the result address range may collide with a
@@ -978,8 +983,13 @@ void InputSection::relocateNonAlloc(uint8_t *buf, ArrayRef<RelTy> rels) {
       // value. Enable -1 in a future release.
       if (!sym.getOutputSection() || (ds && ds->folded && !isDebugLine)) {
         // If -z dead-reloc-in-nonalloc= is specified, respect it.
-        const uint64_t value = tombstone ? SignExtend64<bits>(*tombstone)
-                                         : (isDebugLocOrRanges ? 1 : 0);
+        uint64_t value = SignExtend64<bits>(*tombstone);
+        // For a 32-bit local TU reference in .debug_names, X86_64::relocate
+        // requires that the unsigned value for R_X86_64_32 is truncated to
+        // 32-bit. Other 64-bit targets's don't discern signed/unsigned 32-bit
+        // absolute relocations and do not need this change.
+        if (emachine == EM_X86_64 && type == R_X86_64_32)
+          value = static_cast<uint32_t>(value);
         target.relocateNoSym(bufLoc, type, value);
         continue;
       }
diff --git a/lld/test/ELF/debug-dead-reloc-32.s b/lld/test/ELF/debug-dead-reloc-32.s
index 99335b44f51ce..1aa43148689e9 100644
--- a/lld/test/ELF/debug-dead-reloc-32.s
+++ b/lld/test/ELF/debug-dead-reloc-32.s
@@ -13,6 +13,8 @@
 # CHECK-NEXT:  0000 01000000
 # CHECK-NEXT: Contents of section .debug_addr:
 # CHECK-NEXT:  0000 00000000
+# CHECK-NEXT: Contents of section .debug_names:
+# CHECK-NEXT:  0000 ffffffff
 
 ## -z dead-reloc-in-nonalloc= can override the tombstone value.
 # RUN: ld.lld -z dead-reloc-in-nonalloc=.debug_loc=42 -z dead-reloc-in-nonalloc=.debug_addr=0xfffffffffffffffe %t.o -o %t1
@@ -38,3 +40,12 @@
 ## Resolved to UINT32_C(0), with the addend ignored.
 .section .debug_addr
   .long .text.1+8
+
+.section  .debug_info,"eG",@progbits,5657452045627120676,comdat
+.Ltu_begin0:
+
+.section .debug_names
+## .debug_names may reference a local type unit defined in a COMDAT .debug_info
+## section (-g -gpubnames -fdebug-types-section). If the referenced section is
+## non-prevailing, resolve to UINT32_MAX.
+.long .Ltu_begin0
diff --git a/lld/test/ELF/debug-dead-reloc.s b/lld/test/ELF/debug-dead-reloc.s
index cfa41e00eab06..1a8823737ed56 100644
--- a/lld/test/ELF/debug-dead-reloc.s
+++ b/lld/test/ELF/debug-dead-reloc.s
@@ -21,9 +21,12 @@
 # CHECK:      Contents of section .debug_addr:
 # CHECK-NEXT:  0000 {{.*}}00 00000000 {{.*}}00 00000000
 # CHECK-NEXT:  0010 00000000 00000000 {{.*}}00 00000000
+# CHECK:      Contents of section .debug_names:
+# CHECK-NEXT:  0000 00000000 00000000 00000000 ffffffff .
+# CHECK-NEXT:  0010 ffffffff ffffffff                   .
 # CHECK:      Contents of section .debug_foo:
-# CHECK-NEXT:  0000 00000000 00000000 08000000 00000000
-# CHECK-NEXT:  0010 00000000 00000000 08000000 00000000
+# CHECK-NEXT:  0000 00000000 00000000 00000000 00000000
+# CHECK-NEXT:  0010 00000000 00000000 00000000 00000000
 
 # REL:      Relocations [
 # REL-NEXT:   .rela.text {
@@ -43,6 +46,12 @@
 # REL-NEXT:     0x10 R_X86_64_NONE - 0x18
 # REL-NEXT:     0x18 R_X86_64_64 group 0x20
 # REL-NEXT:   }
+# REL-NEXT:   .rela.debug_names {
+# REL-NEXT:     0x0 R_X86_64_32 .debug_info 0x0
+# REL-NEXT:     0x4 R_X86_64_64 .debug_info 0x0
+# REL-NEXT:     0xC R_X86_64_NONE - 0x0
+# REL-NEXT:     0x10 R_X86_64_NONE - 0x0
+# REL-NEXT:   }
 # REL-NEXT:   .rela.debug_foo {
 # REL-NEXT:     0x0 R_X86_64_NONE - 0x8
 # REL-NEXT:     0x8 R_X86_64_NONE - 0x8
@@ -82,6 +91,17 @@ group:
 ## resolved to the prevailing copy.
   .quad group+32
 
+.section  .debug_info,"G",@progbits,5657452045627120676,comdat
+.Ltu_begin0:
+
+.section .debug_names
+## .debug_names may reference a local type unit defined in a COMDAT .debug_info
+## section (-g -gpubnames -fdebug-types-section). If the referenced section is
+## non-prevailing, resolve to UINT32_MAX.
+.long .Ltu_begin0
+## ... or UINT64_MAX for DWARF64.
+.quad .Ltu_begin0
+
 .section .debug_foo
   .quad .text.1+8
 

From a03c53c53082ae14d5af44a0cf7fa4626ab3f6f3 Mon Sep 17 00:00:00 2001
From: Jakub Kuderski <jakub@nod-labs.com>
Date: Thu, 21 Dec 2023 23:06:26 -0500
Subject: [PATCH 133/342] [mlir][spirv] Add physical storage buffer extension
 test. NFC. (#76196)

This test demonstrates how the PhysicalStorageBuffer extension can be
used end-2-end in a spir-v module.

This module has been verified to pass serialization, deserialization,
and validation with spirv-val.
---
 .../Target/SPIRV/physical-storage-buffer.mlir | 48 +++++++++++++++++++
 1 file changed, 48 insertions(+)
 create mode 100644 mlir/test/Target/SPIRV/physical-storage-buffer.mlir

diff --git a/mlir/test/Target/SPIRV/physical-storage-buffer.mlir b/mlir/test/Target/SPIRV/physical-storage-buffer.mlir
new file mode 100644
index 0000000000000..040cfb891cb31
--- /dev/null
+++ b/mlir/test/Target/SPIRV/physical-storage-buffer.mlir
@@ -0,0 +1,48 @@
+// RUN: mlir-translate --no-implicit-module --test-spirv-roundtrip %s | FileCheck %s
+
+// Test file showing how the Physical Storage Buffer extension works end-2-end.
+
+!f32_binding = !spirv.struct<binding_f32_t, (!spirv.rtarray<f32, stride=4> [0])>
+!f32_binding_ptr = !spirv.ptr<!f32_binding, PhysicalStorageBuffer>
+
+!set_0 = !spirv.struct<set_0_t, (!f32_binding_ptr [0],
+                                 !f32_binding_ptr [8],
+                                 !f32_binding_ptr [16])>
+!set_0_ptr = !spirv.ptr<!set_0, StorageBuffer>
+
+!set_1 = !spirv.struct<set_1_t, (!f32_binding_ptr [0],
+                                 !f32_binding_ptr [8])>
+!set_1_ptr = !spirv.ptr<!set_1, StorageBuffer>
+
+spirv.module PhysicalStorageBuffer64 GLSL450 requires #spirv.vce<v1.5,
+    [Shader, Int64, PhysicalStorageBufferAddresses], [SPV_KHR_physical_storage_buffer]> {
+
+  spirv.GlobalVariable @set_0 bind(3, 0) : !set_0_ptr
+  spirv.GlobalVariable @set_1 bind(3, 1) : !set_1_ptr
+
+  // CHECK-LABEL: spirv.func @main() "None"
+  spirv.func @main() "None" {
+    %idx0 = spirv.Constant 0 : i64
+    %idx1 = spirv.Constant 1 : i64
+    %idx2 = spirv.Constant 2 : i64
+    %set_0_addr = spirv.mlir.addressof @set_0 : !set_0_ptr
+    %s0_b2_ptr = spirv.AccessChain %set_0_addr[%idx2] : !set_0_ptr, i64
+    %b2_ptr = spirv.Load "StorageBuffer" %s0_b2_ptr : !f32_binding_ptr
+    %b2_data_ptr = spirv.AccessChain %b2_ptr[%idx0, %idx0] : !f32_binding_ptr, i64, i64
+
+    // CHECK: spirv.Load "PhysicalStorageBuffer"
+    %b2_data = spirv.Load "PhysicalStorageBuffer" %b2_data_ptr ["Aligned", 4] : f32
+
+    %set_1_addr = spirv.mlir.addressof @set_1 : !set_1_ptr
+    %s1_b1_ptr = spirv.AccessChain %set_1_addr[%idx1] : !set_1_ptr, i64
+    %b1_ptr = spirv.Load "StorageBuffer" %s1_b1_ptr : !f32_binding_ptr
+    %b1_data_ptr = spirv.AccessChain %b1_ptr[%idx0, %idx0] : !f32_binding_ptr, i64, i64
+
+    // CHECK: spirv.Store "PhysicalStorageBuffer"
+    spirv.Store "PhysicalStorageBuffer" %b1_data_ptr, %b2_data ["Aligned", 4] : f32
+
+    spirv.Return
+  }
+
+  spirv.EntryPoint "GLCompute" @main, @set_0, @set_1
+}

From 248fba0cd806a0f6bf4b0f12979f2185f2bed111 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Thu, 21 Dec 2023 11:12:18 +0700
Subject: [PATCH 134/342] AMDGPU: Remove pointless setOperationAction for
 xint_to_fp

The legalize action for uint_to_fp/sint_to_fp uses the source integer
type, not the result FP type so setting an action on an FP type does
nothing.
---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 4f4bc45e49b43..fc119aa61d01a 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -540,10 +540,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
                        MVT::f16, Custom);
 
     setOperationAction({ISD::SINT_TO_FP, ISD::UINT_TO_FP}, MVT::i16, Custom);
-
-    setOperationAction(
-        {ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::SINT_TO_FP, ISD::UINT_TO_FP},
-        MVT::f16, Promote);
+    setOperationAction({ISD::FP_TO_SINT, ISD::FP_TO_UINT}, MVT::f16, Promote);
 
     // F16 - VOP2 Actions.
     setOperationAction({ISD::BR_CC, ISD::SELECT_CC}, MVT::f16, Expand);

From 0ccc1e7acdda4488a5ae680a4cead6cdd238efab Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Thu, 21 Dec 2023 20:55:30 -0800
Subject: [PATCH 135/342] Revert "[AArch64] Fold more load.x into load.i with
 large offset"

Issue #76202

This reverts commit f5687636415969e6d945659a0b78734abdfb0f06.
---
 llvm/lib/Target/AArch64/AArch64InstrInfo.cpp  | 13 ---
 .../AArch64/AArch64LoadStoreOptimizer.cpp     | 53 +-----------
 llvm/test/CodeGen/AArch64/arm64-addrmode.ll   | 85 +++++++++++--------
 3 files changed, 54 insertions(+), 97 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 6d85e1fb5fbf1..bc9678c13971f 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -4094,20 +4094,7 @@ AArch64InstrInfo::getLdStAmountOp(const MachineInstr &MI) {
   switch (MI.getOpcode()) {
   default:
     llvm_unreachable("Unexpected opcode");
-  case AArch64::LDRBroX:
   case AArch64::LDRBBroX:
-  case AArch64::LDRSBXroX:
-  case AArch64::LDRSBWroX:
-  case AArch64::LDRHroX:
-  case AArch64::LDRHHroX:
-  case AArch64::LDRSHXroX:
-  case AArch64::LDRSHWroX:
-  case AArch64::LDRWroX:
-  case AArch64::LDRSroX:
-  case AArch64::LDRSWroX:
-  case AArch64::LDRDroX:
-  case AArch64::LDRXroX:
-  case AArch64::LDRQroX:
     return MI.getOperand(4);
   }
 }
diff --git a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
index aa7a4bc235361..b435b3ce03e7e 100644
--- a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
@@ -180,7 +180,7 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass {
 
   // Scan the instruction list to find a register assigned with a const
   // value that can be combined with the current instruction (a load or store)
-  // using base addressing with writeback. Scan backwards.
+  // using base addressing with writeback. Scan forwards.
   MachineBasicBlock::iterator
   findMatchingConstOffsetBackward(MachineBasicBlock::iterator I, unsigned Limit,
                                   unsigned &Offset);
@@ -221,7 +221,7 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass {
   // Find and merge a base register updates before or after a ld/st instruction.
   bool tryToMergeLdStUpdate(MachineBasicBlock::iterator &MBBI);
 
-  // Find and merge a index ldr/st instruction into a base ld/st instruction.
+  // Find and merge a index ldr/st instructions into a base ld/st instruction.
   bool tryToMergeIndexLdSt(MachineBasicBlock::iterator &MBBI, int Scale);
 
   bool optimizeBlock(MachineBasicBlock &MBB, bool EnableNarrowZeroStOpt);
@@ -511,34 +511,8 @@ static unsigned getBaseAddressOpcode(unsigned Opc) {
   switch (Opc) {
   default:
     llvm_unreachable("Opcode has no base address equivalent!");
-  case AArch64::LDRBroX:
-    return AArch64::LDRBui;
   case AArch64::LDRBBroX:
     return AArch64::LDRBBui;
-  case AArch64::LDRSBXroX:
-    return AArch64::LDRSBXui;
-  case AArch64::LDRSBWroX:
-    return AArch64::LDRSBWui;
-  case AArch64::LDRHroX:
-    return AArch64::LDRHui;
-  case AArch64::LDRHHroX:
-    return AArch64::LDRHHui;
-  case AArch64::LDRSHXroX:
-    return AArch64::LDRSHXui;
-  case AArch64::LDRSHWroX:
-    return AArch64::LDRSHWui;
-  case AArch64::LDRWroX:
-    return AArch64::LDRWui;
-  case AArch64::LDRSroX:
-    return AArch64::LDRSui;
-  case AArch64::LDRSWroX:
-    return AArch64::LDRSWui;
-  case AArch64::LDRDroX:
-    return AArch64::LDRDui;
-  case AArch64::LDRXroX:
-    return AArch64::LDRXui;
-  case AArch64::LDRQroX:
-    return AArch64::LDRQui;
   }
 }
 
@@ -790,31 +764,10 @@ static bool isMergeableIndexLdSt(MachineInstr &MI, int &Scale) {
   default:
     return false;
   // Scaled instructions.
-  // TODO: Add more index address stores.
-  case AArch64::LDRBroX:
+  // TODO: Add more index address loads/stores.
   case AArch64::LDRBBroX:
-  case AArch64::LDRSBXroX:
-  case AArch64::LDRSBWroX:
     Scale = 1;
     return true;
-  case AArch64::LDRHroX:
-  case AArch64::LDRHHroX:
-  case AArch64::LDRSHXroX:
-  case AArch64::LDRSHWroX:
-    Scale = 2;
-    return true;
-  case AArch64::LDRWroX:
-  case AArch64::LDRSroX:
-  case AArch64::LDRSWroX:
-    Scale = 4;
-    return true;
-  case AArch64::LDRDroX:
-  case AArch64::LDRXroX:
-    Scale = 8;
-    return true;
-  case AArch64::LDRQroX:
-    Scale = 16;
-    return true;
   }
 }
 
diff --git a/llvm/test/CodeGen/AArch64/arm64-addrmode.ll b/llvm/test/CodeGen/AArch64/arm64-addrmode.ll
index bfef61abd8c12..2181eaaee7db6 100644
--- a/llvm/test/CodeGen/AArch64/arm64-addrmode.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-addrmode.ll
@@ -239,8 +239,9 @@ define i32 @LdOffset_i8_zext32(ptr %a)  {
 define i32 @LdOffset_i8_sext32(ptr %a)  {
 ; CHECK-LABEL: LdOffset_i8_sext32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    add x8, x0, #253, lsl #12 // =1036288
-; CHECK-NEXT:    ldrsb w0, [x8, #3704]
+; CHECK-NEXT:    mov w8, #56952 // =0xde78
+; CHECK-NEXT:    movk w8, #15, lsl #16
+; CHECK-NEXT:    ldrsb w0, [x0, x8]
 ; CHECK-NEXT:    ret
   %arrayidx = getelementptr inbounds i8, ptr %a, i64 1039992
   %val = load i8, ptr %arrayidx, align 1
@@ -265,8 +266,9 @@ define i64 @LdOffset_i8_zext64(ptr %a)  {
 define i64 @LdOffset_i8_sext64(ptr %a)  {
 ; CHECK-LABEL: LdOffset_i8_sext64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    add x8, x0, #253, lsl #12 // =1036288
-; CHECK-NEXT:    ldrsb x0, [x8, #3704]
+; CHECK-NEXT:    mov w8, #56952 // =0xde78
+; CHECK-NEXT:    movk w8, #15, lsl #16
+; CHECK-NEXT:    ldrsb x0, [x0, x8]
 ; CHECK-NEXT:    ret
   %arrayidx = getelementptr inbounds i8, ptr %a, i64 1039992
   %val = load i8, ptr %arrayidx, align 1
@@ -278,8 +280,9 @@ define i64 @LdOffset_i8_sext64(ptr %a)  {
 define i16 @LdOffset_i16(ptr %a)  {
 ; CHECK-LABEL: LdOffset_i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    add x8, x0, #506, lsl #12 // =2072576
-; CHECK-NEXT:    ldrh w0, [x8, #7408]
+; CHECK-NEXT:    mov w8, #48368 // =0xbcf0
+; CHECK-NEXT:    movk w8, #31, lsl #16
+; CHECK-NEXT:    ldrh w0, [x0, x8]
 ; CHECK-NEXT:    ret
   %arrayidx = getelementptr inbounds i16, ptr %a, i64 1039992
   %val = load i16, ptr %arrayidx, align 2
@@ -290,8 +293,9 @@ define i16 @LdOffset_i16(ptr %a)  {
 define i32 @LdOffset_i16_zext32(ptr %a)  {
 ; CHECK-LABEL: LdOffset_i16_zext32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    add x8, x0, #506, lsl #12 // =2072576
-; CHECK-NEXT:    ldrh w0, [x8, #7408]
+; CHECK-NEXT:    mov w8, #48368 // =0xbcf0
+; CHECK-NEXT:    movk w8, #31, lsl #16
+; CHECK-NEXT:    ldrh w0, [x0, x8]
 ; CHECK-NEXT:    ret
   %arrayidx = getelementptr inbounds i16, ptr %a, i64 1039992
   %val = load i16, ptr %arrayidx, align 2
@@ -303,8 +307,9 @@ define i32 @LdOffset_i16_zext32(ptr %a)  {
 define i32 @LdOffset_i16_sext32(ptr %a)  {
 ; CHECK-LABEL: LdOffset_i16_sext32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    add x8, x0, #506, lsl #12 // =2072576
-; CHECK-NEXT:    ldrsh w0, [x8, #7408]
+; CHECK-NEXT:    mov w8, #48368 // =0xbcf0
+; CHECK-NEXT:    movk w8, #31, lsl #16
+; CHECK-NEXT:    ldrsh w0, [x0, x8]
 ; CHECK-NEXT:    ret
   %arrayidx = getelementptr inbounds i16, ptr %a, i64 1039992
   %val = load i16, ptr %arrayidx, align 2
@@ -316,8 +321,9 @@ define i32 @LdOffset_i16_sext32(ptr %a)  {
 define i64 @LdOffset_i16_zext64(ptr %a)  {
 ; CHECK-LABEL: LdOffset_i16_zext64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    add x8, x0, #506, lsl #12 // =2072576
-; CHECK-NEXT:    ldrh w0, [x8, #7408]
+; CHECK-NEXT:    mov w8, #48368 // =0xbcf0
+; CHECK-NEXT:    movk w8, #31, lsl #16
+; CHECK-NEXT:    ldrh w0, [x0, x8]
 ; CHECK-NEXT:    ret
   %arrayidx = getelementptr inbounds i16, ptr %a, i64 1039992
   %val = load i16, ptr %arrayidx, align 2
@@ -329,8 +335,9 @@ define i64 @LdOffset_i16_zext64(ptr %a)  {
 define i64 @LdOffset_i16_sext64(ptr %a)  {
 ; CHECK-LABEL: LdOffset_i16_sext64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    add x8, x0, #506, lsl #12 // =2072576
-; CHECK-NEXT:    ldrsh x0, [x8, #7408]
+; CHECK-NEXT:    mov w8, #48368 // =0xbcf0
+; CHECK-NEXT:    movk w8, #31, lsl #16
+; CHECK-NEXT:    ldrsh x0, [x0, x8]
 ; CHECK-NEXT:    ret
   %arrayidx = getelementptr inbounds i16, ptr %a, i64 1039992
   %val = load i16, ptr %arrayidx, align 2
@@ -342,8 +349,9 @@ define i64 @LdOffset_i16_sext64(ptr %a)  {
 define i32 @LdOffset_i32(ptr %a)  {
 ; CHECK-LABEL: LdOffset_i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    add x8, x0, #1012, lsl #12 // =4145152
-; CHECK-NEXT:    ldr w0, [x8, #14816]
+; CHECK-NEXT:    mov w8, #31200 // =0x79e0
+; CHECK-NEXT:    movk w8, #63, lsl #16
+; CHECK-NEXT:    ldr w0, [x0, x8]
 ; CHECK-NEXT:    ret
   %arrayidx = getelementptr inbounds i32, ptr %a, i64 1039992
   %val = load i32, ptr %arrayidx, align 4
@@ -354,8 +362,9 @@ define i32 @LdOffset_i32(ptr %a)  {
 define i64 @LdOffset_i32_zext64(ptr %a)  {
 ; CHECK-LABEL: LdOffset_i32_zext64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    add x8, x0, #1012, lsl #12 // =4145152
-; CHECK-NEXT:    ldr w0, [x8, #14816]
+; CHECK-NEXT:    mov w8, #31200 // =0x79e0
+; CHECK-NEXT:    movk w8, #63, lsl #16
+; CHECK-NEXT:    ldr w0, [x0, x8]
 ; CHECK-NEXT:    ret
   %arrayidx = getelementptr inbounds i32, ptr %a, i64 1039992
   %val = load i32, ptr %arrayidx, align 2
@@ -367,8 +376,9 @@ define i64 @LdOffset_i32_zext64(ptr %a)  {
 define i64 @LdOffset_i32_sext64(ptr %a)  {
 ; CHECK-LABEL: LdOffset_i32_sext64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    add x8, x0, #1012, lsl #12 // =4145152
-; CHECK-NEXT:    ldrsw x0, [x8, #14816]
+; CHECK-NEXT:    mov w8, #31200 // =0x79e0
+; CHECK-NEXT:    movk w8, #63, lsl #16
+; CHECK-NEXT:    ldrsw x0, [x0, x8]
 ; CHECK-NEXT:    ret
   %arrayidx = getelementptr inbounds i32, ptr %a, i64 1039992
   %val = load i32, ptr %arrayidx, align 2
@@ -380,8 +390,9 @@ define i64 @LdOffset_i32_sext64(ptr %a)  {
 define i64 @LdOffset_i64(ptr %a)  {
 ; CHECK-LABEL: LdOffset_i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    add x8, x0, #2024, lsl #12 // =8290304
-; CHECK-NEXT:    ldr x0, [x8, #29632]
+; CHECK-NEXT:    mov w8, #62400 // =0xf3c0
+; CHECK-NEXT:    movk w8, #126, lsl #16
+; CHECK-NEXT:    ldr x0, [x0, x8]
 ; CHECK-NEXT:    ret
   %arrayidx = getelementptr inbounds i64, ptr %a, i64 1039992
   %val = load i64, ptr %arrayidx, align 4
@@ -392,8 +403,9 @@ define i64 @LdOffset_i64(ptr %a)  {
 define <2 x i32> @LdOffset_v2i32(ptr %a)  {
 ; CHECK-LABEL: LdOffset_v2i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    add x8, x0, #2024, lsl #12 // =8290304
-; CHECK-NEXT:    ldr d0, [x8, #29632]
+; CHECK-NEXT:    mov w8, #62400 // =0xf3c0
+; CHECK-NEXT:    movk w8, #126, lsl #16
+; CHECK-NEXT:    ldr d0, [x0, x8]
 ; CHECK-NEXT:    ret
   %arrayidx = getelementptr inbounds <2 x i32>, ptr %a, i64 1039992
   %val = load <2 x i32>, ptr %arrayidx, align 4
@@ -404,8 +416,9 @@ define <2 x i32> @LdOffset_v2i32(ptr %a)  {
 define <2 x i64> @LdOffset_v2i64(ptr %a)  {
 ; CHECK-LABEL: LdOffset_v2i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    add x8, x0, #4048, lsl #12 // =16580608
-; CHECK-NEXT:    ldr q0, [x8, #59264]
+; CHECK-NEXT:    mov w8, #59264 // =0xe780
+; CHECK-NEXT:    movk w8, #253, lsl #16
+; CHECK-NEXT:    ldr q0, [x0, x8]
 ; CHECK-NEXT:    ret
   %arrayidx = getelementptr inbounds <2 x i64>, ptr %a, i64 1039992
   %val = load <2 x i64>, ptr %arrayidx, align 4
@@ -416,8 +429,9 @@ define <2 x i64> @LdOffset_v2i64(ptr %a)  {
 define double @LdOffset_i8_f64(ptr %a)  {
 ; CHECK-LABEL: LdOffset_i8_f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    add x8, x0, #253, lsl #12 // =1036288
-; CHECK-NEXT:    ldrsb w8, [x8, #3704]
+; CHECK-NEXT:    mov w8, #56952 // =0xde78
+; CHECK-NEXT:    movk w8, #15, lsl #16
+; CHECK-NEXT:    ldrsb w8, [x0, x8]
 ; CHECK-NEXT:    scvtf d0, w8
 ; CHECK-NEXT:    ret
   %arrayidx = getelementptr inbounds i8, ptr %a, i64 1039992
@@ -430,8 +444,9 @@ define double @LdOffset_i8_f64(ptr %a)  {
 define double @LdOffset_i16_f64(ptr %a)  {
 ; CHECK-LABEL: LdOffset_i16_f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    add x8, x0, #506, lsl #12 // =2072576
-; CHECK-NEXT:    ldrsh w8, [x8, #7408]
+; CHECK-NEXT:    mov w8, #48368 // =0xbcf0
+; CHECK-NEXT:    movk w8, #31, lsl #16
+; CHECK-NEXT:    ldrsh w8, [x0, x8]
 ; CHECK-NEXT:    scvtf d0, w8
 ; CHECK-NEXT:    ret
   %arrayidx = getelementptr inbounds i16, ptr %a, i64 1039992
@@ -444,8 +459,9 @@ define double @LdOffset_i16_f64(ptr %a)  {
 define double @LdOffset_i32_f64(ptr %a)  {
 ; CHECK-LABEL: LdOffset_i32_f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    add x8, x0, #1012, lsl #12 // =4145152
-; CHECK-NEXT:    ldr s0, [x8, #14816]
+; CHECK-NEXT:    mov w8, #31200 // =0x79e0
+; CHECK-NEXT:    movk w8, #63, lsl #16
+; CHECK-NEXT:    ldr s0, [x0, x8]
 ; CHECK-NEXT:    ucvtf d0, d0
 ; CHECK-NEXT:    ret
   %arrayidx = getelementptr inbounds i32, ptr %a, i64 1039992
@@ -458,8 +474,9 @@ define double @LdOffset_i32_f64(ptr %a)  {
 define double @LdOffset_i64_f64(ptr %a)  {
 ; CHECK-LABEL: LdOffset_i64_f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    add x8, x0, #2024, lsl #12 // =8290304
-; CHECK-NEXT:    ldr d0, [x8, #29632]
+; CHECK-NEXT:    mov w8, #62400 // =0xf3c0
+; CHECK-NEXT:    movk w8, #126, lsl #16
+; CHECK-NEXT:    ldr d0, [x0, x8]
 ; CHECK-NEXT:    scvtf d0, d0
 ; CHECK-NEXT:    ret
   %arrayidx = getelementptr inbounds i64, ptr %a, i64 1039992

From beffa1e1f689f1d57079e9721df23ea89f1b730c Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Thu, 21 Dec 2023 21:14:21 -0800
Subject: [PATCH 136/342] [test][hwasan] Try to fix Android bot

https://lab.llvm.org/buildbot/#/builders/77/builds/33152/steps/21/logs/stdio
---
 compiler-rt/test/hwasan/TestCases/strip_path_prefix.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/compiler-rt/test/hwasan/TestCases/strip_path_prefix.c b/compiler-rt/test/hwasan/TestCases/strip_path_prefix.c
index 5844749a6d977..1c89b47af155b 100644
--- a/compiler-rt/test/hwasan/TestCases/strip_path_prefix.c
+++ b/compiler-rt/test/hwasan/TestCases/strip_path_prefix.c
@@ -1,4 +1,4 @@
-// RUN: %clang_hwasan -O0 %s -o %t && %env_hwasan_opts=strip_path_prefix='"%S/"' not %run %t 2>&1 | FileCheck %s
+// RUN: %clang_hwasan -O0 %s -o %t && %env_hwasan_opts=strip_path_prefix=/TestCases/ not %run %t 2>&1 | FileCheck %s
 
 // Stack histories currently are not recorded on x86.
 // XFAIL: target=x86_64{{.*}}

From 50ed3b1eccd8497e3546f1f3cd6bd7631c6dc20e Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Wed, 20 Dec 2023 21:15:22 +0700
Subject: [PATCH 137/342] AMDGPU: Workaround a divergent return value bug in
 test

---
 llvm/test/CodeGen/AMDGPU/bf16.ll | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index 79b9f8caea945..a63a46c010a90 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -9716,7 +9716,8 @@ define amdgpu_ps i32 @s_fabs_bf16(bfloat inreg %a) {
   %op = call bfloat @llvm.fabs.bf16(bfloat %a)
   %cast = bitcast bfloat %op to i16
   %zext = zext i16 %cast to i32
-  ret i32 %zext
+  %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %zext)
+  ret i32 %readlane
 }
 
 define bfloat @v_fneg_bf16(bfloat %a) {

From c7952d886060bf2fed7d3a4c08d5636db642a3c1 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Thu, 21 Dec 2023 16:16:54 +0700
Subject: [PATCH 138/342] AMDGPU: Add a few more bfloat codegen tests

---
 llvm/test/CodeGen/AMDGPU/bf16.ll              | 21493 +++++++++++-----
 llvm/test/CodeGen/AMDGPU/function-args.ll     |   131 +-
 .../CodeGen/AMDGPU/vector_shuffle.packed.ll   |  2395 +-
 3 files changed, 17359 insertions(+), 6660 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index a63a46c010a90..85a24a063aa4e 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -72,432 +72,441 @@ define void @test_load_store(ptr addrspace(1) %in, ptr addrspace(1) %out) {
   ret void
 }
 
-define void @test_load_store_f32_to_bf16(ptr addrspace(1) %in, ptr addrspace(1) %out) {
-; GCN-LABEL: test_load_store_f32_to_bf16:
+define <2 x bfloat> @v_load_global_v2bf16(ptr addrspace(1) %ptr) {
+; GCN-LABEL: v_load_global_v2bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b32 s6, 0
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_mov_b32 s4, s6
 ; GCN-NEXT:    s_mov_b32 s5, s6
-; GCN-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    buffer_load_dword v1, v[0:1], s[4:7], 0 addr64
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GCN-NEXT:    buffer_store_short v0, v[2:3], s[4:7], 0 addr64
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: test_load_store_f32_to_bf16:
+; GFX7-LABEL: v_load_global_v2bf16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX7-NEXT:    buffer_store_short v0, v[2:3], s[4:7], 0 addr64
+; GFX7-NEXT:    buffer_load_dword v1, v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: test_load_store_f32_to_bf16:
+; GFX8-LABEL: v_load_global_v2bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dword v0, v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    flat_store_short v[2:3], v0
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: test_load_store_f32_to_bf16:
+; GFX9-LABEL: v_load_global_v2bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v0, v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_store_short_d16_hi v[2:3], v0, off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: test_load_store_f32_to_bf16:
+; GFX10-LABEL: v_load_global_v2bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v0, v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    global_store_short_d16_hi v[2:3], v0, off
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: test_load_store_f32_to_bf16:
+; GFX11-LABEL: v_load_global_v2bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    global_store_d16_hi_b16 v[2:3], v0, off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %val = load float, ptr addrspace(1) %in
-  %val.bf16 = fptrunc float %val to bfloat
-  store bfloat %val.bf16, ptr addrspace(1) %out
-  ret void
+  %load = load <2 x bfloat>, ptr addrspace(1) %ptr
+  ret <2 x bfloat> %load
 }
 
-define void @test_load_store_f64_to_bf16(ptr addrspace(1) %in, ptr addrspace(1) %out) {
-; GCN-LABEL: test_load_store_f64_to_bf16:
+define <3 x bfloat> @v_load_global_v3bf16(ptr addrspace(1) %ptr) {
+; GCN-LABEL: v_load_global_v3bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b32 s6, 0
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_mov_b32 s4, s6
 ; GCN-NEXT:    s_mov_b32 s5, s6
-; GCN-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    buffer_load_dwordx2 v[1:2], v[0:1], s[4:7], 0 addr64
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
-; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GCN-NEXT:    buffer_store_short v0, v[2:3], s[4:7], 0 addr64
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: test_load_store_f64_to_bf16:
+; GFX7-LABEL: v_load_global_v3bf16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX7-NEXT:    buffer_store_short v0, v[2:3], s[4:7], 0 addr64
+; GFX7-NEXT:    buffer_load_dwordx2 v[1:2], v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: test_load_store_f64_to_bf16:
+; GFX8-LABEL: v_load_global_v3bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    flat_store_short v[2:3], v0
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: test_load_store_f64_to_bf16:
+; GFX9-LABEL: v_load_global_v3bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
-; GFX9-NEXT:    global_store_short_d16_hi v[2:3], v0, off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX9-NEXT:    v_and_or_b32 v0, v0, s4, v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: test_load_store_f64_to_bf16:
+; GFX10-LABEL: v_load_global_v3bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
-; GFX10-NEXT:    global_store_short_d16_hi v[2:3], v0, off
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v0, v2
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: test_load_store_f64_to_bf16:
+; GFX11-LABEL: v_load_global_v3bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
-; GFX11-NEXT:    global_store_d16_hi_b16 v[2:3], v0, off
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v0, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %val = load double, ptr addrspace(1) %in
-  %val.bf16 = fptrunc double %val to bfloat
-  store bfloat %val.bf16, ptr addrspace(1) %out
-  ret void
+  %load = load <3 x bfloat>, ptr addrspace(1) %ptr
+  ret <3 x bfloat> %load
 }
 
-define void @test_load_store_bf16_to_f32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
-; GCN-LABEL: test_load_store_bf16_to_f32:
+define <4 x bfloat> @v_load_global_v4bf16(ptr addrspace(1) %ptr) {
+; GCN-LABEL: v_load_global_v4bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b32 s6, 0
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_mov_b32 s4, s6
 ; GCN-NEXT:    s_mov_b32 s5, s6
-; GCN-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GCN-NEXT:    buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
+; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: test_load_store_bf16_to_f32:
+; GFX7-LABEL: v_load_global_v4bf16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX7-NEXT:    buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
+; GFX7-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: test_load_store_bf16_to_f32:
+; GFX8-LABEL: v_load_global_v4bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_ushort v0, v[0:1]
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    flat_store_dword v[2:3], v0
+; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: test_load_store_bf16_to_f32:
+; GFX9-LABEL: v_load_global_v4bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0
-; GFX9-NEXT:    global_load_short_d16_hi v4, v[0:1], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_store_dword v[2:3], v4, off
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: test_load_store_bf16_to_f32:
+; GFX10-LABEL: v_load_global_v4bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v4, 0
-; GFX10-NEXT:    global_load_short_d16_hi v4, v[0:1], off
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    global_store_dword v[2:3], v4, off
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: test_load_store_bf16_to_f32:
+; GFX11-LABEL: v_load_global_v4bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v4, 0
-; GFX11-NEXT:    global_load_d16_hi_b16 v4, v[0:1], off
+; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    global_store_b32 v[2:3], v4, off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %val = load bfloat, ptr addrspace(1) %in
-  %val.f32 = fpext bfloat %val to float
-  store float %val.f32, ptr addrspace(1) %out
-  ret void
+  %load = load <4 x bfloat>, ptr addrspace(1) %ptr
+  ret <4 x bfloat> %load
 }
 
-define void @test_load_store_bf16_to_f64(ptr addrspace(1) %in, ptr addrspace(1) %out) {
-; GCN-LABEL: test_load_store_bf16_to_f64:
+define <6 x bfloat> @v_load_global_v6bf16(ptr addrspace(1) %ptr) {
+; GCN-LABEL: v_load_global_v6bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b32 s6, 0
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_mov_b32 s4, s6
 ; GCN-NEXT:    s_mov_b32 s5, s6
-; GCN-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    buffer_load_dwordx4 v[3:6], v[0:1], s[4:7], 0 addr64
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
-; GCN-NEXT:    buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v3
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v3
+; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v4
+; GCN-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
+; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: test_load_store_bf16_to_f64:
+; GFX7-LABEL: v_load_global_v6bf16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
-; GFX7-NEXT:    buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64
+; GFX7-NEXT:    buffer_load_dwordx3 v[3:5], v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v3
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: test_load_store_bf16_to_f64:
+; GFX8-LABEL: v_load_global_v6bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_ushort v0, v[0:1]
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
-; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    flat_load_dwordx3 v[0:2], v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: test_load_store_bf16_to_f64:
+; GFX9-LABEL: v_load_global_v6bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0
-; GFX9-NEXT:    global_load_short_d16_hi v4, v[0:1], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[0:1], v4
-; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
+; GFX9-NEXT:    global_load_dwordx3 v[0:2], v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: test_load_store_bf16_to_f64:
+; GFX10-LABEL: v_load_global_v6bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v4, 0
-; GFX10-NEXT:    global_load_short_d16_hi v4, v[0:1], off
+; GFX10-NEXT:    global_load_dwordx3 v[0:2], v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[0:1], v4
-; GFX10-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: test_load_store_bf16_to_f64:
+; GFX11-LABEL: v_load_global_v6bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v4, 0
-; GFX11-NEXT:    global_load_d16_hi_b16 v4, v[0:1], off
+; GFX11-NEXT:    global_load_b96 v[0:2], v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[0:1], v4
-; GFX11-NEXT:    global_store_b64 v[2:3], v[0:1], off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %val = load bfloat, ptr addrspace(1) %in
-  %val.f64 = fpext bfloat %val to double
-  store double %val.f64, ptr addrspace(1) %out
-  ret void
+  %load = load <6 x bfloat>, ptr addrspace(1) %ptr
+  ret <6 x bfloat> %load
 }
 
-define void @test_load_store_v2bf16(ptr addrspace(1) %in, ptr addrspace(1) %out) {
-; GCN-LABEL: test_load_store_v2bf16:
+define <8 x bfloat> @v_load_global_v8bf16(ptr addrspace(1) %ptr) {
+; GCN-LABEL: v_load_global_v8bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b32 s6, 0
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_mov_b32 s4, s6
 ; GCN-NEXT:    s_mov_b32 s5, s6
-; GCN-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
+; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
+; GCN-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
+; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GCN-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
+; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: test_load_store_v2bf16:
+; GFX7-LABEL: v_load_global_v8bf16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
+; GFX7-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: test_load_store_v2bf16:
+; GFX8-LABEL: v_load_global_v8bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dword v0, v[0:1]
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    flat_store_dword v[2:3], v0
+; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: test_load_store_v2bf16:
+; GFX9-LABEL: v_load_global_v8bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v0, v[0:1], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_store_dword v[2:3], v0, off
+; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: test_load_store_v2bf16:
+; GFX10-LABEL: v_load_global_v8bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v[0:1], off
+; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    global_store_dword v[2:3], v0, off
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: test_load_store_v2bf16:
+; GFX11-LABEL: v_load_global_v8bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-NEXT:    global_load_b128 v[0:3], v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    global_store_b32 v[2:3], v0, off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %val = load <2 x bfloat>, ptr addrspace(1) %in
-  store <2 x bfloat> %val, ptr addrspace(1) %out
-  ret void
+  %load = load <8 x bfloat>, ptr addrspace(1) %ptr
+  ret <8 x bfloat> %load
 }
 
-define void @test_load_store_v4bf16(ptr addrspace(1) %in, ptr addrspace(1) %out) {
-; GCN-LABEL: test_load_store_v4bf16:
+define <16 x bfloat> @v_load_global_v16bf16(ptr addrspace(1) %ptr) {
+; GCN-LABEL: v_load_global_v16bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b32 s6, 0
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_mov_b32 s4, s6
 ; GCN-NEXT:    s_mov_b32 s5, s6
-; GCN-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:16
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
+; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
+; GCN-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
+; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GCN-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
+; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    v_lshlrev_b32_e32 v8, 16, v12
+; GCN-NEXT:    v_and_b32_e32 v9, 0xffff0000, v12
+; GCN-NEXT:    v_lshlrev_b32_e32 v10, 16, v13
+; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v13
+; GCN-NEXT:    v_lshlrev_b32_e32 v12, 16, v14
+; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v14
+; GCN-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
+; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: test_load_store_v4bf16:
+; GFX7-LABEL: v_load_global_v16bf16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64
+; GFX7-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 16, v12
+; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v12
+; GFX7-NEXT:    v_lshlrev_b32_e32 v10, 16, v13
+; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v13
+; GFX7-NEXT:    v_lshlrev_b32_e32 v12, 16, v14
+; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v14
+; GFX7-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
+; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: test_load_store_v4bf16:
+; GFX8-LABEL: v_load_global_v16bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v5, v1
+; GFX8-NEXT:    v_mov_b32_e32 v4, v0
+; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[4:5]
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 16, v4
+; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
+; GFX8-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: test_load_store_v4bf16:
+; GFX9-LABEL: v_load_global_v16bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
+; GFX9-NEXT:    v_mov_b32_e32 v9, v1
+; GFX9-NEXT:    v_mov_b32_e32 v8, v0
+; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[8:9], off
+; GFX9-NEXT:    global_load_dwordx4 v[4:7], v[8:9], off offset:16
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: test_load_store_v4bf16:
+; GFX10-LABEL: v_load_global_v16bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX10-NEXT:    v_mov_b32_e32 v9, v1
+; GFX10-NEXT:    v_mov_b32_e32 v8, v0
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[8:9], off
+; GFX10-NEXT:    global_load_dwordx4 v[4:7], v[8:9], off offset:16
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: test_load_store_v4bf16:
+; GFX11-LABEL: v_load_global_v16bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
+; GFX11-NEXT:    v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    global_load_b128 v[0:3], v[4:5], off
+; GFX11-NEXT:    global_load_b128 v[4:7], v[4:5], off offset:16
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    global_store_b64 v[2:3], v[0:1], off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %val = load <4 x bfloat>, ptr addrspace(1) %in
-  store <4 x bfloat> %val, ptr addrspace(1) %out
-  ret void
+  %load = load <16 x bfloat>, ptr addrspace(1) %ptr
+  ret <16 x bfloat> %load
 }
 
-define void @test_load_store_v8bf16(ptr addrspace(1) %in, ptr addrspace(1) %out) {
-; GCN-LABEL: test_load_store_v8bf16:
+define <32 x bfloat> @v_load_global_v32bf16(ptr addrspace(1) %ptr) {
+; GCN-LABEL: v_load_global_v32bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b32 s6, 0
@@ -505,12 +514,48 @@ define void @test_load_store_v8bf16(ptr addrspace(1) %in, ptr addrspace(1) %out)
 ; GCN-NEXT:    s_mov_b32 s4, s6
 ; GCN-NEXT:    s_mov_b32 s5, s6
 ; GCN-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:16
+; GCN-NEXT:    buffer_load_dwordx4 v[20:23], v[0:1], s[4:7], 0 addr64 offset:32
+; GCN-NEXT:    buffer_load_dwordx4 v[28:31], v[0:1], s[4:7], 0 addr64 offset:48
+; GCN-NEXT:    s_waitcnt vmcnt(3)
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
+; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
+; GCN-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
+; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GCN-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
+; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT:    s_waitcnt vmcnt(2)
+; GCN-NEXT:    v_lshlrev_b32_e32 v8, 16, v12
+; GCN-NEXT:    v_and_b32_e32 v9, 0xffff0000, v12
+; GCN-NEXT:    v_lshlrev_b32_e32 v10, 16, v13
+; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v13
+; GCN-NEXT:    v_lshlrev_b32_e32 v12, 16, v14
+; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v14
+; GCN-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
+; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_lshlrev_b32_e32 v16, 16, v20
+; GCN-NEXT:    v_and_b32_e32 v17, 0xffff0000, v20
+; GCN-NEXT:    v_lshlrev_b32_e32 v18, 16, v21
+; GCN-NEXT:    v_and_b32_e32 v19, 0xffff0000, v21
+; GCN-NEXT:    v_lshlrev_b32_e32 v20, 16, v22
+; GCN-NEXT:    v_and_b32_e32 v21, 0xffff0000, v22
+; GCN-NEXT:    v_lshlrev_b32_e32 v22, 16, v23
+; GCN-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    v_lshlrev_b32_e32 v24, 16, v28
+; GCN-NEXT:    v_and_b32_e32 v25, 0xffff0000, v28
+; GCN-NEXT:    v_lshlrev_b32_e32 v26, 16, v29
+; GCN-NEXT:    v_and_b32_e32 v27, 0xffff0000, v29
+; GCN-NEXT:    v_lshlrev_b32_e32 v28, 16, v30
+; GCN-NEXT:    v_and_b32_e32 v29, 0xffff0000, v30
+; GCN-NEXT:    v_lshlrev_b32_e32 v30, 16, v31
+; GCN-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: test_load_store_v8bf16:
+; GFX7-LABEL: v_load_global_v32bf16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -518,256 +563,438 @@ define void @test_load_store_v8bf16(ptr addrspace(1) %in, ptr addrspace(1) %out)
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
 ; GFX7-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX7-NEXT:    buffer_load_dwordx4 v[20:23], v[0:1], s[4:7], 0 addr64 offset:32
+; GFX7-NEXT:    buffer_load_dwordx4 v[28:31], v[0:1], s[4:7], 0 addr64 offset:48
+; GFX7-NEXT:    s_waitcnt vmcnt(3)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT:    s_waitcnt vmcnt(2)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 16, v12
+; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v12
+; GFX7-NEXT:    v_lshlrev_b32_e32 v10, 16, v13
+; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v13
+; GFX7-NEXT:    v_lshlrev_b32_e32 v12, 16, v14
+; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v14
+; GFX7-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
+; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v16, 16, v20
+; GFX7-NEXT:    v_and_b32_e32 v17, 0xffff0000, v20
+; GFX7-NEXT:    v_lshlrev_b32_e32 v18, 16, v21
+; GFX7-NEXT:    v_and_b32_e32 v19, 0xffff0000, v21
+; GFX7-NEXT:    v_lshlrev_b32_e32 v20, 16, v22
+; GFX7-NEXT:    v_and_b32_e32 v21, 0xffff0000, v22
+; GFX7-NEXT:    v_lshlrev_b32_e32 v22, 16, v23
+; GFX7-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v24, 16, v28
+; GFX7-NEXT:    v_and_b32_e32 v25, 0xffff0000, v28
+; GFX7-NEXT:    v_lshlrev_b32_e32 v26, 16, v29
+; GFX7-NEXT:    v_and_b32_e32 v27, 0xffff0000, v29
+; GFX7-NEXT:    v_lshlrev_b32_e32 v28, 16, v30
+; GFX7-NEXT:    v_and_b32_e32 v29, 0xffff0000, v30
+; GFX7-NEXT:    v_lshlrev_b32_e32 v30, 16, v31
+; GFX7-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: test_load_store_v8bf16:
+; GFX8-LABEL: v_load_global_v32bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dwordx4 v[4:7], v[0:1]
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    flat_store_dwordx4 v[2:3], v[4:7]
+; GFX8-NEXT:    v_mov_b32_e32 v12, v0
+; GFX8-NEXT:    v_mov_b32_e32 v13, v1
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 16, v12
+; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, 0, v13, vcc
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 32, v12
+; GFX8-NEXT:    v_addc_u32_e32 v9, vcc, 0, v13, vcc
+; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[12:13]
+; GFX8-NEXT:    v_add_u32_e32 v12, vcc, 48, v12
+; GFX8-NEXT:    v_addc_u32_e32 v13, vcc, 0, v13, vcc
+; GFX8-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
+; GFX8-NEXT:    flat_load_dwordx4 v[8:11], v[8:9]
+; GFX8-NEXT:    flat_load_dwordx4 v[12:15], v[12:13]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: test_load_store_v8bf16:
+; GFX9-LABEL: v_load_global_v32bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off
+; GFX9-NEXT:    v_mov_b32_e32 v17, v1
+; GFX9-NEXT:    v_mov_b32_e32 v16, v0
+; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[16:17], off
+; GFX9-NEXT:    global_load_dwordx4 v[4:7], v[16:17], off offset:16
+; GFX9-NEXT:    global_load_dwordx4 v[8:11], v[16:17], off offset:32
+; GFX9-NEXT:    global_load_dwordx4 v[12:15], v[16:17], off offset:48
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: test_load_store_v8bf16:
+; GFX10-LABEL: v_load_global_v32bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
+; GFX10-NEXT:    v_mov_b32_e32 v17, v1
+; GFX10-NEXT:    v_mov_b32_e32 v16, v0
+; GFX10-NEXT:    s_clause 0x3
+; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[16:17], off
+; GFX10-NEXT:    global_load_dwordx4 v[4:7], v[16:17], off offset:16
+; GFX10-NEXT:    global_load_dwordx4 v[8:11], v[16:17], off offset:32
+; GFX10-NEXT:    global_load_dwordx4 v[12:15], v[16:17], off offset:48
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: test_load_store_v8bf16:
+; GFX11-LABEL: v_load_global_v32bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b128 v[4:7], v[0:1], off
+; GFX11-NEXT:    v_dual_mov_b32 v13, v1 :: v_dual_mov_b32 v12, v0
+; GFX11-NEXT:    s_clause 0x3
+; GFX11-NEXT:    global_load_b128 v[0:3], v[12:13], off
+; GFX11-NEXT:    global_load_b128 v[4:7], v[12:13], off offset:16
+; GFX11-NEXT:    global_load_b128 v[8:11], v[12:13], off offset:32
+; GFX11-NEXT:    global_load_b128 v[12:15], v[12:13], off offset:48
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    global_store_b128 v[2:3], v[4:7], off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %val = load <8 x bfloat>, ptr addrspace(1) %in
-  store <8 x bfloat> %val, ptr addrspace(1) %out
-  ret void
+  %load = load <32 x bfloat>, ptr addrspace(1) %ptr
+  ret <32 x bfloat> %load
 }
 
-define void @test_load_store_v16bf16(ptr addrspace(1) %in, ptr addrspace(1) %out) {
-; GCN-LABEL: test_load_store_v16bf16:
+define <64 x bfloat> @v_load_global_v64bf16(ptr addrspace(1) %ptr) {
+; GCN-LABEL: v_load_global_v64bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s6, 0
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    v_add_i32_e32 v7, vcc, 0x7c, v0
+; GCN-NEXT:    v_add_i32_e32 v8, vcc, 0x78, v0
+; GCN-NEXT:    v_add_i32_e32 v9, vcc, 0x74, v0
+; GCN-NEXT:    v_add_i32_e32 v10, vcc, 0x70, v0
+; GCN-NEXT:    v_add_i32_e32 v11, vcc, 0x6c, v0
+; GCN-NEXT:    v_add_i32_e32 v12, vcc, 0x68, v0
+; GCN-NEXT:    v_add_i32_e32 v13, vcc, 0x64, v0
+; GCN-NEXT:    v_add_i32_e32 v14, vcc, 0x60, v0
 ; GCN-NEXT:    s_mov_b32 s4, s6
 ; GCN-NEXT:    s_mov_b32 s5, s6
-; GCN-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 offset:16
-; GCN-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64
-; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64 offset:16
-; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64
+; GCN-NEXT:    buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:112
+; GCN-NEXT:    v_add_i32_e32 v15, vcc, 0x5c, v0
+; GCN-NEXT:    v_add_i32_e32 v16, vcc, 0x58, v0
+; GCN-NEXT:    v_add_i32_e32 v17, vcc, 0x54, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dword v6, v7, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v5, v8, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v4, v9, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v3, v10, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt expcnt(0)
+; GCN-NEXT:    buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:96
+; GCN-NEXT:    v_add_i32_e32 v7, vcc, 0x50, v0
+; GCN-NEXT:    v_add_i32_e32 v8, vcc, 0x4c, v0
+; GCN-NEXT:    v_add_i32_e32 v9, vcc, 0x48, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dword v6, v11, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v5, v12, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v4, v13, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v3, v14, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt expcnt(0)
+; GCN-NEXT:    buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:80
+; GCN-NEXT:    v_add_i32_e32 v10, vcc, 0x44, v0
+; GCN-NEXT:    v_add_i32_e32 v11, vcc, 64, v0
+; GCN-NEXT:    v_add_i32_e32 v19, vcc, 60, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dword v6, v15, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v5, v16, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v4, v17, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v3, v7, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt expcnt(0)
+; GCN-NEXT:    buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:64
+; GCN-NEXT:    v_add_i32_e32 v20, vcc, 56, v0
+; GCN-NEXT:    v_add_i32_e32 v21, vcc, 52, v0
+; GCN-NEXT:    v_add_i32_e32 v22, vcc, 48, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dword v6, v8, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v5, v9, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v4, v10, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v3, v11, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt expcnt(0)
+; GCN-NEXT:    buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:48
+; GCN-NEXT:    buffer_load_dwordx4 v[7:10], v[1:2], s[4:7], 0 addr64 offset:32
+; GCN-NEXT:    buffer_load_dwordx4 v[11:14], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT:    buffer_load_dwordx4 v[15:18], v[1:2], s[4:7], 0 addr64 offset:16
+; GCN-NEXT:    s_waitcnt vmcnt(3)
+; GCN-NEXT:    buffer_store_dword v6, v19, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v1, vcc, 44, v0
+; GCN-NEXT:    buffer_store_dword v5, v20, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, 40, v0
+; GCN-NEXT:    buffer_store_dword v4, v21, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt expcnt(0)
+; GCN-NEXT:    v_add_i32_e32 v4, vcc, 36, v0
+; GCN-NEXT:    buffer_store_dword v3, v22, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt expcnt(0)
+; GCN-NEXT:    v_add_i32_e32 v3, vcc, 32, v0
+; GCN-NEXT:    v_add_i32_e32 v5, vcc, 28, v0
+; GCN-NEXT:    v_add_i32_e32 v6, vcc, 24, v0
+; GCN-NEXT:    v_add_i32_e32 v19, vcc, 20, v0
+; GCN-NEXT:    v_add_i32_e32 v20, vcc, 16, v0
+; GCN-NEXT:    s_waitcnt vmcnt(6)
+; GCN-NEXT:    buffer_store_dword v10, v1, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v1, vcc, 12, v0
+; GCN-NEXT:    buffer_store_dword v9, v2, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, 8, v0
+; GCN-NEXT:    buffer_store_dword v8, v4, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v4, vcc, 4, v0
+; GCN-NEXT:    buffer_store_dword v7, v3, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt vmcnt(8)
+; GCN-NEXT:    buffer_store_dword v18, v5, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v17, v6, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v16, v19, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v15, v20, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v14, v1, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v13, v2, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v12, v4, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v11, v0, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: test_load_store_v16bf16:
+; GFX7-LABEL: v_load_global_v64bf16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 offset:16
-; GFX7-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    s_waitcnt vmcnt(1)
-; GFX7-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64 offset:16
-; GFX7-NEXT:    s_waitcnt vmcnt(1)
-; GFX7-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64
+; GFX7-NEXT:    buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:112
+; GFX7-NEXT:    v_add_i32_e32 v7, vcc, 0x7c, v0
+; GFX7-NEXT:    v_add_i32_e32 v8, vcc, 0x78, v0
+; GFX7-NEXT:    v_add_i32_e32 v9, vcc, 0x74, v0
+; GFX7-NEXT:    v_add_i32_e32 v10, vcc, 0x70, v0
+; GFX7-NEXT:    v_add_i32_e32 v19, vcc, 52, v0
+; GFX7-NEXT:    v_add_i32_e32 v20, vcc, 48, v0
+; GFX7-NEXT:    v_add_i32_e32 v21, vcc, 44, v0
+; GFX7-NEXT:    v_add_i32_e32 v22, vcc, 40, v0
+; GFX7-NEXT:    v_add_i32_e32 v23, vcc, 36, v0
+; GFX7-NEXT:    v_add_i32_e32 v24, vcc, 32, v0
+; GFX7-NEXT:    v_add_i32_e32 v25, vcc, 28, v0
+; GFX7-NEXT:    v_add_i32_e32 v26, vcc, 24, v0
+; GFX7-NEXT:    v_add_i32_e32 v27, vcc, 20, v0
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_store_dword v6, v7, s[0:3], 0 offen
+; GFX7-NEXT:    buffer_store_dword v5, v8, s[0:3], 0 offen
+; GFX7-NEXT:    buffer_store_dword v4, v9, s[0:3], 0 offen
+; GFX7-NEXT:    buffer_store_dword v3, v10, s[0:3], 0 offen
+; GFX7-NEXT:    buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:96
+; GFX7-NEXT:    v_add_i32_e32 v7, vcc, 0x6c, v0
+; GFX7-NEXT:    v_add_i32_e32 v8, vcc, 0x68, v0
+; GFX7-NEXT:    v_add_i32_e32 v9, vcc, 0x64, v0
+; GFX7-NEXT:    v_add_i32_e32 v10, vcc, 0x60, v0
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_store_dword v6, v7, s[0:3], 0 offen
+; GFX7-NEXT:    buffer_store_dword v5, v8, s[0:3], 0 offen
+; GFX7-NEXT:    buffer_store_dword v4, v9, s[0:3], 0 offen
+; GFX7-NEXT:    buffer_store_dword v3, v10, s[0:3], 0 offen
+; GFX7-NEXT:    buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:80
+; GFX7-NEXT:    v_add_i32_e32 v7, vcc, 0x5c, v0
+; GFX7-NEXT:    v_add_i32_e32 v8, vcc, 0x58, v0
+; GFX7-NEXT:    v_add_i32_e32 v9, vcc, 0x54, v0
+; GFX7-NEXT:    v_add_i32_e32 v10, vcc, 0x50, v0
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_store_dword v6, v7, s[0:3], 0 offen
+; GFX7-NEXT:    buffer_store_dword v5, v8, s[0:3], 0 offen
+; GFX7-NEXT:    buffer_store_dword v4, v9, s[0:3], 0 offen
+; GFX7-NEXT:    buffer_store_dword v3, v10, s[0:3], 0 offen
+; GFX7-NEXT:    buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:64
+; GFX7-NEXT:    v_add_i32_e32 v7, vcc, 0x4c, v0
+; GFX7-NEXT:    v_add_i32_e32 v8, vcc, 0x48, v0
+; GFX7-NEXT:    v_add_i32_e32 v9, vcc, 0x44, v0
+; GFX7-NEXT:    v_add_i32_e32 v10, vcc, 64, v0
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_store_dword v6, v7, s[0:3], 0 offen
+; GFX7-NEXT:    buffer_store_dword v5, v8, s[0:3], 0 offen
+; GFX7-NEXT:    buffer_store_dword v4, v9, s[0:3], 0 offen
+; GFX7-NEXT:    buffer_store_dword v3, v10, s[0:3], 0 offen
+; GFX7-NEXT:    buffer_load_dwordx4 v[3:6], v[1:2], s[4:7], 0 addr64 offset:48
+; GFX7-NEXT:    buffer_load_dwordx4 v[7:10], v[1:2], s[4:7], 0 addr64 offset:32
+; GFX7-NEXT:    buffer_load_dwordx4 v[11:14], v[1:2], s[4:7], 0 addr64 offset:16
+; GFX7-NEXT:    buffer_load_dwordx4 v[15:18], v[1:2], s[4:7], 0 addr64
+; GFX7-NEXT:    v_add_i32_e32 v1, vcc, 60, v0
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 56, v0
+; GFX7-NEXT:    s_waitcnt vmcnt(3)
+; GFX7-NEXT:    buffer_store_dword v6, v1, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v1, vcc, 16, v0
+; GFX7-NEXT:    buffer_store_dword v5, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 12, v0
+; GFX7-NEXT:    buffer_store_dword v4, v19, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 8, v0
+; GFX7-NEXT:    buffer_store_dword v3, v20, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 4, v0
+; GFX7-NEXT:    s_waitcnt vmcnt(6)
+; GFX7-NEXT:    buffer_store_dword v10, v21, s[0:3], 0 offen
+; GFX7-NEXT:    buffer_store_dword v9, v22, s[0:3], 0 offen
+; GFX7-NEXT:    buffer_store_dword v8, v23, s[0:3], 0 offen
+; GFX7-NEXT:    buffer_store_dword v7, v24, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(9)
+; GFX7-NEXT:    buffer_store_dword v14, v25, s[0:3], 0 offen
+; GFX7-NEXT:    buffer_store_dword v13, v26, s[0:3], 0 offen
+; GFX7-NEXT:    buffer_store_dword v12, v27, s[0:3], 0 offen
+; GFX7-NEXT:    buffer_store_dword v11, v1, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(12)
+; GFX7-NEXT:    buffer_store_dword v18, v2, s[0:3], 0 offen
+; GFX7-NEXT:    buffer_store_dword v17, v4, s[0:3], 0 offen
+; GFX7-NEXT:    buffer_store_dword v16, v3, s[0:3], 0 offen
+; GFX7-NEXT:    buffer_store_dword v15, v0, s[0:3], 0 offen
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: test_load_store_v16bf16:
+; GFX8-LABEL: v_load_global_v64bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 16, v0
-; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v28, v0
+; GFX8-NEXT:    v_mov_b32_e32 v29, v1
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 16, v28
+; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, 0, v29, vcc
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 32, v28
+; GFX8-NEXT:    v_addc_u32_e32 v9, vcc, 0, v29, vcc
+; GFX8-NEXT:    v_add_u32_e32 v12, vcc, 48, v28
+; GFX8-NEXT:    v_addc_u32_e32 v13, vcc, 0, v29, vcc
+; GFX8-NEXT:    v_add_u32_e32 v16, vcc, 64, v28
+; GFX8-NEXT:    v_addc_u32_e32 v17, vcc, 0, v29, vcc
+; GFX8-NEXT:    s_movk_i32 s4, 0x50
+; GFX8-NEXT:    v_add_u32_e32 v20, vcc, s4, v28
+; GFX8-NEXT:    v_addc_u32_e32 v21, vcc, 0, v29, vcc
+; GFX8-NEXT:    s_movk_i32 s4, 0x60
+; GFX8-NEXT:    v_add_u32_e32 v24, vcc, s4, v28
+; GFX8-NEXT:    v_addc_u32_e32 v25, vcc, 0, v29, vcc
+; GFX8-NEXT:    s_movk_i32 s4, 0x70
+; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[28:29]
+; GFX8-NEXT:    flat_load_dwordx4 v[12:15], v[12:13]
+; GFX8-NEXT:    v_add_u32_e32 v28, vcc, s4, v28
+; GFX8-NEXT:    v_addc_u32_e32 v29, vcc, 0, v29, vcc
 ; GFX8-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
-; GFX8-NEXT:    flat_load_dwordx4 v[8:11], v[0:1]
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 16, v2
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
-; GFX8-NEXT:    s_waitcnt vmcnt(1)
-; GFX8-NEXT:    flat_store_dwordx4 v[0:1], v[4:7]
-; GFX8-NEXT:    s_waitcnt vmcnt(1)
-; GFX8-NEXT:    flat_store_dwordx4 v[2:3], v[8:11]
+; GFX8-NEXT:    flat_load_dwordx4 v[8:11], v[8:9]
+; GFX8-NEXT:    flat_load_dwordx4 v[16:19], v[16:17]
+; GFX8-NEXT:    flat_load_dwordx4 v[20:23], v[20:21]
+; GFX8-NEXT:    flat_load_dwordx4 v[24:27], v[24:25]
+; GFX8-NEXT:    flat_load_dwordx4 v[28:31], v[28:29]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: test_load_store_v16bf16:
+; GFX9-LABEL: v_load_global_v64bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off offset:16
-; GFX9-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off offset:16
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    global_store_dwordx4 v[2:3], v[8:11], off
+; GFX9-NEXT:    v_mov_b32_e32 v29, v1
+; GFX9-NEXT:    v_mov_b32_e32 v28, v0
+; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[28:29], off
+; GFX9-NEXT:    global_load_dwordx4 v[4:7], v[28:29], off offset:16
+; GFX9-NEXT:    global_load_dwordx4 v[8:11], v[28:29], off offset:32
+; GFX9-NEXT:    global_load_dwordx4 v[12:15], v[28:29], off offset:48
+; GFX9-NEXT:    global_load_dwordx4 v[16:19], v[28:29], off offset:64
+; GFX9-NEXT:    global_load_dwordx4 v[20:23], v[28:29], off offset:80
+; GFX9-NEXT:    global_load_dwordx4 v[24:27], v[28:29], off offset:96
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    global_load_dwordx4 v[28:31], v[28:29], off offset:112
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: test_load_store_v16bf16:
+; GFX10-LABEL: v_load_global_v64bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off offset:16
-; GFX10-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off
-; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off offset:16
+; GFX10-NEXT:    v_mov_b32_e32 v33, v1
+; GFX10-NEXT:    v_mov_b32_e32 v32, v0
+; GFX10-NEXT:    s_clause 0x7
+; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[32:33], off
+; GFX10-NEXT:    global_load_dwordx4 v[4:7], v[32:33], off offset:16
+; GFX10-NEXT:    global_load_dwordx4 v[8:11], v[32:33], off offset:32
+; GFX10-NEXT:    global_load_dwordx4 v[12:15], v[32:33], off offset:48
+; GFX10-NEXT:    global_load_dwordx4 v[16:19], v[32:33], off offset:64
+; GFX10-NEXT:    global_load_dwordx4 v[20:23], v[32:33], off offset:80
+; GFX10-NEXT:    global_load_dwordx4 v[24:27], v[32:33], off offset:96
+; GFX10-NEXT:    global_load_dwordx4 v[28:31], v[32:33], off offset:112
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    global_store_dwordx4 v[2:3], v[8:11], off
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: test_load_store_v16bf16:
+; GFX11-LABEL: v_load_global_v64bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    global_load_b128 v[4:7], v[0:1], off offset:16
-; GFX11-NEXT:    global_load_b128 v[8:11], v[0:1], off
-; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    global_store_b128 v[2:3], v[4:7], off offset:16
+; GFX11-NEXT:    v_dual_mov_b32 v29, v1 :: v_dual_mov_b32 v28, v0
+; GFX11-NEXT:    s_clause 0x7
+; GFX11-NEXT:    global_load_b128 v[0:3], v[28:29], off
+; GFX11-NEXT:    global_load_b128 v[4:7], v[28:29], off offset:16
+; GFX11-NEXT:    global_load_b128 v[8:11], v[28:29], off offset:32
+; GFX11-NEXT:    global_load_b128 v[12:15], v[28:29], off offset:48
+; GFX11-NEXT:    global_load_b128 v[16:19], v[28:29], off offset:64
+; GFX11-NEXT:    global_load_b128 v[20:23], v[28:29], off offset:80
+; GFX11-NEXT:    global_load_b128 v[24:27], v[28:29], off offset:96
+; GFX11-NEXT:    global_load_b128 v[28:31], v[28:29], off offset:112
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    global_store_b128 v[2:3], v[8:11], off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %val = load <16 x bfloat>, ptr addrspace(1) %in
-  store <16 x bfloat> %val, ptr addrspace(1) %out
-  ret void
+  %load = load <64 x bfloat>, ptr addrspace(1) %ptr
+  ret <64 x bfloat> %load
 }
 
-define void @test_arg_store(bfloat %in, ptr addrspace(1) %out) {
-; GCN-LABEL: test_arg_store:
+define void @v_store_global_v2bf16(<2 x bfloat> %val, ptr addrspace(1) %ptr) {
+; GCN-LABEL: v_store_global_v2bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    v_alignbit_b32 v0, v1, v0, 16
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_mov_b32 s4, s6
 ; GCN-NEXT:    s_mov_b32 s5, s6
-; GCN-NEXT:    buffer_store_short v0, v[1:2], s[4:7], 0 addr64
+; GCN-NEXT:    buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: test_arg_store:
+; GFX7-LABEL: v_store_global_v2bf16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:    v_alignbit_b32 v0, v1, v0, 16
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_store_short v0, v[1:2], s[4:7], 0 addr64
+; GFX7-NEXT:    buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: test_arg_store:
+; GFX8-LABEL: v_store_global_v2bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    flat_store_short v[1:2], v0
+; GFX8-NEXT:    flat_store_dword v[1:2], v0
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: test_arg_store:
+; GFX9-LABEL: v_store_global_v2bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_store_short_d16_hi v[1:2], v0, off
+; GFX9-NEXT:    global_store_dword v[1:2], v0, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: test_arg_store:
+; GFX10-LABEL: v_store_global_v2bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_store_short_d16_hi v[1:2], v0, off
+; GFX10-NEXT:    global_store_dword v[1:2], v0, off
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: test_arg_store:
+; GFX11-LABEL: v_store_global_v2bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_store_d16_hi_b16 v[1:2], v0, off
+; GFX11-NEXT:    global_store_b32 v[1:2], v0, off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  store bfloat %in, ptr addrspace(1) %out
+  store <2 x bfloat> %val, ptr addrspace(1) %ptr
   ret void
 }
 
-define void @test_arg_store_v2bf16(<2 x bfloat> %in, ptr addrspace(1) %out) {
-; GCN-LABEL: test_arg_store_v2bf16:
+define void @v_store_global_v3bf16(<3 x bfloat> %val, ptr addrspace(1) %ptr) {
+; GCN-LABEL: v_store_global_v3bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GCN-NEXT:    s_mov_b32 s6, 0
-; GCN-NEXT:    v_alignbit_b32 v0, v1, v0, 16
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NEXT:    s_mov_b32 s4, s6
-; GCN-NEXT:    s_mov_b32 s5, s6
-; GCN-NEXT:    buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: test_arg_store_v2bf16:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    v_alignbit_b32 v0, v1, v0, 16
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
-; GFX7-NEXT:    s_mov_b32 s4, s6
-; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: test_arg_store_v2bf16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_store_dword v[1:2], v0
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: test_arg_store_v2bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_store_dword v[1:2], v0, off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: test_arg_store_v2bf16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_store_dword v[1:2], v0, off
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: test_arg_store_v2bf16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_store_b32 v[1:2], v0, off
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  store <2 x bfloat> %in, ptr addrspace(1) %out
-  ret void
-}
-
-define void @test_arg_store_v3bf16(<3 x bfloat> %in, <3 x bfloat> addrspace(1)* %out) {
-; GCN-LABEL: test_arg_store_v3bf16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GCN-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_mov_b32 s6, 0
 ; GCN-NEXT:    v_alignbit_b32 v0, v1, v0, 16
 ; GCN-NEXT:    s_mov_b32 s4, s6
@@ -777,7 +1004,7 @@ define void @test_arg_store_v3bf16(<3 x bfloat> %in, <3 x bfloat> addrspace(1)*
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: test_arg_store_v3bf16:
+; GFX7-LABEL: v_store_global_v3bf16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
@@ -792,7 +1019,7 @@ define void @test_arg_store_v3bf16(<3 x bfloat> %in, <3 x bfloat> addrspace(1)*
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: test_arg_store_v3bf16:
+; GFX8-LABEL: v_store_global_v3bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_store_dword v[2:3], v0
@@ -802,7 +1029,7 @@ define void @test_arg_store_v3bf16(<3 x bfloat> %in, <3 x bfloat> addrspace(1)*
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: test_arg_store_v3bf16:
+; GFX9-LABEL: v_store_global_v3bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_store_short v[2:3], v1, off offset:4
@@ -810,26 +1037,26 @@ define void @test_arg_store_v3bf16(<3 x bfloat> %in, <3 x bfloat> addrspace(1)*
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: test_arg_store_v3bf16:
+; GFX10-LABEL: v_store_global_v3bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_store_short v[2:3], v1, off offset:4
 ; GFX10-NEXT:    global_store_dword v[2:3], v0, off
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: test_arg_store_v3bf16:
+; GFX11-LABEL: v_store_global_v3bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_store_b16 v[2:3], v1, off offset:4
 ; GFX11-NEXT:    global_store_b32 v[2:3], v0, off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  store <3 x bfloat> %in, <3 x bfloat> addrspace(1) * %out
+  store <3 x bfloat> %val, ptr addrspace(1) %ptr
   ret void
 }
 
-define void @test_arg_store_v4bf16(<4 x bfloat> %in, ptr addrspace(1) %out) {
-; GCN-LABEL: test_arg_store_v4bf16:
+define void @v_store_global_v4bf16(<4 x bfloat> %val, ptr addrspace(1) %ptr) {
+; GCN-LABEL: v_store_global_v4bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
@@ -844,7 +1071,7 @@ define void @test_arg_store_v4bf16(<4 x bfloat> %in, ptr addrspace(1) %out) {
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: test_arg_store_v4bf16:
+; GFX7-LABEL: v_store_global_v4bf16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
@@ -859,37 +1086,37 @@ define void @test_arg_store_v4bf16(<4 x bfloat> %in, ptr addrspace(1) %out) {
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: test_arg_store_v4bf16:
+; GFX8-LABEL: v_store_global_v4bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: test_arg_store_v4bf16:
+; GFX9-LABEL: v_store_global_v4bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: test_arg_store_v4bf16:
+; GFX10-LABEL: v_store_global_v4bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: test_arg_store_v4bf16:
+; GFX11-LABEL: v_store_global_v4bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b64 v[2:3], v[0:1], off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  store <4 x bfloat> %in, ptr addrspace(1)  %out
+  store <4 x bfloat> %val, ptr addrspace(1) %ptr
   ret void
 }
 
-define void @test_arg_store_v8bf16(<8 x bfloat> %in, ptr addrspace(1) %out) {
-; GCN-LABEL: test_arg_store_v8bf16:
+define void @v_store_global_v8bf16(<8 x bfloat> %val, ptr addrspace(1) %ptr) {
+; GCN-LABEL: v_store_global_v8bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
@@ -908,7 +1135,7 @@ define void @test_arg_store_v8bf16(<8 x bfloat> %in, ptr addrspace(1) %out) {
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: test_arg_store_v8bf16:
+; GFX7-LABEL: v_store_global_v8bf16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
@@ -927,37 +1154,37 @@ define void @test_arg_store_v8bf16(<8 x bfloat> %in, ptr addrspace(1) %out) {
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: test_arg_store_v8bf16:
+; GFX8-LABEL: v_store_global_v8bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: test_arg_store_v8bf16:
+; GFX9-LABEL: v_store_global_v8bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: test_arg_store_v8bf16:
+; GFX10-LABEL: v_store_global_v8bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: test_arg_store_v8bf16:
+; GFX11-LABEL: v_store_global_v8bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b128 v[4:5], v[0:3], off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  store <8 x bfloat> %in, ptr addrspace(1) %out
+  store <8 x bfloat> %val, ptr addrspace(1) %ptr
   ret void
 }
 
-define void @test_arg_store_v16bf16(<16 x bfloat> %in, ptr addrspace(1) %out) {
-; GCN-LABEL: test_arg_store_v16bf16:
+define void @v_store_global_v16bf16(<16 x bfloat> %val, ptr addrspace(1) %ptr) {
+; GCN-LABEL: v_store_global_v16bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
@@ -985,7 +1212,7 @@ define void @test_arg_store_v16bf16(<16 x bfloat> %in, ptr addrspace(1) %out) {
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: test_arg_store_v16bf16:
+; GFX7-LABEL: v_store_global_v16bf16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
@@ -1013,7 +1240,7 @@ define void @test_arg_store_v16bf16(<16 x bfloat> %in, ptr addrspace(1) %out) {
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: test_arg_store_v16bf16:
+; GFX8-LABEL: v_store_global_v16bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
@@ -1024,7 +1251,7 @@ define void @test_arg_store_v16bf16(<16 x bfloat> %in, ptr addrspace(1) %out) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: test_arg_store_v16bf16:
+; GFX9-LABEL: v_store_global_v16bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_store_dwordx4 v[8:9], v[4:7], off offset:16
@@ -1032,5969 +1259,9930 @@ define void @test_arg_store_v16bf16(<16 x bfloat> %in, ptr addrspace(1) %out) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: test_arg_store_v16bf16:
+; GFX10-LABEL: v_store_global_v16bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_store_dwordx4 v[8:9], v[4:7], off offset:16
 ; GFX10-NEXT:    global_store_dwordx4 v[8:9], v[0:3], off
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: test_arg_store_v16bf16:
+; GFX11-LABEL: v_store_global_v16bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_store_b128 v[8:9], v[4:7], off offset:16
 ; GFX11-NEXT:    global_store_b128 v[8:9], v[0:3], off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  store <16 x bfloat> %in, ptr addrspace(1) %out
+  store <16 x bfloat> %val, ptr addrspace(1) %ptr
   ret void
 }
 
-define amdgpu_gfx void @test_inreg_arg_store(bfloat inreg %in, ptr addrspace(1) %out) {
-; GCN-LABEL: test_inreg_arg_store:
+define void @v_store_global_v32bf16(<32 x bfloat> %val, ptr addrspace(1) %ptr) {
+; GCN-LABEL: v_store_global_v32bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_lshr_b32 s34, s4, 16
-; GCN-NEXT:    s_mov_b32 s38, 0
-; GCN-NEXT:    s_mov_b32 s39, 0xf000
-; GCN-NEXT:    s_mov_b32 s36, s38
-; GCN-NEXT:    s_mov_b32 s37, s38
-; GCN-NEXT:    v_mov_b32_e32 v2, s34
-; GCN-NEXT:    buffer_store_short v2, v[0:1], s[36:39], 0 addr64
+; GCN-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GCN-NEXT:    v_lshrrev_b32_e32 v31, 16, v5
+; GCN-NEXT:    v_alignbit_b32 v5, v7, v6, 16
+; GCN-NEXT:    v_alignbit_b32 v4, v31, v4, 16
+; GCN-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT:    v_alignbit_b32 v3, v3, v2, 16
+; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT:    v_alignbit_b32 v2, v1, v0, 16
+; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v15
+; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v13
+; GCN-NEXT:    v_alignbit_b32 v13, v0, v14, 16
+; GCN-NEXT:    v_alignbit_b32 v12, v1, v12, 16
+; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v11
+; GCN-NEXT:    v_alignbit_b32 v11, v0, v10, 16
+; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v9
+; GCN-NEXT:    v_alignbit_b32 v10, v0, v8, 16
+; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v23
+; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v21
+; GCN-NEXT:    v_alignbit_b32 v9, v0, v22, 16
+; GCN-NEXT:    v_alignbit_b32 v8, v1, v20, 16
+; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v19
+; GCN-NEXT:    v_alignbit_b32 v7, v0, v18, 16
+; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v17
+; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v29
+; GCN-NEXT:    v_lshrrev_b32_e32 v14, 16, v27
+; GCN-NEXT:    v_alignbit_b32 v6, v0, v16, 16
+; GCN-NEXT:    v_alignbit_b32 v16, v1, v28, 16
+; GCN-NEXT:    v_alignbit_b32 v15, v14, v26, 16
+; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:8
+; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v25
+; GCN-NEXT:    v_alignbit_b32 v14, v0, v24, 16
+; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s32
+; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s4, s6
+; GCN-NEXT:    s_mov_b32 s5, s6
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dwordx4 v[6:9], v[0:1], s[4:7], 0 addr64 offset:32
+; GCN-NEXT:    buffer_store_dwordx4 v[10:13], v[0:1], s[4:7], 0 addr64 offset:16
+; GCN-NEXT:    v_alignbit_b32 v17, v17, v30, 16
+; GCN-NEXT:    buffer_store_dwordx4 v[14:17], v[0:1], s[4:7], 0 addr64 offset:48
+; GCN-NEXT:    buffer_store_dwordx4 v[2:5], v[0:1], s[4:7], 0 addr64
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: test_inreg_arg_store:
+; GFX7-LABEL: v_store_global_v32bf16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_lshr_b32 s34, s4, 16
-; GFX7-NEXT:    s_mov_b32 s38, 0
-; GFX7-NEXT:    s_mov_b32 s39, 0xf000
-; GFX7-NEXT:    s_mov_b32 s36, s38
-; GFX7-NEXT:    s_mov_b32 s37, s38
-; GFX7-NEXT:    v_mov_b32_e32 v2, s34
-; GFX7-NEXT:    buffer_store_short v2, v[0:1], s[36:39], 0 addr64
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_alignbit_b32 v3, v3, v2, 16
+; GFX7-NEXT:    v_alignbit_b32 v2, v1, v0, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v15
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v13
+; GFX7-NEXT:    v_alignbit_b32 v13, v0, v14, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v11
+; GFX7-NEXT:    v_alignbit_b32 v11, v0, v10, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v9
+; GFX7-NEXT:    v_alignbit_b32 v10, v0, v8, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v23
+; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX7-NEXT:    v_alignbit_b32 v9, v0, v22, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v19
+; GFX7-NEXT:    v_lshrrev_b32_e32 v31, 16, v5
+; GFX7-NEXT:    v_alignbit_b32 v5, v7, v6, 16
+; GFX7-NEXT:    v_alignbit_b32 v12, v1, v12, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v21
+; GFX7-NEXT:    v_alignbit_b32 v7, v0, v18, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v17
+; GFX7-NEXT:    buffer_load_dword v17, off, s[0:3], s32
+; GFX7-NEXT:    v_alignbit_b32 v8, v1, v20, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v29
+; GFX7-NEXT:    v_lshrrev_b32_e32 v14, 16, v27
+; GFX7-NEXT:    v_alignbit_b32 v6, v0, v16, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v25
+; GFX7-NEXT:    v_alignbit_b32 v16, v1, v28, 16
+; GFX7-NEXT:    v_alignbit_b32 v15, v14, v26, 16
+; GFX7-NEXT:    v_alignbit_b32 v14, v0, v24, 16
+; GFX7-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:8
+; GFX7-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    v_alignbit_b32 v4, v31, v4, 16
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_store_dwordx4 v[6:9], v[0:1], s[4:7], 0 addr64 offset:32
+; GFX7-NEXT:    buffer_store_dwordx4 v[10:13], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
+; GFX7-NEXT:    v_alignbit_b32 v17, v17, v30, 16
+; GFX7-NEXT:    buffer_store_dwordx4 v[14:17], v[0:1], s[4:7], 0 addr64 offset:48
+; GFX7-NEXT:    buffer_store_dwordx4 v[2:5], v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: test_inreg_arg_store:
+; GFX8-LABEL: v_store_global_v32bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_lshr_b32 s34, s4, 16
-; GFX8-NEXT:    v_mov_b32_e32 v2, s34
-; GFX8-NEXT:    flat_store_short v[0:1], v2
+; GFX8-NEXT:    flat_store_dwordx4 v[16:17], v[0:3]
+; GFX8-NEXT:    s_nop 0
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 48, v16
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v17, vcc
+; GFX8-NEXT:    flat_store_dwordx4 v[0:1], v[12:15]
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 32, v16
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v17, vcc
+; GFX8-NEXT:    flat_store_dwordx4 v[0:1], v[8:11]
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 16, v16
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v17, vcc
+; GFX8-NEXT:    flat_store_dwordx4 v[0:1], v[4:7]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: test_inreg_arg_store:
+; GFX9-LABEL: v_store_global_v32bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-NEXT:    global_store_short_d16_hi v[0:1], v2, off
+; GFX9-NEXT:    global_store_dwordx4 v[16:17], v[12:15], off offset:48
+; GFX9-NEXT:    global_store_dwordx4 v[16:17], v[8:11], off offset:32
+; GFX9-NEXT:    global_store_dwordx4 v[16:17], v[4:7], off offset:16
+; GFX9-NEXT:    global_store_dwordx4 v[16:17], v[0:3], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: test_inreg_arg_store:
+; GFX10-LABEL: v_store_global_v32bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-NEXT:    global_store_short_d16_hi v[0:1], v2, off
+; GFX10-NEXT:    global_store_dwordx4 v[16:17], v[12:15], off offset:48
+; GFX10-NEXT:    global_store_dwordx4 v[16:17], v[8:11], off offset:32
+; GFX10-NEXT:    global_store_dwordx4 v[16:17], v[4:7], off offset:16
+; GFX10-NEXT:    global_store_dwordx4 v[16:17], v[0:3], off
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: test_inreg_arg_store:
+; GFX11-LABEL: v_store_global_v32bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v2, s4
-; GFX11-NEXT:    global_store_d16_hi_b16 v[0:1], v2, off
+; GFX11-NEXT:    s_clause 0x3
+; GFX11-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:48
+; GFX11-NEXT:    global_store_b128 v[16:17], v[8:11], off offset:32
+; GFX11-NEXT:    global_store_b128 v[16:17], v[4:7], off offset:16
+; GFX11-NEXT:    global_store_b128 v[16:17], v[0:3], off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  store bfloat %in, ptr addrspace(1) %out
+  store <32 x bfloat> %val, ptr addrspace(1) %ptr
   ret void
 }
 
-define bfloat @test_byval(ptr addrspace(5) byval(bfloat) %bv, bfloat %val) {
-; GCN-LABEL: test_byval:
+define void @v_store_global_v64bf16(<64 x bfloat> %val, ptr addrspace(1) %ptr) {
+; GCN-LABEL: v_store_global_v64bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GCN-NEXT:    buffer_store_short v1, off, s[0:3], s32
+; GCN-NEXT:    v_lshrrev_b32_e32 v23, 16, v23
+; GCN-NEXT:    v_lshrrev_b32_e32 v21, 16, v21
+; GCN-NEXT:    v_alignbit_b32 v22, v23, v22, 16
+; GCN-NEXT:    v_alignbit_b32 v21, v21, v20, 16
+; GCN-NEXT:    v_lshrrev_b32_e32 v19, 16, v19
+; GCN-NEXT:    v_alignbit_b32 v20, v19, v18, 16
+; GCN-NEXT:    v_lshrrev_b32_e32 v19, 16, v17
+; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:136
+; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:132
+; GCN-NEXT:    v_alignbit_b32 v19, v19, v16, 16
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s4, s6
+; GCN-NEXT:    s_mov_b32 s5, s6
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dwordx4 v[19:22], v[17:18], s[4:7], 0 addr64 offset:32
+; GCN-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
+; GCN-NEXT:    v_lshrrev_b32_e32 v16, 16, v13
+; GCN-NEXT:    v_alignbit_b32 v13, v15, v14, 16
+; GCN-NEXT:    v_alignbit_b32 v12, v16, v12, 16
+; GCN-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
+; GCN-NEXT:    v_alignbit_b32 v11, v11, v10, 16
+; GCN-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GCN-NEXT:    v_alignbit_b32 v10, v9, v8, 16
+; GCN-NEXT:    buffer_store_dwordx4 v[10:13], v[17:18], s[4:7], 0 addr64 offset:16
+; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:128
+; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:120
+; GCN-NEXT:    s_waitcnt expcnt(0)
+; GCN-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:124
+; GCN-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:116
+; GCN-NEXT:    s_waitcnt vmcnt(3)
+; GCN-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GCN-NEXT:    s_waitcnt vmcnt(2)
+; GCN-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_alignbit_b32 v11, v8, v10, 16
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_alignbit_b32 v10, v9, v12, 16
+; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:112
+; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:108
+; GCN-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:104
+; GCN-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:100
+; GCN-NEXT:    s_waitcnt vmcnt(3)
+; GCN-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GCN-NEXT:    s_waitcnt vmcnt(2)
+; GCN-NEXT:    v_alignbit_b32 v9, v8, v9, 16
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_lshrrev_b32_e32 v8, 16, v12
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_alignbit_b32 v8, v8, v13, 16
+; GCN-NEXT:    buffer_store_dwordx4 v[8:11], v[17:18], s[4:7], 0 addr64 offset:112
+; GCN-NEXT:    s_waitcnt expcnt(0)
+; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:96
+; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:88
+; GCN-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:92
+; GCN-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:84
+; GCN-NEXT:    s_waitcnt vmcnt(3)
+; GCN-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GCN-NEXT:    s_waitcnt vmcnt(2)
+; GCN-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_alignbit_b32 v11, v8, v10, 16
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_alignbit_b32 v10, v9, v12, 16
+; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:80
+; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:76
+; GCN-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:72
+; GCN-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:68
+; GCN-NEXT:    s_waitcnt vmcnt(3)
+; GCN-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GCN-NEXT:    s_waitcnt vmcnt(2)
+; GCN-NEXT:    v_alignbit_b32 v9, v8, v9, 16
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_lshrrev_b32_e32 v8, 16, v12
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_alignbit_b32 v8, v8, v13, 16
+; GCN-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GCN-NEXT:    v_lshrrev_b32_e32 v12, 16, v5
+; GCN-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT:    v_lshrrev_b32_e32 v13, 16, v29
+; GCN-NEXT:    v_lshrrev_b32_e32 v14, 16, v27
+; GCN-NEXT:    v_lshrrev_b32_e32 v15, 16, v25
+; GCN-NEXT:    buffer_store_dwordx4 v[8:11], v[17:18], s[4:7], 0 addr64 offset:96
+; GCN-NEXT:    s_waitcnt expcnt(0)
+; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], s32
+; GCN-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:32
+; GCN-NEXT:    v_alignbit_b32 v5, v7, v6, 16
+; GCN-NEXT:    v_alignbit_b32 v4, v12, v4, 16
+; GCN-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:28
+; GCN-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:24
+; GCN-NEXT:    v_alignbit_b32 v3, v3, v2, 16
+; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:20
+; GCN-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:16
+; GCN-NEXT:    v_alignbit_b32 v2, v1, v0, 16
+; GCN-NEXT:    v_alignbit_b32 v8, v13, v28, 16
+; GCN-NEXT:    v_alignbit_b32 v7, v14, v26, 16
+; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:12
+; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:8
+; GCN-NEXT:    v_alignbit_b32 v6, v15, v24, 16
+; GCN-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:4
+; GCN-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:64
+; GCN-NEXT:    s_waitcnt vmcnt(9)
+; GCN-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GCN-NEXT:    s_waitcnt vmcnt(8)
+; GCN-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
+; GCN-NEXT:    s_waitcnt vmcnt(6)
+; GCN-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
+; GCN-NEXT:    v_alignbit_b32 v9, v9, v30, 16
+; GCN-NEXT:    v_alignbit_b32 v13, v10, v11, 16
+; GCN-NEXT:    s_waitcnt vmcnt(5)
+; GCN-NEXT:    v_alignbit_b32 v12, v12, v16, 16
+; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:60
+; GCN-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:56
+; GCN-NEXT:    s_waitcnt vmcnt(6)
+; GCN-NEXT:    v_lshrrev_b32_e32 v11, 16, v19
+; GCN-NEXT:    s_waitcnt vmcnt(5)
+; GCN-NEXT:    v_alignbit_b32 v11, v11, v0, 16
+; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:52
+; GCN-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:48
+; GCN-NEXT:    s_waitcnt vmcnt(6)
+; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT:    s_waitcnt vmcnt(4)
+; GCN-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
+; GCN-NEXT:    s_waitcnt vmcnt(2)
+; GCN-NEXT:    v_lshrrev_b32_e32 v20, 16, v10
+; GCN-NEXT:    v_alignbit_b32 v10, v1, v14, 16
+; GCN-NEXT:    v_alignbit_b32 v22, v15, v16, 16
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_alignbit_b32 v21, v20, v0, 16
+; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:44
+; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:40
+; GCN-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:36
+; GCN-NEXT:    s_waitcnt vmcnt(3)
+; GCN-NEXT:    v_lshrrev_b32_e32 v15, 16, v19
+; GCN-NEXT:    s_waitcnt vmcnt(2)
+; GCN-NEXT:    v_alignbit_b32 v20, v15, v0, 16
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_alignbit_b32 v19, v0, v14, 16
+; GCN-NEXT:    buffer_store_dwordx4 v[19:22], v[17:18], s[4:7], 0 addr64 offset:80
+; GCN-NEXT:    buffer_store_dwordx4 v[10:13], v[17:18], s[4:7], 0 addr64 offset:64
+; GCN-NEXT:    buffer_store_dwordx4 v[6:9], v[17:18], s[4:7], 0 addr64 offset:48
+; GCN-NEXT:    buffer_store_dwordx4 v[2:5], v[17:18], s[4:7], 0 addr64
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: test_byval:
+; GFX7-LABEL: v_store_global_v64bf16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX7-NEXT:    buffer_store_short v1, off, s[0:3], s32
+; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX7-NEXT:    v_lshrrev_b32_e32 v31, 16, v5
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_alignbit_b32 v5, v7, v6, 16
+; GFX7-NEXT:    v_alignbit_b32 v4, v31, v4, 16
+; GFX7-NEXT:    v_alignbit_b32 v3, v3, v2, 16
+; GFX7-NEXT:    v_alignbit_b32 v2, v1, v0, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v15
+; GFX7-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:128
+; GFX7-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:124
+; GFX7-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:120
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:116
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:112
+; GFX7-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:108
+; GFX7-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:104
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v13
+; GFX7-NEXT:    v_alignbit_b32 v13, v0, v14, 16
+; GFX7-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:100
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v11
+; GFX7-NEXT:    v_alignbit_b32 v12, v1, v12, 16
+; GFX7-NEXT:    v_alignbit_b32 v11, v0, v10, 16
+; GFX7-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:136
+; GFX7-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:132
+; GFX7-NEXT:    buffer_load_dword v35, off, s[0:3], s32
+; GFX7-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX7-NEXT:    v_lshrrev_b32_e32 v21, 16, v21
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v23, 16, v23
+; GFX7-NEXT:    v_alignbit_b32 v10, v9, v8, 16
+; GFX7-NEXT:    v_alignbit_b32 v8, v21, v20, 16
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    v_alignbit_b32 v9, v23, v22, 16
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    s_waitcnt vmcnt(10)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX7-NEXT:    s_waitcnt vmcnt(9)
+; GFX7-NEXT:    v_alignbit_b32 v23, v6, v7, 16
+; GFX7-NEXT:    s_waitcnt vmcnt(8)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
+; GFX7-NEXT:    s_waitcnt vmcnt(7)
+; GFX7-NEXT:    v_alignbit_b32 v22, v15, v31, 16
+; GFX7-NEXT:    s_waitcnt vmcnt(6)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v20, 16, v32
+; GFX7-NEXT:    s_waitcnt vmcnt(5)
+; GFX7-NEXT:    v_alignbit_b32 v21, v20, v33, 16
+; GFX7-NEXT:    s_waitcnt vmcnt(4)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v32, 16, v34
+; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 16, v19
+; GFX7-NEXT:    v_alignbit_b32 v7, v6, v18, 16
+; GFX7-NEXT:    s_waitcnt vmcnt(3)
+; GFX7-NEXT:    v_alignbit_b32 v20, v32, v14, 16
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:28
+; GFX7-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:24
+; GFX7-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:20
+; GFX7-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:16
+; GFX7-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:12
+; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 16, v17
+; GFX7-NEXT:    s_waitcnt vmcnt(7)
+; GFX7-NEXT:    buffer_store_dwordx4 v[20:23], v[0:1], s[4:7], 0 addr64 offset:112
+; GFX7-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:96
+; GFX7-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:92
+; GFX7-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:88
+; GFX7-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:84
+; GFX7-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:80
+; GFX7-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:76
+; GFX7-NEXT:    v_lshrrev_b32_e32 v15, 16, v29
+; GFX7-NEXT:    v_alignbit_b32 v6, v6, v16, 16
+; GFX7-NEXT:    v_alignbit_b32 v16, v15, v28, 16
+; GFX7-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:72
+; GFX7-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:68
+; GFX7-NEXT:    s_waitcnt vmcnt(14)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v14, 16, v35
+; GFX7-NEXT:    v_alignbit_b32 v17, v14, v30, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v14, 16, v27
+; GFX7-NEXT:    v_alignbit_b32 v15, v14, v26, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v14, 16, v25
+; GFX7-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:8
+; GFX7-NEXT:    v_alignbit_b32 v14, v14, v24, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v19, 16, v31
+; GFX7-NEXT:    s_waitcnt vmcnt(14)
+; GFX7-NEXT:    v_alignbit_b32 v21, v19, v32, 16
+; GFX7-NEXT:    s_waitcnt vmcnt(13)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v19, 16, v33
+; GFX7-NEXT:    s_waitcnt vmcnt(12)
+; GFX7-NEXT:    v_alignbit_b32 v20, v19, v34, 16
+; GFX7-NEXT:    s_waitcnt vmcnt(11)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v19, 16, v36
+; GFX7-NEXT:    s_waitcnt vmcnt(10)
+; GFX7-NEXT:    v_alignbit_b32 v19, v19, v37, 16
+; GFX7-NEXT:    s_waitcnt vmcnt(8)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v22, 16, v22
+; GFX7-NEXT:    s_waitcnt vmcnt(7)
+; GFX7-NEXT:    v_alignbit_b32 v25, v22, v23, 16
+; GFX7-NEXT:    s_waitcnt vmcnt(6)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v24, 16, v38
+; GFX7-NEXT:    s_waitcnt vmcnt(5)
+; GFX7-NEXT:    v_alignbit_b32 v24, v24, v18, 16
+; GFX7-NEXT:    s_waitcnt vmcnt(4)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v18, 16, v39
+; GFX7-NEXT:    s_waitcnt vmcnt(3)
+; GFX7-NEXT:    v_alignbit_b32 v23, v18, v48, 16
+; GFX7-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:4
+; GFX7-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:64
+; GFX7-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:60
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:56
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:52
+; GFX7-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:48
+; GFX7-NEXT:    s_waitcnt vmcnt(8)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v22, 16, v28
+; GFX7-NEXT:    s_waitcnt vmcnt(7)
+; GFX7-NEXT:    v_alignbit_b32 v22, v22, v29, 16
+; GFX7-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:44
+; GFX7-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:40
+; GFX7-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:36
+; GFX7-NEXT:    buffer_store_dwordx4 v[22:25], v[0:1], s[4:7], 0 addr64 offset:96
+; GFX7-NEXT:    s_waitcnt vmcnt(10)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v22, 16, v26
+; GFX7-NEXT:    s_waitcnt vmcnt(9)
+; GFX7-NEXT:    v_alignbit_b32 v18, v22, v18, 16
+; GFX7-NEXT:    s_waitcnt vmcnt(8)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v22, 16, v27
+; GFX7-NEXT:    s_waitcnt vmcnt(7)
+; GFX7-NEXT:    v_alignbit_b32 v25, v22, v30, 16
+; GFX7-NEXT:    s_waitcnt vmcnt(6)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v23, 16, v31
+; GFX7-NEXT:    s_waitcnt vmcnt(5)
+; GFX7-NEXT:    v_alignbit_b32 v24, v23, v32, 16
+; GFX7-NEXT:    s_waitcnt vmcnt(4)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v22, 16, v33
+; GFX7-NEXT:    s_waitcnt vmcnt(3)
+; GFX7-NEXT:    v_alignbit_b32 v23, v22, v28, 16
+; GFX7-NEXT:    s_waitcnt vmcnt(2)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v22, 16, v29
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_alignbit_b32 v22, v22, v34, 16
+; GFX7-NEXT:    buffer_store_dwordx4 v[22:25], v[0:1], s[4:7], 0 addr64 offset:80
+; GFX7-NEXT:    buffer_store_dwordx4 v[18:21], v[0:1], s[4:7], 0 addr64 offset:64
+; GFX7-NEXT:    buffer_store_dwordx4 v[14:17], v[0:1], s[4:7], 0 addr64 offset:48
+; GFX7-NEXT:    buffer_store_dwordx4 v[6:9], v[0:1], s[4:7], 0 addr64 offset:32
+; GFX7-NEXT:    buffer_store_dwordx4 v[10:13], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX7-NEXT:    buffer_store_dwordx4 v[2:5], v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: test_byval:
+; GFX8-LABEL: v_store_global_v64bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX8-NEXT:    buffer_store_short v1, off, s[0:3], s32
+; GFX8-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
+; GFX8-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
+; GFX8-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX8-NEXT:    s_movk_i32 s4, 0x70
+; GFX8-NEXT:    s_movk_i32 s5, 0x50
+; GFX8-NEXT:    s_waitcnt vmcnt(2)
+; GFX8-NEXT:    v_add_u32_e32 v34, vcc, s4, v32
+; GFX8-NEXT:    s_waitcnt vmcnt(1)
+; GFX8-NEXT:    v_addc_u32_e32 v35, vcc, 0, v33, vcc
+; GFX8-NEXT:    s_movk_i32 s4, 0x60
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    flat_store_dwordx4 v[34:35], v[28:31]
+; GFX8-NEXT:    flat_store_dwordx4 v[32:33], v[0:3]
+; GFX8-NEXT:    v_add_u32_e32 v28, vcc, s4, v32
+; GFX8-NEXT:    v_addc_u32_e32 v29, vcc, 0, v33, vcc
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s5, v32
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v33, vcc
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 64, v32
+; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v33, vcc
+; GFX8-NEXT:    flat_store_dwordx4 v[28:29], v[24:27]
+; GFX8-NEXT:    s_nop 0
+; GFX8-NEXT:    v_add_u32_e32 v24, vcc, 48, v32
+; GFX8-NEXT:    v_addc_u32_e32 v25, vcc, 0, v33, vcc
+; GFX8-NEXT:    v_add_u32_e32 v26, vcc, 32, v32
+; GFX8-NEXT:    v_addc_u32_e32 v27, vcc, 0, v33, vcc
+; GFX8-NEXT:    v_add_u32_e32 v28, vcc, 16, v32
+; GFX8-NEXT:    v_addc_u32_e32 v29, vcc, 0, v33, vcc
+; GFX8-NEXT:    flat_store_dwordx4 v[0:1], v[20:23]
+; GFX8-NEXT:    flat_store_dwordx4 v[2:3], v[16:19]
+; GFX8-NEXT:    flat_store_dwordx4 v[24:25], v[12:15]
+; GFX8-NEXT:    flat_store_dwordx4 v[26:27], v[8:11]
+; GFX8-NEXT:    flat_store_dwordx4 v[28:29], v[4:7]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: test_byval:
+; GFX9-LABEL: v_store_global_v64bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    buffer_store_short_d16_hi v0, off, s[0:3], s32
+; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_store_dwordx4 v[32:33], v[28:31], off offset:112
+; GFX9-NEXT:    global_store_dwordx4 v[32:33], v[24:27], off offset:96
+; GFX9-NEXT:    global_store_dwordx4 v[32:33], v[20:23], off offset:80
+; GFX9-NEXT:    global_store_dwordx4 v[32:33], v[16:19], off offset:64
+; GFX9-NEXT:    global_store_dwordx4 v[32:33], v[12:15], off offset:48
+; GFX9-NEXT:    global_store_dwordx4 v[32:33], v[8:11], off offset:32
+; GFX9-NEXT:    global_store_dwordx4 v[32:33], v[4:7], off offset:16
+; GFX9-NEXT:    global_store_dwordx4 v[32:33], v[0:3], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: test_byval:
+; GFX10-LABEL: v_store_global_v64bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    buffer_store_short_d16_hi v0, off, s[0:3], s32
+; GFX10-NEXT:    s_clause 0x2
+; GFX10-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
+; GFX10-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
+; GFX10-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_store_dwordx4 v[32:33], v[28:31], off offset:112
+; GFX10-NEXT:    global_store_dwordx4 v[32:33], v[24:27], off offset:96
+; GFX10-NEXT:    global_store_dwordx4 v[32:33], v[20:23], off offset:80
+; GFX10-NEXT:    global_store_dwordx4 v[32:33], v[16:19], off offset:64
+; GFX10-NEXT:    global_store_dwordx4 v[32:33], v[12:15], off offset:48
+; GFX10-NEXT:    global_store_dwordx4 v[32:33], v[8:11], off offset:32
+; GFX10-NEXT:    global_store_dwordx4 v[32:33], v[4:7], off offset:16
+; GFX10-NEXT:    global_store_dwordx4 v[32:33], v[0:3], off
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: test_byval:
+; GFX11-LABEL: v_store_global_v64bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    scratch_store_d16_hi_b16 off, v0, s32
+; GFX11-NEXT:    s_clause 0x2
+; GFX11-NEXT:    scratch_load_b32 v33, off, s32 offset:8
+; GFX11-NEXT:    scratch_load_b32 v32, off, s32 offset:4
+; GFX11-NEXT:    scratch_load_b32 v31, off, s32
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_clause 0x7
+; GFX11-NEXT:    global_store_b128 v[32:33], v[28:31], off offset:112
+; GFX11-NEXT:    global_store_b128 v[32:33], v[24:27], off offset:96
+; GFX11-NEXT:    global_store_b128 v[32:33], v[20:23], off offset:80
+; GFX11-NEXT:    global_store_b128 v[32:33], v[16:19], off offset:64
+; GFX11-NEXT:    global_store_b128 v[32:33], v[12:15], off offset:48
+; GFX11-NEXT:    global_store_b128 v[32:33], v[8:11], off offset:32
+; GFX11-NEXT:    global_store_b128 v[32:33], v[4:7], off offset:16
+; GFX11-NEXT:    global_store_b128 v[32:33], v[0:3], off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  store bfloat %val, ptr addrspace(5) %bv
-  %retval = load bfloat, ptr addrspace(5) %bv
-  ret bfloat %retval
+  store <64 x bfloat> %val, ptr addrspace(1) %ptr
+  ret void
 }
 
-define void @test_sret(ptr addrspace(5) sret(bfloat) %sret, bfloat %val) {
-; GCN-LABEL: test_sret:
+define void @test_store_fpimm(ptr addrspace(1) %ptr0, ptr addrspace(1) %ptr1) {
+; GCN-LABEL: test_store_fpimm:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GCN-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    v_mov_b32_e32 v4, 0x3f80
+; GCN-NEXT:    v_mov_b32_e32 v5, 0x4228
+; GCN-NEXT:    s_mov_b32 s4, s6
+; GCN-NEXT:    s_mov_b32 s5, s6
+; GCN-NEXT:    buffer_store_short v4, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    buffer_store_short v5, v[2:3], s[4:7], 0 addr64
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: test_sret:
+; GFX7-LABEL: test_store_fpimm:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX7-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    v_mov_b32_e32 v4, 0x3f80
+; GFX7-NEXT:    buffer_store_short v4, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    v_mov_b32_e32 v0, 0x4228
+; GFX7-NEXT:    buffer_store_short v0, v[2:3], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: test_sret:
+; GFX8-LABEL: test_store_fpimm:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
+; GFX8-NEXT:    v_mov_b32_e32 v4, 0x3f80
+; GFX8-NEXT:    flat_store_short v[0:1], v4
+; GFX8-NEXT:    v_mov_b32_e32 v0, 0x4228
+; GFX8-NEXT:    flat_store_short v[2:3], v0
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: test_sret:
+; GFX9-LABEL: test_store_fpimm:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    buffer_store_short_d16_hi v1, v0, s[0:3], 0 offen
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x3f80
+; GFX9-NEXT:    global_store_short v[0:1], v4, off
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x4228
+; GFX9-NEXT:    global_store_short v[2:3], v0, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: test_sret:
+; GFX10-LABEL: test_store_fpimm:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    buffer_store_short_d16_hi v1, v0, s[0:3], 0 offen
+; GFX10-NEXT:    v_mov_b32_e32 v4, 0x3f80
+; GFX10-NEXT:    v_mov_b32_e32 v5, 0x4228
+; GFX10-NEXT:    global_store_short v[0:1], v4, off
+; GFX10-NEXT:    global_store_short v[2:3], v5, off
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: test_sret:
+; GFX11-LABEL: test_store_fpimm:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    scratch_store_d16_hi_b16 v0, v1, off
+; GFX11-NEXT:    v_mov_b32_e32 v4, 0x3f80
+; GFX11-NEXT:    v_mov_b32_e32 v5, 0x4228
+; GFX11-NEXT:    global_store_b16 v[0:1], v4, off
+; GFX11-NEXT:    global_store_b16 v[2:3], v5, off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  store bfloat %val, ptr addrspace(5) %sret
+  store bfloat 1.0, ptr addrspace(1) %ptr0
+  store bfloat 42.0, ptr addrspace(1) %ptr1
   ret void
 }
 
-define void @test_bitcast_from_bfloat(ptr addrspace(1) %in, ptr addrspace(1) %out) {
-; GCN-LABEL: test_bitcast_from_bfloat:
+define void @test_load_store_f32_to_bf16(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+; GCN-LABEL: test_load_store_f32_to_bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b32 s6, 0
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_mov_b32 s4, s6
 ; GCN-NEXT:    s_mov_b32 s5, s6
-; GCN-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GCN-NEXT:    buffer_store_short v0, v[2:3], s[4:7], 0 addr64
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: test_bitcast_from_bfloat:
+; GFX7-LABEL: test_load_store_f32_to_bf16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    buffer_store_short v0, v[2:3], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: test_bitcast_from_bfloat:
+; GFX8-LABEL: test_load_store_f32_to_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_ushort v0, v[0:1]
+; GFX8-NEXT:    flat_load_dword v0, v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    flat_store_short v[2:3], v0
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: test_bitcast_from_bfloat:
+; GFX9-LABEL: test_load_store_f32_to_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_ushort v0, v[0:1], off
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_store_short v[2:3], v0, off
+; GFX9-NEXT:    global_store_short_d16_hi v[2:3], v0, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: test_bitcast_from_bfloat:
+; GFX10-LABEL: test_load_store_f32_to_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_ushort v0, v[0:1], off
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    global_store_short v[2:3], v0, off
+; GFX10-NEXT:    global_store_short_d16_hi v[2:3], v0, off
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: test_bitcast_from_bfloat:
+; GFX11-LABEL: test_load_store_f32_to_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_u16 v0, v[0:1], off
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    global_store_b16 v[2:3], v0, off
+; GFX11-NEXT:    global_store_d16_hi_b16 v[2:3], v0, off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %val = load bfloat, ptr addrspace(1) %in
-  %val_int = bitcast bfloat %val to i16
-  store i16 %val_int, ptr addrspace(1) %out
+  %val = load float, ptr addrspace(1) %in
+  %val.bf16 = fptrunc float %val to bfloat
+  store bfloat %val.bf16, ptr addrspace(1) %out
   ret void
 }
 
-define void @test_bitcast_to_bfloat(ptr addrspace(1) %out, ptr addrspace(1) %in) {
-; GCN-LABEL: test_bitcast_to_bfloat:
+define void @test_load_store_f64_to_bf16(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+; GCN-LABEL: test_load_store_f64_to_bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b32 s6, 0
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_mov_b32 s4, s6
 ; GCN-NEXT:    s_mov_b32 s5, s6
-; GCN-NEXT:    buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64
+; GCN-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_short v2, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
+; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT:    buffer_store_short v0, v[2:3], s[4:7], 0 addr64
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: test_bitcast_to_bfloat:
+; GFX7-LABEL: test_load_store_f64_to_bf16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64
+; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    buffer_store_short v2, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:    buffer_store_short v0, v[2:3], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: test_bitcast_to_bfloat:
+; GFX8-LABEL: test_load_store_f64_to_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_ushort v2, v[2:3]
+; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    flat_store_short v[0:1], v2
+; GFX8-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    flat_store_short v[2:3], v0
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: test_bitcast_to_bfloat:
+; GFX9-LABEL: test_load_store_f64_to_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_ushort v2, v[2:3], off
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_store_short v[0:1], v2, off
+; GFX9-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
+; GFX9-NEXT:    global_store_short_d16_hi v[2:3], v0, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: test_bitcast_to_bfloat:
+; GFX10-LABEL: test_load_store_f64_to_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_ushort v2, v[2:3], off
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    global_store_short v[0:1], v2, off
+; GFX10-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
+; GFX10-NEXT:    global_store_short_d16_hi v[2:3], v0, off
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: test_bitcast_to_bfloat:
+; GFX11-LABEL: test_load_store_f64_to_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_u16 v2, v[2:3], off
+; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    global_store_b16 v[0:1], v2, off
+; GFX11-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
+; GFX11-NEXT:    global_store_d16_hi_b16 v[2:3], v0, off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %val = load i16, ptr addrspace(1) %in
-  %val_fp = bitcast i16 %val to bfloat
-  store bfloat %val_fp, ptr addrspace(1) %out
+  %val = load double, ptr addrspace(1) %in
+  %val.bf16 = fptrunc double %val to bfloat
+  store bfloat %val.bf16, ptr addrspace(1) %out
   ret void
 }
 
-define bfloat @test_ret(bfloat %in) {
-; GCN-LABEL: test_ret:
-; GCN:       ; %bb.0: ; %entry
+define void @test_load_store_bf16_to_f32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+; GCN-LABEL: test_load_store_bf16_to_f32:
+; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s4, s6
+; GCN-NEXT:    s_mov_b32 s5, s6
+; GCN-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT:    buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: test_ret:
-; GFX7:       ; %bb.0: ; %entry
+; GFX7-LABEL: test_load_store_bf16_to_f32:
+; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:    buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: test_ret:
-; GFX8:       ; %bb.0: ; %entry
+; GFX8-LABEL: test_load_store_bf16_to_f32:
+; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_ushort v0, v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    flat_store_dword v[2:3], v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: test_ret:
-; GFX9:       ; %bb.0: ; %entry
+; GFX9-LABEL: test_load_store_bf16_to_f32:
+; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-NEXT:    global_load_short_d16_hi v4, v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_store_dword v[2:3], v4, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: test_ret:
-; GFX10:       ; %bb.0: ; %entry
+; GFX10-LABEL: test_load_store_bf16_to_f32:
+; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v4, 0
+; GFX10-NEXT:    global_load_short_d16_hi v4, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_store_dword v[2:3], v4, off
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: test_ret:
-; GFX11:       ; %bb.0: ; %entry
+; GFX11-LABEL: test_load_store_bf16_to_f32:
+; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v4, 0
+; GFX11-NEXT:    global_load_d16_hi_b16 v4, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_store_b32 v[2:3], v4, off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-entry:
-  ret bfloat %in
+  %val = load bfloat, ptr addrspace(1) %in
+  %val.f32 = fpext bfloat %val to float
+  store float %val.f32, ptr addrspace(1) %out
+  ret void
 }
 
-define <2 x bfloat> @test_ret_v2bf16(<2 x bfloat> %in) {
-; GCN-LABEL: test_ret_v2bf16:
-; GCN:       ; %bb.0: ; %entry
+define void @test_load_store_bf16_to_f64(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+; GCN-LABEL: test_load_store_bf16_to_f64:
+; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s4, s6
+; GCN-NEXT:    s_mov_b32 s5, s6
+; GCN-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GCN-NEXT:    buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: test_ret_v2bf16:
-; GFX7:       ; %bb.0: ; %entry
+; GFX7-LABEL: test_load_store_bf16_to_f64:
+; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX7-NEXT:    buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: test_ret_v2bf16:
-; GFX8:       ; %bb.0: ; %entry
+; GFX8-LABEL: test_load_store_bf16_to_f64:
+; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_ushort v0, v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: test_ret_v2bf16:
-; GFX9:       ; %bb.0: ; %entry
+; GFX9-LABEL: test_load_store_bf16_to_f64:
+; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-NEXT:    global_load_short_d16_hi v4, v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[0:1], v4
+; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: test_ret_v2bf16:
-; GFX10:       ; %bb.0: ; %entry
+; GFX10-LABEL: test_load_store_bf16_to_f64:
+; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v4, 0
+; GFX10-NEXT:    global_load_short_d16_hi v4, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[0:1], v4
+; GFX10-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: test_ret_v2bf16:
-; GFX11:       ; %bb.0: ; %entry
+; GFX11-LABEL: test_load_store_bf16_to_f64:
+; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-entry:
-  ret <2 x bfloat> %in
+; GFX11-NEXT:    v_mov_b32_e32 v4, 0
+; GFX11-NEXT:    global_load_d16_hi_b16 v4, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[0:1], v4
+; GFX11-NEXT:    global_store_b64 v[2:3], v[0:1], off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val = load bfloat, ptr addrspace(1) %in
+  %val.f64 = fpext bfloat %val to double
+  store double %val.f64, ptr addrspace(1) %out
+  ret void
 }
 
-define <3 x bfloat> @test_ret_v3bf16(<3 x bfloat> %in) {
-; GCN-LABEL: test_ret_v3bf16:
-; GCN:       ; %bb.0: ; %entry
+define void @test_load_store_v2bf16(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+; GCN-LABEL: test_load_store_v2bf16:
+; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s4, s6
+; GCN-NEXT:    s_mov_b32 s5, s6
+; GCN-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: test_ret_v3bf16:
-; GFX7:       ; %bb.0: ; %entry
+; GFX7-LABEL: test_load_store_v2bf16:
+; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: test_ret_v3bf16:
-; GFX8:       ; %bb.0: ; %entry
+; GFX8-LABEL: test_load_store_v2bf16:
+; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX8-NEXT:    flat_load_dword v0, v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    flat_store_dword v[2:3], v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: test_ret_v3bf16:
-; GFX9:       ; %bb.0: ; %entry
+; GFX9-LABEL: test_load_store_v2bf16:
+; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_store_dword v[2:3], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: test_ret_v3bf16:
-; GFX10:       ; %bb.0: ; %entry
+; GFX10-LABEL: test_load_store_v2bf16:
+; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_store_dword v[2:3], v0, off
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: test_ret_v3bf16:
-; GFX11:       ; %bb.0: ; %entry
+; GFX11-LABEL: test_load_store_v2bf16:
+; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_store_b32 v[2:3], v0, off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-entry:
-  ret <3 x bfloat> %in
+  %val = load <2 x bfloat>, ptr addrspace(1) %in
+  store <2 x bfloat> %val, ptr addrspace(1) %out
+  ret void
 }
 
-define <4 x bfloat> @test_ret_v4bf16(<4 x bfloat> %in) {
-; GCN-LABEL: test_ret_v4bf16:
-; GCN:       ; %bb.0: ; %entry
+define void @test_load_store_v4bf16(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+; GCN-LABEL: test_load_store_v4bf16:
+; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s4, s6
+; GCN-NEXT:    s_mov_b32 s5, s6
+; GCN-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: test_ret_v4bf16:
-; GFX7:       ; %bb.0: ; %entry
+; GFX7-LABEL: test_load_store_v4bf16:
+; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_store_dwordx2 v[0:1], v[2:3], s[4:7], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: test_ret_v4bf16:
-; GFX8:       ; %bb.0: ; %entry
+; GFX8-LABEL: test_load_store_v4bf16:
+; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: test_ret_v4bf16:
-; GFX9:       ; %bb.0: ; %entry
+; GFX9-LABEL: test_load_store_v4bf16:
+; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: test_ret_v4bf16:
-; GFX10:       ; %bb.0: ; %entry
+; GFX10-LABEL: test_load_store_v4bf16:
+; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: test_ret_v4bf16:
-; GFX11:       ; %bb.0: ; %entry
+; GFX11-LABEL: test_load_store_v4bf16:
+; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_store_b64 v[2:3], v[0:1], off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-entry:
-  ret <4 x bfloat> %in
+  %val = load <4 x bfloat>, ptr addrspace(1) %in
+  store <4 x bfloat> %val, ptr addrspace(1) %out
+  ret void
 }
 
-define <8 x bfloat> @test_ret_v8bf16(<8 x bfloat> %in) {
-; GCN-LABEL: test_ret_v8bf16:
-; GCN:       ; %bb.0: ; %entry
+define void @test_load_store_v8bf16(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+; GCN-LABEL: test_load_store_v8bf16:
+; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s4, s6
+; GCN-NEXT:    s_mov_b32 s5, s6
+; GCN-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: test_ret_v8bf16:
-; GFX7:       ; %bb.0: ; %entry
+; GFX7-LABEL: test_load_store_v8bf16:
+; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: test_ret_v8bf16:
-; GFX8:       ; %bb.0: ; %entry
+; GFX8-LABEL: test_load_store_v8bf16:
+; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dwordx4 v[4:7], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    flat_store_dwordx4 v[2:3], v[4:7]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: test_ret_v8bf16:
-; GFX9:       ; %bb.0: ; %entry
+; GFX9-LABEL: test_load_store_v8bf16:
+; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: test_ret_v8bf16:
-; GFX10:       ; %bb.0: ; %entry
+; GFX10-LABEL: test_load_store_v8bf16:
+; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: test_ret_v8bf16:
-; GFX11:       ; %bb.0: ; %entry
+; GFX11-LABEL: test_load_store_v8bf16:
+; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b128 v[4:7], v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_store_b128 v[2:3], v[4:7], off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-entry:
-  ret <8 x bfloat> %in
+  %val = load <8 x bfloat>, ptr addrspace(1) %in
+  store <8 x bfloat> %val, ptr addrspace(1) %out
+  ret void
 }
 
-define <16 x bfloat> @test_ret_v16bf16(<16 x bfloat> %in) {
-; GCN-LABEL: test_ret_v16bf16:
-; GCN:       ; %bb.0: ; %entry
+define void @test_load_store_v16bf16(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+; GCN-LABEL: test_load_store_v16bf16:
+; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s4, s6
+; GCN-NEXT:    s_mov_b32 s5, s6
+; GCN-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 offset:16
+; GCN-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64 offset:16
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: test_ret_v16bf16:
-; GFX7:       ; %bb.0: ; %entry
+; GFX7-LABEL: test_load_store_v16bf16:
+; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX7-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    buffer_store_dwordx4 v[4:7], v[2:3], s[4:7], 0 addr64 offset:16
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    buffer_store_dwordx4 v[8:11], v[2:3], s[4:7], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: test_ret_v16bf16:
-; GFX8:       ; %bb.0: ; %entry
+; GFX8-LABEL: test_load_store_v16bf16:
+; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 16, v0
+; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
+; GFX8-NEXT:    flat_load_dwordx4 v[8:11], v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 16, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
+; GFX8-NEXT:    s_waitcnt vmcnt(1)
+; GFX8-NEXT:    flat_store_dwordx4 v[0:1], v[4:7]
+; GFX8-NEXT:    s_waitcnt vmcnt(1)
+; GFX8-NEXT:    flat_store_dwordx4 v[2:3], v[8:11]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: test_ret_v16bf16:
-; GFX9:       ; %bb.0: ; %entry
+; GFX9-LABEL: test_load_store_v16bf16:
+; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: test_ret_v16bf16:
-; GFX10:       ; %bb.0: ; %entry
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off offset:16
+; GFX9-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off offset:16
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    global_store_dwordx4 v[2:3], v[8:11], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: test_ret_v16bf16:
-; GFX11:       ; %bb.0: ; %entry
+; GFX10-LABEL: test_load_store_v16bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off offset:16
+; GFX10-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off offset:16
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_store_dwordx4 v[2:3], v[8:11], off
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: test_load_store_v16bf16:
+; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    global_load_b128 v[4:7], v[0:1], off offset:16
+; GFX11-NEXT:    global_load_b128 v[8:11], v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    global_store_b128 v[2:3], v[4:7], off offset:16
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_store_b128 v[2:3], v[8:11], off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-entry:
-  ret <16 x bfloat> %in
+  %val = load <16 x bfloat>, ptr addrspace(1) %in
+  store <16 x bfloat> %val, ptr addrspace(1) %out
+  ret void
 }
 
-define void @test_call(bfloat %in, ptr addrspace(5) %out) {
-; GCN-LABEL: test_call:
-; GCN:       ; %bb.0: ; %entry
+define void @test_arg_store(bfloat %in, ptr addrspace(1) %out) {
+; GCN-LABEL: test_arg_store:
+; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s8, s33
-; GCN-NEXT:    s_mov_b32 s33, s32
-; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
-; GCN-NEXT:    s_addk_i32 s32, 0x400
-; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    v_writelane_b32 v2, s30, 0
-; GCN-NEXT:    v_writelane_b32 v2, s31, 1
-; GCN-NEXT:    s_getpc_b64 s[4:5]
-; GCN-NEXT:    s_add_u32 s4, s4, test_arg_store@gotpcrel32@lo+4
-; GCN-NEXT:    s_addc_u32 s5, s5, test_arg_store@gotpcrel32@hi+12
-; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GCN-NEXT:    buffer_store_short v0, v1, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_readlane_b32 s31, v2, 1
-; GCN-NEXT:    v_readlane_b32 s30, v2, 0
-; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
-; GCN-NEXT:    s_addk_i32 s32, 0xfc00
-; GCN-NEXT:    s_mov_b32 s33, s8
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s4, s6
+; GCN-NEXT:    s_mov_b32 s5, s6
+; GCN-NEXT:    buffer_store_short v0, v[1:2], s[4:7], 0 addr64
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: test_call:
-; GFX7:       ; %bb.0: ; %entry
+; GFX7-LABEL: test_arg_store:
+; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b32 s8, s33
-; GFX7-NEXT:    s_mov_b32 s33, s32
-; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX7-NEXT:    buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX7-NEXT:    s_addk_i32 s32, 0x400
-; GFX7-NEXT:    s_getpc_b64 s[4:5]
-; GFX7-NEXT:    s_add_u32 s4, s4, test_arg_store@gotpcrel32@lo+4
-; GFX7-NEXT:    s_addc_u32 s5, s5, test_arg_store@gotpcrel32@hi+12
-; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX7-NEXT:    v_writelane_b32 v2, s30, 0
-; GFX7-NEXT:    v_writelane_b32 v2, s31, 1
-; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX7-NEXT:    buffer_store_short v0, v1, s[0:3], 0 offen
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_readlane_b32 s31, v2, 1
-; GFX7-NEXT:    v_readlane_b32 s30, v2, 0
-; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX7-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX7-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX7-NEXT:    s_mov_b32 s33, s8
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_store_short v0, v[1:2], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: test_call:
-; GFX8:       ; %bb.0: ; %entry
+; GFX8-LABEL: test_arg_store:
+; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_mov_b32 s6, s33
-; GFX8-NEXT:    s_mov_b32 s33, s32
-; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX8-NEXT:    buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX8-NEXT:    s_addk_i32 s32, 0x400
-; GFX8-NEXT:    s_getpc_b64 s[4:5]
-; GFX8-NEXT:    s_add_u32 s4, s4, test_arg_store@gotpcrel32@lo+4
-; GFX8-NEXT:    s_addc_u32 s5, s5, test_arg_store@gotpcrel32@hi+12
-; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX8-NEXT:    v_writelane_b32 v2, s30, 0
-; GFX8-NEXT:    v_writelane_b32 v2, s31, 1
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    buffer_store_short v0, v1, s[0:3], 0 offen
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_readlane_b32 s31, v2, 1
-; GFX8-NEXT:    v_readlane_b32 s30, v2, 0
-; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX8-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX8-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX8-NEXT:    s_mov_b32 s33, s6
+; GFX8-NEXT:    flat_store_short v[1:2], v0
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: test_call:
-; GFX9:       ; %bb.0: ; %entry
+; GFX9-LABEL: test_arg_store:
+; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s6, s33
-; GFX9-NEXT:    s_mov_b32 s33, s32
-; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX9-NEXT:    buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, test_arg_store@gotpcrel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, test_arg_store@gotpcrel32@hi+12
-; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX9-NEXT:    v_writelane_b32 v2, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v2, s31, 1
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    buffer_store_short_d16_hi v0, v1, s[0:3], 0 offen
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_readlane_b32 s31, v2, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v2, 0
-; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX9-NEXT:    s_mov_b32 s33, s6
+; GFX9-NEXT:    global_store_short_d16_hi v[1:2], v0, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: test_call:
-; GFX10:       ; %bb.0: ; %entry
+; GFX10-LABEL: test_arg_store:
+; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    s_mov_b32 s6, s33
-; GFX10-NEXT:    s_mov_b32 s33, s32
-; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
-; GFX10-NEXT:    buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[4:5]
-; GFX10-NEXT:    s_add_u32 s4, s4, test_arg_store@gotpcrel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s5, s5, test_arg_store@gotpcrel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v2, s30, 0
-; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX10-NEXT:    v_writelane_b32 v2, s31, 1
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT:    buffer_store_short_d16_hi v0, v1, s[0:3], 0 offen
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_readlane_b32 s31, v2, 1
-; GFX10-NEXT:    v_readlane_b32 s30, v2, 0
-; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
-; GFX10-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
-; GFX10-NEXT:    s_mov_b32 s33, s6
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_store_short_d16_hi v[1:2], v0, off
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: test_call:
-; GFX11:       ; %bb.0: ; %entry
+; GFX11-LABEL: test_arg_store:
+; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s2, s33
-; GFX11-NEXT:    s_mov_b32 s33, s32
-; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
-; GFX11-NEXT:    scratch_store_b32 off, v2, s33 ; 4-byte Folded Spill
-; GFX11-NEXT:    s_mov_b32 exec_lo, s0
-; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    s_getpc_b64 s[0:1]
-; GFX11-NEXT:    s_add_u32 s0, s0, test_arg_store@gotpcrel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s1, s1, test_arg_store@gotpcrel32@hi+12
-; GFX11-NEXT:    v_writelane_b32 v2, s30, 0
-; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX11-NEXT:    v_writelane_b32 v2, s31, 1
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    scratch_store_d16_hi_b16 v1, v0, off dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    v_readlane_b32 s31, v2, 1
-; GFX11-NEXT:    v_readlane_b32 s30, v2, 0
-; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
-; GFX11-NEXT:    scratch_load_b32 v2, off, s33 ; 4-byte Folded Reload
-; GFX11-NEXT:    s_mov_b32 exec_lo, s0
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
-; GFX11-NEXT:    s_mov_b32 s33, s2
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_store_d16_hi_b16 v[1:2], v0, off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-entry:
-  %result = call bfloat @test_arg_store(bfloat %in)
-  store volatile bfloat %result, ptr addrspace(5) %out
+  store bfloat %in, ptr addrspace(1) %out
   ret void
 }
 
-define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) {
-; GCN-LABEL: test_call_v2bf16:
-; GCN:       ; %bb.0: ; %entry
+define void @test_arg_store_v2bf16(<2 x bfloat> %in, ptr addrspace(1) %out) {
+; GCN-LABEL: test_arg_store_v2bf16:
+; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s8, s33
-; GCN-NEXT:    s_mov_b32 s33, s32
-; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
-; GCN-NEXT:    s_addk_i32 s32, 0x400
-; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    v_writelane_b32 v3, s30, 0
-; GCN-NEXT:    v_writelane_b32 v3, s31, 1
-; GCN-NEXT:    s_getpc_b64 s[4:5]
-; GCN-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4
-; GCN-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12
-; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GCN-NEXT:    v_add_i32_e32 v4, vcc, 2, v2
-; GCN-NEXT:    buffer_store_short v1, v4, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_short v0, v2, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_readlane_b32 s31, v3, 1
-; GCN-NEXT:    v_readlane_b32 s30, v3, 0
-; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
-; GCN-NEXT:    s_addk_i32 s32, 0xfc00
-; GCN-NEXT:    s_mov_b32 s33, s8
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    v_alignbit_b32 v0, v1, v0, 16
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s4, s6
+; GCN-NEXT:    s_mov_b32 s5, s6
+; GCN-NEXT:    buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: test_call_v2bf16:
-; GFX7:       ; %bb.0: ; %entry
+; GFX7-LABEL: test_arg_store_v2bf16:
+; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b32 s8, s33
-; GFX7-NEXT:    s_mov_b32 s33, s32
-; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX7-NEXT:    buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX7-NEXT:    s_addk_i32 s32, 0x400
-; GFX7-NEXT:    s_getpc_b64 s[4:5]
-; GFX7-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4
-; GFX7-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12
-; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX7-NEXT:    v_writelane_b32 v3, s30, 0
-; GFX7-NEXT:    v_writelane_b32 v3, s31, 1
-; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 2, v2
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX7-NEXT:    buffer_store_short v1, v4, s[0:3], 0 offen
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    buffer_store_short v0, v2, s[0:3], 0 offen
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_readlane_b32 s31, v3, 1
-; GFX7-NEXT:    v_readlane_b32 s30, v3, 0
-; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX7-NEXT:    buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX7-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX7-NEXT:    s_mov_b32 s33, s8
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    v_alignbit_b32 v0, v1, v0, 16
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: test_call_v2bf16:
-; GFX8:       ; %bb.0: ; %entry
+; GFX8-LABEL: test_arg_store_v2bf16:
+; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_mov_b32 s6, s33
-; GFX8-NEXT:    s_mov_b32 s33, s32
-; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX8-NEXT:    buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX8-NEXT:    s_addk_i32 s32, 0x400
-; GFX8-NEXT:    s_getpc_b64 s[4:5]
-; GFX8-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4
-; GFX8-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12
-; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX8-NEXT:    v_writelane_b32 v2, s30, 0
-; GFX8-NEXT:    v_writelane_b32 v2, s31, 1
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX8-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_readlane_b32 s31, v2, 1
-; GFX8-NEXT:    v_readlane_b32 s30, v2, 0
-; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX8-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX8-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX8-NEXT:    s_mov_b32 s33, s6
+; GFX8-NEXT:    flat_store_dword v[1:2], v0
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: test_call_v2bf16:
-; GFX9:       ; %bb.0: ; %entry
+; GFX9-LABEL: test_arg_store_v2bf16:
+; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s6, s33
-; GFX9-NEXT:    s_mov_b32 s33, s32
-; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX9-NEXT:    buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12
-; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX9-NEXT:    v_writelane_b32 v2, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v2, s31, 1
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_readlane_b32 s31, v2, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v2, 0
-; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX9-NEXT:    s_mov_b32 s33, s6
+; GFX9-NEXT:    global_store_dword v[1:2], v0, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: test_call_v2bf16:
-; GFX10:       ; %bb.0: ; %entry
+; GFX10-LABEL: test_arg_store_v2bf16:
+; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    s_mov_b32 s6, s33
-; GFX10-NEXT:    s_mov_b32 s33, s32
-; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
-; GFX10-NEXT:    buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[4:5]
-; GFX10-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v2, s30, 0
-; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX10-NEXT:    v_writelane_b32 v2, s31, 1
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_readlane_b32 s31, v2, 1
-; GFX10-NEXT:    v_readlane_b32 s30, v2, 0
-; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
-; GFX10-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
-; GFX10-NEXT:    s_mov_b32 s33, s6
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_store_dword v[1:2], v0, off
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: test_call_v2bf16:
-; GFX11:       ; %bb.0: ; %entry
+; GFX11-LABEL: test_arg_store_v2bf16:
+; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s2, s33
-; GFX11-NEXT:    s_mov_b32 s33, s32
-; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
-; GFX11-NEXT:    scratch_store_b32 off, v2, s33 ; 4-byte Folded Spill
-; GFX11-NEXT:    s_mov_b32 exec_lo, s0
-; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    s_getpc_b64 s[0:1]
-; GFX11-NEXT:    s_add_u32 s0, s0, test_arg_store_v2bf16@gotpcrel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s1, s1, test_arg_store_v2bf16@gotpcrel32@hi+12
-; GFX11-NEXT:    v_writelane_b32 v2, s30, 0
-; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX11-NEXT:    v_writelane_b32 v2, s31, 1
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    scratch_store_b32 v1, v0, off dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    v_readlane_b32 s31, v2, 1
-; GFX11-NEXT:    v_readlane_b32 s30, v2, 0
-; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
-; GFX11-NEXT:    scratch_load_b32 v2, off, s33 ; 4-byte Folded Reload
-; GFX11-NEXT:    s_mov_b32 exec_lo, s0
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
-; GFX11-NEXT:    s_mov_b32 s33, s2
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_store_b32 v[1:2], v0, off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-entry:
-  %result = call <2 x bfloat> @test_arg_store_v2bf16(<2 x bfloat> %in)
-  store volatile <2 x bfloat> %result, ptr addrspace(5) %out
+  store <2 x bfloat> %in, ptr addrspace(1) %out
   ret void
 }
 
-define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) {
-; GCN-LABEL: test_call_v3bf16:
-; GCN:       ; %bb.0: ; %entry
+define void @test_arg_store_v3bf16(<3 x bfloat> %in, <3 x bfloat> addrspace(1)* %out) {
+; GCN-LABEL: test_arg_store_v3bf16:
+; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s8, s33
-; GCN-NEXT:    s_mov_b32 s33, s32
-; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    buffer_store_dword v4, off, s[0:3], s33 ; 4-byte Folded Spill
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
-; GCN-NEXT:    s_addk_i32 s32, 0x400
-; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    v_writelane_b32 v4, s30, 0
-; GCN-NEXT:    v_writelane_b32 v4, s31, 1
-; GCN-NEXT:    s_getpc_b64 s[4:5]
-; GCN-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4
-; GCN-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12
-; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GCN-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GCN-NEXT:    v_add_i32_e32 v5, vcc, 4, v3
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, 0
 ; GCN-NEXT:    v_alignbit_b32 v0, v1, v0, 16
-; GCN-NEXT:    buffer_store_short v2, v5, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v0, v3, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_readlane_b32 s31, v4, 1
-; GCN-NEXT:    v_readlane_b32 s30, v4, 0
-; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
-; GCN-NEXT:    s_addk_i32 s32, 0xfc00
-; GCN-NEXT:    s_mov_b32 s33, s8
+; GCN-NEXT:    s_mov_b32 s4, s6
+; GCN-NEXT:    s_mov_b32 s5, s6
+; GCN-NEXT:    buffer_store_short v2, v[3:4], s[4:7], 0 addr64 offset:4
+; GCN-NEXT:    buffer_store_dword v0, v[3:4], s[4:7], 0 addr64
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: test_call_v3bf16:
-; GFX7:       ; %bb.0: ; %entry
+; GFX7-LABEL: test_arg_store_v3bf16:
+; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b32 s8, s33
-; GFX7-NEXT:    s_mov_b32 s33, s32
-; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX7-NEXT:    buffer_store_dword v4, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX7-NEXT:    s_addk_i32 s32, 0x400
-; GFX7-NEXT:    s_getpc_b64 s[4:5]
-; GFX7-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4
-; GFX7-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12
-; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX7-NEXT:    v_writelane_b32 v4, s30, 0
-; GFX7-NEXT:    v_writelane_b32 v4, s31, 1
-; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    v_alignbit_b32 v0, v1, v0, 16
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 4, v3
-; GFX7-NEXT:    buffer_store_short v1, v2, s[0:3], 0 offen
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    buffer_store_dword v0, v3, s[0:3], 0 offen
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_readlane_b32 s31, v4, 1
-; GFX7-NEXT:    v_readlane_b32 s30, v4, 0
-; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX7-NEXT:    buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX7-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX7-NEXT:    s_mov_b32 s33, s8
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_store_short v1, v[3:4], s[4:7], 0 addr64 offset:4
+; GFX7-NEXT:    buffer_store_dword v0, v[3:4], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: test_call_v3bf16:
-; GFX8:       ; %bb.0: ; %entry
+; GFX8-LABEL: test_arg_store_v3bf16:
+; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_mov_b32 s6, s33
-; GFX8-NEXT:    s_mov_b32 s33, s32
-; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX8-NEXT:    buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX8-NEXT:    s_addk_i32 s32, 0x400
-; GFX8-NEXT:    s_getpc_b64 s[4:5]
-; GFX8-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4
-; GFX8-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12
-; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX8-NEXT:    v_writelane_b32 v3, s30, 0
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX8-NEXT:    v_writelane_b32 v3, s31, 1
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 4, v2
-; GFX8-NEXT:    buffer_store_short v1, v4, s[0:3], 0 offen
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_readlane_b32 s31, v3, 1
-; GFX8-NEXT:    v_readlane_b32 s30, v3, 0
-; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX8-NEXT:    buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX8-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX8-NEXT:    s_mov_b32 s33, s6
+; GFX8-NEXT:    flat_store_dword v[2:3], v0
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 4, v2
+; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; GFX8-NEXT:    flat_store_short v[2:3], v1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: test_call_v3bf16:
-; GFX9:       ; %bb.0: ; %entry
+; GFX9-LABEL: test_arg_store_v3bf16:
+; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s6, s33
-; GFX9-NEXT:    s_mov_b32 s33, s32
-; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX9-NEXT:    buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12
-; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX9-NEXT:    v_writelane_b32 v3, s30, 0
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX9-NEXT:    v_writelane_b32 v3, s31, 1
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    buffer_store_short v1, v2, s[0:3], 0 offen offset:4
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_readlane_b32 s31, v3, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v3, 0
-; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX9-NEXT:    s_mov_b32 s33, s6
+; GFX9-NEXT:    global_store_short v[2:3], v1, off offset:4
+; GFX9-NEXT:    global_store_dword v[2:3], v0, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: test_call_v3bf16:
-; GFX10:       ; %bb.0: ; %entry
+; GFX10-LABEL: test_arg_store_v3bf16:
+; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    s_mov_b32 s6, s33
-; GFX10-NEXT:    s_mov_b32 s33, s32
-; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
-; GFX10-NEXT:    buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[4:5]
-; GFX10-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v3, s30, 0
-; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX10-NEXT:    v_writelane_b32 v3, s31, 1
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT:    buffer_store_short v1, v2, s[0:3], 0 offen offset:4
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_readlane_b32 s31, v3, 1
-; GFX10-NEXT:    v_readlane_b32 s30, v3, 0
-; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
-; GFX10-NEXT:    buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
-; GFX10-NEXT:    s_mov_b32 s33, s6
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_store_short v[2:3], v1, off offset:4
+; GFX10-NEXT:    global_store_dword v[2:3], v0, off
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: test_call_v3bf16:
-; GFX11:       ; %bb.0: ; %entry
+; GFX11-LABEL: test_arg_store_v3bf16:
+; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s2, s33
-; GFX11-NEXT:    s_mov_b32 s33, s32
-; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
-; GFX11-NEXT:    scratch_store_b32 off, v3, s33 ; 4-byte Folded Spill
-; GFX11-NEXT:    s_mov_b32 exec_lo, s0
-; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    s_getpc_b64 s[0:1]
-; GFX11-NEXT:    s_add_u32 s0, s0, test_arg_store_v2bf16@gotpcrel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s1, s1, test_arg_store_v2bf16@gotpcrel32@hi+12
-; GFX11-NEXT:    v_writelane_b32 v3, s30, 0
-; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT:    v_writelane_b32 v3, s31, 1
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    scratch_store_b16 v2, v1, off offset:4 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_store_b32 v2, v0, off dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    v_readlane_b32 s31, v3, 1
-; GFX11-NEXT:    v_readlane_b32 s30, v3, 0
-; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
-; GFX11-NEXT:    scratch_load_b32 v3, off, s33 ; 4-byte Folded Reload
-; GFX11-NEXT:    s_mov_b32 exec_lo, s0
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
-; GFX11-NEXT:    s_mov_b32 s33, s2
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    global_store_b16 v[2:3], v1, off offset:4
+; GFX11-NEXT:    global_store_b32 v[2:3], v0, off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-entry:
-  %result = call <3 x bfloat> @test_arg_store_v2bf16(<3 x bfloat> %in)
-  store volatile <3 x bfloat> %result, ptr addrspace(5) %out
+  store <3 x bfloat> %in, <3 x bfloat> addrspace(1) * %out
   ret void
 }
 
-define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) {
-; GCN-LABEL: test_call_v4bf16:
-; GCN:       ; %bb.0: ; %entry
+define void @test_arg_store_v4bf16(<4 x bfloat> %in, ptr addrspace(1) %out) {
+; GCN-LABEL: test_arg_store_v4bf16:
+; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s8, s33
-; GCN-NEXT:    s_mov_b32 s33, s32
-; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
-; GCN-NEXT:    s_addk_i32 s32, 0x400
-; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    v_writelane_b32 v5, s30, 0
-; GCN-NEXT:    v_writelane_b32 v5, s31, 1
-; GCN-NEXT:    s_getpc_b64 s[4:5]
-; GCN-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4
-; GCN-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12
-; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GCN-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GCN-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GCN-NEXT:    v_add_i32_e32 v6, vcc, 6, v4
-; GCN-NEXT:    v_add_i32_e32 v7, vcc, 4, v4
-; GCN-NEXT:    v_add_i32_e32 v8, vcc, 2, v4
-; GCN-NEXT:    buffer_store_short v3, v6, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_short v2, v7, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_short v1, v8, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_short v0, v4, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_readlane_b32 s31, v5, 1
-; GCN-NEXT:    v_readlane_b32 s30, v5, 0
-; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
-; GCN-NEXT:    s_addk_i32 s32, 0xfc00
-; GCN-NEXT:    s_mov_b32 s33, s8
+; GCN-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    v_alignbit_b32 v1, v3, v2, 16
+; GCN-NEXT:    v_alignbit_b32 v0, v6, v0, 16
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s4, s6
+; GCN-NEXT:    s_mov_b32 s5, s6
+; GCN-NEXT:    buffer_store_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: test_call_v4bf16:
-; GFX7:       ; %bb.0: ; %entry
+; GFX7-LABEL: test_arg_store_v4bf16:
+; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b32 s8, s33
-; GFX7-NEXT:    s_mov_b32 s33, s32
-; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX7-NEXT:    buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX7-NEXT:    s_addk_i32 s32, 0x400
-; GFX7-NEXT:    s_getpc_b64 s[4:5]
-; GFX7-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4
-; GFX7-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12
-; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX7-NEXT:    v_writelane_b32 v5, s30, 0
-; GFX7-NEXT:    v_writelane_b32 v5, s31, 1
-; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX7-NEXT:    v_add_i32_e32 v6, vcc, 6, v4
-; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT:    buffer_store_short v3, v6, s[0:3], 0 offen
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 4, v4
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX7-NEXT:    buffer_store_short v2, v3, s[0:3], 0 offen
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 2, v4
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX7-NEXT:    buffer_store_short v1, v2, s[0:3], 0 offen
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    buffer_store_short v0, v4, s[0:3], 0 offen
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_readlane_b32 s31, v5, 1
-; GFX7-NEXT:    v_readlane_b32 s30, v5, 0
-; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX7-NEXT:    buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX7-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX7-NEXT:    s_mov_b32 s33, s8
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    v_alignbit_b32 v2, v3, v2, 16
+; GFX7-NEXT:    v_alignbit_b32 v1, v1, v0, 16
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_store_dwordx2 v[1:2], v[4:5], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: test_call_v4bf16:
-; GFX8:       ; %bb.0: ; %entry
+; GFX8-LABEL: test_arg_store_v4bf16:
+; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_mov_b32 s6, s33
-; GFX8-NEXT:    s_mov_b32 s33, s32
-; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX8-NEXT:    buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX8-NEXT:    s_addk_i32 s32, 0x400
-; GFX8-NEXT:    s_getpc_b64 s[4:5]
-; GFX8-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4
-; GFX8-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12
-; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX8-NEXT:    v_writelane_b32 v3, s30, 0
-; GFX8-NEXT:    v_writelane_b32 v3, s31, 1
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 4, v2
-; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
-; GFX8-NEXT:    buffer_store_short v1, v6, s[0:3], 0 offen
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    buffer_store_short v0, v2, s[0:3], 0 offen
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 6, v2
-; GFX8-NEXT:    buffer_store_short v5, v0, s[0:3], 0 offen
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 2, v2
-; GFX8-NEXT:    buffer_store_short v4, v0, s[0:3], 0 offen
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_readlane_b32 s31, v3, 1
-; GFX8-NEXT:    v_readlane_b32 s30, v3, 0
-; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX8-NEXT:    buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX8-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX8-NEXT:    s_mov_b32 s33, s6
+; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: test_call_v4bf16:
-; GFX9:       ; %bb.0: ; %entry
+; GFX9-LABEL: test_arg_store_v4bf16:
+; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s6, s33
-; GFX9-NEXT:    s_mov_b32 s33, s32
-; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX9-NEXT:    buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12
-; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX9-NEXT:    v_writelane_b32 v3, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v3, s31, 1
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    buffer_store_short_d16_hi v1, v2, s[0:3], 0 offen offset:6
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_short v1, v2, s[0:3], 0 offen offset:4
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_short_d16_hi v0, v2, s[0:3], 0 offen offset:2
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_short v0, v2, s[0:3], 0 offen
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_readlane_b32 s31, v3, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v3, 0
-; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX9-NEXT:    s_mov_b32 s33, s6
+; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: test_call_v4bf16:
-; GFX10:       ; %bb.0: ; %entry
+; GFX10-LABEL: test_arg_store_v4bf16:
+; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    s_mov_b32 s6, s33
-; GFX10-NEXT:    s_mov_b32 s33, s32
-; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
-; GFX10-NEXT:    buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[4:5]
-; GFX10-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v3, s30, 0
-; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX10-NEXT:    v_writelane_b32 v3, s31, 1
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT:    buffer_store_short_d16_hi v1, v2, s[0:3], 0 offen offset:6
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_store_short v1, v2, s[0:3], 0 offen offset:4
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_store_short_d16_hi v0, v2, s[0:3], 0 offen offset:2
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_store_short v0, v2, s[0:3], 0 offen
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_readlane_b32 s31, v3, 1
-; GFX10-NEXT:    v_readlane_b32 s30, v3, 0
-; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
-; GFX10-NEXT:    buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
-; GFX10-NEXT:    s_mov_b32 s33, s6
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: test_call_v4bf16:
-; GFX11:       ; %bb.0: ; %entry
+; GFX11-LABEL: test_arg_store_v4bf16:
+; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s2, s33
-; GFX11-NEXT:    s_mov_b32 s33, s32
-; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
-; GFX11-NEXT:    scratch_store_b32 off, v3, s33 ; 4-byte Folded Spill
-; GFX11-NEXT:    s_mov_b32 exec_lo, s0
-; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    s_getpc_b64 s[0:1]
-; GFX11-NEXT:    s_add_u32 s0, s0, test_arg_store_v2bf16@gotpcrel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s1, s1, test_arg_store_v2bf16@gotpcrel32@hi+12
-; GFX11-NEXT:    v_writelane_b32 v3, s30, 0
-; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX11-NEXT:    v_writelane_b32 v3, s31, 1
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, 6, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_readlane_b32 s31, v3, 1
-; GFX11-NEXT:    v_readlane_b32 s30, v3, 0
-; GFX11-NEXT:    scratch_store_d16_hi_b16 v4, v1, off dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_store_b16 v2, v1, off offset:4 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_store_d16_hi_b16 v2, v0, off offset:2 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_store_b16 v2, v0, off dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
-; GFX11-NEXT:    scratch_load_b32 v3, off, s33 ; 4-byte Folded Reload
-; GFX11-NEXT:    s_mov_b32 exec_lo, s0
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
-; GFX11-NEXT:    s_mov_b32 s33, s2
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_store_b64 v[2:3], v[0:1], off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-entry:
-  %result = call <4 x bfloat> @test_arg_store_v2bf16(<4 x bfloat> %in)
-  store volatile <4 x bfloat> %result, ptr addrspace(5) %out
+  store <4 x bfloat> %in, ptr addrspace(1)  %out
   ret void
 }
 
-define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) {
-; GCN-LABEL: test_call_v8bf16:
-; GCN:       ; %bb.0: ; %entry
+define void @test_arg_store_v8bf16(<8 x bfloat> %in, ptr addrspace(1) %out) {
+; GCN-LABEL: test_arg_store_v8bf16:
+; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s8, s33
-; GCN-NEXT:    s_mov_b32 s33, s32
-; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    buffer_store_dword v9, off, s[0:3], s33 ; 4-byte Folded Spill
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
-; GCN-NEXT:    s_addk_i32 s32, 0x400
-; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    v_writelane_b32 v9, s30, 0
-; GCN-NEXT:    v_writelane_b32 v9, s31, 1
-; GCN-NEXT:    s_getpc_b64 s[4:5]
-; GCN-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4
-; GCN-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12
-; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GCN-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GCN-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GCN-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GCN-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GCN-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, 0
 ; GCN-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
-; GCN-NEXT:    v_add_i32_e32 v10, vcc, 14, v8
-; GCN-NEXT:    v_add_i32_e32 v11, vcc, 12, v8
-; GCN-NEXT:    v_add_i32_e32 v12, vcc, 10, v8
-; GCN-NEXT:    v_add_i32_e32 v13, vcc, 8, v8
-; GCN-NEXT:    v_add_i32_e32 v14, vcc, 6, v8
-; GCN-NEXT:    v_add_i32_e32 v15, vcc, 4, v8
-; GCN-NEXT:    v_add_i32_e32 v16, vcc, 2, v8
-; GCN-NEXT:    buffer_store_short v7, v10, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_short v6, v11, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_short v5, v12, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_short v4, v13, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_short v3, v14, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_short v2, v15, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_short v1, v16, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_short v0, v8, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_readlane_b32 s31, v9, 1
-; GCN-NEXT:    v_readlane_b32 s30, v9, 0
-; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
-; GCN-NEXT:    s_addk_i32 s32, 0xfc00
-; GCN-NEXT:    s_mov_b32 s33, s8
+; GCN-NEXT:    v_lshrrev_b32_e32 v10, 16, v5
+; GCN-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT:    s_mov_b32 s4, s6
+; GCN-NEXT:    s_mov_b32 s5, s6
+; GCN-NEXT:    v_alignbit_b32 v5, v7, v6, 16
+; GCN-NEXT:    v_alignbit_b32 v4, v10, v4, 16
+; GCN-NEXT:    v_alignbit_b32 v3, v3, v2, 16
+; GCN-NEXT:    v_alignbit_b32 v2, v1, v0, 16
+; GCN-NEXT:    buffer_store_dwordx4 v[2:5], v[8:9], s[4:7], 0 addr64
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: test_call_v8bf16:
-; GFX7:       ; %bb.0: ; %entry
+; GFX7-LABEL: test_arg_store_v8bf16:
+; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b32 s8, s33
-; GFX7-NEXT:    s_mov_b32 s33, s32
-; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX7-NEXT:    buffer_store_dword v9, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX7-NEXT:    s_addk_i32 s32, 0x400
-; GFX7-NEXT:    s_getpc_b64 s[4:5]
-; GFX7-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4
-; GFX7-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12
-; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX7-NEXT:    v_writelane_b32 v9, s30, 0
-; GFX7-NEXT:    v_writelane_b32 v9, s31, 1
-; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
-; GFX7-NEXT:    v_add_i32_e32 v10, vcc, 14, v8
-; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; GFX7-NEXT:    buffer_store_short v7, v10, s[0:3], 0 offen
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v7, vcc, 12, v8
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX7-NEXT:    buffer_store_short v6, v7, s[0:3], 0 offen
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v6, vcc, 10, v8
-; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX7-NEXT:    buffer_store_short v5, v6, s[0:3], 0 offen
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v5, vcc, 8, v8
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX7-NEXT:    buffer_store_short v4, v5, s[0:3], 0 offen
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 6, v8
-; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT:    buffer_store_short v3, v4, s[0:3], 0 offen
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 4, v8
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX7-NEXT:    buffer_store_short v2, v3, s[0:3], 0 offen
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 2, v8
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX7-NEXT:    buffer_store_short v1, v2, s[0:3], 0 offen
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    buffer_store_short v0, v8, s[0:3], 0 offen
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_readlane_b32 s31, v9, 1
-; GFX7-NEXT:    v_readlane_b32 s30, v9, 0
-; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX7-NEXT:    buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX7-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX7-NEXT:    s_mov_b32 s33, s8
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    v_alignbit_b32 v6, v7, v6, 16
+; GFX7-NEXT:    v_alignbit_b32 v5, v5, v4, 16
+; GFX7-NEXT:    v_alignbit_b32 v4, v3, v2, 16
+; GFX7-NEXT:    v_alignbit_b32 v3, v1, v0, 16
+; GFX7-NEXT:    buffer_store_dwordx4 v[3:6], v[8:9], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: test_call_v8bf16:
-; GFX8:       ; %bb.0: ; %entry
+; GFX8-LABEL: test_arg_store_v8bf16:
+; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_mov_b32 s6, s33
-; GFX8-NEXT:    s_mov_b32 s33, s32
-; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX8-NEXT:    buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX8-NEXT:    s_addk_i32 s32, 0x400
-; GFX8-NEXT:    s_getpc_b64 s[4:5]
-; GFX8-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4
-; GFX8-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12
-; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX8-NEXT:    v_writelane_b32 v5, s30, 0
-; GFX8-NEXT:    v_writelane_b32 v5, s31, 1
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX8-NEXT:    v_add_u32_e32 v10, vcc, 12, v4
-; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v3
-; GFX8-NEXT:    buffer_store_short v3, v10, s[0:3], 0 offen
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 8, v4
-; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
-; GFX8-NEXT:    buffer_store_short v2, v3, s[0:3], 0 offen
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 4, v4
-; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
-; GFX8-NEXT:    buffer_store_short v1, v2, s[0:3], 0 offen
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    buffer_store_short v0, v4, s[0:3], 0 offen
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 14, v4
-; GFX8-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 10, v4
-; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
-; GFX8-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 6, v4
-; GFX8-NEXT:    buffer_store_short v7, v0, s[0:3], 0 offen
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 2, v4
-; GFX8-NEXT:    buffer_store_short v6, v0, s[0:3], 0 offen
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_readlane_b32 s31, v5, 1
-; GFX8-NEXT:    v_readlane_b32 s30, v5, 0
-; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX8-NEXT:    buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX8-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX8-NEXT:    s_mov_b32 s33, s6
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: test_call_v8bf16:
-; GFX9:       ; %bb.0: ; %entry
+; GFX9-LABEL: test_arg_store_v8bf16:
+; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s6, s33
-; GFX9-NEXT:    s_mov_b32 s33, s32
-; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX9-NEXT:    buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12
-; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX9-NEXT:    v_writelane_b32 v5, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v5, s31, 1
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    buffer_store_short_d16_hi v3, v4, s[0:3], 0 offen offset:14
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_short v3, v4, s[0:3], 0 offen offset:12
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_short_d16_hi v2, v4, s[0:3], 0 offen offset:10
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_short v2, v4, s[0:3], 0 offen offset:8
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_short_d16_hi v1, v4, s[0:3], 0 offen offset:6
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_short v1, v4, s[0:3], 0 offen offset:4
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_short_d16_hi v0, v4, s[0:3], 0 offen offset:2
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_short v0, v4, s[0:3], 0 offen
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_readlane_b32 s31, v5, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v5, 0
-; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX9-NEXT:    buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX9-NEXT:    s_mov_b32 s33, s6
+; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: test_call_v8bf16:
-; GFX10:       ; %bb.0: ; %entry
+; GFX10-LABEL: test_arg_store_v8bf16:
+; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    s_mov_b32 s6, s33
-; GFX10-NEXT:    s_mov_b32 s33, s32
-; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
-; GFX10-NEXT:    buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[4:5]
-; GFX10-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v5, s30, 0
-; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX10-NEXT:    v_writelane_b32 v5, s31, 1
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT:    buffer_store_short_d16_hi v3, v4, s[0:3], 0 offen offset:14
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_store_short v3, v4, s[0:3], 0 offen offset:12
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_store_short_d16_hi v2, v4, s[0:3], 0 offen offset:10
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_store_short v2, v4, s[0:3], 0 offen offset:8
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_store_short_d16_hi v1, v4, s[0:3], 0 offen offset:6
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_store_short v1, v4, s[0:3], 0 offen offset:4
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_store_short_d16_hi v0, v4, s[0:3], 0 offen offset:2
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_store_short v0, v4, s[0:3], 0 offen
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_readlane_b32 s31, v5, 1
-; GFX10-NEXT:    v_readlane_b32 s30, v5, 0
-; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
-; GFX10-NEXT:    buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
-; GFX10-NEXT:    s_mov_b32 s33, s6
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: test_call_v8bf16:
-; GFX11:       ; %bb.0: ; %entry
+; GFX11-LABEL: test_arg_store_v8bf16:
+; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s2, s33
-; GFX11-NEXT:    s_mov_b32 s33, s32
-; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
-; GFX11-NEXT:    scratch_store_b32 off, v5, s33 ; 4-byte Folded Spill
-; GFX11-NEXT:    s_mov_b32 exec_lo, s0
-; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    s_getpc_b64 s[0:1]
-; GFX11-NEXT:    s_add_u32 s0, s0, test_arg_store_v2bf16@gotpcrel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s1, s1, test_arg_store_v2bf16@gotpcrel32@hi+12
-; GFX11-NEXT:    v_writelane_b32 v5, s30, 0
-; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX11-NEXT:    v_writelane_b32 v5, s31, 1
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    v_add_nc_u32_e32 v6, 14, v4
-; GFX11-NEXT:    v_add_nc_u32_e32 v7, 12, v4
-; GFX11-NEXT:    v_add_nc_u32_e32 v8, 10, v4
-; GFX11-NEXT:    v_add_nc_u32_e32 v9, 6, v4
-; GFX11-NEXT:    v_readlane_b32 s31, v5, 1
-; GFX11-NEXT:    scratch_store_d16_hi_b16 v6, v3, off dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_store_b16 v7, v3, off dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_store_d16_hi_b16 v8, v2, off dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_store_b16 v4, v2, off offset:8 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_store_d16_hi_b16 v9, v1, off dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_store_b16 v4, v1, off offset:4 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_store_d16_hi_b16 v4, v0, off offset:2 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_store_b16 v4, v0, off dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    v_readlane_b32 s30, v5, 0
-; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
-; GFX11-NEXT:    scratch_load_b32 v5, off, s33 ; 4-byte Folded Reload
-; GFX11-NEXT:    s_mov_b32 exec_lo, s0
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
-; GFX11-NEXT:    s_mov_b32 s33, s2
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_store_b128 v[4:5], v[0:3], off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-entry:
-  %result = call <8 x bfloat> @test_arg_store_v2bf16(<8 x bfloat> %in)
-  store volatile <8 x bfloat> %result, ptr addrspace(5) %out
+  store <8 x bfloat> %in, ptr addrspace(1) %out
   ret void
 }
 
-define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
-; GCN-LABEL: test_call_v16bf16:
-; GCN:       ; %bb.0: ; %entry
+define void @test_arg_store_v16bf16(<16 x bfloat> %in, ptr addrspace(1) %out) {
+; GCN-LABEL: test_arg_store_v16bf16:
+; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s8, s33
-; GCN-NEXT:    s_mov_b32 s33, s32
-; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    buffer_store_dword v17, off, s[0:3], s33 ; 4-byte Folded Spill
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
-; GCN-NEXT:    s_addk_i32 s32, 0x400
-; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    v_writelane_b32 v17, s30, 0
-; GCN-NEXT:    v_writelane_b32 v17, s31, 1
-; GCN-NEXT:    s_getpc_b64 s[4:5]
-; GCN-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4
-; GCN-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12
-; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GCN-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GCN-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GCN-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GCN-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GCN-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
 ; GCN-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
-; GCN-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
-; GCN-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
-; GCN-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
-; GCN-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
-; GCN-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
-; GCN-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
-; GCN-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
+; GCN-NEXT:    v_lshrrev_b32_e32 v18, 16, v5
+; GCN-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, 0
 ; GCN-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
-; GCN-NEXT:    v_add_i32_e32 v18, vcc, 30, v16
-; GCN-NEXT:    v_add_i32_e32 v19, vcc, 28, v16
-; GCN-NEXT:    v_add_i32_e32 v20, vcc, 26, v16
-; GCN-NEXT:    v_add_i32_e32 v21, vcc, 24, v16
-; GCN-NEXT:    v_add_i32_e32 v22, vcc, 22, v16
-; GCN-NEXT:    v_add_i32_e32 v23, vcc, 20, v16
-; GCN-NEXT:    v_add_i32_e32 v24, vcc, 18, v16
-; GCN-NEXT:    v_add_i32_e32 v25, vcc, 16, v16
-; GCN-NEXT:    v_add_i32_e32 v26, vcc, 14, v16
-; GCN-NEXT:    v_add_i32_e32 v27, vcc, 12, v16
-; GCN-NEXT:    v_add_i32_e32 v28, vcc, 10, v16
-; GCN-NEXT:    buffer_store_short v15, v18, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
-; GCN-NEXT:    v_add_i32_e32 v15, vcc, 8, v16
-; GCN-NEXT:    v_add_i32_e32 v18, vcc, 6, v16
-; GCN-NEXT:    buffer_store_short v14, v19, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
-; GCN-NEXT:    v_add_i32_e32 v14, vcc, 4, v16
-; GCN-NEXT:    v_add_i32_e32 v19, vcc, 2, v16
-; GCN-NEXT:    buffer_store_short v13, v20, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_short v12, v21, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_short v11, v22, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_short v10, v23, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_short v9, v24, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_short v8, v25, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_short v7, v26, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_short v6, v27, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_short v5, v28, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_short v4, v15, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_short v3, v18, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_short v2, v14, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_short v1, v19, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_short v0, v16, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_readlane_b32 s31, v17, 1
-; GCN-NEXT:    v_readlane_b32 s30, v17, 0
-; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s33 ; 4-byte Folded Reload
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
-; GCN-NEXT:    s_addk_i32 s32, 0xfc00
-; GCN-NEXT:    s_mov_b32 s33, s8
+; GCN-NEXT:    v_lshrrev_b32_e32 v19, 16, v13
+; GCN-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
+; GCN-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GCN-NEXT:    v_alignbit_b32 v5, v7, v6, 16
+; GCN-NEXT:    v_alignbit_b32 v4, v18, v4, 16
+; GCN-NEXT:    v_alignbit_b32 v3, v3, v2, 16
+; GCN-NEXT:    v_alignbit_b32 v2, v1, v0, 16
+; GCN-NEXT:    s_mov_b32 s4, s6
+; GCN-NEXT:    s_mov_b32 s5, s6
+; GCN-NEXT:    v_alignbit_b32 v13, v15, v14, 16
+; GCN-NEXT:    v_alignbit_b32 v12, v19, v12, 16
+; GCN-NEXT:    v_alignbit_b32 v11, v11, v10, 16
+; GCN-NEXT:    v_alignbit_b32 v10, v9, v8, 16
+; GCN-NEXT:    buffer_store_dwordx4 v[10:13], v[16:17], s[4:7], 0 addr64 offset:16
+; GCN-NEXT:    buffer_store_dwordx4 v[2:5], v[16:17], s[4:7], 0 addr64
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: test_call_v16bf16:
-; GFX7:       ; %bb.0: ; %entry
+; GFX7-LABEL: test_arg_store_v16bf16:
+; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b32 s8, s33
-; GFX7-NEXT:    s_mov_b32 s33, s32
-; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX7-NEXT:    buffer_store_dword v17, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX7-NEXT:    s_addk_i32 s32, 0x400
-; GFX7-NEXT:    s_getpc_b64 s[4:5]
-; GFX7-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4
-; GFX7-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12
-; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX7-NEXT:    v_writelane_b32 v17, s30, 0
-; GFX7-NEXT:    v_writelane_b32 v17, s31, 1
-; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX7-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
-; GFX7-NEXT:    v_add_i32_e32 v18, vcc, 30, v16
-; GFX7-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
-; GFX7-NEXT:    buffer_store_short v15, v18, s[0:3], 0 offen
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v15, vcc, 28, v16
-; GFX7-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
-; GFX7-NEXT:    buffer_store_short v14, v15, s[0:3], 0 offen
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v14, vcc, 26, v16
-; GFX7-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
-; GFX7-NEXT:    buffer_store_short v13, v14, s[0:3], 0 offen
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v13, vcc, 24, v16
-; GFX7-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
-; GFX7-NEXT:    buffer_store_short v12, v13, s[0:3], 0 offen
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v12, vcc, 22, v16
-; GFX7-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
-; GFX7-NEXT:    buffer_store_short v11, v12, s[0:3], 0 offen
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v11, vcc, 20, v16
-; GFX7-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
-; GFX7-NEXT:    buffer_store_short v10, v11, s[0:3], 0 offen
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v10, vcc, 18, v16
-; GFX7-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
-; GFX7-NEXT:    buffer_store_short v9, v10, s[0:3], 0 offen
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v9, vcc, 16, v16
-; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
-; GFX7-NEXT:    buffer_store_short v8, v9, s[0:3], 0 offen
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v8, vcc, 14, v16
-; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; GFX7-NEXT:    buffer_store_short v7, v8, s[0:3], 0 offen
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v7, vcc, 12, v16
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX7-NEXT:    buffer_store_short v6, v7, s[0:3], 0 offen
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v6, vcc, 10, v16
-; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX7-NEXT:    buffer_store_short v5, v6, s[0:3], 0 offen
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v5, vcc, 8, v16
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX7-NEXT:    buffer_store_short v4, v5, s[0:3], 0 offen
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 6, v16
-; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT:    buffer_store_short v3, v4, s[0:3], 0 offen
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 4, v16
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX7-NEXT:    buffer_store_short v2, v3, s[0:3], 0 offen
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 2, v16
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX7-NEXT:    buffer_store_short v1, v2, s[0:3], 0 offen
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    buffer_store_short v0, v16, s[0:3], 0 offen
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_readlane_b32 s31, v17, 1
-; GFX7-NEXT:    v_readlane_b32 s30, v17, 0
-; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX7-NEXT:    buffer_load_dword v17, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX7-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX7-NEXT:    s_mov_b32 s33, s8
+; GFX7-NEXT:    v_alignbit_b32 v5, v5, v4, 16
+; GFX7-NEXT:    v_alignbit_b32 v4, v3, v2, 16
+; GFX7-NEXT:    v_alignbit_b32 v3, v1, v0, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v15
+; GFX7-NEXT:    v_alignbit_b32 v14, v0, v14, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v13
+; GFX7-NEXT:    v_alignbit_b32 v13, v0, v12, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v11
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    v_alignbit_b32 v12, v0, v10, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v9
+; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    v_alignbit_b32 v11, v0, v8, 16
+; GFX7-NEXT:    v_alignbit_b32 v6, v7, v6, 16
+; GFX7-NEXT:    buffer_store_dwordx4 v[11:14], v[16:17], s[4:7], 0 addr64 offset:16
+; GFX7-NEXT:    buffer_store_dwordx4 v[3:6], v[16:17], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: test_call_v16bf16:
-; GFX8:       ; %bb.0: ; %entry
+; GFX8-LABEL: test_arg_store_v16bf16:
+; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_mov_b32 s6, s33
-; GFX8-NEXT:    s_mov_b32 s33, s32
-; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX8-NEXT:    buffer_store_dword v9, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX8-NEXT:    s_addk_i32 s32, 0x400
-; GFX8-NEXT:    s_getpc_b64 s[4:5]
-; GFX8-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4
-; GFX8-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12
-; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX8-NEXT:    v_writelane_b32 v9, s30, 0
-; GFX8-NEXT:    v_writelane_b32 v9, s31, 1
-; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX8-NEXT:    v_add_u32_e32 v18, vcc, 28, v8
-; GFX8-NEXT:    v_lshrrev_b32_e32 v17, 16, v7
-; GFX8-NEXT:    buffer_store_short v7, v18, s[0:3], 0 offen
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 24, v8
-; GFX8-NEXT:    v_lshrrev_b32_e32 v16, 16, v6
-; GFX8-NEXT:    buffer_store_short v6, v7, s[0:3], 0 offen
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 20, v8
-; GFX8-NEXT:    v_lshrrev_b32_e32 v15, 16, v5
-; GFX8-NEXT:    buffer_store_short v5, v6, s[0:3], 0 offen
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 16, v8
-; GFX8-NEXT:    v_lshrrev_b32_e32 v14, 16, v4
-; GFX8-NEXT:    buffer_store_short v4, v5, s[0:3], 0 offen
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 12, v8
-; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 16, v3
-; GFX8-NEXT:    buffer_store_short v3, v4, s[0:3], 0 offen
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 8, v8
-; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 16, v2
-; GFX8-NEXT:    buffer_store_short v2, v3, s[0:3], 0 offen
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 4, v8
-; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v0
-; GFX8-NEXT:    buffer_store_short v1, v2, s[0:3], 0 offen
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    buffer_store_short v0, v8, s[0:3], 0 offen
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 30, v8
-; GFX8-NEXT:    buffer_store_short v17, v0, s[0:3], 0 offen
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 26, v8
-; GFX8-NEXT:    buffer_store_short v16, v0, s[0:3], 0 offen
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 22, v8
-; GFX8-NEXT:    buffer_store_short v15, v0, s[0:3], 0 offen
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 18, v8
-; GFX8-NEXT:    buffer_store_short v14, v0, s[0:3], 0 offen
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 14, v8
-; GFX8-NEXT:    buffer_store_short v13, v0, s[0:3], 0 offen
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 10, v8
-; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 16, v1
-; GFX8-NEXT:    buffer_store_short v12, v0, s[0:3], 0 offen
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 6, v8
-; GFX8-NEXT:    buffer_store_short v11, v0, s[0:3], 0 offen
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 2, v8
-; GFX8-NEXT:    buffer_store_short v10, v0, s[0:3], 0 offen
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_readlane_b32 s31, v9, 1
-; GFX8-NEXT:    v_readlane_b32 s30, v9, 0
-; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX8-NEXT:    buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX8-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX8-NEXT:    s_mov_b32 s33, s6
+; GFX8-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
+; GFX8-NEXT:    s_nop 0
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 16, v8
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v9, vcc
+; GFX8-NEXT:    flat_store_dwordx4 v[0:1], v[4:7]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: test_call_v16bf16:
-; GFX9:       ; %bb.0: ; %entry
+; GFX9-LABEL: test_arg_store_v16bf16:
+; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s6, s33
-; GFX9-NEXT:    s_mov_b32 s33, s32
-; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX9-NEXT:    buffer_store_dword v9, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12
-; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX9-NEXT:    v_writelane_b32 v9, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v9, s31, 1
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    buffer_store_short_d16_hi v7, v8, s[0:3], 0 offen offset:30
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_short v7, v8, s[0:3], 0 offen offset:28
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_short_d16_hi v6, v8, s[0:3], 0 offen offset:26
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_short v6, v8, s[0:3], 0 offen offset:24
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_short_d16_hi v5, v8, s[0:3], 0 offen offset:22
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_short v5, v8, s[0:3], 0 offen offset:20
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_short_d16_hi v4, v8, s[0:3], 0 offen offset:18
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_short v4, v8, s[0:3], 0 offen offset:16
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_short_d16_hi v3, v8, s[0:3], 0 offen offset:14
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_short v3, v8, s[0:3], 0 offen offset:12
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_short_d16_hi v2, v8, s[0:3], 0 offen offset:10
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_short v2, v8, s[0:3], 0 offen offset:8
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_short_d16_hi v1, v8, s[0:3], 0 offen offset:6
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_short v1, v8, s[0:3], 0 offen offset:4
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_short_d16_hi v0, v8, s[0:3], 0 offen offset:2
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_short v0, v8, s[0:3], 0 offen
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_readlane_b32 s31, v9, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v9, 0
-; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
-; GFX9-NEXT:    s_mov_b32 s33, s6
+; GFX9-NEXT:    global_store_dwordx4 v[8:9], v[4:7], off offset:16
+; GFX9-NEXT:    global_store_dwordx4 v[8:9], v[0:3], off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: test_call_v16bf16:
-; GFX10:       ; %bb.0: ; %entry
+; GFX10-LABEL: test_arg_store_v16bf16:
+; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    s_mov_b32 s6, s33
-; GFX10-NEXT:    s_mov_b32 s33, s32
-; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
-; GFX10-NEXT:    buffer_store_dword v9, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    s_getpc_b64 s[4:5]
-; GFX10-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4
-; GFX10-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12
-; GFX10-NEXT:    v_writelane_b32 v9, s30, 0
-; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX10-NEXT:    v_writelane_b32 v9, s31, 1
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT:    buffer_store_short_d16_hi v7, v8, s[0:3], 0 offen offset:30
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_store_short v7, v8, s[0:3], 0 offen offset:28
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_store_short_d16_hi v6, v8, s[0:3], 0 offen offset:26
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_store_short v6, v8, s[0:3], 0 offen offset:24
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_store_short_d16_hi v5, v8, s[0:3], 0 offen offset:22
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_store_short v5, v8, s[0:3], 0 offen offset:20
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_store_short_d16_hi v4, v8, s[0:3], 0 offen offset:18
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_store_short v4, v8, s[0:3], 0 offen offset:16
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_store_short_d16_hi v3, v8, s[0:3], 0 offen offset:14
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_store_short v3, v8, s[0:3], 0 offen offset:12
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_store_short_d16_hi v2, v8, s[0:3], 0 offen offset:10
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_store_short v2, v8, s[0:3], 0 offen offset:8
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_store_short_d16_hi v1, v8, s[0:3], 0 offen offset:6
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_store_short v1, v8, s[0:3], 0 offen offset:4
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_store_short_d16_hi v0, v8, s[0:3], 0 offen offset:2
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_store_short v0, v8, s[0:3], 0 offen
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_readlane_b32 s31, v9, 1
-; GFX10-NEXT:    v_readlane_b32 s30, v9, 0
-; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
-; GFX10-NEXT:    buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
-; GFX10-NEXT:    s_mov_b32 s33, s6
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_store_dwordx4 v[8:9], v[4:7], off offset:16
+; GFX10-NEXT:    global_store_dwordx4 v[8:9], v[0:3], off
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: test_call_v16bf16:
-; GFX11:       ; %bb.0: ; %entry
+; GFX11-LABEL: test_arg_store_v16bf16:
+; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s2, s33
-; GFX11-NEXT:    s_mov_b32 s33, s32
-; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
-; GFX11-NEXT:    scratch_store_b32 off, v9, s33 ; 4-byte Folded Spill
-; GFX11-NEXT:    s_mov_b32 exec_lo, s0
-; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    s_getpc_b64 s[0:1]
-; GFX11-NEXT:    s_add_u32 s0, s0, test_arg_store_v2bf16@gotpcrel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s1, s1, test_arg_store_v2bf16@gotpcrel32@hi+12
-; GFX11-NEXT:    v_writelane_b32 v9, s30, 0
-; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX11-NEXT:    v_writelane_b32 v9, s31, 1
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    v_add_nc_u32_e32 v10, 30, v8
-; GFX11-NEXT:    v_add_nc_u32_e32 v11, 28, v8
-; GFX11-NEXT:    v_add_nc_u32_e32 v12, 20, v8
-; GFX11-NEXT:    v_add_nc_u32_e32 v13, 18, v8
-; GFX11-NEXT:    v_readlane_b32 s31, v9, 1
-; GFX11-NEXT:    scratch_store_d16_hi_b16 v10, v7, off dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_store_b16 v11, v7, off dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    v_add_nc_u32_e32 v7, 26, v8
-; GFX11-NEXT:    v_add_nc_u32_e32 v10, 24, v8
-; GFX11-NEXT:    v_add_nc_u32_e32 v11, 22, v8
-; GFX11-NEXT:    v_readlane_b32 s30, v9, 0
-; GFX11-NEXT:    scratch_store_d16_hi_b16 v7, v6, off dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_store_b16 v10, v6, off dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_store_d16_hi_b16 v11, v5, off dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_store_b16 v12, v5, off dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_store_d16_hi_b16 v13, v4, off dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    v_add_nc_u32_e32 v5, 14, v8
-; GFX11-NEXT:    v_add_nc_u32_e32 v6, 12, v8
-; GFX11-NEXT:    v_add_nc_u32_e32 v7, 10, v8
-; GFX11-NEXT:    scratch_store_b16 v8, v4, off offset:16 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, 6, v8
-; GFX11-NEXT:    scratch_store_d16_hi_b16 v5, v3, off dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_store_b16 v6, v3, off dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_store_d16_hi_b16 v7, v2, off dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_store_b16 v8, v2, off offset:8 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_store_d16_hi_b16 v4, v1, off dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_store_b16 v8, v1, off offset:4 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_store_d16_hi_b16 v8, v0, off offset:2 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_store_b16 v8, v0, off dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
-; GFX11-NEXT:    scratch_load_b32 v9, off, s33 ; 4-byte Folded Reload
-; GFX11-NEXT:    s_mov_b32 exec_lo, s0
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
-; GFX11-NEXT:    s_mov_b32 s33, s2
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    global_store_b128 v[8:9], v[4:7], off offset:16
+; GFX11-NEXT:    global_store_b128 v[8:9], v[0:3], off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-entry:
-  %result = call <16 x bfloat> @test_arg_store_v2bf16(<16 x bfloat> %in)
-  store volatile <16 x bfloat> %result, ptr addrspace(5) %out
+  store <16 x bfloat> %in, ptr addrspace(1) %out
   ret void
 }
 
-define bfloat @test_alloca_load_store_ret(bfloat %in) {
-; GCN-LABEL: test_alloca_load_store_ret:
-; GCN:       ; %bb.0: ; %entry
+define amdgpu_gfx void @test_inreg_arg_store(bfloat inreg %in, ptr addrspace(1) %out) {
+; GCN-LABEL: test_inreg_arg_store:
+; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GCN-NEXT:    buffer_store_short v0, off, s[0:3], s32
+; GCN-NEXT:    s_lshr_b32 s34, s4, 16
+; GCN-NEXT:    s_mov_b32 s38, 0
+; GCN-NEXT:    s_mov_b32 s39, 0xf000
+; GCN-NEXT:    s_mov_b32 s36, s38
+; GCN-NEXT:    s_mov_b32 s37, s38
+; GCN-NEXT:    v_mov_b32_e32 v2, s34
+; GCN-NEXT:    buffer_store_short v2, v[0:1], s[36:39], 0 addr64
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
-; GCN-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 glc
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: test_alloca_load_store_ret:
-; GFX7:       ; %bb.0: ; %entry
+; GFX7-LABEL: test_inreg_arg_store:
+; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX7-NEXT:    buffer_store_short v0, off, s[0:3], s32
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 glc
+; GFX7-NEXT:    s_lshr_b32 s34, s4, 16
+; GFX7-NEXT:    s_mov_b32 s38, 0
+; GFX7-NEXT:    s_mov_b32 s39, 0xf000
+; GFX7-NEXT:    s_mov_b32 s36, s38
+; GFX7-NEXT:    s_mov_b32 s37, s38
+; GFX7-NEXT:    v_mov_b32_e32 v2, s34
+; GFX7-NEXT:    buffer_store_short v2, v[0:1], s[36:39], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: test_alloca_load_store_ret:
-; GFX8:       ; %bb.0: ; %entry
+; GFX8-LABEL: test_inreg_arg_store:
+; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    buffer_store_short v0, off, s[0:3], s32
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 glc
+; GFX8-NEXT:    s_lshr_b32 s34, s4, 16
+; GFX8-NEXT:    v_mov_b32_e32 v2, s34
+; GFX8-NEXT:    flat_store_short v[0:1], v2
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: test_alloca_load_store_ret:
-; GFX9:       ; %bb.0: ; %entry
+; GFX9-LABEL: test_inreg_arg_store:
+; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    buffer_store_short_d16_hi v0, off, s[0:3], s32
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-NEXT:    buffer_load_short_d16_hi v0, off, s[0:3], s32 glc
+; GFX9-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9-NEXT:    global_store_short_d16_hi v[0:1], v2, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: test_alloca_load_store_ret:
-; GFX10:       ; %bb.0: ; %entry
+; GFX10-LABEL: test_inreg_arg_store:
+; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-NEXT:    buffer_store_short_d16_hi v0, off, s[0:3], s32
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    buffer_load_short_d16_hi v1, off, s[0:3], s32 glc dlc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v0, v1
+; GFX10-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-NEXT:    global_store_short_d16_hi v[0:1], v2, off
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: test_alloca_load_store_ret:
-; GFX11:       ; %bb.0: ; %entry
+; GFX11-LABEL: test_inreg_arg_store:
+; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-NEXT:    scratch_store_d16_hi_b16 off, v0, s32 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_load_d16_hi_b16 v1, off, s32 glc dlc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v0, v1
+; GFX11-NEXT:    v_mov_b32_e32 v2, s4
+; GFX11-NEXT:    global_store_d16_hi_b16 v[0:1], v2, off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-entry:
-  %in.addr = alloca bfloat, align 2, addrspace(5)
-  store volatile bfloat %in, ptr addrspace(5) %in.addr, align 2
-  %loaded = load volatile bfloat, ptr addrspace(5) %in.addr, align 2
-  ret bfloat %loaded
+  store bfloat %in, ptr addrspace(1) %out
+  ret void
 }
 
-define { <32 x i32>, bfloat } @test_overflow_stack(bfloat %a, <32 x i32> %b) {
-; GCN-LABEL: test_overflow_stack:
+define bfloat @test_byval(ptr addrspace(5) byval(bfloat) %bv, bfloat %val) {
+; GCN-LABEL: test_byval:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GCN-NEXT:    buffer_store_short v1, off, s[0:3], s32
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: test_byval:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX7-NEXT:    buffer_store_short v1, off, s[0:3], s32
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: test_byval:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX8-NEXT:    buffer_store_short v1, off, s[0:3], s32
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: test_byval:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_store_short_d16_hi v0, off, s[0:3], s32
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: test_byval:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    buffer_store_short_d16_hi v0, off, s[0:3], s32
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: test_byval:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    scratch_store_d16_hi_b16 off, v0, s32
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  store bfloat %val, ptr addrspace(5) %bv
+  %retval = load bfloat, ptr addrspace(5) %bv
+  ret bfloat %retval
+}
+
+define void @test_sret(ptr addrspace(5) sret(bfloat) %sret, bfloat %val) {
+; GCN-LABEL: test_sret:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:8
-; GCN-NEXT:    v_add_i32_e32 v31, vcc, 0x7c, v0
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
-; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32
-; GCN-NEXT:    s_waitcnt vmcnt(2)
-; GCN-NEXT:    buffer_store_dword v2, v31, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, 0x78, v0
-; GCN-NEXT:    s_waitcnt vmcnt(2)
-; GCN-NEXT:    buffer_store_dword v32, v2, s[0:3], 0 offen
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, 0x74, v0
-; GCN-NEXT:    s_waitcnt vmcnt(2)
-; GCN-NEXT:    buffer_store_dword v33, v2, s[0:3], 0 offen
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, 0x70, v0
-; GCN-NEXT:    v_add_i32_e32 v31, vcc, 0x6c, v0
-; GCN-NEXT:    buffer_store_dword v30, v2, s[0:3], 0 offen
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, 0x68, v0
-; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    v_add_i32_e32 v30, vcc, 0x64, v0
-; GCN-NEXT:    buffer_store_dword v29, v31, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    v_add_i32_e32 v29, vcc, 0x60, v0
-; GCN-NEXT:    v_add_i32_e32 v31, vcc, 0x5c, v0
-; GCN-NEXT:    buffer_store_dword v28, v2, s[0:3], 0 offen
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, 0x58, v0
-; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    v_add_i32_e32 v28, vcc, 0x54, v0
-; GCN-NEXT:    buffer_store_dword v27, v30, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    v_add_i32_e32 v27, vcc, 0x50, v0
-; GCN-NEXT:    v_add_i32_e32 v30, vcc, 0x4c, v0
 ; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GCN-NEXT:    buffer_store_dword v26, v29, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    v_add_i32_e32 v26, vcc, 0x48, v0
-; GCN-NEXT:    v_add_i32_e32 v29, vcc, 0x44, v0
-; GCN-NEXT:    buffer_store_dword v25, v31, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    v_add_i32_e32 v25, vcc, 64, v0
-; GCN-NEXT:    v_add_i32_e32 v31, vcc, 60, v0
-; GCN-NEXT:    buffer_store_dword v24, v2, s[0:3], 0 offen
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, 56, v0
-; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    v_add_i32_e32 v24, vcc, 52, v0
-; GCN-NEXT:    buffer_store_dword v23, v28, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    v_add_i32_e32 v23, vcc, 48, v0
-; GCN-NEXT:    v_add_i32_e32 v28, vcc, 44, v0
-; GCN-NEXT:    buffer_store_dword v22, v27, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    v_add_i32_e32 v22, vcc, 40, v0
-; GCN-NEXT:    v_add_i32_e32 v27, vcc, 36, v0
-; GCN-NEXT:    buffer_store_dword v21, v30, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    v_add_i32_e32 v21, vcc, 32, v0
-; GCN-NEXT:    v_add_i32_e32 v30, vcc, 28, v0
-; GCN-NEXT:    buffer_store_dword v20, v26, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    v_add_i32_e32 v20, vcc, 24, v0
-; GCN-NEXT:    v_add_i32_e32 v26, vcc, 20, v0
-; GCN-NEXT:    buffer_store_dword v19, v29, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    v_add_i32_e32 v19, vcc, 16, v0
-; GCN-NEXT:    v_add_i32_e32 v29, vcc, 12, v0
-; GCN-NEXT:    buffer_store_dword v18, v25, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    v_add_i32_e32 v18, vcc, 8, v0
-; GCN-NEXT:    v_add_i32_e32 v25, vcc, 4, v0
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, 0x80, v0
-; GCN-NEXT:    buffer_store_dword v17, v31, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v16, v2, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v15, v24, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v14, v23, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v13, v28, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v12, v22, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v11, v27, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v10, v21, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v9, v30, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v8, v20, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v7, v26, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v6, v19, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v5, v29, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v4, v18, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v3, v25, s[0:3], 0 offen
 ; GCN-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: test_overflow_stack:
+; GFX7-LABEL: test_sret:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
-; GFX7-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:8
-; GFX7-NEXT:    v_add_i32_e32 v31, vcc, 0x7c, v0
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    buffer_store_dword v2, v31, s[0:3], 0 offen
-; GFX7-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:4
-; GFX7-NEXT:    v_add_i32_e32 v31, vcc, 0x78, v0
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    buffer_store_dword v2, v31, s[0:3], 0 offen
-; GFX7-NEXT:    buffer_load_dword v2, off, s[0:3], s32
-; GFX7-NEXT:    v_add_i32_e32 v31, vcc, 0x74, v0
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    buffer_store_dword v2, v31, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0x70, v0
-; GFX7-NEXT:    buffer_store_dword v30, v2, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0x6c, v0
-; GFX7-NEXT:    buffer_store_dword v29, v2, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0x68, v0
-; GFX7-NEXT:    buffer_store_dword v28, v2, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0x64, v0
-; GFX7-NEXT:    buffer_store_dword v27, v2, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0x60, v0
-; GFX7-NEXT:    buffer_store_dword v26, v2, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0x5c, v0
-; GFX7-NEXT:    buffer_store_dword v25, v2, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0x58, v0
-; GFX7-NEXT:    buffer_store_dword v24, v2, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0x54, v0
-; GFX7-NEXT:    buffer_store_dword v23, v2, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0x50, v0
-; GFX7-NEXT:    buffer_store_dword v22, v2, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0x4c, v0
-; GFX7-NEXT:    buffer_store_dword v21, v2, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0x48, v0
-; GFX7-NEXT:    buffer_store_dword v20, v2, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0x44, v0
-; GFX7-NEXT:    buffer_store_dword v19, v2, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 64, v0
-; GFX7-NEXT:    buffer_store_dword v18, v2, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 60, v0
-; GFX7-NEXT:    buffer_store_dword v17, v2, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 56, v0
-; GFX7-NEXT:    buffer_store_dword v16, v2, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 52, v0
-; GFX7-NEXT:    buffer_store_dword v15, v2, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 48, v0
-; GFX7-NEXT:    buffer_store_dword v14, v2, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 44, v0
-; GFX7-NEXT:    buffer_store_dword v13, v2, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 40, v0
-; GFX7-NEXT:    buffer_store_dword v12, v2, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 36, v0
-; GFX7-NEXT:    buffer_store_dword v11, v2, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 32, v0
-; GFX7-NEXT:    buffer_store_dword v10, v2, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 28, v0
-; GFX7-NEXT:    buffer_store_dword v9, v2, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 24, v0
-; GFX7-NEXT:    buffer_store_dword v8, v2, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 20, v0
-; GFX7-NEXT:    buffer_store_dword v7, v2, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 16, v0
-; GFX7-NEXT:    buffer_store_dword v6, v2, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 12, v0
-; GFX7-NEXT:    buffer_store_dword v5, v2, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 8, v0
-; GFX7-NEXT:    buffer_store_dword v4, v2, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 4, v0
-; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x80, v0
-; GFX7-NEXT:    buffer_store_dword v3, v2, s[0:3], 0 offen
 ; GFX7-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: test_overflow_stack:
+; GFX8-LABEL: test_sret:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
-; GFX8-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:8
-; GFX8-NEXT:    v_add_u32_e32 v31, vcc, 0x7c, v0
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    buffer_store_dword v2, v31, s[0:3], 0 offen
-; GFX8-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:4
-; GFX8-NEXT:    v_add_u32_e32 v31, vcc, 0x78, v0
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    buffer_store_dword v2, v31, s[0:3], 0 offen
-; GFX8-NEXT:    buffer_load_dword v2, off, s[0:3], s32
-; GFX8-NEXT:    v_add_u32_e32 v31, vcc, 0x74, v0
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    buffer_store_dword v2, v31, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x70, v0
-; GFX8-NEXT:    buffer_store_dword v30, v2, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x6c, v0
-; GFX8-NEXT:    buffer_store_dword v29, v2, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x68, v0
-; GFX8-NEXT:    buffer_store_dword v28, v2, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x64, v0
-; GFX8-NEXT:    buffer_store_dword v27, v2, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x60, v0
-; GFX8-NEXT:    buffer_store_dword v26, v2, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x5c, v0
-; GFX8-NEXT:    buffer_store_dword v25, v2, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x58, v0
-; GFX8-NEXT:    buffer_store_dword v24, v2, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x54, v0
-; GFX8-NEXT:    buffer_store_dword v23, v2, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x50, v0
-; GFX8-NEXT:    buffer_store_dword v22, v2, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x4c, v0
-; GFX8-NEXT:    buffer_store_dword v21, v2, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x48, v0
-; GFX8-NEXT:    buffer_store_dword v20, v2, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x44, v0
-; GFX8-NEXT:    buffer_store_dword v19, v2, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 64, v0
-; GFX8-NEXT:    buffer_store_dword v18, v2, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 60, v0
-; GFX8-NEXT:    buffer_store_dword v17, v2, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 56, v0
-; GFX8-NEXT:    buffer_store_dword v16, v2, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 52, v0
-; GFX8-NEXT:    buffer_store_dword v15, v2, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 48, v0
-; GFX8-NEXT:    buffer_store_dword v14, v2, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 44, v0
-; GFX8-NEXT:    buffer_store_dword v13, v2, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 40, v0
-; GFX8-NEXT:    buffer_store_dword v12, v2, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 36, v0
-; GFX8-NEXT:    buffer_store_dword v11, v2, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 32, v0
-; GFX8-NEXT:    buffer_store_dword v10, v2, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 28, v0
-; GFX8-NEXT:    buffer_store_dword v9, v2, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 24, v0
-; GFX8-NEXT:    buffer_store_dword v8, v2, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 20, v0
-; GFX8-NEXT:    buffer_store_dword v7, v2, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 16, v0
-; GFX8-NEXT:    buffer_store_dword v6, v2, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 12, v0
-; GFX8-NEXT:    buffer_store_dword v5, v2, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 8, v0
-; GFX8-NEXT:    buffer_store_dword v4, v2, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 4, v0
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x80, v0
-; GFX8-NEXT:    buffer_store_dword v3, v2, s[0:3], 0 offen
-; GFX8-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
+; GFX8-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: test_overflow_stack:
+; GFX9-LABEL: test_sret:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v30, v0, s[0:3], 0 offen offset:112
-; GFX9-NEXT:    buffer_store_dword v29, v0, s[0:3], 0 offen offset:108
-; GFX9-NEXT:    buffer_store_dword v28, v0, s[0:3], 0 offen offset:104
-; GFX9-NEXT:    buffer_store_dword v27, v0, s[0:3], 0 offen offset:100
-; GFX9-NEXT:    buffer_store_dword v26, v0, s[0:3], 0 offen offset:96
-; GFX9-NEXT:    buffer_store_dword v25, v0, s[0:3], 0 offen offset:92
-; GFX9-NEXT:    buffer_store_dword v24, v0, s[0:3], 0 offen offset:88
-; GFX9-NEXT:    buffer_store_dword v23, v0, s[0:3], 0 offen offset:84
-; GFX9-NEXT:    buffer_store_dword v22, v0, s[0:3], 0 offen offset:80
-; GFX9-NEXT:    buffer_store_dword v21, v0, s[0:3], 0 offen offset:76
-; GFX9-NEXT:    buffer_store_dword v20, v0, s[0:3], 0 offen offset:72
-; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:4
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v19, v0, s[0:3], 0 offen offset:68
-; GFX9-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:8
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v18, v0, s[0:3], 0 offen offset:64
-; GFX9-NEXT:    buffer_load_dword v18, off, s[0:3], s32
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_store_dword v17, v0, s[0:3], 0 offen offset:60
-; GFX9-NEXT:    buffer_store_dword v16, v0, s[0:3], 0 offen offset:56
-; GFX9-NEXT:    buffer_store_dword v15, v0, s[0:3], 0 offen offset:52
-; GFX9-NEXT:    buffer_store_dword v14, v0, s[0:3], 0 offen offset:48
-; GFX9-NEXT:    buffer_store_dword v13, v0, s[0:3], 0 offen offset:44
-; GFX9-NEXT:    buffer_store_dword v12, v0, s[0:3], 0 offen offset:40
-; GFX9-NEXT:    buffer_store_dword v11, v0, s[0:3], 0 offen offset:36
-; GFX9-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:32
-; GFX9-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:28
-; GFX9-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:24
-; GFX9-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:20
-; GFX9-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:16
-; GFX9-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:12
-; GFX9-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:8
-; GFX9-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
-; GFX9-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
-; GFX9-NEXT:    s_waitcnt vmcnt(18)
-; GFX9-NEXT:    buffer_store_dword v19, v0, s[0:3], 0 offen offset:124
-; GFX9-NEXT:    buffer_store_dword v20, v0, s[0:3], 0 offen offset:120
-; GFX9-NEXT:    s_waitcnt vmcnt(18)
-; GFX9-NEXT:    buffer_store_dword v18, v0, s[0:3], 0 offen offset:116
-; GFX9-NEXT:    buffer_store_short_d16_hi v1, v0, s[0:3], 0 offen offset:128
+; GFX9-NEXT:    buffer_store_short_d16_hi v1, v0, s[0:3], 0 offen
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: test_overflow_stack:
+; GFX10-LABEL: test_sret:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    s_clause 0x2
-; GFX10-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:8
-; GFX10-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
-; GFX10-NEXT:    buffer_load_dword v33, off, s[0:3], s32
-; GFX10-NEXT:    buffer_store_dword v30, v0, s[0:3], 0 offen offset:112
-; GFX10-NEXT:    buffer_store_dword v29, v0, s[0:3], 0 offen offset:108
-; GFX10-NEXT:    buffer_store_dword v28, v0, s[0:3], 0 offen offset:104
-; GFX10-NEXT:    buffer_store_dword v27, v0, s[0:3], 0 offen offset:100
-; GFX10-NEXT:    buffer_store_dword v26, v0, s[0:3], 0 offen offset:96
-; GFX10-NEXT:    buffer_store_dword v25, v0, s[0:3], 0 offen offset:92
-; GFX10-NEXT:    buffer_store_dword v24, v0, s[0:3], 0 offen offset:88
-; GFX10-NEXT:    buffer_store_dword v23, v0, s[0:3], 0 offen offset:84
-; GFX10-NEXT:    buffer_store_dword v22, v0, s[0:3], 0 offen offset:80
-; GFX10-NEXT:    buffer_store_dword v21, v0, s[0:3], 0 offen offset:76
-; GFX10-NEXT:    buffer_store_dword v20, v0, s[0:3], 0 offen offset:72
-; GFX10-NEXT:    buffer_store_dword v19, v0, s[0:3], 0 offen offset:68
-; GFX10-NEXT:    buffer_store_dword v18, v0, s[0:3], 0 offen offset:64
-; GFX10-NEXT:    buffer_store_dword v17, v0, s[0:3], 0 offen offset:60
-; GFX10-NEXT:    buffer_store_dword v16, v0, s[0:3], 0 offen offset:56
-; GFX10-NEXT:    buffer_store_dword v15, v0, s[0:3], 0 offen offset:52
-; GFX10-NEXT:    buffer_store_dword v14, v0, s[0:3], 0 offen offset:48
-; GFX10-NEXT:    buffer_store_dword v13, v0, s[0:3], 0 offen offset:44
-; GFX10-NEXT:    buffer_store_dword v12, v0, s[0:3], 0 offen offset:40
-; GFX10-NEXT:    buffer_store_dword v11, v0, s[0:3], 0 offen offset:36
-; GFX10-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:32
-; GFX10-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:28
-; GFX10-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:24
-; GFX10-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:20
-; GFX10-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:16
-; GFX10-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:12
-; GFX10-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:8
-; GFX10-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
-; GFX10-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
-; GFX10-NEXT:    s_waitcnt vmcnt(2)
-; GFX10-NEXT:    buffer_store_dword v31, v0, s[0:3], 0 offen offset:124
-; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    buffer_store_dword v32, v0, s[0:3], 0 offen offset:120
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_store_dword v33, v0, s[0:3], 0 offen offset:116
-; GFX10-NEXT:    buffer_store_short_d16_hi v1, v0, s[0:3], 0 offen offset:128
+; GFX10-NEXT:    buffer_store_short_d16_hi v1, v0, s[0:3], 0 offen
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: test_overflow_stack:
+; GFX11-LABEL: test_sret:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    scratch_load_b32 v33, off, s32 offset:8
-; GFX11-NEXT:    scratch_load_b32 v32, off, s32 offset:4
-; GFX11-NEXT:    scratch_load_b32 v31, off, s32
-; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX11-NEXT:    s_clause 0x4
-; GFX11-NEXT:    scratch_store_b128 off, v[18:21], s0 offset:64
-; GFX11-NEXT:    scratch_store_b128 off, v[10:13], s0 offset:32
-; GFX11-NEXT:    scratch_store_b128 off, v[6:9], s0 offset:16
-; GFX11-NEXT:    scratch_store_b128 off, v[2:5], s0
-; GFX11-NEXT:    scratch_store_d16_hi_b16 off, v1, s0 offset:128
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x70
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x60
-; GFX11-NEXT:    s_add_i32 s3, s0, 0x50
-; GFX11-NEXT:    s_add_i32 s0, s0, 48
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    scratch_store_b128 off, v[30:33], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[26:29], s2
-; GFX11-NEXT:    scratch_store_b128 off, v[22:25], s3
-; GFX11-NEXT:    scratch_store_b128 off, v[14:17], s0
+; GFX11-NEXT:    scratch_store_d16_hi_b16 v0, v1, off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %ins.0 = insertvalue { <32 x i32>, bfloat } poison, <32 x i32> %b, 0
-  %ins.1 = insertvalue { <32 x i32>, bfloat } %ins.0 ,bfloat %a, 1
-  ret { <32 x i32>, bfloat } %ins.1
+  store bfloat %val, ptr addrspace(5) %sret
+  ret void
 }
 
-define <2 x float> @global_extload_v2bf16_to_v2f32(ptr addrspace(1) %ptr) {
-; GCN-LABEL: global_extload_v2bf16_to_v2f32:
+define void @test_bitcast_from_bfloat(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+; GCN-LABEL: test_bitcast_from_bfloat:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b32 s6, 0
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_mov_b32 s4, s6
 ; GCN-NEXT:    s_mov_b32 s5, s6
-; GCN-NEXT:    buffer_load_dword v1, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
-; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    buffer_store_short v0, v[2:3], s[4:7], 0 addr64
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_extload_v2bf16_to_v2f32:
+; GFX7-LABEL: test_bitcast_from_bfloat:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dword v1, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_store_short v0, v[2:3], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
-; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_extload_v2bf16_to_v2f32:
+; GFX8-LABEL: test_bitcast_from_bfloat:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dword v1, v[0:1]
+; GFX8-NEXT:    flat_load_ushort v0, v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    flat_store_short v[2:3], v0
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: global_extload_v2bf16_to_v2f32:
+; GFX9-LABEL: test_bitcast_from_bfloat:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v[0:1], off
+; GFX9-NEXT:    global_load_ushort v0, v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_store_short v[2:3], v0, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_extload_v2bf16_to_v2f32:
+; GFX10-LABEL: test_bitcast_from_bfloat:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v1, v[0:1], off
+; GFX10-NEXT:    global_load_ushort v0, v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    global_store_short v[2:3], v0, off
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_extload_v2bf16_to_v2f32:
+; GFX11-LABEL: test_bitcast_from_bfloat:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v1, v[0:1], off
+; GFX11-NEXT:    global_load_u16 v0, v[0:1], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    global_store_b16 v[2:3], v0, off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %load = load <2 x bfloat>, ptr addrspace(1) %ptr
-  %fpext = fpext <2 x bfloat> %load to <2 x float>
-  ret <2 x float> %fpext
+  %val = load bfloat, ptr addrspace(1) %in
+  %val_int = bitcast bfloat %val to i16
+  store i16 %val_int, ptr addrspace(1) %out
+  ret void
 }
 
-define <3 x float> @global_extload_v3bf16_to_v3f32(ptr addrspace(1) %ptr) {
-; GCN-LABEL: global_extload_v3bf16_to_v3f32:
+define void @test_bitcast_to_bfloat(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+; GCN-LABEL: test_bitcast_to_bfloat:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b32 s6, 0
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_mov_b32 s4, s6
 ; GCN-NEXT:    s_mov_b32 s5, s6
-; GCN-NEXT:    buffer_load_dwordx2 v[1:2], v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
-; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GCN-NEXT:    buffer_store_short v2, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_extload_v3bf16_to_v3f32:
+; GFX7-LABEL: test_bitcast_to_bfloat:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dwordx2 v[1:2], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_store_short v2, v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
-; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_extload_v3bf16_to_v3f32:
+; GFX8-LABEL: test_bitcast_to_bfloat:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dwordx2 v[1:2], v[0:1]
+; GFX8-NEXT:    flat_load_ushort v2, v[2:3]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    flat_store_short v[0:1], v2
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: global_extload_v3bf16_to_v3f32:
+; GFX9-LABEL: test_bitcast_to_bfloat:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[2:3], v[0:1], off
+; GFX9-NEXT:    global_load_ushort v2, v[2:3], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_store_short v[0:1], v2, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_extload_v3bf16_to_v3f32:
+; GFX10-LABEL: test_bitcast_to_bfloat:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[1:2], v[0:1], off
+; GFX10-NEXT:    global_load_ushort v2, v[2:3], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    global_store_short v[0:1], v2, off
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_extload_v3bf16_to_v3f32:
+; GFX11-LABEL: test_bitcast_to_bfloat:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[1:2], v[0:1], off
+; GFX11-NEXT:    global_load_u16 v2, v[2:3], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    global_store_b16 v[0:1], v2, off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %load = load <3 x bfloat>, ptr addrspace(1) %ptr
-  %fpext = fpext <3 x bfloat> %load to <3 x float>
-  ret <3 x float> %fpext
+  %val = load i16, ptr addrspace(1) %in
+  %val_fp = bitcast i16 %val to bfloat
+  store bfloat %val_fp, ptr addrspace(1) %out
+  ret void
 }
 
-define <4 x float> @global_extload_v4bf16_to_v4f32(ptr addrspace(1) %ptr) {
-; GCN-LABEL: global_extload_v4bf16_to_v4f32:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s6, 0
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NEXT:    s_mov_b32 s4, s6
-; GCN-NEXT:    s_mov_b32 s5, s6
-; GCN-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
-; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
-; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+define bfloat @test_ret(bfloat %in) {
+; GCN-LABEL: test_ret:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_extload_v4bf16_to_v4f32:
-; GFX7:       ; %bb.0:
+; GFX7-LABEL: test_ret:
+; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
-; GFX7-NEXT:    s_mov_b32 s4, s6
-; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
-; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_extload_v4bf16_to_v4f32:
-; GFX8:       ; %bb.0:
+; GFX8-LABEL: test_ret:
+; GFX8:       ; %bb.0: ; %entry
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: global_extload_v4bf16_to_v4f32:
-; GFX9:       ; %bb.0:
+; GFX9-LABEL: test_ret:
+; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_extload_v4bf16_to_v4f32:
-; GFX10:       ; %bb.0:
+; GFX10-LABEL: test_ret:
+; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[2:3], v[0:1], off
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_extload_v4bf16_to_v4f32:
-; GFX11:       ; %bb.0:
+; GFX11-LABEL: test_ret:
+; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[2:3], v[0:1], off
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %load = load <4 x bfloat>, ptr addrspace(1) %ptr
-  %fpext = fpext <4 x bfloat> %load to <4 x float>
-  ret <4 x float> %fpext
+entry:
+  ret bfloat %in
 }
 
-define <5 x float> @global_extload_v5bf16_to_v5f32(ptr addrspace(1) %ptr) {
-; GCN-LABEL: global_extload_v5bf16_to_v5f32:
-; GCN:       ; %bb.0:
+define <2 x bfloat> @test_ret_v2bf16(<2 x bfloat> %in) {
+; GCN-LABEL: test_ret_v2bf16:
+; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s6, 0
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NEXT:    s_mov_b32 s4, s6
-; GCN-NEXT:    s_mov_b32 s5, s6
-; GCN-NEXT:    buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:8
-; GCN-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
-; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
-; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
-; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_extload_v5bf16_to_v5f32:
-; GFX7:       ; %bb.0:
+; GFX7-LABEL: test_ret_v2bf16:
+; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
-; GFX7-NEXT:    s_mov_b32 s4, s6
-; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:8
-; GFX7-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    s_waitcnt vmcnt(1)
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
-; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_extload_v5bf16_to_v5f32:
-; GFX8:       ; %bb.0:
+; GFX8-LABEL: test_ret_v2bf16:
+; GFX8:       ; %bb.0: ; %entry
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 8, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_ushort v4, v[0:1]
-; GFX8-NEXT:    s_waitcnt vmcnt(1)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: global_extload_v5bf16_to_v5f32:
-; GFX9:       ; %bb.0:
+; GFX9-LABEL: test_ret_v2bf16:
+; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[5:6], v[0:1], off
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0
-; GFX9-NEXT:    global_load_short_d16_hi v4, v[0:1], off offset:8
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v5
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v6
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v5
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v6
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_extload_v5bf16_to_v5f32:
-; GFX10:       ; %bb.0:
+; GFX10-LABEL: test_ret_v2bf16:
+; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[2:3], v[0:1], off
-; GFX10-NEXT:    v_mov_b32_e32 v4, 0
-; GFX10-NEXT:    global_load_short_d16_hi v4, v[0:1], off offset:8
-; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_extload_v5bf16_to_v5f32:
-; GFX11:       ; %bb.0:
+; GFX11-LABEL: test_ret_v2bf16:
+; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[2:3], v[0:1], off
-; GFX11-NEXT:    v_mov_b32_e32 v4, 0
-; GFX11-NEXT:    global_load_d16_hi_b16 v4, v[0:1], off offset:8
-; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %load = load <5 x bfloat>, ptr addrspace(1) %ptr
-  %fpext = fpext <5 x bfloat> %load to <5 x float>
-  ret <5 x float> %fpext
+entry:
+  ret <2 x bfloat> %in
 }
 
-define <6 x float> @global_extload_v6bf16_to_v6f32(ptr addrspace(1) %ptr) {
-; GCN-LABEL: global_extload_v6bf16_to_v6f32:
-; GCN:       ; %bb.0:
+define <3 x bfloat> @test_ret_v3bf16(<3 x bfloat> %in) {
+; GCN-LABEL: test_ret_v3bf16:
+; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s6, 0
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NEXT:    s_mov_b32 s4, s6
-; GCN-NEXT:    s_mov_b32 s5, s6
-; GCN-NEXT:    buffer_load_dwordx4 v[3:6], v[0:1], s[4:7], 0 addr64
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v3
-; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v3
-; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
-; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v4
-; GCN-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_extload_v6bf16_to_v6f32:
-; GFX7:       ; %bb.0:
+; GFX7-LABEL: test_ret_v3bf16:
+; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
-; GFX7-NEXT:    s_mov_b32 s4, s6
-; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dwordx3 v[3:5], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v3
-; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
-; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_extload_v6bf16_to_v6f32:
-; GFX8:       ; %bb.0:
+; GFX8-LABEL: test_ret_v3bf16:
+; GFX8:       ; %bb.0: ; %entry
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dwordx3 v[3:5], v[0:1]
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v3
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v4
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: global_extload_v6bf16_to_v6f32:
-; GFX9:       ; %bb.0:
+; GFX9-LABEL: test_ret_v3bf16:
+; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx3 v[6:8], v[0:1], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v6
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v7
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v8
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v7
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v8
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_extload_v6bf16_to_v6f32:
-; GFX10:       ; %bb.0:
+; GFX10-LABEL: test_ret_v3bf16:
+; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx3 v[4:6], v[0:1], off
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_extload_v6bf16_to_v6f32:
-; GFX11:       ; %bb.0:
+; GFX11-LABEL: test_ret_v3bf16:
+; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b96 v[4:6], v[0:1], off
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %load = load <6 x bfloat>, ptr addrspace(1) %ptr
-  %fpext = fpext <6 x bfloat> %load to <6 x float>
-  ret <6 x float> %fpext
+entry:
+  ret <3 x bfloat> %in
 }
 
-define <8 x float> @global_extload_v8bf16_to_v8f32(ptr addrspace(1) %ptr) {
-; GCN-LABEL: global_extload_v8bf16_to_v8f32:
-; GCN:       ; %bb.0:
+define <4 x bfloat> @test_ret_v4bf16(<4 x bfloat> %in) {
+; GCN-LABEL: test_ret_v4bf16:
+; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s6, 0
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NEXT:    s_mov_b32 s4, s6
-; GCN-NEXT:    s_mov_b32 s5, s6
-; GCN-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
-; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
-; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
-; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
-; GCN-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
-; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GCN-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
-; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_extload_v8bf16_to_v8f32:
-; GFX7:       ; %bb.0:
+; GFX7-LABEL: test_ret_v4bf16:
+; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
-; GFX7-NEXT:    s_mov_b32 s4, s6
-; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
-; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
-; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
-; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
-; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_extload_v8bf16_to_v8f32:
-; GFX8:       ; %bb.0:
+; GFX8-LABEL: test_ret_v4bf16:
+; GFX8:       ; %bb.0: ; %entry
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dwordx4 v[4:7], v[0:1]
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: global_extload_v8bf16_to_v8f32:
-; GFX9:       ; %bb.0:
+; GFX9-LABEL: test_ret_v4bf16:
+; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v8
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v9
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v10
-; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v11
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v8
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v9
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v10
-; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v11
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_extload_v8bf16_to_v8f32:
-; GFX10:       ; %bb.0:
+; GFX10-LABEL: test_ret_v4bf16:
+; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx4 v[7:10], v[0:1], off
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v7
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v8
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v9
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v7
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v10
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v8
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v9
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v10
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_extload_v8bf16_to_v8f32:
-; GFX11:       ; %bb.0:
+; GFX11-LABEL: test_ret_v4bf16:
+; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b128 v[7:10], v[0:1], off
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v7
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v8
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v9
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v7
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v10
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v8
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v9
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v10
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %load = load <8 x bfloat>, ptr addrspace(1) %ptr
-  %fpext = fpext <8 x bfloat> %load to <8 x float>
-  ret <8 x float> %fpext
+entry:
+  ret <4 x bfloat> %in
 }
 
-define <16 x float> @global_extload_v16bf16_to_v16f32(ptr addrspace(1) %ptr) {
-; GCN-LABEL: global_extload_v16bf16_to_v16f32:
-; GCN:       ; %bb.0:
+define <8 x bfloat> @test_ret_v8bf16(<8 x bfloat> %in) {
+; GCN-LABEL: test_ret_v8bf16:
+; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s6, 0
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NEXT:    s_mov_b32 s4, s6
-; GCN-NEXT:    s_mov_b32 s5, s6
-; GCN-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
-; GCN-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:16
-; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
-; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
-; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
-; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
-; GCN-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
-; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GCN-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
-; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_lshlrev_b32_e32 v8, 16, v12
-; GCN-NEXT:    v_and_b32_e32 v9, 0xffff0000, v12
-; GCN-NEXT:    v_lshlrev_b32_e32 v10, 16, v13
-; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v13
-; GCN-NEXT:    v_lshlrev_b32_e32 v12, 16, v14
-; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v14
-; GCN-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
-; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_extload_v16bf16_to_v16f32:
-; GFX7:       ; %bb.0:
+; GFX7-LABEL: test_ret_v8bf16:
+; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
-; GFX7-NEXT:    s_mov_b32 s4, s6
-; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:16
-; GFX7-NEXT:    s_waitcnt vmcnt(1)
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
-; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
-; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
-; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
-; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 16, v12
-; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v12
-; GFX7-NEXT:    v_lshlrev_b32_e32 v10, 16, v13
-; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v13
-; GFX7-NEXT:    v_lshlrev_b32_e32 v12, 16, v14
-; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v14
-; GFX7-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
-; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_extload_v16bf16_to_v16f32:
-; GFX8:       ; %bb.0:
+; GFX8-LABEL: test_ret_v8bf16:
+; GFX8:       ; %bb.0: ; %entry
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dwordx4 v[4:7], v[0:1]
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dwordx4 v[12:15], v[0:1]
-; GFX8-NEXT:    s_waitcnt vmcnt(1)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v12
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v12
-; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v13
-; GFX8-NEXT:    v_and_b32_e32 v11, 0xffff0000, v13
-; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v14
-; GFX8-NEXT:    v_and_b32_e32 v13, 0xffff0000, v14
-; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
-; GFX8-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: global_extload_v16bf16_to_v16f32:
-; GFX9:       ; %bb.0:
+; GFX9-LABEL: test_ret_v8bf16:
+; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx4 v[16:19], v[0:1], off
-; GFX9-NEXT:    global_load_dwordx4 v[20:23], v[0:1], off offset:16
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v16
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v17
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v18
-; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v19
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v20
-; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v21
-; GFX9-NEXT:    v_and_b32_e32 v13, 0xffff0000, v22
-; GFX9-NEXT:    v_and_b32_e32 v15, 0xffff0000, v23
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v16
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v17
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v18
-; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v19
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v20
-; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 16, v21
-; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 16, v22
-; GFX9-NEXT:    v_lshlrev_b32_e32 v14, 16, v23
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_extload_v16bf16_to_v16f32:
-; GFX10:       ; %bb.0:
+; GFX10-LABEL: test_ret_v8bf16:
+; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    global_load_dwordx4 v[16:19], v[0:1], off
-; GFX10-NEXT:    global_load_dwordx4 v[20:23], v[0:1], off offset:16
-; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v16
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v17
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v18
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v19
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v20
-; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v21
-; GFX10-NEXT:    v_and_b32_e32 v13, 0xffff0000, v22
-; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v23
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v16
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v17
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v18
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v19
-; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v20
-; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v21
-; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v22
-; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v23
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_extload_v16bf16_to_v16f32:
-; GFX11:       ; %bb.0:
+; GFX11-LABEL: test_ret_v8bf16:
+; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    global_load_b128 v[16:19], v[0:1], off
-; GFX11-NEXT:    global_load_b128 v[20:23], v[0:1], off offset:16
-; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v16
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v17
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v18
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v19
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v20
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v21
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff0000, v22
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff0000, v23
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v16
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v17
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v18
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v19
-; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v20
-; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v21
-; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v22
-; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 16, v23
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %load = load <16 x bfloat>, ptr addrspace(1) %ptr
-  %fpext = fpext <16 x bfloat> %load to <16 x float>
-  ret <16 x float> %fpext
+entry:
+  ret <8 x bfloat> %in
 }
 
-define <32 x float> @global_extload_v32bf16_to_v32f32(ptr addrspace(1) %ptr) {
-; GCN-LABEL: global_extload_v32bf16_to_v32f32:
-; GCN:       ; %bb.0:
+define <16 x bfloat> @test_ret_v16bf16(<16 x bfloat> %in) {
+; GCN-LABEL: test_ret_v16bf16:
+; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s6, 0
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NEXT:    s_mov_b32 s4, s6
-; GCN-NEXT:    s_mov_b32 s5, s6
-; GCN-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
-; GCN-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:16
-; GCN-NEXT:    buffer_load_dwordx4 v[20:23], v[0:1], s[4:7], 0 addr64 offset:32
-; GCN-NEXT:    buffer_load_dwordx4 v[28:31], v[0:1], s[4:7], 0 addr64 offset:48
-; GCN-NEXT:    s_waitcnt vmcnt(3)
-; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
-; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
-; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
-; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
-; GCN-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
-; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GCN-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
-; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GCN-NEXT:    s_waitcnt vmcnt(2)
-; GCN-NEXT:    v_lshlrev_b32_e32 v8, 16, v12
-; GCN-NEXT:    v_and_b32_e32 v9, 0xffff0000, v12
-; GCN-NEXT:    v_lshlrev_b32_e32 v10, 16, v13
-; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v13
-; GCN-NEXT:    v_lshlrev_b32_e32 v12, 16, v14
-; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v14
-; GCN-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
-; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_lshlrev_b32_e32 v16, 16, v20
-; GCN-NEXT:    v_and_b32_e32 v17, 0xffff0000, v20
-; GCN-NEXT:    v_lshlrev_b32_e32 v18, 16, v21
-; GCN-NEXT:    v_and_b32_e32 v19, 0xffff0000, v21
-; GCN-NEXT:    v_lshlrev_b32_e32 v20, 16, v22
-; GCN-NEXT:    v_and_b32_e32 v21, 0xffff0000, v22
-; GCN-NEXT:    v_lshlrev_b32_e32 v22, 16, v23
-; GCN-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_lshlrev_b32_e32 v24, 16, v28
-; GCN-NEXT:    v_and_b32_e32 v25, 0xffff0000, v28
-; GCN-NEXT:    v_lshlrev_b32_e32 v26, 16, v29
-; GCN-NEXT:    v_and_b32_e32 v27, 0xffff0000, v29
-; GCN-NEXT:    v_lshlrev_b32_e32 v28, 16, v30
-; GCN-NEXT:    v_and_b32_e32 v29, 0xffff0000, v30
-; GCN-NEXT:    v_lshlrev_b32_e32 v30, 16, v31
-; GCN-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_extload_v32bf16_to_v32f32:
-; GFX7:       ; %bb.0:
+; GFX7-LABEL: test_ret_v16bf16:
+; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
-; GFX7-NEXT:    s_mov_b32 s4, s6
-; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:16
-; GFX7-NEXT:    buffer_load_dwordx4 v[20:23], v[0:1], s[4:7], 0 addr64 offset:32
-; GFX7-NEXT:    buffer_load_dwordx4 v[28:31], v[0:1], s[4:7], 0 addr64 offset:48
-; GFX7-NEXT:    s_waitcnt vmcnt(3)
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
-; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
-; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
-; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
-; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX7-NEXT:    s_waitcnt vmcnt(2)
-; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 16, v12
-; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v12
-; GFX7-NEXT:    v_lshlrev_b32_e32 v10, 16, v13
-; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v13
-; GFX7-NEXT:    v_lshlrev_b32_e32 v12, 16, v14
-; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v14
-; GFX7-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
-; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GFX7-NEXT:    s_waitcnt vmcnt(1)
-; GFX7-NEXT:    v_lshlrev_b32_e32 v16, 16, v20
-; GFX7-NEXT:    v_and_b32_e32 v17, 0xffff0000, v20
-; GFX7-NEXT:    v_lshlrev_b32_e32 v18, 16, v21
-; GFX7-NEXT:    v_and_b32_e32 v19, 0xffff0000, v21
-; GFX7-NEXT:    v_lshlrev_b32_e32 v20, 16, v22
-; GFX7-NEXT:    v_and_b32_e32 v21, 0xffff0000, v22
-; GFX7-NEXT:    v_lshlrev_b32_e32 v22, 16, v23
-; GFX7-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshlrev_b32_e32 v24, 16, v28
-; GFX7-NEXT:    v_and_b32_e32 v25, 0xffff0000, v28
-; GFX7-NEXT:    v_lshlrev_b32_e32 v26, 16, v29
-; GFX7-NEXT:    v_and_b32_e32 v27, 0xffff0000, v29
-; GFX7-NEXT:    v_lshlrev_b32_e32 v28, 16, v30
-; GFX7-NEXT:    v_and_b32_e32 v29, 0xffff0000, v30
-; GFX7-NEXT:    v_lshlrev_b32_e32 v30, 16, v31
-; GFX7-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_extload_v32bf16_to_v32f32:
-; GFX8:       ; %bb.0:
+; GFX8-LABEL: test_ret_v16bf16:
+; GFX8:       ; %bb.0: ; %entry
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 16, v0
-; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dwordx4 v[4:7], v[0:1]
-; GFX8-NEXT:    flat_load_dwordx4 v[12:15], v[2:3]
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 32, v0
-; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 48, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dwordx4 v[20:23], v[2:3]
-; GFX8-NEXT:    flat_load_dwordx4 v[28:31], v[0:1]
-; GFX8-NEXT:    s_waitcnt vmcnt(3)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX8-NEXT:    s_waitcnt vmcnt(2)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v12
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v12
-; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v13
-; GFX8-NEXT:    v_and_b32_e32 v11, 0xffff0000, v13
-; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v14
-; GFX8-NEXT:    v_and_b32_e32 v13, 0xffff0000, v14
-; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
-; GFX8-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GFX8-NEXT:    s_waitcnt vmcnt(1)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v20
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v20
-; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v21
-; GFX8-NEXT:    v_and_b32_e32 v19, 0xffff0000, v21
-; GFX8-NEXT:    v_lshlrev_b32_e32 v20, 16, v22
-; GFX8-NEXT:    v_and_b32_e32 v21, 0xffff0000, v22
-; GFX8-NEXT:    v_lshlrev_b32_e32 v22, 16, v23
-; GFX8-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v28
-; GFX8-NEXT:    v_and_b32_e32 v25, 0xffff0000, v28
-; GFX8-NEXT:    v_lshlrev_b32_e32 v26, 16, v29
-; GFX8-NEXT:    v_and_b32_e32 v27, 0xffff0000, v29
-; GFX8-NEXT:    v_lshlrev_b32_e32 v28, 16, v30
-; GFX8-NEXT:    v_and_b32_e32 v29, 0xffff0000, v30
-; GFX8-NEXT:    v_lshlrev_b32_e32 v30, 16, v31
-; GFX8-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: global_extload_v32bf16_to_v32f32:
-; GFX9:       ; %bb.0:
+; GFX9-LABEL: test_ret_v16bf16:
+; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx4 v[16:19], v[0:1], off
-; GFX9-NEXT:    global_load_dwordx4 v[20:23], v[0:1], off offset:16
-; GFX9-NEXT:    global_load_dwordx4 v[24:27], v[0:1], off offset:32
-; GFX9-NEXT:    global_load_dwordx4 v[32:35], v[0:1], off offset:48
-; GFX9-NEXT:    s_waitcnt vmcnt(3)
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v16
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v17
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v18
-; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v19
-; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v20
-; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v21
-; GFX9-NEXT:    v_and_b32_e32 v13, 0xffff0000, v22
-; GFX9-NEXT:    v_and_b32_e32 v15, 0xffff0000, v23
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v16
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v17
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v18
-; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v19
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v20
-; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 16, v21
-; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 16, v22
-; GFX9-NEXT:    v_lshlrev_b32_e32 v14, 16, v23
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v24
-; GFX9-NEXT:    v_and_b32_e32 v19, 0xffff0000, v25
-; GFX9-NEXT:    v_and_b32_e32 v21, 0xffff0000, v26
-; GFX9-NEXT:    v_and_b32_e32 v23, 0xffff0000, v27
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v24
-; GFX9-NEXT:    v_lshlrev_b32_e32 v18, 16, v25
-; GFX9-NEXT:    v_lshlrev_b32_e32 v20, 16, v26
-; GFX9-NEXT:    v_lshlrev_b32_e32 v22, 16, v27
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v25, 0xffff0000, v32
-; GFX9-NEXT:    v_and_b32_e32 v27, 0xffff0000, v33
-; GFX9-NEXT:    v_and_b32_e32 v29, 0xffff0000, v34
-; GFX9-NEXT:    v_and_b32_e32 v31, 0xffff0000, v35
-; GFX9-NEXT:    v_lshlrev_b32_e32 v24, 16, v32
-; GFX9-NEXT:    v_lshlrev_b32_e32 v26, 16, v33
-; GFX9-NEXT:    v_lshlrev_b32_e32 v28, 16, v34
-; GFX9-NEXT:    v_lshlrev_b32_e32 v30, 16, v35
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_extload_v32bf16_to_v32f32:
-; GFX10:       ; %bb.0:
+; GFX10-LABEL: test_ret_v16bf16:
+; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    s_clause 0x3
-; GFX10-NEXT:    global_load_dwordx4 v[32:35], v[0:1], off
-; GFX10-NEXT:    global_load_dwordx4 v[36:39], v[0:1], off offset:16
-; GFX10-NEXT:    global_load_dwordx4 v[48:51], v[0:1], off offset:32
-; GFX10-NEXT:    global_load_dwordx4 v[52:55], v[0:1], off offset:48
-; GFX10-NEXT:    s_waitcnt vmcnt(3)
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v32
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v33
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v34
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v35
-; GFX10-NEXT:    s_waitcnt vmcnt(2)
-; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v36
-; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v37
-; GFX10-NEXT:    v_and_b32_e32 v13, 0xffff0000, v38
-; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v39
-; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    v_and_b32_e32 v17, 0xffff0000, v48
-; GFX10-NEXT:    v_and_b32_e32 v19, 0xffff0000, v49
-; GFX10-NEXT:    v_and_b32_e32 v21, 0xffff0000, v50
-; GFX10-NEXT:    v_and_b32_e32 v23, 0xffff0000, v51
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v25, 0xffff0000, v52
-; GFX10-NEXT:    v_and_b32_e32 v27, 0xffff0000, v53
-; GFX10-NEXT:    v_and_b32_e32 v29, 0xffff0000, v54
-; GFX10-NEXT:    v_and_b32_e32 v31, 0xffff0000, v55
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v32
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v33
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v34
-; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v35
-; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v36
-; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v37
-; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v38
-; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v39
-; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v48
-; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v49
-; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v50
-; GFX10-NEXT:    v_lshlrev_b32_e32 v22, 16, v51
-; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v52
-; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v53
-; GFX10-NEXT:    v_lshlrev_b32_e32 v28, 16, v54
-; GFX10-NEXT:    v_lshlrev_b32_e32 v30, 16, v55
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_extload_v32bf16_to_v32f32:
-; GFX11:       ; %bb.0:
+; GFX11-LABEL: test_ret_v16bf16:
+; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_clause 0x3
-; GFX11-NEXT:    global_load_b128 v[32:35], v[0:1], off
-; GFX11-NEXT:    global_load_b128 v[36:39], v[0:1], off offset:16
-; GFX11-NEXT:    global_load_b128 v[48:51], v[0:1], off offset:32
-; GFX11-NEXT:    global_load_b128 v[52:55], v[0:1], off offset:48
-; GFX11-NEXT:    s_waitcnt vmcnt(3)
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v32
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v33
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v34
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v35
-; GFX11-NEXT:    s_waitcnt vmcnt(2)
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v36
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v37
-; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff0000, v38
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff0000, v39
-; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff0000, v48
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff0000, v49
-; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff0000, v50
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff0000, v51
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v25, 0xffff0000, v52
-; GFX11-NEXT:    v_and_b32_e32 v27, 0xffff0000, v53
-; GFX11-NEXT:    v_and_b32_e32 v29, 0xffff0000, v54
-; GFX11-NEXT:    v_and_b32_e32 v31, 0xffff0000, v55
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v32
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v33
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v34
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v35
-; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v36
-; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v37
-; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v38
-; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 16, v39
-; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v48
-; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v49
-; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v50
-; GFX11-NEXT:    v_lshlrev_b32_e32 v22, 16, v51
-; GFX11-NEXT:    v_lshlrev_b32_e32 v24, 16, v52
-; GFX11-NEXT:    v_lshlrev_b32_e32 v26, 16, v53
-; GFX11-NEXT:    v_lshlrev_b32_e32 v28, 16, v54
-; GFX11-NEXT:    v_lshlrev_b32_e32 v30, 16, v55
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %load = load <32 x bfloat>, ptr addrspace(1) %ptr
-  %fpext = fpext <32 x bfloat> %load to <32 x float>
-  ret <32 x float> %fpext
+entry:
+  ret <16 x bfloat> %in
 }
 
-define <2 x double> @global_extload_v2bf16_to_v2f64(ptr addrspace(1) %ptr) {
-; GCN-LABEL: global_extload_v2bf16_to_v2f64:
-; GCN:       ; %bb.0:
+define void @test_call(bfloat %in, ptr addrspace(5) %out) {
+; GCN-LABEL: test_call:
+; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s6, 0
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NEXT:    s_mov_b32 s4, s6
-; GCN-NEXT:    s_mov_b32 s5, s6
-; GCN-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    s_mov_b32 s8, s33
+; GCN-NEXT:    s_mov_b32 s33, s32
+; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
+; GCN-NEXT:    s_addk_i32 s32, 0x400
+; GCN-NEXT:    s_waitcnt expcnt(0)
+; GCN-NEXT:    v_writelane_b32 v2, s30, 0
+; GCN-NEXT:    v_writelane_b32 v2, s31, 1
+; GCN-NEXT:    s_getpc_b64 s[4:5]
+; GCN-NEXT:    s_add_u32 s4, s4, test_arg_store@gotpcrel32@lo+4
+; GCN-NEXT:    s_addc_u32 s5, s5, test_arg_store@gotpcrel32@hi+12
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT:    buffer_store_short v0, v1, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
-; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[0:1], v1
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
+; GCN-NEXT:    v_readlane_b32 s31, v2, 1
+; GCN-NEXT:    v_readlane_b32 s30, v2, 0
+; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
+; GCN-NEXT:    s_addk_i32 s32, 0xfc00
+; GCN-NEXT:    s_mov_b32 s33, s8
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_extload_v2bf16_to_v2f64:
-; GFX7:       ; %bb.0:
+; GFX7-LABEL: test_call:
+; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
-; GFX7-NEXT:    s_mov_b32 s4, s6
-; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_mov_b32 s8, s33
+; GFX7-NEXT:    s_mov_b32 s33, s32
+; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX7-NEXT:    buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX7-NEXT:    s_addk_i32 s32, 0x400
+; GFX7-NEXT:    s_getpc_b64 s[4:5]
+; GFX7-NEXT:    s_add_u32 s4, s4, test_arg_store@gotpcrel32@lo+4
+; GFX7-NEXT:    s_addc_u32 s5, s5, test_arg_store@gotpcrel32@hi+12
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX7-NEXT:    v_writelane_b32 v2, s30, 0
+; GFX7-NEXT:    v_writelane_b32 v2, s31, 1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:    buffer_store_short v0, v1, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_readlane_b32 s31, v2, 1
+; GFX7-NEXT:    v_readlane_b32 s30, v2, 0
+; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX7-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX7-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX7-NEXT:    s_mov_b32 s33, s8
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
-; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_extload_v2bf16_to_v2f64:
-; GFX8:       ; %bb.0:
+; GFX8-LABEL: test_call:
+; GFX8:       ; %bb.0: ; %entry
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dword v2, v[0:1]
+; GFX8-NEXT:    s_mov_b32 s6, s33
+; GFX8-NEXT:    s_mov_b32 s33, s32
+; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8-NEXT:    buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8-NEXT:    s_addk_i32 s32, 0x400
+; GFX8-NEXT:    s_getpc_b64 s[4:5]
+; GFX8-NEXT:    s_add_u32 s4, s4, test_arg_store@gotpcrel32@lo+4
+; GFX8-NEXT:    s_addc_u32 s5, s5, test_arg_store@gotpcrel32@hi+12
+; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX8-NEXT:    v_writelane_b32 v2, s30, 0
+; GFX8-NEXT:    v_writelane_b32 v2, s31, 1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    buffer_store_short v0, v1, s[0:3], 0 offen
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_readlane_b32 s31, v2, 1
+; GFX8-NEXT:    v_readlane_b32 s30, v2, 0
+; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX8-NEXT:    s_mov_b32 s33, s6
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: global_extload_v2bf16_to_v2f64:
-; GFX9:       ; %bb.0:
+; GFX9-LABEL: test_call:
+; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v2, v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s6, s33
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, test_arg_store@gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, test_arg_store@gotpcrel32@hi+12
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX9-NEXT:    v_writelane_b32 v2, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v2, s31, 1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    buffer_store_short_d16_hi v0, v1, s[0:3], 0 offen
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_readlane_b32 s31, v2, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v2, 0
+; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX9-NEXT:    s_mov_b32 s33, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_extload_v2bf16_to_v2f64:
-; GFX10:       ; %bb.0:
+; GFX10-LABEL: test_call:
+; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v0, v[0:1], off
+; GFX10-NEXT:    s_mov_b32 s6, s33
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    s_getpc_b64 s[4:5]
+; GFX10-NEXT:    s_add_u32 s4, s4, test_arg_store@gotpcrel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s5, s5, test_arg_store@gotpcrel32@hi+12
+; GFX10-NEXT:    v_writelane_b32 v2, s30, 0
+; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX10-NEXT:    v_writelane_b32 v2, s31, 1
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX10-NEXT:    buffer_store_short_d16_hi v0, v1, s[0:3], 0 offen
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_readlane_b32 s31, v2, 1
+; GFX10-NEXT:    v_readlane_b32 s30, v2, 0
+; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
+; GFX10-NEXT:    s_mov_b32 s33, s6
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
-; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[0:1], v1
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_extload_v2bf16_to_v2f64:
-; GFX11:       ; %bb.0:
+; GFX11-LABEL: test_call:
+; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-NEXT:    s_mov_b32 s2, s33
+; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT:    scratch_store_b32 off, v2, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    s_getpc_b64 s[0:1]
+; GFX11-NEXT:    s_add_u32 s0, s0, test_arg_store@gotpcrel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s1, s1, test_arg_store@gotpcrel32@hi+12
+; GFX11-NEXT:    v_writelane_b32 v2, s30, 0
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT:    v_writelane_b32 v2, s31, 1
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    scratch_store_d16_hi_b16 v1, v0, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    v_readlane_b32 s31, v2, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v2, 0
+; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT:    scratch_load_b32 v2, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-NEXT:    s_add_i32 s32, s32, -16
+; GFX11-NEXT:    s_mov_b32 s33, s2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[0:1], v1
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %load = load <2 x bfloat>, ptr addrspace(1) %ptr
-  %fpext = fpext <2 x bfloat> %load to <2 x double>
-  ret <2 x double> %fpext
+entry:
+  %result = call bfloat @test_arg_store(bfloat %in)
+  store volatile bfloat %result, ptr addrspace(5) %out
+  ret void
 }
 
-define <3 x double> @global_extload_v3bf16_to_v3f64(ptr addrspace(1) %ptr) {
-; GCN-LABEL: global_extload_v3bf16_to_v3f64:
-; GCN:       ; %bb.0:
+define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) {
+; GCN-LABEL: test_call_v2bf16:
+; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s6, 0
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NEXT:    s_mov_b32 s4, s6
-; GCN-NEXT:    s_mov_b32 s5, s6
-; GCN-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    s_mov_b32 s8, s33
+; GCN-NEXT:    s_mov_b32 s33, s32
+; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
+; GCN-NEXT:    s_addk_i32 s32, 0x400
+; GCN-NEXT:    s_waitcnt expcnt(0)
+; GCN-NEXT:    v_writelane_b32 v3, s30, 0
+; GCN-NEXT:    v_writelane_b32 v3, s31, 1
+; GCN-NEXT:    s_getpc_b64 s[4:5]
+; GCN-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4
+; GCN-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT:    v_add_i32_e32 v4, vcc, 2, v2
+; GCN-NEXT:    buffer_store_short v1, v4, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
-; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
-; GCN-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[0:1], v2
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[2:3], v3
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
+; GCN-NEXT:    buffer_store_short v0, v2, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_readlane_b32 s31, v3, 1
+; GCN-NEXT:    v_readlane_b32 s30, v3, 0
+; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
+; GCN-NEXT:    s_addk_i32 s32, 0xfc00
+; GCN-NEXT:    s_mov_b32 s33, s8
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_extload_v3bf16_to_v3f64:
-; GFX7:       ; %bb.0:
+; GFX7-LABEL: test_call_v2bf16:
+; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
-; GFX7-NEXT:    s_mov_b32 s4, s6
-; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dwordx2 v[1:2], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_mov_b32 s8, s33
+; GFX7-NEXT:    s_mov_b32 s33, s32
+; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX7-NEXT:    buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX7-NEXT:    s_addk_i32 s32, 0x400
+; GFX7-NEXT:    s_getpc_b64 s[4:5]
+; GFX7-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4
+; GFX7-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX7-NEXT:    v_writelane_b32 v3, s30, 0
+; GFX7-NEXT:    v_writelane_b32 v3, s31, 1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 2, v2
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:    buffer_store_short v1, v4, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_store_short v0, v2, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_readlane_b32 s31, v3, 1
+; GFX7-NEXT:    v_readlane_b32 s30, v3, 0
+; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX7-NEXT:    buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX7-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX7-NEXT:    s_mov_b32 s33, s8
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
-; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[2:3], v3
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_extload_v3bf16_to_v3f64:
-; GFX8:       ; %bb.0:
+; GFX8-LABEL: test_call_v2bf16:
+; GFX8:       ; %bb.0: ; %entry
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dwordx2 v[1:2], v[0:1]
+; GFX8-NEXT:    s_mov_b32 s6, s33
+; GFX8-NEXT:    s_mov_b32 s33, s32
+; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8-NEXT:    buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8-NEXT:    s_addk_i32 s32, 0x400
+; GFX8-NEXT:    s_getpc_b64 s[4:5]
+; GFX8-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4
+; GFX8-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12
+; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX8-NEXT:    v_writelane_b32 v2, s30, 0
+; GFX8-NEXT:    v_writelane_b32 v2, s31, 1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX8-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_readlane_b32 s31, v2, 1
+; GFX8-NEXT:    v_readlane_b32 s30, v2, 0
+; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX8-NEXT:    s_mov_b32 s33, s6
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[2:3], v3
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: global_extload_v3bf16_to_v3f64:
-; GFX9:       ; %bb.0:
+; GFX9-LABEL: test_call_v2bf16:
+; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[1:2], v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s6, s33
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX9-NEXT:    v_writelane_b32 v2, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v2, s31, 1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_readlane_b32 s31, v2, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v2, 0
+; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX9-NEXT:    s_mov_b32 s33, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[2:3], v3
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_extload_v3bf16_to_v3f64:
-; GFX10:       ; %bb.0:
+; GFX10-LABEL: test_call_v2bf16:
+; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX10-NEXT:    s_mov_b32 s6, s33
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    s_getpc_b64 s[4:5]
+; GFX10-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12
+; GFX10-NEXT:    v_writelane_b32 v2, s30, 0
+; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX10-NEXT:    v_writelane_b32 v2, s31, 1
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX10-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_readlane_b32 s31, v2, 1
+; GFX10-NEXT:    v_readlane_b32 s30, v2, 0
+; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
+; GFX10-NEXT:    s_mov_b32 s33, s6
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[0:1], v2
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[2:3], v3
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_extload_v3bf16_to_v3f64:
-; GFX11:       ; %bb.0:
+; GFX11-LABEL: test_call_v2bf16:
+; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
+; GFX11-NEXT:    s_mov_b32 s2, s33
+; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT:    scratch_store_b32 off, v2, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    s_getpc_b64 s[0:1]
+; GFX11-NEXT:    s_add_u32 s0, s0, test_arg_store_v2bf16@gotpcrel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s1, s1, test_arg_store_v2bf16@gotpcrel32@hi+12
+; GFX11-NEXT:    v_writelane_b32 v2, s30, 0
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT:    v_writelane_b32 v2, s31, 1
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    scratch_store_b32 v1, v0, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    v_readlane_b32 s31, v2, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v2, 0
+; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT:    scratch_load_b32 v2, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-NEXT:    s_add_i32 s32, s32, -16
+; GFX11-NEXT:    s_mov_b32 s33, s2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[0:1], v2
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[2:3], v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %load = load <3 x bfloat>, ptr addrspace(1) %ptr
-  %fpext = fpext <3 x bfloat> %load to <3 x double>
-  ret <3 x double> %fpext
+entry:
+  %result = call <2 x bfloat> @test_arg_store_v2bf16(<2 x bfloat> %in)
+  store volatile <2 x bfloat> %result, ptr addrspace(5) %out
+  ret void
 }
 
-define <4 x double> @global_extload_v4bf16_to_v4f64(ptr addrspace(1) %ptr) {
-; GCN-LABEL: global_extload_v4bf16_to_v4f64:
-; GCN:       ; %bb.0:
+define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) {
+; GCN-LABEL: test_call_v3bf16:
+; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s6, 0
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NEXT:    s_mov_b32 s4, s6
-; GCN-NEXT:    s_mov_b32 s5, s6
-; GCN-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
-; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
-; GCN-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
-; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[0:1], v2
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[2:3], v3
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; GCN-NEXT:    s_mov_b32 s8, s33
+; GCN-NEXT:    s_mov_b32 s33, s32
+; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    buffer_store_dword v4, off, s[0:3], s33 ; 4-byte Folded Spill
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
+; GCN-NEXT:    s_addk_i32 s32, 0x400
+; GCN-NEXT:    s_waitcnt expcnt(0)
+; GCN-NEXT:    v_writelane_b32 v4, s30, 0
+; GCN-NEXT:    v_writelane_b32 v4, s31, 1
+; GCN-NEXT:    s_getpc_b64 s[4:5]
+; GCN-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4
+; GCN-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GCN-NEXT:    v_add_i32_e32 v5, vcc, 4, v3
+; GCN-NEXT:    v_alignbit_b32 v0, v1, v0, 16
+; GCN-NEXT:    buffer_store_short v2, v5, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dword v0, v3, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_readlane_b32 s31, v4, 1
+; GCN-NEXT:    v_readlane_b32 s30, v4, 0
+; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
+; GCN-NEXT:    s_addk_i32 s32, 0xfc00
+; GCN-NEXT:    s_mov_b32 s33, s8
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_extload_v4bf16_to_v4f64:
-; GFX7:       ; %bb.0:
+; GFX7-LABEL: test_call_v3bf16:
+; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
-; GFX7-NEXT:    s_mov_b32 s4, s6
-; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_mov_b32 s8, s33
+; GFX7-NEXT:    s_mov_b32 s33, s32
+; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX7-NEXT:    buffer_store_dword v4, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX7-NEXT:    s_addk_i32 s32, 0x400
+; GFX7-NEXT:    s_getpc_b64 s[4:5]
+; GFX7-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4
+; GFX7-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX7-NEXT:    v_writelane_b32 v4, s30, 0
+; GFX7-NEXT:    v_writelane_b32 v4, s31, 1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_alignbit_b32 v0, v1, v0, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 4, v3
+; GFX7-NEXT:    buffer_store_short v1, v2, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_store_dword v0, v3, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_readlane_b32 s31, v4, 1
+; GFX7-NEXT:    v_readlane_b32 s30, v4, 0
+; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX7-NEXT:    buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX7-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX7-NEXT:    s_mov_b32 s33, s8
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
-; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
-; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[0:1], v2
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[2:3], v3
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_extload_v4bf16_to_v4f64:
-; GFX8:       ; %bb.0:
+; GFX8-LABEL: test_call_v3bf16:
+; GFX8:       ; %bb.0: ; %entry
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GFX8-NEXT:    s_mov_b32 s6, s33
+; GFX8-NEXT:    s_mov_b32 s33, s32
+; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8-NEXT:    buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8-NEXT:    s_addk_i32 s32, 0x400
+; GFX8-NEXT:    s_getpc_b64 s[4:5]
+; GFX8-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4
+; GFX8-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12
+; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX8-NEXT:    v_writelane_b32 v3, s30, 0
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX8-NEXT:    v_writelane_b32 v3, s31, 1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 4, v2
+; GFX8-NEXT:    buffer_store_short v1, v4, s[0:3], 0 offen
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_readlane_b32 s31, v3, 1
+; GFX8-NEXT:    v_readlane_b32 s30, v3, 0
+; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8-NEXT:    buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX8-NEXT:    s_mov_b32 s33, s6
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[0:1], v2
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[2:3], v3
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: global_extload_v4bf16_to_v4f64:
-; GFX9:       ; %bb.0:
+; GFX9-LABEL: test_call_v3bf16:
+; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s6, s33
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX9-NEXT:    v_writelane_b32 v3, s30, 0
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX9-NEXT:    v_writelane_b32 v3, s31, 1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    buffer_store_short v1, v2, s[0:3], 0 offen offset:4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_readlane_b32 s31, v3, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v3, 0
+; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX9-NEXT:    s_mov_b32 s33, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[0:1], v2
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[2:3], v3
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[6:7], v4
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[4:5], v5
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_extload_v4bf16_to_v4f64:
-; GFX10:       ; %bb.0:
+; GFX10-LABEL: test_call_v3bf16:
+; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx2 v[1:2], v[0:1], off
+; GFX10-NEXT:    s_mov_b32 s6, s33
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    s_getpc_b64 s[4:5]
+; GFX10-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12
+; GFX10-NEXT:    v_writelane_b32 v3, s30, 0
+; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX10-NEXT:    v_writelane_b32 v3, s31, 1
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX10-NEXT:    buffer_store_short v1, v2, s[0:3], 0 offen offset:4
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_readlane_b32 s31, v3, 1
+; GFX10-NEXT:    v_readlane_b32 s30, v3, 0
+; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
+; GFX10-NEXT:    s_mov_b32 s33, s6
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[2:3], v3
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_extload_v4bf16_to_v4f64:
-; GFX11:       ; %bb.0:
+; GFX11-LABEL: test_call_v3bf16:
+; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b64 v[1:2], v[0:1], off
+; GFX11-NEXT:    s_mov_b32 s2, s33
+; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT:    scratch_store_b32 off, v3, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    s_getpc_b64 s[0:1]
+; GFX11-NEXT:    s_add_u32 s0, s0, test_arg_store_v2bf16@gotpcrel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s1, s1, test_arg_store_v2bf16@gotpcrel32@hi+12
+; GFX11-NEXT:    v_writelane_b32 v3, s30, 0
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-NEXT:    v_writelane_b32 v3, s31, 1
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    scratch_store_b16 v2, v1, off offset:4 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    scratch_store_b32 v2, v0, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    v_readlane_b32 s31, v3, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v3, 0
+; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT:    scratch_load_b32 v3, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-NEXT:    s_add_i32 s32, s32, -16
+; GFX11-NEXT:    s_mov_b32 s33, s2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[2:3], v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %load = load <4 x bfloat>, ptr addrspace(1) %ptr
-  %fpext = fpext <4 x bfloat> %load to <4 x double>
-  ret <4 x double> %fpext
+entry:
+  %result = call <3 x bfloat> @test_arg_store_v2bf16(<3 x bfloat> %in)
+  store volatile <3 x bfloat> %result, ptr addrspace(5) %out
+  ret void
 }
 
-define <5 x double> @global_extload_v5bf16_to_v5f64(ptr addrspace(1) %ptr) {
-; GCN-LABEL: global_extload_v5bf16_to_v5f64:
-; GCN:       ; %bb.0:
+define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) {
+; GCN-LABEL: test_call_v4bf16:
+; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s6, 0
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NEXT:    s_mov_b32 s4, s6
-; GCN-NEXT:    s_mov_b32 s5, s6
-; GCN-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:8
-; GCN-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
-; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GCN-NEXT:    s_mov_b32 s8, s33
+; GCN-NEXT:    s_mov_b32 s33, s32
+; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
+; GCN-NEXT:    s_addk_i32 s32, 0x400
+; GCN-NEXT:    s_waitcnt expcnt(0)
+; GCN-NEXT:    v_writelane_b32 v5, s30, 0
+; GCN-NEXT:    v_writelane_b32 v5, s31, 1
+; GCN-NEXT:    s_getpc_b64 s[4:5]
+; GCN-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4
+; GCN-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GCN-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT:    v_add_i32_e32 v6, vcc, 6, v4
+; GCN-NEXT:    v_add_i32_e32 v7, vcc, 4, v4
+; GCN-NEXT:    v_add_i32_e32 v8, vcc, 2, v4
+; GCN-NEXT:    buffer_store_short v3, v6, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
-; GCN-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[8:9], v2
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[0:1], v3
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[2:3], v4
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[4:5], v5
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
+; GCN-NEXT:    buffer_store_short v2, v7, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_short v1, v8, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_short v0, v4, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_readlane_b32 s31, v5, 1
+; GCN-NEXT:    v_readlane_b32 s30, v5, 0
+; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
+; GCN-NEXT:    s_addk_i32 s32, 0xfc00
+; GCN-NEXT:    s_mov_b32 s33, s8
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_extload_v5bf16_to_v5f64:
-; GFX7:       ; %bb.0:
+; GFX7-LABEL: test_call_v4bf16:
+; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
-; GFX7-NEXT:    s_mov_b32 s4, s6
-; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:8
-; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    s_waitcnt vmcnt(1)
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    s_mov_b32 s8, s33
+; GFX7-NEXT:    s_mov_b32 s33, s32
+; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX7-NEXT:    buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX7-NEXT:    s_addk_i32 s32, 0x400
+; GFX7-NEXT:    s_getpc_b64 s[4:5]
+; GFX7-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4
+; GFX7-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX7-NEXT:    v_writelane_b32 v5, s30, 0
+; GFX7-NEXT:    v_writelane_b32 v5, s31, 1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT:    v_add_i32_e32 v6, vcc, 6, v4
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    buffer_store_short v3, v6, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 4, v4
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    buffer_store_short v2, v3, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 2, v4
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:    buffer_store_short v1, v2, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_store_short v0, v4, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_readlane_b32 s31, v5, 1
+; GFX7-NEXT:    v_readlane_b32 s30, v5, 0
+; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX7-NEXT:    buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX7-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX7-NEXT:    s_mov_b32 s33, s8
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[8:9], v2
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[0:1], v3
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[2:3], v4
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[4:5], v5
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_extload_v5bf16_to_v5f64:
-; GFX8:       ; %bb.0:
+; GFX8-LABEL: test_call_v4bf16:
+; GFX8:       ; %bb.0: ; %entry
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 8, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_ushort v8, v[0:1]
-; GFX8-NEXT:    s_waitcnt vmcnt(1)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX8-NEXT:    s_mov_b32 s6, s33
+; GFX8-NEXT:    s_mov_b32 s33, s32
+; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8-NEXT:    buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8-NEXT:    s_addk_i32 s32, 0x400
+; GFX8-NEXT:    s_getpc_b64 s[4:5]
+; GFX8-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4
+; GFX8-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12
+; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX8-NEXT:    v_writelane_b32 v3, s30, 0
+; GFX8-NEXT:    v_writelane_b32 v3, s31, 1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 4, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
+; GFX8-NEXT:    buffer_store_short v1, v6, s[0:3], 0 offen
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    buffer_store_short v0, v2, s[0:3], 0 offen
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 6, v2
+; GFX8-NEXT:    buffer_store_short v5, v0, s[0:3], 0 offen
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 2, v2
+; GFX8-NEXT:    buffer_store_short v4, v0, s[0:3], 0 offen
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_readlane_b32 s31, v3, 1
+; GFX8-NEXT:    v_readlane_b32 s30, v3, 0
+; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8-NEXT:    buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX8-NEXT:    s_mov_b32 s33, s6
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[8:9], v8
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: global_extload_v5bf16_to_v5f64:
-; GFX9:       ; %bb.0:
+; GFX9-LABEL: test_call_v4bf16:
+; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-NEXT:    global_load_short_d16_hi v2, v[0:1], off offset:8
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[8:9], v2
+; GFX9-NEXT:    s_mov_b32 s6, s33
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX9-NEXT:    v_writelane_b32 v3, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v3, s31, 1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    buffer_store_short_d16_hi v1, v2, s[0:3], 0 offen offset:6
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_short v1, v2, s[0:3], 0 offen offset:4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_short_d16_hi v0, v2, s[0:3], 0 offen offset:2
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_short v0, v2, s[0:3], 0 offen
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_readlane_b32 s31, v3, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v3, 0
+; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX9-NEXT:    s_mov_b32 s33, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[0:1], v2
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[2:3], v3
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[6:7], v4
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[4:5], v5
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_extload_v5bf16_to_v5f64:
-; GFX10:       ; %bb.0:
+; GFX10-LABEL: test_call_v4bf16:
+; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v4, 0
-; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    global_load_dwordx2 v[2:3], v[0:1], off
-; GFX10-NEXT:    global_load_short_d16_hi v4, v[0:1], off offset:8
-; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
-; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT:    s_mov_b32 s6, s33
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    s_getpc_b64 s[4:5]
+; GFX10-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12
+; GFX10-NEXT:    v_writelane_b32 v3, s30, 0
+; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX10-NEXT:    v_writelane_b32 v3, s31, 1
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX10-NEXT:    buffer_store_short_d16_hi v1, v2, s[0:3], 0 offen offset:6
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    buffer_store_short v1, v2, s[0:3], 0 offen offset:4
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    buffer_store_short_d16_hi v0, v2, s[0:3], 0 offen offset:2
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    buffer_store_short v0, v2, s[0:3], 0 offen
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_readlane_b32 s31, v3, 1
+; GFX10-NEXT:    v_readlane_b32 s30, v3, 0
+; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
+; GFX10-NEXT:    s_mov_b32 s33, s6
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[8:9], v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_extload_v5bf16_to_v5f64:
-; GFX11:       ; %bb.0:
+; GFX11-LABEL: test_call_v4bf16:
+; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v4, 0
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    global_load_b64 v[2:3], v[0:1], off
-; GFX11-NEXT:    global_load_d16_hi_b16 v4, v[0:1], off offset:8
-; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT:    s_mov_b32 s2, s33
+; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT:    scratch_store_b32 off, v3, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    s_getpc_b64 s[0:1]
+; GFX11-NEXT:    s_add_u32 s0, s0, test_arg_store_v2bf16@gotpcrel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s1, s1, test_arg_store_v2bf16@gotpcrel32@hi+12
+; GFX11-NEXT:    v_writelane_b32 v3, s30, 0
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT:    v_writelane_b32 v3, s31, 1
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    v_add_nc_u32_e32 v4, 6, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_readlane_b32 s31, v3, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v3, 0
+; GFX11-NEXT:    scratch_store_d16_hi_b16 v4, v1, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    scratch_store_b16 v2, v1, off offset:4 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    scratch_store_d16_hi_b16 v2, v0, off offset:2 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    scratch_store_b16 v2, v0, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT:    scratch_load_b32 v3, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-NEXT:    s_add_i32 s32, s32, -16
+; GFX11-NEXT:    s_mov_b32 s33, s2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[8:9], v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %load = load <5 x bfloat>, ptr addrspace(1) %ptr
-  %fpext = fpext <5 x bfloat> %load to <5 x double>
-  ret <5 x double> %fpext
+entry:
+  %result = call <4 x bfloat> @test_arg_store_v2bf16(<4 x bfloat> %in)
+  store volatile <4 x bfloat> %result, ptr addrspace(5) %out
+  ret void
 }
 
-define <6 x double> @global_extload_v6bf16_to_v6f64(ptr addrspace(1) %ptr) {
-; GCN-LABEL: global_extload_v6bf16_to_v6f64:
-; GCN:       ; %bb.0:
+define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) {
+; GCN-LABEL: test_call_v8bf16:
+; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s6, 0
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NEXT:    s_mov_b32 s4, s6
-; GCN-NEXT:    s_mov_b32 s5, s6
-; GCN-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    s_mov_b32 s8, s33
+; GCN-NEXT:    s_mov_b32 s33, s32
+; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    buffer_store_dword v9, off, s[0:3], s33 ; 4-byte Folded Spill
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
+; GCN-NEXT:    s_addk_i32 s32, 0x400
+; GCN-NEXT:    s_waitcnt expcnt(0)
+; GCN-NEXT:    v_writelane_b32 v9, s30, 0
+; GCN-NEXT:    v_writelane_b32 v9, s31, 1
+; GCN-NEXT:    s_getpc_b64 s[4:5]
+; GCN-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4
+; GCN-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GCN-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GCN-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GCN-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GCN-NEXT:    v_add_i32_e32 v10, vcc, 14, v8
+; GCN-NEXT:    v_add_i32_e32 v11, vcc, 12, v8
+; GCN-NEXT:    v_add_i32_e32 v12, vcc, 10, v8
+; GCN-NEXT:    v_add_i32_e32 v13, vcc, 8, v8
+; GCN-NEXT:    v_add_i32_e32 v14, vcc, 6, v8
+; GCN-NEXT:    v_add_i32_e32 v15, vcc, 4, v8
+; GCN-NEXT:    v_add_i32_e32 v16, vcc, 2, v8
+; GCN-NEXT:    buffer_store_short v7, v10, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
-; GCN-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
-; GCN-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
-; GCN-NEXT:    v_and_b32_e32 v10, 0xffff0000, v2
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[0:1], v3
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[2:3], v4
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[4:5], v5
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[8:9], v8
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[10:11], v10
+; GCN-NEXT:    buffer_store_short v6, v11, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_short v5, v12, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_short v4, v13, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_short v3, v14, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_short v2, v15, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_short v1, v16, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_short v0, v8, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_readlane_b32 s31, v9, 1
+; GCN-NEXT:    v_readlane_b32 s30, v9, 0
+; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
+; GCN-NEXT:    s_addk_i32 s32, 0xfc00
+; GCN-NEXT:    s_mov_b32 s33, s8
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_extload_v6bf16_to_v6f64:
-; GFX7:       ; %bb.0:
+; GFX7-LABEL: test_call_v8bf16:
+; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
-; GFX7-NEXT:    s_mov_b32 s4, s6
-; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dwordx3 v[0:2], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_mov_b32 s8, s33
+; GFX7-NEXT:    s_mov_b32 s33, s32
+; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX7-NEXT:    buffer_store_dword v9, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX7-NEXT:    s_addk_i32 s32, 0x400
+; GFX7-NEXT:    s_getpc_b64 s[4:5]
+; GFX7-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4
+; GFX7-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX7-NEXT:    v_writelane_b32 v9, s30, 0
+; GFX7-NEXT:    v_writelane_b32 v9, s31, 1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX7-NEXT:    v_add_i32_e32 v10, vcc, 14, v8
+; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX7-NEXT:    buffer_store_short v7, v10, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v7, vcc, 12, v8
+; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX7-NEXT:    buffer_store_short v6, v7, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v6, vcc, 10, v8
+; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX7-NEXT:    buffer_store_short v5, v6, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v5, vcc, 8, v8
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT:    buffer_store_short v4, v5, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 6, v8
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    buffer_store_short v3, v4, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 4, v8
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    buffer_store_short v2, v3, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 2, v8
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:    buffer_store_short v1, v2, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_store_short v0, v8, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_readlane_b32 s31, v9, 1
+; GFX7-NEXT:    v_readlane_b32 s30, v9, 0
+; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX7-NEXT:    buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX7-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX7-NEXT:    s_mov_b32 s33, s8
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
-; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
-; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v2
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[0:1], v3
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[2:3], v4
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[4:5], v5
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[8:9], v8
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[10:11], v10
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_extload_v6bf16_to_v6f64:
-; GFX8:       ; %bb.0:
+; GFX8-LABEL: test_call_v8bf16:
+; GFX8:       ; %bb.0: ; %entry
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dwordx3 v[0:2], v[0:1]
+; GFX8-NEXT:    s_mov_b32 s6, s33
+; GFX8-NEXT:    s_mov_b32 s33, s32
+; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8-NEXT:    buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8-NEXT:    s_addk_i32 s32, 0x400
+; GFX8-NEXT:    s_getpc_b64 s[4:5]
+; GFX8-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4
+; GFX8-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12
+; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX8-NEXT:    v_writelane_b32 v5, s30, 0
+; GFX8-NEXT:    v_writelane_b32 v5, s31, 1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX8-NEXT:    v_add_u32_e32 v10, vcc, 12, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v3
+; GFX8-NEXT:    buffer_store_short v3, v10, s[0:3], 0 offen
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
-; GFX8-NEXT:    v_and_b32_e32 v10, 0xffff0000, v2
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[0:1], v3
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[2:3], v4
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[4:5], v5
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[8:9], v8
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[10:11], v10
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: global_extload_v6bf16_to_v6f64:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx3 v[0:2], v[0:1], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[2:3], v3
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[6:7], v4
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[10:11], v5
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[4:5], v8
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[8:9], v9
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 8, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
+; GFX8-NEXT:    buffer_store_short v2, v3, s[0:3], 0 offen
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 4, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
+; GFX8-NEXT:    buffer_store_short v1, v2, s[0:3], 0 offen
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    buffer_store_short v0, v4, s[0:3], 0 offen
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 14, v4
+; GFX8-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 10, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
+; GFX8-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 6, v4
+; GFX8-NEXT:    buffer_store_short v7, v0, s[0:3], 0 offen
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 2, v4
+; GFX8-NEXT:    buffer_store_short v6, v0, s[0:3], 0 offen
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_readlane_b32 s31, v5, 1
+; GFX8-NEXT:    v_readlane_b32 s30, v5, 0
+; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8-NEXT:    buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX8-NEXT:    s_mov_b32 s33, s6
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: test_call_v8bf16:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s6, s33
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX9-NEXT:    v_writelane_b32 v5, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v5, s31, 1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    buffer_store_short_d16_hi v3, v4, s[0:3], 0 offen offset:14
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_short v3, v4, s[0:3], 0 offen offset:12
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_short_d16_hi v2, v4, s[0:3], 0 offen offset:10
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_short v2, v4, s[0:3], 0 offen offset:8
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_short_d16_hi v1, v4, s[0:3], 0 offen offset:6
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_short v1, v4, s[0:3], 0 offen offset:4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_short_d16_hi v0, v4, s[0:3], 0 offen offset:2
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_short v0, v4, s[0:3], 0 offen
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_readlane_b32 s31, v5, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v5, 0
+; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX9-NEXT:    s_mov_b32 s33, s6
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_extload_v6bf16_to_v6f64:
-; GFX10:       ; %bb.0:
+; GFX10-LABEL: test_call_v8bf16:
+; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx3 v[4:6], v[0:1], off
+; GFX10-NEXT:    s_mov_b32 s6, s33
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    s_getpc_b64 s[4:5]
+; GFX10-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12
+; GFX10-NEXT:    v_writelane_b32 v5, s30, 0
+; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX10-NEXT:    v_writelane_b32 v5, s31, 1
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX10-NEXT:    buffer_store_short_d16_hi v3, v4, s[0:3], 0 offen offset:14
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    buffer_store_short v3, v4, s[0:3], 0 offen offset:12
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    buffer_store_short_d16_hi v2, v4, s[0:3], 0 offen offset:10
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    buffer_store_short v2, v4, s[0:3], 0 offen offset:8
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    buffer_store_short_d16_hi v1, v4, s[0:3], 0 offen offset:6
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    buffer_store_short v1, v4, s[0:3], 0 offen offset:4
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    buffer_store_short_d16_hi v0, v4, s[0:3], 0 offen offset:2
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    buffer_store_short v0, v4, s[0:3], 0 offen
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_readlane_b32 s31, v5, 1
+; GFX10-NEXT:    v_readlane_b32 s30, v5, 0
+; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
+; GFX10-NEXT:    s_mov_b32 s33, s6
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
-; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v6
-; GFX10-NEXT:    v_and_b32_e32 v10, 0xffff0000, v6
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[6:7], v7
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[8:9], v8
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[10:11], v10
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_extload_v6bf16_to_v6f64:
-; GFX11:       ; %bb.0:
+; GFX11-LABEL: test_call_v8bf16:
+; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b96 v[4:6], v[0:1], off
+; GFX11-NEXT:    s_mov_b32 s2, s33
+; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT:    scratch_store_b32 off, v5, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    s_getpc_b64 s[0:1]
+; GFX11-NEXT:    s_add_u32 s0, s0, test_arg_store_v2bf16@gotpcrel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s1, s1, test_arg_store_v2bf16@gotpcrel32@hi+12
+; GFX11-NEXT:    v_writelane_b32 v5, s30, 0
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT:    v_writelane_b32 v5, s31, 1
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    v_add_nc_u32_e32 v6, 14, v4
+; GFX11-NEXT:    v_add_nc_u32_e32 v7, 12, v4
+; GFX11-NEXT:    v_add_nc_u32_e32 v8, 10, v4
+; GFX11-NEXT:    v_add_nc_u32_e32 v9, 6, v4
+; GFX11-NEXT:    v_readlane_b32 s31, v5, 1
+; GFX11-NEXT:    scratch_store_d16_hi_b16 v6, v3, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    scratch_store_b16 v7, v3, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    scratch_store_d16_hi_b16 v8, v2, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    scratch_store_b16 v4, v2, off offset:8 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    scratch_store_d16_hi_b16 v9, v1, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    scratch_store_b16 v4, v1, off offset:4 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    scratch_store_d16_hi_b16 v4, v0, off offset:2 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    scratch_store_b16 v4, v0, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    v_readlane_b32 s30, v5, 0
+; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT:    scratch_load_b32 v5, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-NEXT:    s_add_i32 s32, s32, -16
+; GFX11-NEXT:    s_mov_b32 s33, s2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v6
-; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff0000, v6
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[6:7], v7
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[8:9], v8
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[10:11], v10
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %load = load <6 x bfloat>, ptr addrspace(1) %ptr
-  %fpext = fpext <6 x bfloat> %load to <6 x double>
-  ret <6 x double> %fpext
+entry:
+  %result = call <8 x bfloat> @test_arg_store_v2bf16(<8 x bfloat> %in)
+  store volatile <8 x bfloat> %result, ptr addrspace(5) %out
+  ret void
 }
 
-define <8 x double> @global_extload_v8bf16_to_v8f64(ptr addrspace(1) %ptr) {
-; GCN-LABEL: global_extload_v8bf16_to_v8f64:
-; GCN:       ; %bb.0:
+define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
+; GCN-LABEL: test_call_v16bf16:
+; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s6, 0
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NEXT:    s_mov_b32 s4, s6
-; GCN-NEXT:    s_mov_b32 s5, s6
-; GCN-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
-; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GCN-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
-; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
-; GCN-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
-; GCN-NEXT:    v_and_b32_e32 v10, 0xffff0000, v2
-; GCN-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
-; GCN-NEXT:    v_and_b32_e32 v14, 0xffff0000, v3
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[0:1], v4
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[2:3], v5
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[4:5], v6
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[6:7], v7
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[8:9], v8
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[10:11], v10
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[12:13], v12
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[14:15], v14
-; GCN-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: global_extload_v8bf16_to_v8f64:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
-; GFX7-NEXT:    s_mov_b32 s4, s6
-; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
-; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
-; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
-; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
-; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v2
-; GFX7-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
-; GFX7-NEXT:    v_and_b32_e32 v14, 0xffff0000, v3
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[0:1], v4
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[2:3], v5
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[4:5], v6
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[6:7], v7
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[8:9], v8
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[10:11], v10
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[12:13], v12
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[14:15], v14
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: global_extload_v8bf16_to_v8f64:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
-; GFX8-NEXT:    v_and_b32_e32 v10, 0xffff0000, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
-; GFX8-NEXT:    v_and_b32_e32 v14, 0xffff0000, v3
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[0:1], v4
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[2:3], v5
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[4:5], v6
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[6:7], v7
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[8:9], v8
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[10:11], v10
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[12:13], v12
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[14:15], v14
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: global_extload_v8bf16_to_v8f64:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
-; GFX9-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v3
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 16, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v13, 16, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v3
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[2:3], v4
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[6:7], v5
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[10:11], v8
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[14:15], v9
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[4:5], v12
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[8:9], v13
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[12:13], v16
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: global_extload_v8bf16_to_v8f64:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx4 v[7:10], v[0:1], off
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v7
-; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v7
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v8
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v8
-; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
-; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v9
-; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v10
-; GFX10-NEXT:    v_and_b32_e32 v14, 0xffff0000, v10
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[8:9], v8
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[10:11], v11
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[12:13], v12
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[14:15], v14
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: global_extload_v8bf16_to_v8f64:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b128 v[7:10], v[0:1], off
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v7
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v7
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v8
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v8
-; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v9
-; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v10
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff0000, v10
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[8:9], v8
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[10:11], v11
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[12:13], v12
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[14:15], v14
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %load = load <8 x bfloat>, ptr addrspace(1) %ptr
-  %fpext = fpext <8 x bfloat> %load to <8 x double>
-  ret <8 x double> %fpext
-}
-
-define <16 x double> @global_extload_v16bf16_to_v16f64(ptr addrspace(1) %ptr) {
-; GCN-LABEL: global_extload_v16bf16_to_v16f64:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s6, 0
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NEXT:    s_mov_b32 s4, s6
-; GCN-NEXT:    s_mov_b32 s5, s6
-; GCN-NEXT:    buffer_load_dwordx4 v[2:5], v[0:1], s[4:7], 0 addr64
-; GCN-NEXT:    buffer_load_dwordx4 v[6:9], v[0:1], s[4:7], 0 addr64 offset:16
-; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
-; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GCN-NEXT:    v_lshlrev_b32_e32 v10, 16, v3
-; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v3
-; GCN-NEXT:    v_lshlrev_b32_e32 v12, 16, v4
-; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v4
-; GCN-NEXT:    v_lshlrev_b32_e32 v14, 16, v5
-; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v5
+; GCN-NEXT:    s_mov_b32 s8, s33
+; GCN-NEXT:    s_mov_b32 s33, s32
+; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    buffer_store_dword v17, off, s[0:3], s33 ; 4-byte Folded Spill
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
+; GCN-NEXT:    s_addk_i32 s32, 0x400
+; GCN-NEXT:    s_waitcnt expcnt(0)
+; GCN-NEXT:    v_writelane_b32 v17, s30, 0
+; GCN-NEXT:    v_writelane_b32 v17, s31, 1
+; GCN-NEXT:    s_getpc_b64 s[4:5]
+; GCN-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4
+; GCN-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GCN-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GCN-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GCN-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GCN-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GCN-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GCN-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GCN-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GCN-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
+; GCN-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
+; GCN-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
+; GCN-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
+; GCN-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
+; GCN-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
+; GCN-NEXT:    v_add_i32_e32 v18, vcc, 30, v16
+; GCN-NEXT:    v_add_i32_e32 v19, vcc, 28, v16
+; GCN-NEXT:    v_add_i32_e32 v20, vcc, 26, v16
+; GCN-NEXT:    v_add_i32_e32 v21, vcc, 24, v16
+; GCN-NEXT:    v_add_i32_e32 v22, vcc, 22, v16
+; GCN-NEXT:    v_add_i32_e32 v23, vcc, 20, v16
+; GCN-NEXT:    v_add_i32_e32 v24, vcc, 18, v16
+; GCN-NEXT:    v_add_i32_e32 v25, vcc, 16, v16
+; GCN-NEXT:    v_add_i32_e32 v26, vcc, 14, v16
+; GCN-NEXT:    v_add_i32_e32 v27, vcc, 12, v16
+; GCN-NEXT:    v_add_i32_e32 v28, vcc, 10, v16
+; GCN-NEXT:    buffer_store_short v15, v18, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    v_add_i32_e32 v15, vcc, 8, v16
+; GCN-NEXT:    v_add_i32_e32 v18, vcc, 6, v16
+; GCN-NEXT:    buffer_store_short v14, v19, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    v_add_i32_e32 v14, vcc, 4, v16
+; GCN-NEXT:    v_add_i32_e32 v19, vcc, 2, v16
+; GCN-NEXT:    buffer_store_short v13, v20, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_lshlrev_b32_e32 v16, 16, v6
-; GCN-NEXT:    v_and_b32_e32 v18, 0xffff0000, v6
-; GCN-NEXT:    v_lshlrev_b32_e32 v20, 16, v7
-; GCN-NEXT:    v_and_b32_e32 v22, 0xffff0000, v7
-; GCN-NEXT:    v_lshlrev_b32_e32 v24, 16, v8
-; GCN-NEXT:    v_and_b32_e32 v26, 0xffff0000, v8
-; GCN-NEXT:    v_lshlrev_b32_e32 v28, 16, v9
-; GCN-NEXT:    v_and_b32_e32 v30, 0xffff0000, v9
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[4:5], v10
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[6:7], v11
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[8:9], v12
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[10:11], v13
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[12:13], v14
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[14:15], v15
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[16:17], v16
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[18:19], v18
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[20:21], v20
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[22:23], v22
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[24:25], v24
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[26:27], v26
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[28:29], v28
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[30:31], v30
+; GCN-NEXT:    buffer_store_short v12, v21, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_short v11, v22, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_short v10, v23, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_short v9, v24, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_short v8, v25, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_short v7, v26, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_short v6, v27, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_short v5, v28, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_short v4, v15, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_short v3, v18, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_short v2, v14, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_short v1, v19, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_short v0, v16, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_readlane_b32 s31, v17, 1
+; GCN-NEXT:    v_readlane_b32 s30, v17, 0
+; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s33 ; 4-byte Folded Reload
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
+; GCN-NEXT:    s_addk_i32 s32, 0xfc00
+; GCN-NEXT:    s_mov_b32 s33, s8
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_extload_v16bf16_to_v16f64:
-; GFX7:       ; %bb.0:
+; GFX7-LABEL: test_call_v16bf16:
+; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
-; GFX7-NEXT:    s_mov_b32 s4, s6
-; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_dwordx4 v[2:5], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT:    buffer_load_dwordx4 v[6:9], v[0:1], s[4:7], 0 addr64 offset:16
-; GFX7-NEXT:    s_waitcnt vmcnt(1)
-; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
-; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT:    v_lshlrev_b32_e32 v10, 16, v3
-; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v3
-; GFX7-NEXT:    v_lshlrev_b32_e32 v12, 16, v4
-; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v14, 16, v5
-; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v5
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_lshlrev_b32_e32 v16, 16, v6
-; GFX7-NEXT:    v_and_b32_e32 v18, 0xffff0000, v6
-; GFX7-NEXT:    v_lshlrev_b32_e32 v20, 16, v7
-; GFX7-NEXT:    v_and_b32_e32 v22, 0xffff0000, v7
-; GFX7-NEXT:    v_lshlrev_b32_e32 v24, 16, v8
-; GFX7-NEXT:    v_and_b32_e32 v26, 0xffff0000, v8
-; GFX7-NEXT:    v_lshlrev_b32_e32 v28, 16, v9
-; GFX7-NEXT:    v_and_b32_e32 v30, 0xffff0000, v9
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[4:5], v10
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[6:7], v11
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[8:9], v12
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[10:11], v13
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[12:13], v14
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[14:15], v15
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[16:17], v16
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[18:19], v18
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[20:21], v20
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[22:23], v22
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[24:25], v24
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[26:27], v26
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[28:29], v28
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[30:31], v30
+; GFX7-NEXT:    s_mov_b32 s8, s33
+; GFX7-NEXT:    s_mov_b32 s33, s32
+; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX7-NEXT:    buffer_store_dword v17, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX7-NEXT:    s_addk_i32 s32, 0x400
+; GFX7-NEXT:    s_getpc_b64 s[4:5]
+; GFX7-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4
+; GFX7-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX7-NEXT:    v_writelane_b32 v17, s30, 0
+; GFX7-NEXT:    v_writelane_b32 v17, s31, 1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX7-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
+; GFX7-NEXT:    v_add_i32_e32 v18, vcc, 30, v16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
+; GFX7-NEXT:    buffer_store_short v15, v18, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v15, vcc, 28, v16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
+; GFX7-NEXT:    buffer_store_short v14, v15, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v14, vcc, 26, v16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
+; GFX7-NEXT:    buffer_store_short v13, v14, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v13, vcc, 24, v16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
+; GFX7-NEXT:    buffer_store_short v12, v13, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v12, vcc, 22, v16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v10, 16, v10
+; GFX7-NEXT:    buffer_store_short v11, v12, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v11, vcc, 20, v16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX7-NEXT:    buffer_store_short v10, v11, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v10, vcc, 18, v16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; GFX7-NEXT:    buffer_store_short v9, v10, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v9, vcc, 16, v16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX7-NEXT:    buffer_store_short v8, v9, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v8, vcc, 14, v16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX7-NEXT:    buffer_store_short v7, v8, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v7, vcc, 12, v16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX7-NEXT:    buffer_store_short v6, v7, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v6, vcc, 10, v16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX7-NEXT:    buffer_store_short v5, v6, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v5, vcc, 8, v16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT:    buffer_store_short v4, v5, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 6, v16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    buffer_store_short v3, v4, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 4, v16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    buffer_store_short v2, v3, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 2, v16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:    buffer_store_short v1, v2, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_store_short v0, v16, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_readlane_b32 s31, v17, 1
+; GFX7-NEXT:    v_readlane_b32 s30, v17, 0
+; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX7-NEXT:    buffer_load_dword v17, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX7-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX7-NEXT:    s_mov_b32 s33, s8
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_extload_v16bf16_to_v16f64:
-; GFX8:       ; %bb.0:
+; GFX8-LABEL: test_call_v16bf16:
+; GFX8:       ; %bb.0: ; %entry
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    flat_load_dwordx4 v[2:5], v[0:1]
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dwordx4 v[6:9], v[0:1]
-; GFX8-NEXT:    s_waitcnt vmcnt(1)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v3
-; GFX8-NEXT:    v_and_b32_e32 v11, 0xffff0000, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v4
-; GFX8-NEXT:    v_and_b32_e32 v13, 0xffff0000, v4
-; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v5
-; GFX8-NEXT:    v_and_b32_e32 v15, 0xffff0000, v5
+; GFX8-NEXT:    s_mov_b32 s6, s33
+; GFX8-NEXT:    s_mov_b32 s33, s32
+; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8-NEXT:    buffer_store_dword v9, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8-NEXT:    s_addk_i32 s32, 0x400
+; GFX8-NEXT:    s_getpc_b64 s[4:5]
+; GFX8-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4
+; GFX8-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12
+; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX8-NEXT:    v_writelane_b32 v9, s30, 0
+; GFX8-NEXT:    v_writelane_b32 v9, s31, 1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX8-NEXT:    v_add_u32_e32 v18, vcc, 28, v8
+; GFX8-NEXT:    v_lshrrev_b32_e32 v17, 16, v7
+; GFX8-NEXT:    buffer_store_short v7, v18, s[0:3], 0 offen
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 24, v8
+; GFX8-NEXT:    v_lshrrev_b32_e32 v16, 16, v6
+; GFX8-NEXT:    buffer_store_short v6, v7, s[0:3], 0 offen
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 20, v8
+; GFX8-NEXT:    v_lshrrev_b32_e32 v15, 16, v5
+; GFX8-NEXT:    buffer_store_short v5, v6, s[0:3], 0 offen
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 16, v8
+; GFX8-NEXT:    v_lshrrev_b32_e32 v14, 16, v4
+; GFX8-NEXT:    buffer_store_short v4, v5, s[0:3], 0 offen
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 12, v8
+; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 16, v3
+; GFX8-NEXT:    buffer_store_short v3, v4, s[0:3], 0 offen
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 8, v8
+; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 16, v2
+; GFX8-NEXT:    buffer_store_short v2, v3, s[0:3], 0 offen
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 4, v8
+; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v0
+; GFX8-NEXT:    buffer_store_short v1, v2, s[0:3], 0 offen
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    buffer_store_short v0, v8, s[0:3], 0 offen
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 30, v8
+; GFX8-NEXT:    buffer_store_short v17, v0, s[0:3], 0 offen
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 26, v8
+; GFX8-NEXT:    buffer_store_short v16, v0, s[0:3], 0 offen
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 22, v8
+; GFX8-NEXT:    buffer_store_short v15, v0, s[0:3], 0 offen
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 18, v8
+; GFX8-NEXT:    buffer_store_short v14, v0, s[0:3], 0 offen
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 14, v8
+; GFX8-NEXT:    buffer_store_short v13, v0, s[0:3], 0 offen
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 10, v8
+; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 16, v1
+; GFX8-NEXT:    buffer_store_short v12, v0, s[0:3], 0 offen
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 6, v8
+; GFX8-NEXT:    buffer_store_short v11, v0, s[0:3], 0 offen
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 2, v8
+; GFX8-NEXT:    buffer_store_short v10, v0, s[0:3], 0 offen
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_readlane_b32 s31, v9, 1
+; GFX8-NEXT:    v_readlane_b32 s30, v9, 0
+; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8-NEXT:    buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX8-NEXT:    s_mov_b32 s33, s6
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v6
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v6
-; GFX8-NEXT:    v_lshlrev_b32_e32 v20, 16, v7
-; GFX8-NEXT:    v_and_b32_e32 v22, 0xffff0000, v7
-; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v8
-; GFX8-NEXT:    v_and_b32_e32 v26, 0xffff0000, v8
-; GFX8-NEXT:    v_lshlrev_b32_e32 v28, 16, v9
-; GFX8-NEXT:    v_and_b32_e32 v30, 0xffff0000, v9
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[4:5], v10
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[6:7], v11
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[8:9], v12
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[10:11], v13
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[12:13], v14
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[14:15], v15
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[16:17], v16
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[18:19], v18
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[20:21], v20
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[22:23], v22
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[24:25], v24
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[26:27], v26
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[28:29], v28
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[30:31], v30
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: global_extload_v16bf16_to_v16f64:
-; GFX9:       ; %bb.0:
+; GFX9-LABEL: test_call_v16bf16:
+; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx4 v[2:5], v[0:1], off
-; GFX9-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:16
-; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v3
-; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v4
-; GFX9-NEXT:    v_and_b32_e32 v12, 0xffff0000, v5
+; GFX9-NEXT:    s_mov_b32 s6, s33
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_store_dword v9, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX9-NEXT:    v_writelane_b32 v9, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v9, s31, 1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    buffer_store_short_d16_hi v7, v8, s[0:3], 0 offen offset:30
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_short v7, v8, s[0:3], 0 offen offset:28
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_short_d16_hi v6, v8, s[0:3], 0 offen offset:26
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_short v6, v8, s[0:3], 0 offen offset:24
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_short_d16_hi v5, v8, s[0:3], 0 offen offset:22
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_short v5, v8, s[0:3], 0 offen offset:20
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_short_d16_hi v4, v8, s[0:3], 0 offen offset:18
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_short v4, v8, s[0:3], 0 offen offset:16
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_short_d16_hi v3, v8, s[0:3], 0 offen offset:14
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_short v3, v8, s[0:3], 0 offen offset:12
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_short_d16_hi v2, v8, s[0:3], 0 offen offset:10
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_short v2, v8, s[0:3], 0 offen offset:8
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_short_d16_hi v1, v8, s[0:3], 0 offen offset:6
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_short v1, v8, s[0:3], 0 offen offset:4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_short_d16_hi v0, v8, s[0:3], 0 offen offset:2
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_short v0, v8, s[0:3], 0 offen
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_readlane_b32 s31, v9, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v9, 0
+; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX9-NEXT:    s_mov_b32 s33, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v13, 0xffff0000, v6
-; GFX9-NEXT:    v_and_b32_e32 v16, 0xffff0000, v7
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v8
-; GFX9-NEXT:    v_and_b32_e32 v20, 0xffff0000, v9
-; GFX9-NEXT:    v_lshlrev_b32_e32 v21, 16, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v24, 16, v3
-; GFX9-NEXT:    v_lshlrev_b32_e32 v25, 16, v4
-; GFX9-NEXT:    v_lshlrev_b32_e32 v28, 16, v5
-; GFX9-NEXT:    v_lshlrev_b32_e32 v29, 16, v6
-; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v7
-; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v8
-; GFX9-NEXT:    v_lshlrev_b32_e32 v34, 16, v9
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[2:3], v0
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[6:7], v1
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[10:11], v10
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[14:15], v12
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[18:19], v13
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[22:23], v16
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[26:27], v17
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[30:31], v20
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[0:1], v21
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[4:5], v24
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[8:9], v25
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[12:13], v28
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[16:17], v29
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[20:21], v32
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[24:25], v33
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[28:29], v34
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_extload_v16bf16_to_v16f64:
-; GFX10:       ; %bb.0:
+; GFX10-LABEL: test_call_v16bf16:
+; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    global_load_dwordx4 v[2:5], v[0:1], off
-; GFX10-NEXT:    global_load_dwordx4 v[9:12], v[0:1], off offset:16
-; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v3
-; GFX10-NEXT:    v_and_b32_e32 v13, 0xffff0000, v4
-; GFX10-NEXT:    v_and_b32_e32 v14, 0xffff0000, v5
+; GFX10-NEXT:    s_mov_b32 s6, s33
+; GFX10-NEXT:    s_mov_b32 s33, s32
+; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v9, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    s_getpc_b64 s[4:5]
+; GFX10-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12
+; GFX10-NEXT:    v_writelane_b32 v9, s30, 0
+; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX10-NEXT:    v_writelane_b32 v9, s31, 1
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX10-NEXT:    buffer_store_short_d16_hi v7, v8, s[0:3], 0 offen offset:30
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    buffer_store_short v7, v8, s[0:3], 0 offen offset:28
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    buffer_store_short_d16_hi v6, v8, s[0:3], 0 offen offset:26
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    buffer_store_short v6, v8, s[0:3], 0 offen offset:24
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    buffer_store_short_d16_hi v5, v8, s[0:3], 0 offen offset:22
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    buffer_store_short v5, v8, s[0:3], 0 offen offset:20
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    buffer_store_short_d16_hi v4, v8, s[0:3], 0 offen offset:18
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    buffer_store_short v4, v8, s[0:3], 0 offen offset:16
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    buffer_store_short_d16_hi v3, v8, s[0:3], 0 offen offset:14
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    buffer_store_short v3, v8, s[0:3], 0 offen offset:12
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    buffer_store_short_d16_hi v2, v8, s[0:3], 0 offen offset:10
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    buffer_store_short v2, v8, s[0:3], 0 offen offset:8
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    buffer_store_short_d16_hi v1, v8, s[0:3], 0 offen offset:6
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    buffer_store_short v1, v8, s[0:3], 0 offen offset:4
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    buffer_store_short_d16_hi v0, v8, s[0:3], 0 offen offset:2
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    buffer_store_short v0, v8, s[0:3], 0 offen
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_readlane_b32 s31, v9, 1
+; GFX10-NEXT:    v_readlane_b32 s30, v9, 0
+; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s4
+; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
+; GFX10-NEXT:    s_mov_b32 s33, s6
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v16, 0xffff0000, v9
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
-; GFX10-NEXT:    v_and_b32_e32 v17, 0xffff0000, v10
-; GFX10-NEXT:    v_and_b32_e32 v20, 0xffff0000, v11
-; GFX10-NEXT:    v_and_b32_e32 v21, 0xffff0000, v12
-; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v5
-; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v9
-; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v10
-; GFX10-NEXT:    v_lshlrev_b32_e32 v28, 16, v11
-; GFX10-NEXT:    v_lshlrev_b32_e32 v29, 16, v12
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[2:3], v6
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[4:5], v8
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[6:7], v7
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[8:9], v15
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[10:11], v13
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[12:13], v18
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[14:15], v14
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[18:19], v16
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[22:23], v17
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[16:17], v24
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[26:27], v20
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[30:31], v21
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[20:21], v25
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[24:25], v28
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[28:29], v29
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_extload_v16bf16_to_v16f64:
-; GFX11:       ; %bb.0:
+; GFX11-LABEL: test_call_v16bf16:
+; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    global_load_b128 v[7:10], v[0:1], off
-; GFX11-NEXT:    global_load_b128 v[23:26], v[0:1], off offset:16
-; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v7
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v7
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v8
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v8
-; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
-; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v9
-; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v10
-; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff0000, v10
+; GFX11-NEXT:    s_mov_b32 s2, s33
+; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT:    scratch_store_b32 off, v9, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    s_getpc_b64 s[0:1]
+; GFX11-NEXT:    s_add_u32 s0, s0, test_arg_store_v2bf16@gotpcrel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s1, s1, test_arg_store_v2bf16@gotpcrel32@hi+12
+; GFX11-NEXT:    v_writelane_b32 v9, s30, 0
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT:    v_writelane_b32 v9, s31, 1
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    v_add_nc_u32_e32 v10, 30, v8
+; GFX11-NEXT:    v_add_nc_u32_e32 v11, 28, v8
+; GFX11-NEXT:    v_add_nc_u32_e32 v12, 20, v8
+; GFX11-NEXT:    v_add_nc_u32_e32 v13, 18, v8
+; GFX11-NEXT:    v_readlane_b32 s31, v9, 1
+; GFX11-NEXT:    scratch_store_d16_hi_b16 v10, v7, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    scratch_store_b16 v11, v7, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    v_add_nc_u32_e32 v7, 26, v8
+; GFX11-NEXT:    v_add_nc_u32_e32 v10, 24, v8
+; GFX11-NEXT:    v_add_nc_u32_e32 v11, 22, v8
+; GFX11-NEXT:    v_readlane_b32 s30, v9, 0
+; GFX11-NEXT:    scratch_store_d16_hi_b16 v7, v6, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    scratch_store_b16 v10, v6, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    scratch_store_d16_hi_b16 v11, v5, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    scratch_store_b16 v12, v5, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    scratch_store_d16_hi_b16 v13, v4, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    v_add_nc_u32_e32 v5, 14, v8
+; GFX11-NEXT:    v_add_nc_u32_e32 v6, 12, v8
+; GFX11-NEXT:    v_add_nc_u32_e32 v7, 10, v8
+; GFX11-NEXT:    scratch_store_b16 v8, v4, off offset:16 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    v_add_nc_u32_e32 v4, 6, v8
+; GFX11-NEXT:    scratch_store_d16_hi_b16 v5, v3, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    scratch_store_b16 v6, v3, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    scratch_store_d16_hi_b16 v7, v2, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    scratch_store_b16 v8, v2, off offset:8 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    scratch_store_d16_hi_b16 v4, v1, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    scratch_store_b16 v8, v1, off offset:4 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    scratch_store_d16_hi_b16 v8, v0, off offset:2 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    scratch_store_b16 v8, v0, off dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX11-NEXT:    scratch_load_b32 v9, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-NEXT:    s_add_i32 s32, s32, -16
+; GFX11-NEXT:    s_mov_b32 s33, s2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v23
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff0000, v23
-; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v24
-; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff0000, v24
-; GFX11-NEXT:    v_lshlrev_b32_e32 v24, 16, v25
-; GFX11-NEXT:    v_and_b32_e32 v27, 0xffff0000, v25
-; GFX11-NEXT:    v_lshlrev_b32_e32 v28, 16, v26
-; GFX11-NEXT:    v_and_b32_e32 v30, 0xffff0000, v26
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[8:9], v8
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[10:11], v11
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[12:13], v12
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[14:15], v14
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[16:17], v16
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[18:19], v18
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[20:21], v20
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[22:23], v22
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[24:25], v24
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[26:27], v27
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[28:29], v28
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[30:31], v30
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %load = load <16 x bfloat>, ptr addrspace(1) %ptr
-  %fpext = fpext <16 x bfloat> %load to <16 x double>
-  ret <16 x double> %fpext
+entry:
+  %result = call <16 x bfloat> @test_arg_store_v2bf16(<16 x bfloat> %in)
+  store volatile <16 x bfloat> %result, ptr addrspace(5) %out
+  ret void
 }
 
-define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) {
-; GCN-LABEL: global_extload_v32bf16_to_v32f64:
+define bfloat @test_alloca_load_store_ret(bfloat %in) {
+; GCN-LABEL: test_alloca_load_store_ret:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT:    buffer_store_short v0, off, s[0:3], s32
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 glc
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: test_alloca_load_store_ret:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:    buffer_store_short v0, off, s[0:3], s32
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: test_alloca_load_store_ret:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    buffer_store_short v0, off, s[0:3], s32
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    buffer_load_ushort v0, off, s[0:3], s32 glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: test_alloca_load_store_ret:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_store_short_d16_hi v0, off, s[0:3], s32
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    buffer_load_short_d16_hi v0, off, s[0:3], s32 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: test_alloca_load_store_ret:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    buffer_store_short_d16_hi v0, off, s[0:3], s32
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    buffer_load_short_d16_hi v1, off, s[0:3], s32 glc dlc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: test_alloca_load_store_ret:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-NEXT:    scratch_store_d16_hi_b16 off, v0, s32 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    scratch_load_d16_hi_b16 v1, off, s32 glc dlc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %in.addr = alloca bfloat, align 2, addrspace(5)
+  store volatile bfloat %in, ptr addrspace(5) %in.addr, align 2
+  %loaded = load volatile bfloat, ptr addrspace(5) %in.addr, align 2
+  ret bfloat %loaded
+}
+
+define { <32 x i32>, bfloat } @test_overflow_stack(bfloat %a, <32 x i32> %b) {
+; GCN-LABEL: test_overflow_stack:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s6, 0
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NEXT:    s_mov_b32 s4, s6
-; GCN-NEXT:    s_mov_b32 s5, s6
-; GCN-NEXT:    buffer_load_ushort v3, v[1:2], s[4:7], 0 addr64
-; GCN-NEXT:    buffer_load_ushort v4, v[1:2], s[4:7], 0 addr64 offset:2
-; GCN-NEXT:    buffer_load_ushort v5, v[1:2], s[4:7], 0 addr64 offset:4
-; GCN-NEXT:    buffer_load_ushort v6, v[1:2], s[4:7], 0 addr64 offset:6
-; GCN-NEXT:    buffer_load_ushort v7, v[1:2], s[4:7], 0 addr64 offset:8
-; GCN-NEXT:    buffer_load_ushort v8, v[1:2], s[4:7], 0 addr64 offset:10
-; GCN-NEXT:    buffer_load_ushort v9, v[1:2], s[4:7], 0 addr64 offset:12
-; GCN-NEXT:    buffer_load_ushort v10, v[1:2], s[4:7], 0 addr64 offset:14
-; GCN-NEXT:    buffer_load_ushort v11, v[1:2], s[4:7], 0 addr64 offset:16
-; GCN-NEXT:    buffer_load_ushort v12, v[1:2], s[4:7], 0 addr64 offset:18
-; GCN-NEXT:    buffer_load_ushort v13, v[1:2], s[4:7], 0 addr64 offset:20
-; GCN-NEXT:    buffer_load_ushort v14, v[1:2], s[4:7], 0 addr64 offset:22
-; GCN-NEXT:    buffer_load_ushort v15, v[1:2], s[4:7], 0 addr64 offset:24
-; GCN-NEXT:    buffer_load_ushort v16, v[1:2], s[4:7], 0 addr64 offset:26
-; GCN-NEXT:    buffer_load_ushort v17, v[1:2], s[4:7], 0 addr64 offset:28
-; GCN-NEXT:    buffer_load_ushort v18, v[1:2], s[4:7], 0 addr64 offset:30
-; GCN-NEXT:    buffer_load_ushort v25, v[1:2], s[4:7], 0 addr64 offset:48
-; GCN-NEXT:    buffer_load_ushort v26, v[1:2], s[4:7], 0 addr64 offset:50
-; GCN-NEXT:    buffer_load_ushort v27, v[1:2], s[4:7], 0 addr64 offset:52
-; GCN-NEXT:    buffer_load_ushort v28, v[1:2], s[4:7], 0 addr64 offset:54
-; GCN-NEXT:    buffer_load_ushort v29, v[1:2], s[4:7], 0 addr64 offset:56
-; GCN-NEXT:    buffer_load_ushort v30, v[1:2], s[4:7], 0 addr64 offset:58
-; GCN-NEXT:    buffer_load_ushort v31, v[1:2], s[4:7], 0 addr64 offset:60
-; GCN-NEXT:    buffer_load_ushort v32, v[1:2], s[4:7], 0 addr64 offset:62
-; GCN-NEXT:    buffer_load_ushort v19, v[1:2], s[4:7], 0 addr64 offset:32
-; GCN-NEXT:    buffer_load_ushort v20, v[1:2], s[4:7], 0 addr64 offset:34
-; GCN-NEXT:    buffer_load_ushort v21, v[1:2], s[4:7], 0 addr64 offset:36
-; GCN-NEXT:    buffer_load_ushort v22, v[1:2], s[4:7], 0 addr64 offset:38
-; GCN-NEXT:    buffer_load_ushort v23, v[1:2], s[4:7], 0 addr64 offset:40
-; GCN-NEXT:    buffer_load_ushort v24, v[1:2], s[4:7], 0 addr64 offset:42
-; GCN-NEXT:    buffer_load_ushort v33, v[1:2], s[4:7], 0 addr64 offset:44
-; GCN-NEXT:    buffer_load_ushort v34, v[1:2], s[4:7], 0 addr64 offset:46
-; GCN-NEXT:    s_waitcnt vmcnt(8)
-; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v32
-; GCN-NEXT:    v_add_i32_e32 v32, vcc, 0xfc, v0
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
-; GCN-NEXT:    buffer_store_dword v2, v32, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, 0xf8, v0
-; GCN-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v31
-; GCN-NEXT:    v_add_i32_e32 v31, vcc, 0xf4, v0
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
+; GCN-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:8
+; GCN-NEXT:    v_add_i32_e32 v31, vcc, 0x7c, v0
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32
+; GCN-NEXT:    s_waitcnt vmcnt(2)
 ; GCN-NEXT:    buffer_store_dword v2, v31, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, 0xf0, v0
-; GCN-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; GCN-NEXT:    v_add_i32_e32 v31, vcc, 0xec, v0
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, 0x78, v0
+; GCN-NEXT:    s_waitcnt vmcnt(2)
+; GCN-NEXT:    buffer_store_dword v32, v2, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, 0x74, v0
+; GCN-NEXT:    s_waitcnt vmcnt(2)
+; GCN-NEXT:    buffer_store_dword v33, v2, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, 0x70, v0
+; GCN-NEXT:    v_add_i32_e32 v31, vcc, 0x6c, v0
+; GCN-NEXT:    buffer_store_dword v30, v2, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, 0x68, v0
 ; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v30
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
-; GCN-NEXT:    buffer_store_dword v2, v31, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v30, vcc, 0x64, v0
+; GCN-NEXT:    buffer_store_dword v29, v31, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, 0xe8, v0
-; GCN-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; GCN-NEXT:    v_add_i32_e32 v30, vcc, 0xe4, v0
+; GCN-NEXT:    v_add_i32_e32 v29, vcc, 0x60, v0
+; GCN-NEXT:    v_add_i32_e32 v31, vcc, 0x5c, v0
+; GCN-NEXT:    buffer_store_dword v28, v2, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, 0x58, v0
 ; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v29
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
-; GCN-NEXT:    buffer_store_dword v2, v30, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v28, vcc, 0x54, v0
+; GCN-NEXT:    buffer_store_dword v27, v30, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, 0xe0, v0
-; GCN-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; GCN-NEXT:    v_add_i32_e32 v29, vcc, 0xdc, v0
-; GCN-NEXT:    v_add_i32_e32 v30, vcc, 0xd8, v0
+; GCN-NEXT:    v_add_i32_e32 v27, vcc, 0x50, v0
+; GCN-NEXT:    v_add_i32_e32 v30, vcc, 0x4c, v0
+; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GCN-NEXT:    buffer_store_dword v26, v29, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v28
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
-; GCN-NEXT:    buffer_store_dword v2, v29, s[0:3], 0 offen
-; GCN-NEXT:    v_add_i32_e32 v28, vcc, 0xd4, v0
-; GCN-NEXT:    buffer_store_dword v1, v30, s[0:3], 0 offen
-; GCN-NEXT:    v_add_i32_e32 v29, vcc, 0xd0, v0
-; GCN-NEXT:    v_add_i32_e32 v30, vcc, 0xcc, v0
+; GCN-NEXT:    v_add_i32_e32 v26, vcc, 0x48, v0
+; GCN-NEXT:    v_add_i32_e32 v29, vcc, 0x44, v0
+; GCN-NEXT:    buffer_store_dword v25, v31, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v27
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
-; GCN-NEXT:    buffer_store_dword v2, v28, s[0:3], 0 offen
-; GCN-NEXT:    v_add_i32_e32 v27, vcc, 0xc8, v0
-; GCN-NEXT:    buffer_store_dword v1, v29, s[0:3], 0 offen
-; GCN-NEXT:    v_add_i32_e32 v28, vcc, 0xc4, v0
-; GCN-NEXT:    v_add_i32_e32 v29, vcc, 0xc0, v0
+; GCN-NEXT:    v_add_i32_e32 v25, vcc, 64, v0
+; GCN-NEXT:    v_add_i32_e32 v31, vcc, 60, v0
+; GCN-NEXT:    buffer_store_dword v24, v2, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, 56, v0
 ; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v26
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
-; GCN-NEXT:    buffer_store_dword v2, v30, s[0:3], 0 offen
-; GCN-NEXT:    v_add_i32_e32 v26, vcc, 0xbc, v0
-; GCN-NEXT:    buffer_store_dword v1, v27, s[0:3], 0 offen
-; GCN-NEXT:    v_add_i32_e32 v27, vcc, 0xb8, v0
-; GCN-NEXT:    v_add_i32_e32 v30, vcc, 0xb4, v0
+; GCN-NEXT:    v_add_i32_e32 v24, vcc, 52, v0
+; GCN-NEXT:    buffer_store_dword v23, v28, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v25
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
-; GCN-NEXT:    buffer_store_dword v2, v28, s[0:3], 0 offen
-; GCN-NEXT:    v_add_i32_e32 v25, vcc, 0xb0, v0
-; GCN-NEXT:    buffer_store_dword v1, v29, s[0:3], 0 offen
-; GCN-NEXT:    v_add_i32_e32 v28, vcc, 0xac, v0
-; GCN-NEXT:    v_add_i32_e32 v29, vcc, 0xa8, v0
-; GCN-NEXT:    s_waitcnt vmcnt(14) expcnt(0)
-; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v34
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
-; GCN-NEXT:    buffer_store_dword v2, v26, s[0:3], 0 offen
-; GCN-NEXT:    v_add_i32_e32 v26, vcc, 0xa4, v0
-; GCN-NEXT:    buffer_store_dword v1, v27, s[0:3], 0 offen
-; GCN-NEXT:    v_add_i32_e32 v27, vcc, 0xa0, v0
-; GCN-NEXT:    v_add_i32_e32 v31, vcc, 0x9c, v0
+; GCN-NEXT:    v_add_i32_e32 v23, vcc, 48, v0
+; GCN-NEXT:    v_add_i32_e32 v28, vcc, 44, v0
+; GCN-NEXT:    buffer_store_dword v22, v27, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v33
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
-; GCN-NEXT:    buffer_store_dword v2, v30, s[0:3], 0 offen
-; GCN-NEXT:    v_add_i32_e32 v30, vcc, 0x98, v0
-; GCN-NEXT:    buffer_store_dword v1, v25, s[0:3], 0 offen
-; GCN-NEXT:    v_add_i32_e32 v25, vcc, 0x94, v0
-; GCN-NEXT:    v_add_i32_e32 v32, vcc, 0x90, v0
+; GCN-NEXT:    v_add_i32_e32 v22, vcc, 40, v0
+; GCN-NEXT:    v_add_i32_e32 v27, vcc, 36, v0
+; GCN-NEXT:    buffer_store_dword v21, v30, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v24
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
-; GCN-NEXT:    buffer_store_dword v2, v28, s[0:3], 0 offen
-; GCN-NEXT:    v_add_i32_e32 v24, vcc, 0x8c, v0
-; GCN-NEXT:    buffer_store_dword v1, v29, s[0:3], 0 offen
-; GCN-NEXT:    v_add_i32_e32 v28, vcc, 0x88, v0
-; GCN-NEXT:    v_add_i32_e32 v29, vcc, 0x84, v0
+; GCN-NEXT:    v_add_i32_e32 v21, vcc, 32, v0
+; GCN-NEXT:    v_add_i32_e32 v30, vcc, 28, v0
+; GCN-NEXT:    buffer_store_dword v20, v26, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v23
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
-; GCN-NEXT:    buffer_store_dword v2, v26, s[0:3], 0 offen
-; GCN-NEXT:    v_add_i32_e32 v23, vcc, 0x80, v0
-; GCN-NEXT:    buffer_store_dword v1, v27, s[0:3], 0 offen
-; GCN-NEXT:    v_add_i32_e32 v26, vcc, 0x7c, v0
-; GCN-NEXT:    v_add_i32_e32 v27, vcc, 0x78, v0
+; GCN-NEXT:    v_add_i32_e32 v20, vcc, 24, v0
+; GCN-NEXT:    v_add_i32_e32 v26, vcc, 20, v0
+; GCN-NEXT:    buffer_store_dword v19, v29, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v22
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
-; GCN-NEXT:    buffer_store_dword v2, v31, s[0:3], 0 offen
-; GCN-NEXT:    v_add_i32_e32 v22, vcc, 0x74, v0
-; GCN-NEXT:    buffer_store_dword v1, v30, s[0:3], 0 offen
-; GCN-NEXT:    v_add_i32_e32 v30, vcc, 0x70, v0
-; GCN-NEXT:    v_add_i32_e32 v31, vcc, 0x6c, v0
+; GCN-NEXT:    v_add_i32_e32 v19, vcc, 16, v0
+; GCN-NEXT:    v_add_i32_e32 v29, vcc, 12, v0
+; GCN-NEXT:    buffer_store_dword v18, v25, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v21
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
-; GCN-NEXT:    buffer_store_dword v2, v25, s[0:3], 0 offen
-; GCN-NEXT:    v_add_i32_e32 v21, vcc, 0x68, v0
-; GCN-NEXT:    buffer_store_dword v1, v32, s[0:3], 0 offen
-; GCN-NEXT:    v_add_i32_e32 v25, vcc, 0x64, v0
-; GCN-NEXT:    v_add_i32_e32 v32, vcc, 0x60, v0
-; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v20
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
-; GCN-NEXT:    buffer_store_dword v2, v24, s[0:3], 0 offen
-; GCN-NEXT:    v_add_i32_e32 v20, vcc, 0x5c, v0
-; GCN-NEXT:    buffer_store_dword v1, v28, s[0:3], 0 offen
-; GCN-NEXT:    v_add_i32_e32 v24, vcc, 0x58, v0
-; GCN-NEXT:    v_add_i32_e32 v28, vcc, 0x54, v0
-; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v19
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
-; GCN-NEXT:    buffer_store_dword v2, v29, s[0:3], 0 offen
-; GCN-NEXT:    v_add_i32_e32 v19, vcc, 0x50, v0
-; GCN-NEXT:    buffer_store_dword v1, v23, s[0:3], 0 offen
-; GCN-NEXT:    v_add_i32_e32 v23, vcc, 0x4c, v0
-; GCN-NEXT:    v_add_i32_e32 v29, vcc, 0x48, v0
-; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v18
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
-; GCN-NEXT:    buffer_store_dword v2, v26, s[0:3], 0 offen
-; GCN-NEXT:    v_add_i32_e32 v18, vcc, 0x44, v0
-; GCN-NEXT:    buffer_store_dword v1, v27, s[0:3], 0 offen
-; GCN-NEXT:    v_add_i32_e32 v26, vcc, 64, v0
-; GCN-NEXT:    v_add_i32_e32 v27, vcc, 60, v0
-; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v17
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
-; GCN-NEXT:    buffer_store_dword v2, v22, s[0:3], 0 offen
-; GCN-NEXT:    v_add_i32_e32 v17, vcc, 56, v0
-; GCN-NEXT:    buffer_store_dword v1, v30, s[0:3], 0 offen
-; GCN-NEXT:    v_add_i32_e32 v22, vcc, 52, v0
-; GCN-NEXT:    v_add_i32_e32 v30, vcc, 48, v0
-; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v16
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
-; GCN-NEXT:    buffer_store_dword v2, v31, s[0:3], 0 offen
-; GCN-NEXT:    v_add_i32_e32 v31, vcc, 44, v0
-; GCN-NEXT:    buffer_store_dword v1, v21, s[0:3], 0 offen
-; GCN-NEXT:    v_add_i32_e32 v21, vcc, 40, v0
-; GCN-NEXT:    v_add_i32_e32 v33, vcc, 36, v0
-; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v15
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
-; GCN-NEXT:    buffer_store_dword v2, v25, s[0:3], 0 offen
-; GCN-NEXT:    v_add_i32_e32 v25, vcc, 32, v0
-; GCN-NEXT:    buffer_store_dword v1, v32, s[0:3], 0 offen
-; GCN-NEXT:    v_add_i32_e32 v32, vcc, 28, v0
-; GCN-NEXT:    v_add_i32_e32 v34, vcc, 24, v0
-; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v14
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
-; GCN-NEXT:    buffer_store_dword v2, v20, s[0:3], 0 offen
-; GCN-NEXT:    v_add_i32_e32 v20, vcc, 20, v0
-; GCN-NEXT:    buffer_store_dword v1, v24, s[0:3], 0 offen
-; GCN-NEXT:    v_add_i32_e32 v24, vcc, 16, v0
-; GCN-NEXT:    v_add_i32_e32 v35, vcc, 12, v0
-; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v13
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
-; GCN-NEXT:    buffer_store_dword v2, v28, s[0:3], 0 offen
-; GCN-NEXT:    v_add_i32_e32 v28, vcc, 8, v0
-; GCN-NEXT:    buffer_store_dword v1, v19, s[0:3], 0 offen
-; GCN-NEXT:    v_add_i32_e32 v19, vcc, 4, v0
-; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v12
-; GCN-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GCN-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GCN-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GCN-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
-; GCN-NEXT:    v_lshlrev_b32_e32 v13, 16, v4
-; GCN-NEXT:    v_lshlrev_b32_e32 v14, 16, v5
-; GCN-NEXT:    v_lshlrev_b32_e32 v15, 16, v6
-; GCN-NEXT:    v_lshlrev_b32_e32 v16, 16, v7
-; GCN-NEXT:    v_lshlrev_b32_e32 v36, 16, v8
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[3:4], v11
-; GCN-NEXT:    buffer_store_dword v2, v23, s[0:3], 0 offen
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[5:6], v10
-; GCN-NEXT:    buffer_store_dword v1, v29, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[1:2], v9
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[7:8], v12
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[9:10], v13
+; GCN-NEXT:    v_add_i32_e32 v18, vcc, 8, v0
+; GCN-NEXT:    v_add_i32_e32 v25, vcc, 4, v0
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, 0x80, v0
+; GCN-NEXT:    buffer_store_dword v17, v31, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v16, v2, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v15, v24, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v14, v23, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v13, v28, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v12, v22, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v11, v27, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v10, v21, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v9, v30, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v8, v20, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v7, v26, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v6, v19, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v5, v29, s[0:3], 0 offen
 ; GCN-NEXT:    buffer_store_dword v4, v18, s[0:3], 0 offen
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[11:12], v36
-; GCN-NEXT:    buffer_store_dword v3, v26, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[3:4], v14
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[13:14], v15
-; GCN-NEXT:    v_cvt_f64_f32_e32 v[15:16], v16
-; GCN-NEXT:    buffer_store_dword v6, v27, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v5, v17, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v2, v22, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v1, v30, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v12, v31, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v11, v21, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v16, v33, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v15, v25, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v14, v32, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v13, v34, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v4, v20, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v3, v24, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v10, v35, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v9, v28, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v8, v19, s[0:3], 0 offen
-; GCN-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v3, v25, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: global_extload_v32bf16_to_v32f64:
+; GFX7-LABEL: test_overflow_stack:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b32 s6, 0
-; GFX7-NEXT:    s_mov_b32 s7, 0xf000
-; GFX7-NEXT:    s_mov_b32 s4, s6
-; GFX7-NEXT:    s_mov_b32 s5, s6
-; GFX7-NEXT:    buffer_load_ushort v20, v[1:2], s[4:7], 0 addr64 offset:62
-; GFX7-NEXT:    buffer_load_ushort v22, v[1:2], s[4:7], 0 addr64 offset:60
-; GFX7-NEXT:    buffer_load_ushort v23, v[1:2], s[4:7], 0 addr64 offset:58
-; GFX7-NEXT:    buffer_load_ushort v24, v[1:2], s[4:7], 0 addr64 offset:56
-; GFX7-NEXT:    buffer_load_ushort v25, v[1:2], s[4:7], 0 addr64 offset:54
-; GFX7-NEXT:    buffer_load_ushort v26, v[1:2], s[4:7], 0 addr64 offset:52
-; GFX7-NEXT:    buffer_load_ushort v27, v[1:2], s[4:7], 0 addr64 offset:50
-; GFX7-NEXT:    buffer_load_ushort v28, v[1:2], s[4:7], 0 addr64 offset:48
-; GFX7-NEXT:    buffer_load_ushort v15, v[1:2], s[4:7], 0 addr64 offset:32
-; GFX7-NEXT:    buffer_load_ushort v18, v[1:2], s[4:7], 0 addr64 offset:34
-; GFX7-NEXT:    buffer_load_ushort v29, v[1:2], s[4:7], 0 addr64 offset:36
-; GFX7-NEXT:    buffer_load_ushort v30, v[1:2], s[4:7], 0 addr64 offset:38
-; GFX7-NEXT:    buffer_load_ushort v31, v[1:2], s[4:7], 0 addr64 offset:40
-; GFX7-NEXT:    buffer_load_ushort v32, v[1:2], s[4:7], 0 addr64 offset:42
-; GFX7-NEXT:    buffer_load_ushort v33, v[1:2], s[4:7], 0 addr64 offset:44
-; GFX7-NEXT:    buffer_load_ushort v34, v[1:2], s[4:7], 0 addr64 offset:46
-; GFX7-NEXT:    buffer_load_ushort v19, v[1:2], s[4:7], 0 addr64
-; GFX7-NEXT:    buffer_load_ushort v17, v[1:2], s[4:7], 0 addr64 offset:2
-; GFX7-NEXT:    buffer_load_ushort v14, v[1:2], s[4:7], 0 addr64 offset:4
-; GFX7-NEXT:    buffer_load_ushort v12, v[1:2], s[4:7], 0 addr64 offset:6
-; GFX7-NEXT:    buffer_load_ushort v10, v[1:2], s[4:7], 0 addr64 offset:8
-; GFX7-NEXT:    buffer_load_ushort v9, v[1:2], s[4:7], 0 addr64 offset:10
-; GFX7-NEXT:    buffer_load_ushort v7, v[1:2], s[4:7], 0 addr64 offset:12
-; GFX7-NEXT:    buffer_load_ushort v4, v[1:2], s[4:7], 0 addr64 offset:14
-; GFX7-NEXT:    buffer_load_ushort v5, v[1:2], s[4:7], 0 addr64 offset:16
-; GFX7-NEXT:    buffer_load_ushort v3, v[1:2], s[4:7], 0 addr64 offset:18
-; GFX7-NEXT:    buffer_load_ushort v6, v[1:2], s[4:7], 0 addr64 offset:20
-; GFX7-NEXT:    buffer_load_ushort v8, v[1:2], s[4:7], 0 addr64 offset:22
-; GFX7-NEXT:    buffer_load_ushort v11, v[1:2], s[4:7], 0 addr64 offset:24
-; GFX7-NEXT:    buffer_load_ushort v13, v[1:2], s[4:7], 0 addr64 offset:26
-; GFX7-NEXT:    buffer_load_ushort v16, v[1:2], s[4:7], 0 addr64 offset:28
-; GFX7-NEXT:    buffer_load_ushort v1, v[1:2], s[4:7], 0 addr64 offset:30
-; GFX7-NEXT:    s_waitcnt vmcnt(14)
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v20
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[20:21], v2
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0xfc, v0
-; GFX7-NEXT:    buffer_store_dword v21, v2, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0xf8, v0
-; GFX7-NEXT:    buffer_store_dword v20, v2, s[0:3], 0 offen
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v22
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[20:21], v2
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0xf4, v0
-; GFX7-NEXT:    v_add_i32_e32 v22, vcc, 0xd8, v0
-; GFX7-NEXT:    buffer_store_dword v21, v2, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0xf0, v0
-; GFX7-NEXT:    buffer_store_dword v20, v2, s[0:3], 0 offen
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v23
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[20:21], v2
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0xec, v0
-; GFX7-NEXT:    s_waitcnt vmcnt(14)
-; GFX7-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX7-NEXT:    buffer_store_dword v21, v2, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0xe8, v0
-; GFX7-NEXT:    buffer_store_dword v20, v2, s[0:3], 0 offen
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v24
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[20:21], v2
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0xe4, v0
-; GFX7-NEXT:    v_add_i32_e32 v24, vcc, 0xd0, v0
-; GFX7-NEXT:    buffer_store_dword v21, v2, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0xe0, v0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v21, 16, v25
-; GFX7-NEXT:    buffer_store_dword v20, v2, s[0:3], 0 offen
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[20:21], v21
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0xdc, v0
-; GFX7-NEXT:    s_waitcnt vmcnt(8)
-; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX7-NEXT:    buffer_store_dword v21, v2, s[0:3], 0 offen
-; GFX7-NEXT:    v_lshlrev_b32_e32 v21, 16, v26
-; GFX7-NEXT:    buffer_store_dword v20, v22, s[0:3], 0 offen
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[20:21], v21
-; GFX7-NEXT:    v_lshlrev_b32_e32 v22, 16, v27
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0xd4, v0
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[22:23], v22
-; GFX7-NEXT:    buffer_store_dword v21, v2, s[0:3], 0 offen
-; GFX7-NEXT:    buffer_store_dword v20, v24, s[0:3], 0 offen
-; GFX7-NEXT:    v_lshlrev_b32_e32 v20, 16, v28
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[20:21], v20
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0xcc, v0
+; GFX7-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
+; GFX7-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:8
+; GFX7-NEXT:    v_add_i32_e32 v31, vcc, 0x7c, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_store_dword v2, v31, s[0:3], 0 offen
+; GFX7-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:4
+; GFX7-NEXT:    v_add_i32_e32 v31, vcc, 0x78, v0
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_store_dword v2, v31, s[0:3], 0 offen
+; GFX7-NEXT:    buffer_load_dword v2, off, s[0:3], s32
+; GFX7-NEXT:    v_add_i32_e32 v31, vcc, 0x74, v0
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_store_dword v2, v31, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0x70, v0
+; GFX7-NEXT:    buffer_store_dword v30, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0x6c, v0
+; GFX7-NEXT:    buffer_store_dword v29, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0x68, v0
+; GFX7-NEXT:    buffer_store_dword v28, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0x64, v0
+; GFX7-NEXT:    buffer_store_dword v27, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0x60, v0
+; GFX7-NEXT:    buffer_store_dword v26, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0x5c, v0
+; GFX7-NEXT:    buffer_store_dword v25, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0x58, v0
+; GFX7-NEXT:    buffer_store_dword v24, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0x54, v0
 ; GFX7-NEXT:    buffer_store_dword v23, v2, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0xc8, v0
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0x50, v0
 ; GFX7-NEXT:    buffer_store_dword v22, v2, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0xc4, v0
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0x4c, v0
 ; GFX7-NEXT:    buffer_store_dword v21, v2, s[0:3], 0 offen
-; GFX7-NEXT:    v_lshlrev_b32_e32 v21, 16, v34
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[21:22], v21
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0xc0, v0
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0x48, v0
 ; GFX7-NEXT:    buffer_store_dword v20, v2, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0xbc, v0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v20, 16, v33
-; GFX7-NEXT:    buffer_store_dword v22, v2, s[0:3], 0 offen
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[22:23], v20
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0xb8, v0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v20, 16, v32
-; GFX7-NEXT:    buffer_store_dword v21, v2, s[0:3], 0 offen
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[20:21], v20
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0xb4, v0
-; GFX7-NEXT:    buffer_store_dword v23, v2, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0xb0, v0
-; GFX7-NEXT:    buffer_store_dword v22, v2, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0xac, v0
-; GFX7-NEXT:    buffer_store_dword v21, v2, s[0:3], 0 offen
-; GFX7-NEXT:    v_lshlrev_b32_e32 v21, 16, v31
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[21:22], v21
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0xa8, v0
-; GFX7-NEXT:    buffer_store_dword v20, v2, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0xa4, v0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v20, 16, v30
-; GFX7-NEXT:    buffer_store_dword v22, v2, s[0:3], 0 offen
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[22:23], v20
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0xa0, v0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v20, 16, v29
-; GFX7-NEXT:    buffer_store_dword v21, v2, s[0:3], 0 offen
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[20:21], v20
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0x9c, v0
-; GFX7-NEXT:    buffer_store_dword v23, v2, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0x98, v0
-; GFX7-NEXT:    buffer_store_dword v22, v2, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0x94, v0
-; GFX7-NEXT:    buffer_store_dword v21, v2, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0x90, v0
-; GFX7-NEXT:    buffer_store_dword v20, v2, s[0:3], 0 offen
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v18
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[20:21], v2
-; GFX7-NEXT:    v_add_i32_e32 v18, vcc, 0x8c, v0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v19
-; GFX7-NEXT:    buffer_store_dword v21, v18, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v18, vcc, 0x88, v0
-; GFX7-NEXT:    buffer_store_dword v20, v18, s[0:3], 0 offen
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[18:19], v2
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v15
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[20:21], v2
-; GFX7-NEXT:    v_add_i32_e32 v15, vcc, 0x84, v0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v17
-; GFX7-NEXT:    buffer_store_dword v21, v15, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v15, vcc, 0x80, v0
-; GFX7-NEXT:    buffer_store_dword v20, v15, s[0:3], 0 offen
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[20:21], v2
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
-; GFX7-NEXT:    v_add_i32_e32 v15, vcc, 0x7c, v0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX7-NEXT:    buffer_store_dword v2, v15, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0x78, v0
-; GFX7-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[1:2], v14
-; GFX7-NEXT:    v_lshlrev_b32_e32 v14, 16, v16
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[14:15], v14
-; GFX7-NEXT:    v_add_i32_e32 v16, vcc, 0x74, v0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX7-NEXT:    buffer_store_dword v15, v16, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v15, vcc, 0x70, v0
-; GFX7-NEXT:    buffer_store_dword v14, v15, s[0:3], 0 offen
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[14:15], v12
-; GFX7-NEXT:    v_lshlrev_b32_e32 v12, 16, v13
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[12:13], v12
-; GFX7-NEXT:    v_add_i32_e32 v16, vcc, 0x6c, v0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX7-NEXT:    buffer_store_dword v13, v16, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v13, vcc, 0x68, v0
-; GFX7-NEXT:    buffer_store_dword v12, v13, s[0:3], 0 offen
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[12:13], v10
-; GFX7-NEXT:    v_lshlrev_b32_e32 v10, 16, v11
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[10:11], v10
-; GFX7-NEXT:    v_add_i32_e32 v16, vcc, 0x64, v0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT:    buffer_store_dword v11, v16, s[0:3], 0 offen
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[16:17], v8
-; GFX7-NEXT:    v_add_i32_e32 v11, vcc, 0x60, v0
-; GFX7-NEXT:    v_add_i32_e32 v8, vcc, 0x5c, v0
-; GFX7-NEXT:    buffer_store_dword v10, v11, s[0:3], 0 offen
-; GFX7-NEXT:    buffer_store_dword v17, v8, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v8, vcc, 0x58, v0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v11, 16, v4
-; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
-; GFX7-NEXT:    buffer_store_dword v16, v8, s[0:3], 0 offen
-; GFX7-NEXT:    v_lshlrev_b32_e32 v16, 16, v5
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
-; GFX7-NEXT:    v_add_i32_e32 v6, vcc, 0x54, v0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX7-NEXT:    buffer_store_dword v5, v6, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v5, vcc, 0x50, v0
-; GFX7-NEXT:    buffer_store_dword v4, v5, s[0:3], 0 offen
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[3:4], v3
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[5:6], v16
-; GFX7-NEXT:    v_add_i32_e32 v16, vcc, 0x4c, v0
-; GFX7-NEXT:    buffer_store_dword v4, v16, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x48, v0
-; GFX7-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[3:4], v11
-; GFX7-NEXT:    v_add_i32_e32 v11, vcc, 0x44, v0
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[7:8], v7
-; GFX7-NEXT:    buffer_store_dword v6, v11, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v6, vcc, 64, v0
-; GFX7-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX7-NEXT:    buffer_store_dword v5, v6, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v5, vcc, 60, v0
-; GFX7-NEXT:    v_cvt_f64_f32_e32 v[9:10], v9
-; GFX7-NEXT:    buffer_store_dword v4, v5, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 56, v0
-; GFX7-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 52, v0
-; GFX7-NEXT:    buffer_store_dword v8, v3, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 48, v0
-; GFX7-NEXT:    buffer_store_dword v7, v3, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 44, v0
-; GFX7-NEXT:    buffer_store_dword v10, v3, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 40, v0
-; GFX7-NEXT:    buffer_store_dword v9, v3, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 36, v0
-; GFX7-NEXT:    buffer_store_dword v13, v3, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 32, v0
-; GFX7-NEXT:    buffer_store_dword v12, v3, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 28, v0
-; GFX7-NEXT:    buffer_store_dword v15, v3, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 24, v0
-; GFX7-NEXT:    buffer_store_dword v14, v3, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 20, v0
-; GFX7-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0x44, v0
+; GFX7-NEXT:    buffer_store_dword v19, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 64, v0
+; GFX7-NEXT:    buffer_store_dword v18, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 60, v0
+; GFX7-NEXT:    buffer_store_dword v17, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 56, v0
+; GFX7-NEXT:    buffer_store_dword v16, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 52, v0
+; GFX7-NEXT:    buffer_store_dword v15, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 48, v0
+; GFX7-NEXT:    buffer_store_dword v14, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 44, v0
+; GFX7-NEXT:    buffer_store_dword v13, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 40, v0
+; GFX7-NEXT:    buffer_store_dword v12, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 36, v0
+; GFX7-NEXT:    buffer_store_dword v11, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 32, v0
+; GFX7-NEXT:    buffer_store_dword v10, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 28, v0
+; GFX7-NEXT:    buffer_store_dword v9, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 24, v0
+; GFX7-NEXT:    buffer_store_dword v8, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 20, v0
+; GFX7-NEXT:    buffer_store_dword v7, v2, s[0:3], 0 offen
 ; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 16, v0
-; GFX7-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v1, vcc, 12, v0
-; GFX7-NEXT:    buffer_store_dword v21, v1, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v1, vcc, 8, v0
-; GFX7-NEXT:    buffer_store_dword v20, v1, s[0:3], 0 offen
-; GFX7-NEXT:    v_add_i32_e32 v1, vcc, 4, v0
-; GFX7-NEXT:    buffer_store_dword v19, v1, s[0:3], 0 offen
-; GFX7-NEXT:    buffer_store_dword v18, v0, s[0:3], 0 offen
+; GFX7-NEXT:    buffer_store_dword v6, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 12, v0
+; GFX7-NEXT:    buffer_store_dword v5, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 8, v0
+; GFX7-NEXT:    buffer_store_dword v4, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 4, v0
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x80, v0
+; GFX7-NEXT:    buffer_store_dword v3, v2, s[0:3], 0 offen
+; GFX7-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: global_extload_v32bf16_to_v32f64:
+; GFX8-LABEL: test_overflow_stack:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 2, v1
-; GFX8-NEXT:    v_addc_u32_e32 v10, vcc, 0, v2, vcc
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 4, v1
-; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v2, vcc
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 6, v1
-; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, 0, v2, vcc
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 8, v1
-; GFX8-NEXT:    v_addc_u32_e32 v8, vcc, 0, v2, vcc
-; GFX8-NEXT:    v_add_u32_e32 v11, vcc, 10, v1
-; GFX8-NEXT:    v_addc_u32_e32 v12, vcc, 0, v2, vcc
-; GFX8-NEXT:    v_add_u32_e32 v13, vcc, 12, v1
-; GFX8-NEXT:    v_addc_u32_e32 v14, vcc, 0, v2, vcc
-; GFX8-NEXT:    v_add_u32_e32 v17, vcc, 14, v1
-; GFX8-NEXT:    v_addc_u32_e32 v18, vcc, 0, v2, vcc
-; GFX8-NEXT:    v_add_u32_e32 v21, vcc, 16, v1
-; GFX8-NEXT:    v_addc_u32_e32 v22, vcc, 0, v2, vcc
-; GFX8-NEXT:    v_add_u32_e32 v15, vcc, 18, v1
-; GFX8-NEXT:    v_addc_u32_e32 v16, vcc, 0, v2, vcc
-; GFX8-NEXT:    v_add_u32_e32 v19, vcc, 20, v1
-; GFX8-NEXT:    v_addc_u32_e32 v20, vcc, 0, v2, vcc
-; GFX8-NEXT:    v_add_u32_e32 v23, vcc, 22, v1
-; GFX8-NEXT:    v_addc_u32_e32 v24, vcc, 0, v2, vcc
-; GFX8-NEXT:    v_add_u32_e32 v25, vcc, 24, v1
-; GFX8-NEXT:    v_addc_u32_e32 v26, vcc, 0, v2, vcc
-; GFX8-NEXT:    v_add_u32_e32 v27, vcc, 26, v1
-; GFX8-NEXT:    v_addc_u32_e32 v28, vcc, 0, v2, vcc
-; GFX8-NEXT:    v_add_u32_e32 v29, vcc, 28, v1
-; GFX8-NEXT:    v_addc_u32_e32 v30, vcc, 0, v2, vcc
-; GFX8-NEXT:    v_add_u32_e32 v31, vcc, 30, v1
-; GFX8-NEXT:    v_addc_u32_e32 v32, vcc, 0, v2, vcc
-; GFX8-NEXT:    v_add_u32_e32 v33, vcc, 32, v1
-; GFX8-NEXT:    v_addc_u32_e32 v34, vcc, 0, v2, vcc
-; GFX8-NEXT:    v_add_u32_e32 v35, vcc, 34, v1
-; GFX8-NEXT:    v_addc_u32_e32 v36, vcc, 0, v2, vcc
-; GFX8-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; GFX8-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; GFX8-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GFX8-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GFX8-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; GFX8-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; GFX8-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GFX8-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; GFX8-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; GFX8-NEXT:    buffer_store_dword v57, off, s[0:3], s32 ; 4-byte Folded Spill
-; GFX8-NEXT:    v_add_u32_e32 v37, vcc, 36, v1
-; GFX8-NEXT:    flat_load_ushort v43, v[1:2]
-; GFX8-NEXT:    v_addc_u32_e32 v38, vcc, 0, v2, vcc
-; GFX8-NEXT:    v_add_u32_e32 v48, vcc, 38, v1
-; GFX8-NEXT:    v_addc_u32_e32 v49, vcc, 0, v2, vcc
-; GFX8-NEXT:    v_add_u32_e32 v50, vcc, 62, v1
-; GFX8-NEXT:    v_addc_u32_e32 v51, vcc, 0, v2, vcc
-; GFX8-NEXT:    flat_load_ushort v44, v[50:51]
-; GFX8-NEXT:    v_add_u32_e32 v50, vcc, 60, v1
-; GFX8-NEXT:    v_addc_u32_e32 v51, vcc, 0, v2, vcc
-; GFX8-NEXT:    flat_load_ushort v45, v[50:51]
-; GFX8-NEXT:    v_add_u32_e32 v50, vcc, 40, v1
-; GFX8-NEXT:    v_addc_u32_e32 v51, vcc, 0, v2, vcc
-; GFX8-NEXT:    v_add_u32_e32 v52, vcc, 58, v1
-; GFX8-NEXT:    v_addc_u32_e32 v53, vcc, 0, v2, vcc
-; GFX8-NEXT:    flat_load_ushort v46, v[52:53]
-; GFX8-NEXT:    v_add_u32_e32 v52, vcc, 42, v1
-; GFX8-NEXT:    v_addc_u32_e32 v53, vcc, 0, v2, vcc
-; GFX8-NEXT:    v_add_u32_e32 v54, vcc, 56, v1
-; GFX8-NEXT:    v_addc_u32_e32 v55, vcc, 0, v2, vcc
-; GFX8-NEXT:    flat_load_ushort v47, v[54:55]
-; GFX8-NEXT:    v_add_u32_e32 v54, vcc, 44, v1
-; GFX8-NEXT:    v_addc_u32_e32 v55, vcc, 0, v2, vcc
-; GFX8-NEXT:    v_add_u32_e32 v39, vcc, 54, v1
-; GFX8-NEXT:    v_addc_u32_e32 v40, vcc, 0, v2, vcc
-; GFX8-NEXT:    flat_load_ushort v56, v[39:40]
-; GFX8-NEXT:    v_add_u32_e32 v39, vcc, 52, v1
-; GFX8-NEXT:    v_addc_u32_e32 v40, vcc, 0, v2, vcc
-; GFX8-NEXT:    flat_load_ushort v57, v[39:40]
-; GFX8-NEXT:    v_add_u32_e32 v39, vcc, 46, v1
-; GFX8-NEXT:    v_addc_u32_e32 v40, vcc, 0, v2, vcc
-; GFX8-NEXT:    v_add_u32_e32 v41, vcc, 50, v1
-; GFX8-NEXT:    v_addc_u32_e32 v42, vcc, 0, v2, vcc
-; GFX8-NEXT:    flat_load_ushort v41, v[41:42]
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 48, v1
-; GFX8-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
-; GFX8-NEXT:    flat_load_ushort v42, v[9:10]
-; GFX8-NEXT:    flat_load_ushort v9, v[35:36]
-; GFX8-NEXT:    flat_load_ushort v10, v[37:38]
-; GFX8-NEXT:    flat_load_ushort v35, v[48:49]
-; GFX8-NEXT:    flat_load_ushort v36, v[50:51]
-; GFX8-NEXT:    flat_load_ushort v37, v[52:53]
-; GFX8-NEXT:    flat_load_ushort v48, v[54:55]
-; GFX8-NEXT:    flat_load_ushort v39, v[39:40]
-; GFX8-NEXT:    flat_load_ushort v49, v[1:2]
-; GFX8-NEXT:    flat_load_ushort v50, v[3:4]
-; GFX8-NEXT:    flat_load_ushort v51, v[5:6]
-; GFX8-NEXT:    flat_load_ushort v52, v[7:8]
-; GFX8-NEXT:    flat_load_ushort v53, v[11:12]
-; GFX8-NEXT:    flat_load_ushort v38, v[13:14]
-; GFX8-NEXT:    flat_load_ushort v14, v[17:18]
-; GFX8-NEXT:    flat_load_ushort v11, v[21:22]
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 4, v0
-; GFX8-NEXT:    flat_load_ushort v15, v[15:16]
-; GFX8-NEXT:    flat_load_ushort v13, v[19:20]
-; GFX8-NEXT:    flat_load_ushort v8, v[23:24]
-; GFX8-NEXT:    flat_load_ushort v6, v[25:26]
-; GFX8-NEXT:    flat_load_ushort v5, v[27:28]
-; GFX8-NEXT:    flat_load_ushort v7, v[29:30]
-; GFX8-NEXT:    flat_load_ushort v12, v[31:32]
-; GFX8-NEXT:    flat_load_ushort v16, v[33:34]
-; GFX8-NEXT:    v_add_u32_e32 v18, vcc, 0xc4, v0
-; GFX8-NEXT:    v_add_u32_e32 v20, vcc, 0xbc, v0
-; GFX8-NEXT:    v_add_u32_e32 v22, vcc, 0xb4, v0
-; GFX8-NEXT:    v_add_u32_e32 v24, vcc, 0xac, v0
-; GFX8-NEXT:    v_add_u32_e32 v26, vcc, 0xa4, v0
-; GFX8-NEXT:    v_add_u32_e32 v27, vcc, 0x9c, v0
-; GFX8-NEXT:    s_waitcnt vmcnt(14)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v43
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
-; GFX8-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
-; GFX8-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0xfc, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v44
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
-; GFX8-NEXT:    buffer_store_dword v3, v1, s[0:3], 0 offen
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v45
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[3:4], v3
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0xf8, v0
-; GFX8-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0xf4, v0
-; GFX8-NEXT:    buffer_store_dword v4, v1, s[0:3], 0 offen
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v46
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0xf0, v0
-; GFX8-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0xec, v0
-; GFX8-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0xe8, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v47
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
-; GFX8-NEXT:    buffer_store_dword v1, v4, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0xe4, v0
-; GFX8-NEXT:    buffer_store_dword v3, v1, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0xe0, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v56
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[3:4], v3
-; GFX8-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0xdc, v0
-; GFX8-NEXT:    buffer_store_dword v4, v1, s[0:3], 0 offen
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v57
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0xd8, v0
-; GFX8-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0xd4, v0
-; GFX8-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0xd0, v0
-; GFX8-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v41
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0xcc, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v42
-; GFX8-NEXT:    buffer_store_dword v2, v4, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0xc8, v0
-; GFX8-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[1:2], v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v49
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[3:4], v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v50
-; GFX8-NEXT:    s_waitcnt vmcnt(14)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v19, 16, v51
-; GFX8-NEXT:    v_lshlrev_b32_e32 v21, 16, v52
-; GFX8-NEXT:    buffer_store_dword v4, v18, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0xc0, v0
-; GFX8-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[3:4], v17
-; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v39
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[17:18], v17
-; GFX8-NEXT:    v_lshlrev_b32_e32 v23, 16, v53
-; GFX8-NEXT:    v_lshlrev_b32_e32 v25, 16, v38
-; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX8-NEXT:    buffer_store_dword v18, v20, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v18, vcc, 0xb8, v0
-; GFX8-NEXT:    buffer_store_dword v17, v18, s[0:3], 0 offen
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[17:18], v19
-; GFX8-NEXT:    v_lshlrev_b32_e32 v19, 16, v48
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[19:20], v19
-; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    buffer_store_dword v20, v22, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v20, vcc, 0xb0, v0
-; GFX8-NEXT:    buffer_store_dword v19, v20, s[0:3], 0 offen
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[19:20], v21
-; GFX8-NEXT:    v_lshlrev_b32_e32 v21, 16, v37
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[21:22], v21
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX8-NEXT:    buffer_store_dword v22, v24, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v22, vcc, 0xa8, v0
-; GFX8-NEXT:    buffer_store_dword v21, v22, s[0:3], 0 offen
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[21:22], v23
-; GFX8-NEXT:    v_lshlrev_b32_e32 v23, 16, v36
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[23:24], v23
-; GFX8-NEXT:    buffer_store_dword v24, v26, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v24, vcc, 0xa0, v0
-; GFX8-NEXT:    buffer_store_dword v23, v24, s[0:3], 0 offen
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[23:24], v25
-; GFX8-NEXT:    v_lshlrev_b32_e32 v25, 16, v35
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[25:26], v25
-; GFX8-NEXT:    buffer_store_dword v26, v27, s[0:3], 0 offen
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[27:28], v10
-; GFX8-NEXT:    v_add_u32_e32 v26, vcc, 0x98, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v11
-; GFX8-NEXT:    v_add_u32_e32 v11, vcc, 0x94, v0
-; GFX8-NEXT:    buffer_store_dword v25, v26, s[0:3], 0 offen
-; GFX8-NEXT:    buffer_store_dword v28, v11, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v11, vcc, 0x90, v0
-; GFX8-NEXT:    buffer_store_dword v27, v11, s[0:3], 0 offen
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[27:28], v9
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[25:26], v14
-; GFX8-NEXT:    v_add_u32_e32 v14, vcc, 0x8c, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v15
-; GFX8-NEXT:    buffer_store_dword v28, v14, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v14, vcc, 0x88, v0
-; GFX8-NEXT:    buffer_store_dword v27, v14, s[0:3], 0 offen
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[14:15], v9
-; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v16
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[27:28], v9
-; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v13
-; GFX8-NEXT:    v_add_u32_e32 v13, vcc, 0x84, v0
-; GFX8-NEXT:    buffer_store_dword v28, v13, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v13, vcc, 0x80, v0
-; GFX8-NEXT:    buffer_store_dword v27, v13, s[0:3], 0 offen
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[27:28], v9
-; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v12
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[12:13], v9
-; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7c, v0
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[10:11], v10
-; GFX8-NEXT:    buffer_store_dword v13, v9, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x78, v0
-; GFX8-NEXT:    buffer_store_dword v12, v9, s[0:3], 0 offen
-; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v6
-; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
-; GFX8-NEXT:    v_add_u32_e32 v13, vcc, 0x74, v0
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[8:9], v8
-; GFX8-NEXT:    buffer_store_dword v7, v13, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x70, v0
-; GFX8-NEXT:    buffer_store_dword v6, v7, s[0:3], 0 offen
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[5:6], v5
-; GFX8-NEXT:    v_cvt_f64_f32_e32 v[12:13], v12
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x6c, v0
-; GFX8-NEXT:    buffer_store_dword v6, v7, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x68, v0
-; GFX8-NEXT:    buffer_store_dword v5, v6, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x64, v0
-; GFX8-NEXT:    buffer_store_dword v13, v5, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x60, v0
-; GFX8-NEXT:    buffer_store_dword v12, v5, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x5c, v0
-; GFX8-NEXT:    buffer_store_dword v9, v5, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x58, v0
-; GFX8-NEXT:    buffer_store_dword v8, v5, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x54, v0
-; GFX8-NEXT:    buffer_store_dword v28, v5, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x50, v0
-; GFX8-NEXT:    buffer_store_dword v27, v5, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x4c, v0
-; GFX8-NEXT:    buffer_store_dword v15, v5, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x48, v0
-; GFX8-NEXT:    buffer_store_dword v14, v5, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x44, v0
-; GFX8-NEXT:    buffer_store_dword v11, v5, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 64, v0
-; GFX8-NEXT:    buffer_store_dword v10, v5, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 60, v0
-; GFX8-NEXT:    buffer_store_dword v26, v5, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 56, v0
-; GFX8-NEXT:    buffer_store_dword v25, v5, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 52, v0
-; GFX8-NEXT:    buffer_store_dword v24, v5, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 48, v0
-; GFX8-NEXT:    buffer_store_dword v23, v5, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 44, v0
-; GFX8-NEXT:    buffer_store_dword v22, v5, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 40, v0
-; GFX8-NEXT:    buffer_store_dword v21, v5, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 36, v0
-; GFX8-NEXT:    buffer_store_dword v20, v5, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 32, v0
-; GFX8-NEXT:    buffer_store_dword v19, v5, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 28, v0
-; GFX8-NEXT:    buffer_store_dword v18, v5, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 24, v0
-; GFX8-NEXT:    buffer_store_dword v17, v5, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 20, v0
-; GFX8-NEXT:    buffer_store_dword v4, v5, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 16, v0
-; GFX8-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 12, v0
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 8, v0
-; GFX8-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
-; GFX8-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; GFX8-NEXT:    buffer_load_dword v57, off, s[0:3], s32 ; 4-byte Folded Reload
-; GFX8-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX8-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GFX8-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; GFX8-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; GFX8-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX8-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; GFX8-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX8-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX8-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX8-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
+; GFX8-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:8
+; GFX8-NEXT:    v_add_u32_e32 v31, vcc, 0x7c, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    buffer_store_dword v2, v31, s[0:3], 0 offen
+; GFX8-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:4
+; GFX8-NEXT:    v_add_u32_e32 v31, vcc, 0x78, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    buffer_store_dword v2, v31, s[0:3], 0 offen
+; GFX8-NEXT:    buffer_load_dword v2, off, s[0:3], s32
+; GFX8-NEXT:    v_add_u32_e32 v31, vcc, 0x74, v0
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    buffer_store_dword v2, v31, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x70, v0
+; GFX8-NEXT:    buffer_store_dword v30, v2, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x6c, v0
+; GFX8-NEXT:    buffer_store_dword v29, v2, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x68, v0
+; GFX8-NEXT:    buffer_store_dword v28, v2, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x64, v0
+; GFX8-NEXT:    buffer_store_dword v27, v2, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x60, v0
+; GFX8-NEXT:    buffer_store_dword v26, v2, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x5c, v0
+; GFX8-NEXT:    buffer_store_dword v25, v2, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x58, v0
+; GFX8-NEXT:    buffer_store_dword v24, v2, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x54, v0
+; GFX8-NEXT:    buffer_store_dword v23, v2, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x50, v0
+; GFX8-NEXT:    buffer_store_dword v22, v2, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x4c, v0
+; GFX8-NEXT:    buffer_store_dword v21, v2, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x48, v0
+; GFX8-NEXT:    buffer_store_dword v20, v2, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x44, v0
+; GFX8-NEXT:    buffer_store_dword v19, v2, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 64, v0
+; GFX8-NEXT:    buffer_store_dword v18, v2, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 60, v0
+; GFX8-NEXT:    buffer_store_dword v17, v2, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 56, v0
+; GFX8-NEXT:    buffer_store_dword v16, v2, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 52, v0
+; GFX8-NEXT:    buffer_store_dword v15, v2, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 48, v0
+; GFX8-NEXT:    buffer_store_dword v14, v2, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 44, v0
+; GFX8-NEXT:    buffer_store_dword v13, v2, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 40, v0
+; GFX8-NEXT:    buffer_store_dword v12, v2, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 36, v0
+; GFX8-NEXT:    buffer_store_dword v11, v2, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 32, v0
+; GFX8-NEXT:    buffer_store_dword v10, v2, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 28, v0
+; GFX8-NEXT:    buffer_store_dword v9, v2, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 24, v0
+; GFX8-NEXT:    buffer_store_dword v8, v2, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 20, v0
+; GFX8-NEXT:    buffer_store_dword v7, v2, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 16, v0
+; GFX8-NEXT:    buffer_store_dword v6, v2, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 12, v0
+; GFX8-NEXT:    buffer_store_dword v5, v2, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 8, v0
+; GFX8-NEXT:    buffer_store_dword v4, v2, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 4, v0
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x80, v0
+; GFX8-NEXT:    buffer_store_dword v3, v2, s[0:3], 0 offen
+; GFX8-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: test_overflow_stack:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_store_dword v30, v0, s[0:3], 0 offen offset:112
+; GFX9-NEXT:    buffer_store_dword v29, v0, s[0:3], 0 offen offset:108
+; GFX9-NEXT:    buffer_store_dword v28, v0, s[0:3], 0 offen offset:104
+; GFX9-NEXT:    buffer_store_dword v27, v0, s[0:3], 0 offen offset:100
+; GFX9-NEXT:    buffer_store_dword v26, v0, s[0:3], 0 offen offset:96
+; GFX9-NEXT:    buffer_store_dword v25, v0, s[0:3], 0 offen offset:92
+; GFX9-NEXT:    buffer_store_dword v24, v0, s[0:3], 0 offen offset:88
+; GFX9-NEXT:    buffer_store_dword v23, v0, s[0:3], 0 offen offset:84
+; GFX9-NEXT:    buffer_store_dword v22, v0, s[0:3], 0 offen offset:80
+; GFX9-NEXT:    buffer_store_dword v21, v0, s[0:3], 0 offen offset:76
+; GFX9-NEXT:    buffer_store_dword v20, v0, s[0:3], 0 offen offset:72
+; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v19, v0, s[0:3], 0 offen offset:68
+; GFX9-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:8
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v18, v0, s[0:3], 0 offen offset:64
+; GFX9-NEXT:    buffer_load_dword v18, off, s[0:3], s32
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dword v17, v0, s[0:3], 0 offen offset:60
+; GFX9-NEXT:    buffer_store_dword v16, v0, s[0:3], 0 offen offset:56
+; GFX9-NEXT:    buffer_store_dword v15, v0, s[0:3], 0 offen offset:52
+; GFX9-NEXT:    buffer_store_dword v14, v0, s[0:3], 0 offen offset:48
+; GFX9-NEXT:    buffer_store_dword v13, v0, s[0:3], 0 offen offset:44
+; GFX9-NEXT:    buffer_store_dword v12, v0, s[0:3], 0 offen offset:40
+; GFX9-NEXT:    buffer_store_dword v11, v0, s[0:3], 0 offen offset:36
+; GFX9-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:32
+; GFX9-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:28
+; GFX9-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:24
+; GFX9-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:20
+; GFX9-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:16
+; GFX9-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:12
+; GFX9-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:8
+; GFX9-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
+; GFX9-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
+; GFX9-NEXT:    s_waitcnt vmcnt(18)
+; GFX9-NEXT:    buffer_store_dword v19, v0, s[0:3], 0 offen offset:124
+; GFX9-NEXT:    buffer_store_dword v20, v0, s[0:3], 0 offen offset:120
+; GFX9-NEXT:    s_waitcnt vmcnt(18)
+; GFX9-NEXT:    buffer_store_dword v18, v0, s[0:3], 0 offen offset:116
+; GFX9-NEXT:    buffer_store_short_d16_hi v1, v0, s[0:3], 0 offen offset:128
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: test_overflow_stack:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_clause 0x2
+; GFX10-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:8
+; GFX10-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
+; GFX10-NEXT:    buffer_load_dword v33, off, s[0:3], s32
+; GFX10-NEXT:    buffer_store_dword v30, v0, s[0:3], 0 offen offset:112
+; GFX10-NEXT:    buffer_store_dword v29, v0, s[0:3], 0 offen offset:108
+; GFX10-NEXT:    buffer_store_dword v28, v0, s[0:3], 0 offen offset:104
+; GFX10-NEXT:    buffer_store_dword v27, v0, s[0:3], 0 offen offset:100
+; GFX10-NEXT:    buffer_store_dword v26, v0, s[0:3], 0 offen offset:96
+; GFX10-NEXT:    buffer_store_dword v25, v0, s[0:3], 0 offen offset:92
+; GFX10-NEXT:    buffer_store_dword v24, v0, s[0:3], 0 offen offset:88
+; GFX10-NEXT:    buffer_store_dword v23, v0, s[0:3], 0 offen offset:84
+; GFX10-NEXT:    buffer_store_dword v22, v0, s[0:3], 0 offen offset:80
+; GFX10-NEXT:    buffer_store_dword v21, v0, s[0:3], 0 offen offset:76
+; GFX10-NEXT:    buffer_store_dword v20, v0, s[0:3], 0 offen offset:72
+; GFX10-NEXT:    buffer_store_dword v19, v0, s[0:3], 0 offen offset:68
+; GFX10-NEXT:    buffer_store_dword v18, v0, s[0:3], 0 offen offset:64
+; GFX10-NEXT:    buffer_store_dword v17, v0, s[0:3], 0 offen offset:60
+; GFX10-NEXT:    buffer_store_dword v16, v0, s[0:3], 0 offen offset:56
+; GFX10-NEXT:    buffer_store_dword v15, v0, s[0:3], 0 offen offset:52
+; GFX10-NEXT:    buffer_store_dword v14, v0, s[0:3], 0 offen offset:48
+; GFX10-NEXT:    buffer_store_dword v13, v0, s[0:3], 0 offen offset:44
+; GFX10-NEXT:    buffer_store_dword v12, v0, s[0:3], 0 offen offset:40
+; GFX10-NEXT:    buffer_store_dword v11, v0, s[0:3], 0 offen offset:36
+; GFX10-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:32
+; GFX10-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:28
+; GFX10-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:24
+; GFX10-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:20
+; GFX10-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:16
+; GFX10-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:12
+; GFX10-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:8
+; GFX10-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
+; GFX10-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
+; GFX10-NEXT:    s_waitcnt vmcnt(2)
+; GFX10-NEXT:    buffer_store_dword v31, v0, s[0:3], 0 offen offset:124
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    buffer_store_dword v32, v0, s[0:3], 0 offen offset:120
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_store_dword v33, v0, s[0:3], 0 offen offset:116
+; GFX10-NEXT:    buffer_store_short_d16_hi v1, v0, s[0:3], 0 offen offset:128
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: test_overflow_stack:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_clause 0x2
+; GFX11-NEXT:    scratch_load_b32 v33, off, s32 offset:8
+; GFX11-NEXT:    scratch_load_b32 v32, off, s32 offset:4
+; GFX11-NEXT:    scratch_load_b32 v31, off, s32
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX11-NEXT:    s_clause 0x4
+; GFX11-NEXT:    scratch_store_b128 off, v[18:21], s0 offset:64
+; GFX11-NEXT:    scratch_store_b128 off, v[10:13], s0 offset:32
+; GFX11-NEXT:    scratch_store_b128 off, v[6:9], s0 offset:16
+; GFX11-NEXT:    scratch_store_b128 off, v[2:5], s0
+; GFX11-NEXT:    scratch_store_d16_hi_b16 off, v1, s0 offset:128
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x70
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x60
+; GFX11-NEXT:    s_add_i32 s3, s0, 0x50
+; GFX11-NEXT:    s_add_i32 s0, s0, 48
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    scratch_store_b128 off, v[30:33], s1
+; GFX11-NEXT:    scratch_store_b128 off, v[26:29], s2
+; GFX11-NEXT:    scratch_store_b128 off, v[22:25], s3
+; GFX11-NEXT:    scratch_store_b128 off, v[14:17], s0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %ins.0 = insertvalue { <32 x i32>, bfloat } poison, <32 x i32> %b, 0
+  %ins.1 = insertvalue { <32 x i32>, bfloat } %ins.0 ,bfloat %a, 1
+  ret { <32 x i32>, bfloat } %ins.1
+}
+
+define <2 x float> @global_extload_v2bf16_to_v2f32(ptr addrspace(1) %ptr) {
+; GCN-LABEL: global_extload_v2bf16_to_v2f32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s4, s6
+; GCN-NEXT:    s_mov_b32 s5, s6
+; GCN-NEXT:    buffer_load_dword v1, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: global_extload_v2bf16_to_v2f32:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_load_dword v1, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: global_extload_v2bf16_to_v2f32:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dword v1, v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: global_extload_v2bf16_to_v2f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v1, v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_extload_v2bf16_to_v2f32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v1, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_extload_v2bf16_to_v2f32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v1, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %load = load <2 x bfloat>, ptr addrspace(1) %ptr
+  %fpext = fpext <2 x bfloat> %load to <2 x float>
+  ret <2 x float> %fpext
+}
+
+define <3 x float> @global_extload_v3bf16_to_v3f32(ptr addrspace(1) %ptr) {
+; GCN-LABEL: global_extload_v3bf16_to_v3f32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s4, s6
+; GCN-NEXT:    s_mov_b32 s5, s6
+; GCN-NEXT:    buffer_load_dwordx2 v[1:2], v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: global_extload_v3bf16_to_v3f32:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_load_dwordx2 v[1:2], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: global_extload_v3bf16_to_v3f32:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dwordx2 v[1:2], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: global_extload_v3bf16_to_v3f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx2 v[2:3], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_extload_v3bf16_to_v3f32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dwordx2 v[1:2], v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_extload_v3bf16_to_v3f32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b64 v[1:2], v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %load = load <3 x bfloat>, ptr addrspace(1) %ptr
+  %fpext = fpext <3 x bfloat> %load to <3 x float>
+  ret <3 x float> %fpext
+}
+
+define <4 x float> @global_extload_v4bf16_to_v4f32(ptr addrspace(1) %ptr) {
+; GCN-LABEL: global_extload_v4bf16_to_v4f32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s4, s6
+; GCN-NEXT:    s_mov_b32 s5, s6
+; GCN-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
+; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: global_extload_v4bf16_to_v4f32:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: global_extload_v4bf16_to_v4f32:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: global_extload_v4bf16_to_v4f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_extload_v4bf16_to_v4f32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dwordx2 v[2:3], v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_extload_v4bf16_to_v4f32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b64 v[2:3], v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %load = load <4 x bfloat>, ptr addrspace(1) %ptr
+  %fpext = fpext <4 x bfloat> %load to <4 x float>
+  ret <4 x float> %fpext
+}
+
+define <5 x float> @global_extload_v5bf16_to_v5f32(ptr addrspace(1) %ptr) {
+; GCN-LABEL: global_extload_v5bf16_to_v5f32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s4, s6
+; GCN-NEXT:    s_mov_b32 s5, s6
+; GCN-NEXT:    buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:8
+; GCN-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
+; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: global_extload_v5bf16_to_v5f32:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_load_ushort v4, v[0:1], s[4:7], 0 addr64 offset:8
+; GFX7-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: global_extload_v5bf16_to_v5f32:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 8, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_ushort v4, v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(1)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: global_extload_v5bf16_to_v5f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx2 v[5:6], v[0:1], off
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-NEXT:    global_load_short_d16_hi v4, v[0:1], off offset:8
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v5
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v6
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v5
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v6
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_extload_v5bf16_to_v5f32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dwordx2 v[2:3], v[0:1], off
+; GFX10-NEXT:    v_mov_b32_e32 v4, 0
+; GFX10-NEXT:    global_load_short_d16_hi v4, v[0:1], off offset:8
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_extload_v5bf16_to_v5f32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b64 v[2:3], v[0:1], off
+; GFX11-NEXT:    v_mov_b32_e32 v4, 0
+; GFX11-NEXT:    global_load_d16_hi_b16 v4, v[0:1], off offset:8
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %load = load <5 x bfloat>, ptr addrspace(1) %ptr
+  %fpext = fpext <5 x bfloat> %load to <5 x float>
+  ret <5 x float> %fpext
+}
+
+define <6 x float> @global_extload_v6bf16_to_v6f32(ptr addrspace(1) %ptr) {
+; GCN-LABEL: global_extload_v6bf16_to_v6f32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s4, s6
+; GCN-NEXT:    s_mov_b32 s5, s6
+; GCN-NEXT:    buffer_load_dwordx4 v[3:6], v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v3
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v3
+; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v4
+; GCN-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
+; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: global_extload_v6bf16_to_v6f32:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_load_dwordx3 v[3:5], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v3
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: global_extload_v6bf16_to_v6f32:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dwordx3 v[3:5], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v3
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: global_extload_v6bf16_to_v6f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx3 v[6:8], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v6
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v7
+; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v8
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v7
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v8
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_extload_v6bf16_to_v6f32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dwordx3 v[4:6], v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_extload_v6bf16_to_v6f32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b96 v[4:6], v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %load = load <6 x bfloat>, ptr addrspace(1) %ptr
+  %fpext = fpext <6 x bfloat> %load to <6 x float>
+  ret <6 x float> %fpext
+}
+
+define <8 x float> @global_extload_v8bf16_to_v8f32(ptr addrspace(1) %ptr) {
+; GCN-LABEL: global_extload_v8bf16_to_v8f32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s4, s6
+; GCN-NEXT:    s_mov_b32 s5, s6
+; GCN-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
+; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
+; GCN-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
+; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GCN-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
+; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: global_extload_v8bf16_to_v8f32:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: global_extload_v8bf16_to_v8f32:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dwordx4 v[4:7], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
+; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: global_extload_v8bf16_to_v8f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v8
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v9
+; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v10
+; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v11
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v8
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v9
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v10
+; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v11
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_extload_v8bf16_to_v8f32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dwordx4 v[7:10], v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v7
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v8
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v9
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v7
+; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v8
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v9
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v10
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_extload_v8bf16_to_v8f32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b128 v[7:10], v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v7
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v8
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v9
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v7
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v10
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v8
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v9
+; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v10
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %load = load <8 x bfloat>, ptr addrspace(1) %ptr
+  %fpext = fpext <8 x bfloat> %load to <8 x float>
+  ret <8 x float> %fpext
+}
+
+define <16 x float> @global_extload_v16bf16_to_v16f32(ptr addrspace(1) %ptr) {
+; GCN-LABEL: global_extload_v16bf16_to_v16f32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s4, s6
+; GCN-NEXT:    s_mov_b32 s5, s6
+; GCN-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:16
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
+; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
+; GCN-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
+; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GCN-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
+; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_lshlrev_b32_e32 v8, 16, v12
+; GCN-NEXT:    v_and_b32_e32 v9, 0xffff0000, v12
+; GCN-NEXT:    v_lshlrev_b32_e32 v10, 16, v13
+; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v13
+; GCN-NEXT:    v_lshlrev_b32_e32 v12, 16, v14
+; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v14
+; GCN-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
+; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: global_extload_v16bf16_to_v16f32:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 16, v12
+; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v12
+; GFX7-NEXT:    v_lshlrev_b32_e32 v10, 16, v13
+; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v13
+; GFX7-NEXT:    v_lshlrev_b32_e32 v12, 16, v14
+; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v14
+; GFX7-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
+; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: global_extload_v16bf16_to_v16f32:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dwordx4 v[4:7], v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dwordx4 v[12:15], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(1)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
+; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v12
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v12
+; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v13
+; GFX8-NEXT:    v_and_b32_e32 v11, 0xffff0000, v13
+; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v14
+; GFX8-NEXT:    v_and_b32_e32 v13, 0xffff0000, v14
+; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
+; GFX8-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: global_extload_v16bf16_to_v16f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx4 v[16:19], v[0:1], off
+; GFX9-NEXT:    global_load_dwordx4 v[20:23], v[0:1], off offset:16
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v16
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v17
+; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v18
+; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v19
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v20
+; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v21
+; GFX9-NEXT:    v_and_b32_e32 v13, 0xffff0000, v22
+; GFX9-NEXT:    v_and_b32_e32 v15, 0xffff0000, v23
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v16
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v17
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v18
+; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v19
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v20
+; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 16, v21
+; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 16, v22
+; GFX9-NEXT:    v_lshlrev_b32_e32 v14, 16, v23
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_extload_v16bf16_to_v16f32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    global_load_dwordx4 v[16:19], v[0:1], off
+; GFX10-NEXT:    global_load_dwordx4 v[20:23], v[0:1], off offset:16
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v16
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v17
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v18
+; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v19
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v20
+; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v21
+; GFX10-NEXT:    v_and_b32_e32 v13, 0xffff0000, v22
+; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v23
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v16
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v17
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v18
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v19
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v20
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v21
+; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v22
+; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v23
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_extload_v16bf16_to_v16f32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    global_load_b128 v[16:19], v[0:1], off
+; GFX11-NEXT:    global_load_b128 v[20:23], v[0:1], off offset:16
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v16
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v17
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v18
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v19
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v20
+; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v21
+; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff0000, v22
+; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff0000, v23
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v16
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v17
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v18
+; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v19
+; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v20
+; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v21
+; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v22
+; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 16, v23
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %load = load <16 x bfloat>, ptr addrspace(1) %ptr
+  %fpext = fpext <16 x bfloat> %load to <16 x float>
+  ret <16 x float> %fpext
+}
+
+define <32 x float> @global_extload_v32bf16_to_v32f32(ptr addrspace(1) %ptr) {
+; GCN-LABEL: global_extload_v32bf16_to_v32f32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s4, s6
+; GCN-NEXT:    s_mov_b32 s5, s6
+; GCN-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:16
+; GCN-NEXT:    buffer_load_dwordx4 v[20:23], v[0:1], s[4:7], 0 addr64 offset:32
+; GCN-NEXT:    buffer_load_dwordx4 v[28:31], v[0:1], s[4:7], 0 addr64 offset:48
+; GCN-NEXT:    s_waitcnt vmcnt(3)
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
+; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
+; GCN-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
+; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GCN-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
+; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT:    s_waitcnt vmcnt(2)
+; GCN-NEXT:    v_lshlrev_b32_e32 v8, 16, v12
+; GCN-NEXT:    v_and_b32_e32 v9, 0xffff0000, v12
+; GCN-NEXT:    v_lshlrev_b32_e32 v10, 16, v13
+; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v13
+; GCN-NEXT:    v_lshlrev_b32_e32 v12, 16, v14
+; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v14
+; GCN-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
+; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_lshlrev_b32_e32 v16, 16, v20
+; GCN-NEXT:    v_and_b32_e32 v17, 0xffff0000, v20
+; GCN-NEXT:    v_lshlrev_b32_e32 v18, 16, v21
+; GCN-NEXT:    v_and_b32_e32 v19, 0xffff0000, v21
+; GCN-NEXT:    v_lshlrev_b32_e32 v20, 16, v22
+; GCN-NEXT:    v_and_b32_e32 v21, 0xffff0000, v22
+; GCN-NEXT:    v_lshlrev_b32_e32 v22, 16, v23
+; GCN-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_lshlrev_b32_e32 v24, 16, v28
+; GCN-NEXT:    v_and_b32_e32 v25, 0xffff0000, v28
+; GCN-NEXT:    v_lshlrev_b32_e32 v26, 16, v29
+; GCN-NEXT:    v_and_b32_e32 v27, 0xffff0000, v29
+; GCN-NEXT:    v_lshlrev_b32_e32 v28, 16, v30
+; GCN-NEXT:    v_and_b32_e32 v29, 0xffff0000, v30
+; GCN-NEXT:    v_lshlrev_b32_e32 v30, 16, v31
+; GCN-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: global_extload_v32bf16_to_v32f32:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    buffer_load_dwordx4 v[12:15], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX7-NEXT:    buffer_load_dwordx4 v[20:23], v[0:1], s[4:7], 0 addr64 offset:32
+; GFX7-NEXT:    buffer_load_dwordx4 v[28:31], v[0:1], s[4:7], 0 addr64 offset:48
+; GFX7-NEXT:    s_waitcnt vmcnt(3)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT:    s_waitcnt vmcnt(2)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 16, v12
+; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v12
+; GFX7-NEXT:    v_lshlrev_b32_e32 v10, 16, v13
+; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v13
+; GFX7-NEXT:    v_lshlrev_b32_e32 v12, 16, v14
+; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v14
+; GFX7-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
+; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v16, 16, v20
+; GFX7-NEXT:    v_and_b32_e32 v17, 0xffff0000, v20
+; GFX7-NEXT:    v_lshlrev_b32_e32 v18, 16, v21
+; GFX7-NEXT:    v_and_b32_e32 v19, 0xffff0000, v21
+; GFX7-NEXT:    v_lshlrev_b32_e32 v20, 16, v22
+; GFX7-NEXT:    v_and_b32_e32 v21, 0xffff0000, v22
+; GFX7-NEXT:    v_lshlrev_b32_e32 v22, 16, v23
+; GFX7-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v24, 16, v28
+; GFX7-NEXT:    v_and_b32_e32 v25, 0xffff0000, v28
+; GFX7-NEXT:    v_lshlrev_b32_e32 v26, 16, v29
+; GFX7-NEXT:    v_and_b32_e32 v27, 0xffff0000, v29
+; GFX7-NEXT:    v_lshlrev_b32_e32 v28, 16, v30
+; GFX7-NEXT:    v_and_b32_e32 v29, 0xffff0000, v30
+; GFX7-NEXT:    v_lshlrev_b32_e32 v30, 16, v31
+; GFX7-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: global_extload_v32bf16_to_v32f32:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 16, v0
+; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dwordx4 v[4:7], v[0:1]
+; GFX8-NEXT:    flat_load_dwordx4 v[12:15], v[2:3]
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 32, v0
+; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 48, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dwordx4 v[20:23], v[2:3]
+; GFX8-NEXT:    flat_load_dwordx4 v[28:31], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(3)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
+; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX8-NEXT:    s_waitcnt vmcnt(2)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v12
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v12
+; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v13
+; GFX8-NEXT:    v_and_b32_e32 v11, 0xffff0000, v13
+; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v14
+; GFX8-NEXT:    v_and_b32_e32 v13, 0xffff0000, v14
+; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v15
+; GFX8-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX8-NEXT:    s_waitcnt vmcnt(1)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v20
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v20
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v21
+; GFX8-NEXT:    v_and_b32_e32 v19, 0xffff0000, v21
+; GFX8-NEXT:    v_lshlrev_b32_e32 v20, 16, v22
+; GFX8-NEXT:    v_and_b32_e32 v21, 0xffff0000, v22
+; GFX8-NEXT:    v_lshlrev_b32_e32 v22, 16, v23
+; GFX8-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v28
+; GFX8-NEXT:    v_and_b32_e32 v25, 0xffff0000, v28
+; GFX8-NEXT:    v_lshlrev_b32_e32 v26, 16, v29
+; GFX8-NEXT:    v_and_b32_e32 v27, 0xffff0000, v29
+; GFX8-NEXT:    v_lshlrev_b32_e32 v28, 16, v30
+; GFX8-NEXT:    v_and_b32_e32 v29, 0xffff0000, v30
+; GFX8-NEXT:    v_lshlrev_b32_e32 v30, 16, v31
+; GFX8-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: global_extload_v32bf16_to_v32f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx4 v[16:19], v[0:1], off
+; GFX9-NEXT:    global_load_dwordx4 v[20:23], v[0:1], off offset:16
+; GFX9-NEXT:    global_load_dwordx4 v[24:27], v[0:1], off offset:32
+; GFX9-NEXT:    global_load_dwordx4 v[32:35], v[0:1], off offset:48
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v16
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v17
+; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v18
+; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v19
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v20
+; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v21
+; GFX9-NEXT:    v_and_b32_e32 v13, 0xffff0000, v22
+; GFX9-NEXT:    v_and_b32_e32 v15, 0xffff0000, v23
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v16
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v17
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v18
+; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v19
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v20
+; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 16, v21
+; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 16, v22
+; GFX9-NEXT:    v_lshlrev_b32_e32 v14, 16, v23
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v24
+; GFX9-NEXT:    v_and_b32_e32 v19, 0xffff0000, v25
+; GFX9-NEXT:    v_and_b32_e32 v21, 0xffff0000, v26
+; GFX9-NEXT:    v_and_b32_e32 v23, 0xffff0000, v27
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v24
+; GFX9-NEXT:    v_lshlrev_b32_e32 v18, 16, v25
+; GFX9-NEXT:    v_lshlrev_b32_e32 v20, 16, v26
+; GFX9-NEXT:    v_lshlrev_b32_e32 v22, 16, v27
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v25, 0xffff0000, v32
+; GFX9-NEXT:    v_and_b32_e32 v27, 0xffff0000, v33
+; GFX9-NEXT:    v_and_b32_e32 v29, 0xffff0000, v34
+; GFX9-NEXT:    v_and_b32_e32 v31, 0xffff0000, v35
+; GFX9-NEXT:    v_lshlrev_b32_e32 v24, 16, v32
+; GFX9-NEXT:    v_lshlrev_b32_e32 v26, 16, v33
+; GFX9-NEXT:    v_lshlrev_b32_e32 v28, 16, v34
+; GFX9-NEXT:    v_lshlrev_b32_e32 v30, 16, v35
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_extload_v32bf16_to_v32f32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_clause 0x3
+; GFX10-NEXT:    global_load_dwordx4 v[32:35], v[0:1], off
+; GFX10-NEXT:    global_load_dwordx4 v[36:39], v[0:1], off offset:16
+; GFX10-NEXT:    global_load_dwordx4 v[48:51], v[0:1], off offset:32
+; GFX10-NEXT:    global_load_dwordx4 v[52:55], v[0:1], off offset:48
+; GFX10-NEXT:    s_waitcnt vmcnt(3)
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v32
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v33
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v34
+; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v35
+; GFX10-NEXT:    s_waitcnt vmcnt(2)
+; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v36
+; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v37
+; GFX10-NEXT:    v_and_b32_e32 v13, 0xffff0000, v38
+; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v39
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    v_and_b32_e32 v17, 0xffff0000, v48
+; GFX10-NEXT:    v_and_b32_e32 v19, 0xffff0000, v49
+; GFX10-NEXT:    v_and_b32_e32 v21, 0xffff0000, v50
+; GFX10-NEXT:    v_and_b32_e32 v23, 0xffff0000, v51
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v25, 0xffff0000, v52
+; GFX10-NEXT:    v_and_b32_e32 v27, 0xffff0000, v53
+; GFX10-NEXT:    v_and_b32_e32 v29, 0xffff0000, v54
+; GFX10-NEXT:    v_and_b32_e32 v31, 0xffff0000, v55
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v32
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v33
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v34
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v35
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v36
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v37
+; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v38
+; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v39
+; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v48
+; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v49
+; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v50
+; GFX10-NEXT:    v_lshlrev_b32_e32 v22, 16, v51
+; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v52
+; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v53
+; GFX10-NEXT:    v_lshlrev_b32_e32 v28, 16, v54
+; GFX10-NEXT:    v_lshlrev_b32_e32 v30, 16, v55
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_extload_v32bf16_to_v32f32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_clause 0x3
+; GFX11-NEXT:    global_load_b128 v[32:35], v[0:1], off
+; GFX11-NEXT:    global_load_b128 v[36:39], v[0:1], off offset:16
+; GFX11-NEXT:    global_load_b128 v[48:51], v[0:1], off offset:32
+; GFX11-NEXT:    global_load_b128 v[52:55], v[0:1], off offset:48
+; GFX11-NEXT:    s_waitcnt vmcnt(3)
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v32
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v33
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v34
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v35
+; GFX11-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v36
+; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v37
+; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff0000, v38
+; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff0000, v39
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff0000, v48
+; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff0000, v49
+; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff0000, v50
+; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff0000, v51
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v25, 0xffff0000, v52
+; GFX11-NEXT:    v_and_b32_e32 v27, 0xffff0000, v53
+; GFX11-NEXT:    v_and_b32_e32 v29, 0xffff0000, v54
+; GFX11-NEXT:    v_and_b32_e32 v31, 0xffff0000, v55
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v32
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v33
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v34
+; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v35
+; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v36
+; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v37
+; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v38
+; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 16, v39
+; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v48
+; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v49
+; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v50
+; GFX11-NEXT:    v_lshlrev_b32_e32 v22, 16, v51
+; GFX11-NEXT:    v_lshlrev_b32_e32 v24, 16, v52
+; GFX11-NEXT:    v_lshlrev_b32_e32 v26, 16, v53
+; GFX11-NEXT:    v_lshlrev_b32_e32 v28, 16, v54
+; GFX11-NEXT:    v_lshlrev_b32_e32 v30, 16, v55
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %load = load <32 x bfloat>, ptr addrspace(1) %ptr
+  %fpext = fpext <32 x bfloat> %load to <32 x float>
+  ret <32 x float> %fpext
+}
+
+define <2 x double> @global_extload_v2bf16_to_v2f64(ptr addrspace(1) %ptr) {
+; GCN-LABEL: global_extload_v2bf16_to_v2f64:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s4, s6
+; GCN-NEXT:    s_mov_b32 s5, s6
+; GCN-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[0:1], v1
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: global_extload_v2bf16_to_v2f64:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: global_extload_v2bf16_to_v2f64:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dword v2, v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: global_extload_v2bf16_to_v2f64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v2, v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_extload_v2bf16_to_v2f64:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[0:1], v1
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_extload_v2bf16_to_v2f64:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[0:1], v1
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %load = load <2 x bfloat>, ptr addrspace(1) %ptr
+  %fpext = fpext <2 x bfloat> %load to <2 x double>
+  ret <2 x double> %fpext
+}
+
+define <3 x double> @global_extload_v3bf16_to_v3f64(ptr addrspace(1) %ptr) {
+; GCN-LABEL: global_extload_v3bf16_to_v3f64:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s4, s6
+; GCN-NEXT:    s_mov_b32 s5, s6
+; GCN-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GCN-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[0:1], v2
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[2:3], v3
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: global_extload_v3bf16_to_v3f64:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_load_dwordx2 v[1:2], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[2:3], v3
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: global_extload_v3bf16_to_v3f64:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dwordx2 v[1:2], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[2:3], v3
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: global_extload_v3bf16_to_v3f64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx2 v[1:2], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[2:3], v3
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_extload_v3bf16_to_v3f64:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[0:1], v2
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[2:3], v3
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_extload_v3bf16_to_v3f64:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[0:1], v2
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[2:3], v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %load = load <3 x bfloat>, ptr addrspace(1) %ptr
+  %fpext = fpext <3 x bfloat> %load to <3 x double>
+  ret <3 x double> %fpext
+}
+
+define <4 x double> @global_extload_v4bf16_to_v4f64(ptr addrspace(1) %ptr) {
+; GCN-LABEL: global_extload_v4bf16_to_v4f64:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s4, s6
+; GCN-NEXT:    s_mov_b32 s5, s6
+; GCN-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GCN-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[0:1], v2
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[2:3], v3
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: global_extload_v4bf16_to_v4f64:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[0:1], v2
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[2:3], v3
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: global_extload_v4bf16_to_v4f64:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[0:1], v2
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[2:3], v3
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: global_extload_v4bf16_to_v4f64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[0:1], v2
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[2:3], v3
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[6:7], v4
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[4:5], v5
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_extload_v4bf16_to_v4f64:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dwordx2 v[1:2], v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[2:3], v3
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_extload_v4bf16_to_v4f64:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b64 v[1:2], v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[2:3], v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %load = load <4 x bfloat>, ptr addrspace(1) %ptr
+  %fpext = fpext <4 x bfloat> %load to <4 x double>
+  ret <4 x double> %fpext
+}
+
+define <5 x double> @global_extload_v5bf16_to_v5f64(ptr addrspace(1) %ptr) {
+; GCN-LABEL: global_extload_v5bf16_to_v5f64:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s4, s6
+; GCN-NEXT:    s_mov_b32 s5, s6
+; GCN-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:8
+; GCN-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
+; GCN-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[8:9], v2
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[0:1], v3
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[2:3], v4
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[4:5], v5
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: global_extload_v5bf16_to_v5f64:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 offset:8
+; GFX7-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[8:9], v2
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[0:1], v3
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[2:3], v4
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[4:5], v5
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: global_extload_v5bf16_to_v5f64:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 8, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_ushort v8, v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(1)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[8:9], v8
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: global_extload_v5bf16_to_v5f64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    global_load_short_d16_hi v2, v[0:1], off offset:8
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[8:9], v2
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[0:1], v2
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[2:3], v3
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[6:7], v4
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[4:5], v5
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_extload_v5bf16_to_v5f64:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v4, 0
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    global_load_dwordx2 v[2:3], v[0:1], off
+; GFX10-NEXT:    global_load_short_d16_hi v4, v[0:1], off offset:8
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[8:9], v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_extload_v5bf16_to_v5f64:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v4, 0
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    global_load_b64 v[2:3], v[0:1], off
+; GFX11-NEXT:    global_load_d16_hi_b16 v4, v[0:1], off offset:8
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[8:9], v4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %load = load <5 x bfloat>, ptr addrspace(1) %ptr
+  %fpext = fpext <5 x bfloat> %load to <5 x double>
+  ret <5 x double> %fpext
+}
+
+define <6 x double> @global_extload_v6bf16_to_v6f64(ptr addrspace(1) %ptr) {
+; GCN-LABEL: global_extload_v6bf16_to_v6f64:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s4, s6
+; GCN-NEXT:    s_mov_b32 s5, s6
+; GCN-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
+; GCN-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
+; GCN-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
+; GCN-NEXT:    v_and_b32_e32 v10, 0xffff0000, v2
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[0:1], v3
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[2:3], v4
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[4:5], v5
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[8:9], v8
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[10:11], v10
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: global_extload_v6bf16_to_v6f64:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_load_dwordx3 v[0:2], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
+; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v2
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[0:1], v3
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[2:3], v4
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[4:5], v5
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[8:9], v8
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[10:11], v10
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: global_extload_v6bf16_to_v6f64:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dwordx3 v[0:2], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v10, 0xffff0000, v2
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[0:1], v3
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[2:3], v4
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[4:5], v5
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[8:9], v8
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[10:11], v10
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: global_extload_v6bf16_to_v6f64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx3 v[0:2], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v1
+; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[2:3], v3
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[6:7], v4
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[10:11], v5
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[4:5], v8
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[8:9], v9
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_extload_v6bf16_to_v6f64:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dwordx3 v[4:6], v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
+; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v6
+; GFX10-NEXT:    v_and_b32_e32 v10, 0xffff0000, v6
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[6:7], v7
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[8:9], v8
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[10:11], v10
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_extload_v6bf16_to_v6f64:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b96 v[4:6], v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v6
+; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff0000, v6
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[6:7], v7
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[8:9], v8
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[10:11], v10
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %load = load <6 x bfloat>, ptr addrspace(1) %ptr
+  %fpext = fpext <6 x bfloat> %load to <6 x double>
+  ret <6 x double> %fpext
+}
+
+define <8 x double> @global_extload_v8bf16_to_v8f64(ptr addrspace(1) %ptr) {
+; GCN-LABEL: global_extload_v8bf16_to_v8f64:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s4, s6
+; GCN-NEXT:    s_mov_b32 s5, s6
+; GCN-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
+; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
+; GCN-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
+; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
+; GCN-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
+; GCN-NEXT:    v_and_b32_e32 v10, 0xffff0000, v2
+; GCN-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
+; GCN-NEXT:    v_and_b32_e32 v14, 0xffff0000, v3
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[0:1], v4
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[2:3], v5
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[4:5], v6
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[6:7], v7
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[8:9], v8
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[10:11], v10
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[12:13], v12
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[14:15], v14
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: global_extload_v8bf16_to_v8f64:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
+; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
+; GFX7-NEXT:    v_and_b32_e32 v14, 0xffff0000, v3
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[0:1], v4
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[2:3], v5
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[4:5], v6
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[6:7], v7
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[8:9], v8
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[10:11], v10
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[12:13], v12
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[14:15], v14
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: global_extload_v8bf16_to_v8f64:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v1
+; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v10, 0xffff0000, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
+; GFX8-NEXT:    v_and_b32_e32 v14, 0xffff0000, v3
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[0:1], v4
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[2:3], v5
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[4:5], v6
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[6:7], v7
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[8:9], v8
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[10:11], v10
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[12:13], v12
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[14:15], v14
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: global_extload_v8bf16_to_v8f64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
+; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX9-NEXT:    v_and_b32_e32 v8, 0xffff0000, v2
+; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v13, 16, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v3
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[2:3], v4
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[6:7], v5
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[10:11], v8
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[14:15], v9
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[4:5], v12
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[8:9], v13
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[12:13], v16
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_extload_v8bf16_to_v8f64:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dwordx4 v[7:10], v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v7
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v8
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v8
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
+; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v9
+; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v10
+; GFX10-NEXT:    v_and_b32_e32 v14, 0xffff0000, v10
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[8:9], v8
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[10:11], v11
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[12:13], v12
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[14:15], v14
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_extload_v8bf16_to_v8f64:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b128 v[7:10], v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v7
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v7
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v8
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v8
+; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
+; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v9
+; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v10
+; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff0000, v10
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[8:9], v8
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[10:11], v11
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[12:13], v12
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[14:15], v14
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %load = load <8 x bfloat>, ptr addrspace(1) %ptr
+  %fpext = fpext <8 x bfloat> %load to <8 x double>
+  ret <8 x double> %fpext
+}
+
+define <16 x double> @global_extload_v16bf16_to_v16f64(ptr addrspace(1) %ptr) {
+; GCN-LABEL: global_extload_v16bf16_to_v16f64:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s4, s6
+; GCN-NEXT:    s_mov_b32 s5, s6
+; GCN-NEXT:    buffer_load_dwordx4 v[2:5], v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    buffer_load_dwordx4 v[6:9], v[0:1], s[4:7], 0 addr64 offset:16
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_lshlrev_b32_e32 v10, 16, v3
+; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v3
+; GCN-NEXT:    v_lshlrev_b32_e32 v12, 16, v4
+; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v4
+; GCN-NEXT:    v_lshlrev_b32_e32 v14, 16, v5
+; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v5
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_lshlrev_b32_e32 v16, 16, v6
+; GCN-NEXT:    v_and_b32_e32 v18, 0xffff0000, v6
+; GCN-NEXT:    v_lshlrev_b32_e32 v20, 16, v7
+; GCN-NEXT:    v_and_b32_e32 v22, 0xffff0000, v7
+; GCN-NEXT:    v_lshlrev_b32_e32 v24, 16, v8
+; GCN-NEXT:    v_and_b32_e32 v26, 0xffff0000, v8
+; GCN-NEXT:    v_lshlrev_b32_e32 v28, 16, v9
+; GCN-NEXT:    v_and_b32_e32 v30, 0xffff0000, v9
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[4:5], v10
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[6:7], v11
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[8:9], v12
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[10:11], v13
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[12:13], v14
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[14:15], v15
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[16:17], v16
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[18:19], v18
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[20:21], v20
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[22:23], v22
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[24:25], v24
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[26:27], v26
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[28:29], v28
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[30:31], v30
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: global_extload_v16bf16_to_v16f64:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_load_dwordx4 v[2:5], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    buffer_load_dwordx4 v[6:9], v[0:1], s[4:7], 0 addr64 offset:16
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v10, 16, v3
+; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v12, 16, v4
+; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v14, 16, v5
+; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v5
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v16, 16, v6
+; GFX7-NEXT:    v_and_b32_e32 v18, 0xffff0000, v6
+; GFX7-NEXT:    v_lshlrev_b32_e32 v20, 16, v7
+; GFX7-NEXT:    v_and_b32_e32 v22, 0xffff0000, v7
+; GFX7-NEXT:    v_lshlrev_b32_e32 v24, 16, v8
+; GFX7-NEXT:    v_and_b32_e32 v26, 0xffff0000, v8
+; GFX7-NEXT:    v_lshlrev_b32_e32 v28, 16, v9
+; GFX7-NEXT:    v_and_b32_e32 v30, 0xffff0000, v9
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[4:5], v10
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[6:7], v11
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[8:9], v12
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[10:11], v13
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[12:13], v14
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[14:15], v15
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[16:17], v16
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[18:19], v18
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[20:21], v20
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[22:23], v22
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[24:25], v24
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[26:27], v26
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[28:29], v28
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[30:31], v30
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: global_extload_v16bf16_to_v16f64:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dwordx4 v[2:5], v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dwordx4 v[6:9], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(1)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v3
+; GFX8-NEXT:    v_and_b32_e32 v11, 0xffff0000, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v4
+; GFX8-NEXT:    v_and_b32_e32 v13, 0xffff0000, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v5
+; GFX8-NEXT:    v_and_b32_e32 v15, 0xffff0000, v5
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v6
+; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v6
+; GFX8-NEXT:    v_lshlrev_b32_e32 v20, 16, v7
+; GFX8-NEXT:    v_and_b32_e32 v22, 0xffff0000, v7
+; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v8
+; GFX8-NEXT:    v_and_b32_e32 v26, 0xffff0000, v8
+; GFX8-NEXT:    v_lshlrev_b32_e32 v28, 16, v9
+; GFX8-NEXT:    v_and_b32_e32 v30, 0xffff0000, v9
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[4:5], v10
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[6:7], v11
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[8:9], v12
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[10:11], v13
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[12:13], v14
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[14:15], v15
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[16:17], v16
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[18:19], v18
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[20:21], v20
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[22:23], v22
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[24:25], v24
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[26:27], v26
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[28:29], v28
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[30:31], v30
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: global_extload_v16bf16_to_v16f64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx4 v[2:5], v[0:1], off
+; GFX9-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:16
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v3
+; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v4
+; GFX9-NEXT:    v_and_b32_e32 v12, 0xffff0000, v5
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v13, 0xffff0000, v6
+; GFX9-NEXT:    v_and_b32_e32 v16, 0xffff0000, v7
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v8
+; GFX9-NEXT:    v_and_b32_e32 v20, 0xffff0000, v9
+; GFX9-NEXT:    v_lshlrev_b32_e32 v21, 16, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v24, 16, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v25, 16, v4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v28, 16, v5
+; GFX9-NEXT:    v_lshlrev_b32_e32 v29, 16, v6
+; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v7
+; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v8
+; GFX9-NEXT:    v_lshlrev_b32_e32 v34, 16, v9
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[2:3], v0
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[6:7], v1
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[10:11], v10
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[14:15], v12
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[18:19], v13
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[22:23], v16
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[26:27], v17
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[30:31], v20
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[0:1], v21
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[4:5], v24
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[8:9], v25
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[12:13], v28
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[16:17], v29
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[20:21], v32
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[24:25], v33
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[28:29], v34
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_extload_v16bf16_to_v16f64:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    global_load_dwordx4 v[2:5], v[0:1], off
+; GFX10-NEXT:    global_load_dwordx4 v[9:12], v[0:1], off offset:16
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
+; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v3
+; GFX10-NEXT:    v_and_b32_e32 v13, 0xffff0000, v4
+; GFX10-NEXT:    v_and_b32_e32 v14, 0xffff0000, v5
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v16, 0xffff0000, v9
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v17, 0xffff0000, v10
+; GFX10-NEXT:    v_and_b32_e32 v20, 0xffff0000, v11
+; GFX10-NEXT:    v_and_b32_e32 v21, 0xffff0000, v12
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v9
+; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v28, 16, v11
+; GFX10-NEXT:    v_lshlrev_b32_e32 v29, 16, v12
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[2:3], v6
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[4:5], v8
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[6:7], v7
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[8:9], v15
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[10:11], v13
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[12:13], v18
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[14:15], v14
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[18:19], v16
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[22:23], v17
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[16:17], v24
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[26:27], v20
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[30:31], v21
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[20:21], v25
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[24:25], v28
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[28:29], v29
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_extload_v16bf16_to_v16f64:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    global_load_b128 v[7:10], v[0:1], off
+; GFX11-NEXT:    global_load_b128 v[23:26], v[0:1], off offset:16
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v7
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v7
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v8
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v8
+; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
+; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v9
+; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v10
+; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff0000, v10
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v23
+; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff0000, v23
+; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v24
+; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff0000, v24
+; GFX11-NEXT:    v_lshlrev_b32_e32 v24, 16, v25
+; GFX11-NEXT:    v_and_b32_e32 v27, 0xffff0000, v25
+; GFX11-NEXT:    v_lshlrev_b32_e32 v28, 16, v26
+; GFX11-NEXT:    v_and_b32_e32 v30, 0xffff0000, v26
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[8:9], v8
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[10:11], v11
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[12:13], v12
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[14:15], v14
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[16:17], v16
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[18:19], v18
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[20:21], v20
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[22:23], v22
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[24:25], v24
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[26:27], v27
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[28:29], v28
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[30:31], v30
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %load = load <16 x bfloat>, ptr addrspace(1) %ptr
+  %fpext = fpext <16 x bfloat> %load to <16 x double>
+  ret <16 x double> %fpext
+}
+
+define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) {
+; GCN-LABEL: global_extload_v32bf16_to_v32f64:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s4, s6
+; GCN-NEXT:    s_mov_b32 s5, s6
+; GCN-NEXT:    buffer_load_ushort v3, v[1:2], s[4:7], 0 addr64
+; GCN-NEXT:    buffer_load_ushort v4, v[1:2], s[4:7], 0 addr64 offset:2
+; GCN-NEXT:    buffer_load_ushort v5, v[1:2], s[4:7], 0 addr64 offset:4
+; GCN-NEXT:    buffer_load_ushort v6, v[1:2], s[4:7], 0 addr64 offset:6
+; GCN-NEXT:    buffer_load_ushort v7, v[1:2], s[4:7], 0 addr64 offset:8
+; GCN-NEXT:    buffer_load_ushort v8, v[1:2], s[4:7], 0 addr64 offset:10
+; GCN-NEXT:    buffer_load_ushort v9, v[1:2], s[4:7], 0 addr64 offset:12
+; GCN-NEXT:    buffer_load_ushort v10, v[1:2], s[4:7], 0 addr64 offset:14
+; GCN-NEXT:    buffer_load_ushort v11, v[1:2], s[4:7], 0 addr64 offset:16
+; GCN-NEXT:    buffer_load_ushort v12, v[1:2], s[4:7], 0 addr64 offset:18
+; GCN-NEXT:    buffer_load_ushort v13, v[1:2], s[4:7], 0 addr64 offset:20
+; GCN-NEXT:    buffer_load_ushort v14, v[1:2], s[4:7], 0 addr64 offset:22
+; GCN-NEXT:    buffer_load_ushort v15, v[1:2], s[4:7], 0 addr64 offset:24
+; GCN-NEXT:    buffer_load_ushort v16, v[1:2], s[4:7], 0 addr64 offset:26
+; GCN-NEXT:    buffer_load_ushort v17, v[1:2], s[4:7], 0 addr64 offset:28
+; GCN-NEXT:    buffer_load_ushort v18, v[1:2], s[4:7], 0 addr64 offset:30
+; GCN-NEXT:    buffer_load_ushort v25, v[1:2], s[4:7], 0 addr64 offset:48
+; GCN-NEXT:    buffer_load_ushort v26, v[1:2], s[4:7], 0 addr64 offset:50
+; GCN-NEXT:    buffer_load_ushort v27, v[1:2], s[4:7], 0 addr64 offset:52
+; GCN-NEXT:    buffer_load_ushort v28, v[1:2], s[4:7], 0 addr64 offset:54
+; GCN-NEXT:    buffer_load_ushort v29, v[1:2], s[4:7], 0 addr64 offset:56
+; GCN-NEXT:    buffer_load_ushort v30, v[1:2], s[4:7], 0 addr64 offset:58
+; GCN-NEXT:    buffer_load_ushort v31, v[1:2], s[4:7], 0 addr64 offset:60
+; GCN-NEXT:    buffer_load_ushort v32, v[1:2], s[4:7], 0 addr64 offset:62
+; GCN-NEXT:    buffer_load_ushort v19, v[1:2], s[4:7], 0 addr64 offset:32
+; GCN-NEXT:    buffer_load_ushort v20, v[1:2], s[4:7], 0 addr64 offset:34
+; GCN-NEXT:    buffer_load_ushort v21, v[1:2], s[4:7], 0 addr64 offset:36
+; GCN-NEXT:    buffer_load_ushort v22, v[1:2], s[4:7], 0 addr64 offset:38
+; GCN-NEXT:    buffer_load_ushort v23, v[1:2], s[4:7], 0 addr64 offset:40
+; GCN-NEXT:    buffer_load_ushort v24, v[1:2], s[4:7], 0 addr64 offset:42
+; GCN-NEXT:    buffer_load_ushort v33, v[1:2], s[4:7], 0 addr64 offset:44
+; GCN-NEXT:    buffer_load_ushort v34, v[1:2], s[4:7], 0 addr64 offset:46
+; GCN-NEXT:    s_waitcnt vmcnt(8)
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v32
+; GCN-NEXT:    v_add_i32_e32 v32, vcc, 0xfc, v0
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
+; GCN-NEXT:    buffer_store_dword v2, v32, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt expcnt(0)
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, 0xf8, v0
+; GCN-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt expcnt(0)
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v31
+; GCN-NEXT:    v_add_i32_e32 v31, vcc, 0xf4, v0
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
+; GCN-NEXT:    buffer_store_dword v2, v31, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt expcnt(0)
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, 0xf0, v0
+; GCN-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v31, vcc, 0xec, v0
+; GCN-NEXT:    s_waitcnt expcnt(0)
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v30
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
+; GCN-NEXT:    buffer_store_dword v2, v31, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt expcnt(0)
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, 0xe8, v0
+; GCN-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v30, vcc, 0xe4, v0
+; GCN-NEXT:    s_waitcnt expcnt(0)
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v29
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
+; GCN-NEXT:    buffer_store_dword v2, v30, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt expcnt(0)
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, 0xe0, v0
+; GCN-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v29, vcc, 0xdc, v0
+; GCN-NEXT:    v_add_i32_e32 v30, vcc, 0xd8, v0
+; GCN-NEXT:    s_waitcnt expcnt(0)
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v28
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
+; GCN-NEXT:    buffer_store_dword v2, v29, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v28, vcc, 0xd4, v0
+; GCN-NEXT:    buffer_store_dword v1, v30, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v29, vcc, 0xd0, v0
+; GCN-NEXT:    v_add_i32_e32 v30, vcc, 0xcc, v0
+; GCN-NEXT:    s_waitcnt expcnt(0)
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v27
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
+; GCN-NEXT:    buffer_store_dword v2, v28, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v27, vcc, 0xc8, v0
+; GCN-NEXT:    buffer_store_dword v1, v29, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v28, vcc, 0xc4, v0
+; GCN-NEXT:    v_add_i32_e32 v29, vcc, 0xc0, v0
+; GCN-NEXT:    s_waitcnt expcnt(0)
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v26
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
+; GCN-NEXT:    buffer_store_dword v2, v30, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v26, vcc, 0xbc, v0
+; GCN-NEXT:    buffer_store_dword v1, v27, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v27, vcc, 0xb8, v0
+; GCN-NEXT:    v_add_i32_e32 v30, vcc, 0xb4, v0
+; GCN-NEXT:    s_waitcnt expcnt(0)
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v25
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
+; GCN-NEXT:    buffer_store_dword v2, v28, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v25, vcc, 0xb0, v0
+; GCN-NEXT:    buffer_store_dword v1, v29, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v28, vcc, 0xac, v0
+; GCN-NEXT:    v_add_i32_e32 v29, vcc, 0xa8, v0
+; GCN-NEXT:    s_waitcnt vmcnt(14) expcnt(0)
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v34
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
+; GCN-NEXT:    buffer_store_dword v2, v26, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v26, vcc, 0xa4, v0
+; GCN-NEXT:    buffer_store_dword v1, v27, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v27, vcc, 0xa0, v0
+; GCN-NEXT:    v_add_i32_e32 v31, vcc, 0x9c, v0
+; GCN-NEXT:    s_waitcnt expcnt(0)
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v33
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
+; GCN-NEXT:    buffer_store_dword v2, v30, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v30, vcc, 0x98, v0
+; GCN-NEXT:    buffer_store_dword v1, v25, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v25, vcc, 0x94, v0
+; GCN-NEXT:    v_add_i32_e32 v32, vcc, 0x90, v0
+; GCN-NEXT:    s_waitcnt expcnt(0)
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v24
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
+; GCN-NEXT:    buffer_store_dword v2, v28, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v24, vcc, 0x8c, v0
+; GCN-NEXT:    buffer_store_dword v1, v29, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v28, vcc, 0x88, v0
+; GCN-NEXT:    v_add_i32_e32 v29, vcc, 0x84, v0
+; GCN-NEXT:    s_waitcnt expcnt(0)
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v23
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
+; GCN-NEXT:    buffer_store_dword v2, v26, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v23, vcc, 0x80, v0
+; GCN-NEXT:    buffer_store_dword v1, v27, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v26, vcc, 0x7c, v0
+; GCN-NEXT:    v_add_i32_e32 v27, vcc, 0x78, v0
+; GCN-NEXT:    s_waitcnt expcnt(0)
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v22
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
+; GCN-NEXT:    buffer_store_dword v2, v31, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v22, vcc, 0x74, v0
+; GCN-NEXT:    buffer_store_dword v1, v30, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v30, vcc, 0x70, v0
+; GCN-NEXT:    v_add_i32_e32 v31, vcc, 0x6c, v0
+; GCN-NEXT:    s_waitcnt expcnt(0)
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v21
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
+; GCN-NEXT:    buffer_store_dword v2, v25, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v21, vcc, 0x68, v0
+; GCN-NEXT:    buffer_store_dword v1, v32, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v25, vcc, 0x64, v0
+; GCN-NEXT:    v_add_i32_e32 v32, vcc, 0x60, v0
+; GCN-NEXT:    s_waitcnt expcnt(0)
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v20
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
+; GCN-NEXT:    buffer_store_dword v2, v24, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v20, vcc, 0x5c, v0
+; GCN-NEXT:    buffer_store_dword v1, v28, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v24, vcc, 0x58, v0
+; GCN-NEXT:    v_add_i32_e32 v28, vcc, 0x54, v0
+; GCN-NEXT:    s_waitcnt expcnt(0)
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v19
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
+; GCN-NEXT:    buffer_store_dword v2, v29, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v19, vcc, 0x50, v0
+; GCN-NEXT:    buffer_store_dword v1, v23, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v23, vcc, 0x4c, v0
+; GCN-NEXT:    v_add_i32_e32 v29, vcc, 0x48, v0
+; GCN-NEXT:    s_waitcnt expcnt(0)
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v18
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
+; GCN-NEXT:    buffer_store_dword v2, v26, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v18, vcc, 0x44, v0
+; GCN-NEXT:    buffer_store_dword v1, v27, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v26, vcc, 64, v0
+; GCN-NEXT:    v_add_i32_e32 v27, vcc, 60, v0
+; GCN-NEXT:    s_waitcnt expcnt(0)
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v17
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
+; GCN-NEXT:    buffer_store_dword v2, v22, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v17, vcc, 56, v0
+; GCN-NEXT:    buffer_store_dword v1, v30, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v22, vcc, 52, v0
+; GCN-NEXT:    v_add_i32_e32 v30, vcc, 48, v0
+; GCN-NEXT:    s_waitcnt expcnt(0)
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v16
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
+; GCN-NEXT:    buffer_store_dword v2, v31, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v31, vcc, 44, v0
+; GCN-NEXT:    buffer_store_dword v1, v21, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v21, vcc, 40, v0
+; GCN-NEXT:    v_add_i32_e32 v33, vcc, 36, v0
+; GCN-NEXT:    s_waitcnt expcnt(0)
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v15
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
+; GCN-NEXT:    buffer_store_dword v2, v25, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v25, vcc, 32, v0
+; GCN-NEXT:    buffer_store_dword v1, v32, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v32, vcc, 28, v0
+; GCN-NEXT:    v_add_i32_e32 v34, vcc, 24, v0
+; GCN-NEXT:    s_waitcnt expcnt(0)
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v14
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
+; GCN-NEXT:    buffer_store_dword v2, v20, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v20, vcc, 20, v0
+; GCN-NEXT:    buffer_store_dword v1, v24, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v24, vcc, 16, v0
+; GCN-NEXT:    v_add_i32_e32 v35, vcc, 12, v0
+; GCN-NEXT:    s_waitcnt expcnt(0)
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v13
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
+; GCN-NEXT:    buffer_store_dword v2, v28, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v28, vcc, 8, v0
+; GCN-NEXT:    buffer_store_dword v1, v19, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v19, vcc, 4, v0
+; GCN-NEXT:    s_waitcnt expcnt(0)
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v12
+; GCN-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GCN-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GCN-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GCN-NEXT:    v_lshlrev_b32_e32 v12, 16, v3
+; GCN-NEXT:    v_lshlrev_b32_e32 v13, 16, v4
+; GCN-NEXT:    v_lshlrev_b32_e32 v14, 16, v5
+; GCN-NEXT:    v_lshlrev_b32_e32 v15, 16, v6
+; GCN-NEXT:    v_lshlrev_b32_e32 v16, 16, v7
+; GCN-NEXT:    v_lshlrev_b32_e32 v36, 16, v8
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[3:4], v11
+; GCN-NEXT:    buffer_store_dword v2, v23, s[0:3], 0 offen
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[5:6], v10
+; GCN-NEXT:    buffer_store_dword v1, v29, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt expcnt(0)
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[1:2], v9
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[7:8], v12
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[9:10], v13
+; GCN-NEXT:    buffer_store_dword v4, v18, s[0:3], 0 offen
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[11:12], v36
+; GCN-NEXT:    buffer_store_dword v3, v26, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt expcnt(0)
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[3:4], v14
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[13:14], v15
+; GCN-NEXT:    v_cvt_f64_f32_e32 v[15:16], v16
+; GCN-NEXT:    buffer_store_dword v6, v27, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v5, v17, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v2, v22, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v1, v30, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v12, v31, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v11, v21, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v16, v33, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v15, v25, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v14, v32, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v13, v34, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v4, v20, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v3, v24, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v10, v35, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v9, v28, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v8, v19, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: global_extload_v32bf16_to_v32f64:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_load_ushort v20, v[1:2], s[4:7], 0 addr64 offset:62
+; GFX7-NEXT:    buffer_load_ushort v22, v[1:2], s[4:7], 0 addr64 offset:60
+; GFX7-NEXT:    buffer_load_ushort v23, v[1:2], s[4:7], 0 addr64 offset:58
+; GFX7-NEXT:    buffer_load_ushort v24, v[1:2], s[4:7], 0 addr64 offset:56
+; GFX7-NEXT:    buffer_load_ushort v25, v[1:2], s[4:7], 0 addr64 offset:54
+; GFX7-NEXT:    buffer_load_ushort v26, v[1:2], s[4:7], 0 addr64 offset:52
+; GFX7-NEXT:    buffer_load_ushort v27, v[1:2], s[4:7], 0 addr64 offset:50
+; GFX7-NEXT:    buffer_load_ushort v28, v[1:2], s[4:7], 0 addr64 offset:48
+; GFX7-NEXT:    buffer_load_ushort v15, v[1:2], s[4:7], 0 addr64 offset:32
+; GFX7-NEXT:    buffer_load_ushort v18, v[1:2], s[4:7], 0 addr64 offset:34
+; GFX7-NEXT:    buffer_load_ushort v29, v[1:2], s[4:7], 0 addr64 offset:36
+; GFX7-NEXT:    buffer_load_ushort v30, v[1:2], s[4:7], 0 addr64 offset:38
+; GFX7-NEXT:    buffer_load_ushort v31, v[1:2], s[4:7], 0 addr64 offset:40
+; GFX7-NEXT:    buffer_load_ushort v32, v[1:2], s[4:7], 0 addr64 offset:42
+; GFX7-NEXT:    buffer_load_ushort v33, v[1:2], s[4:7], 0 addr64 offset:44
+; GFX7-NEXT:    buffer_load_ushort v34, v[1:2], s[4:7], 0 addr64 offset:46
+; GFX7-NEXT:    buffer_load_ushort v19, v[1:2], s[4:7], 0 addr64
+; GFX7-NEXT:    buffer_load_ushort v17, v[1:2], s[4:7], 0 addr64 offset:2
+; GFX7-NEXT:    buffer_load_ushort v14, v[1:2], s[4:7], 0 addr64 offset:4
+; GFX7-NEXT:    buffer_load_ushort v12, v[1:2], s[4:7], 0 addr64 offset:6
+; GFX7-NEXT:    buffer_load_ushort v10, v[1:2], s[4:7], 0 addr64 offset:8
+; GFX7-NEXT:    buffer_load_ushort v9, v[1:2], s[4:7], 0 addr64 offset:10
+; GFX7-NEXT:    buffer_load_ushort v7, v[1:2], s[4:7], 0 addr64 offset:12
+; GFX7-NEXT:    buffer_load_ushort v4, v[1:2], s[4:7], 0 addr64 offset:14
+; GFX7-NEXT:    buffer_load_ushort v5, v[1:2], s[4:7], 0 addr64 offset:16
+; GFX7-NEXT:    buffer_load_ushort v3, v[1:2], s[4:7], 0 addr64 offset:18
+; GFX7-NEXT:    buffer_load_ushort v6, v[1:2], s[4:7], 0 addr64 offset:20
+; GFX7-NEXT:    buffer_load_ushort v8, v[1:2], s[4:7], 0 addr64 offset:22
+; GFX7-NEXT:    buffer_load_ushort v11, v[1:2], s[4:7], 0 addr64 offset:24
+; GFX7-NEXT:    buffer_load_ushort v13, v[1:2], s[4:7], 0 addr64 offset:26
+; GFX7-NEXT:    buffer_load_ushort v16, v[1:2], s[4:7], 0 addr64 offset:28
+; GFX7-NEXT:    buffer_load_ushort v1, v[1:2], s[4:7], 0 addr64 offset:30
+; GFX7-NEXT:    s_waitcnt vmcnt(14)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v20
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[20:21], v2
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0xfc, v0
+; GFX7-NEXT:    buffer_store_dword v21, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0xf8, v0
+; GFX7-NEXT:    buffer_store_dword v20, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v22
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[20:21], v2
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0xf4, v0
+; GFX7-NEXT:    v_add_i32_e32 v22, vcc, 0xd8, v0
+; GFX7-NEXT:    buffer_store_dword v21, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0xf0, v0
+; GFX7-NEXT:    buffer_store_dword v20, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v23
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[20:21], v2
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0xec, v0
+; GFX7-NEXT:    s_waitcnt vmcnt(14)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX7-NEXT:    buffer_store_dword v21, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0xe8, v0
+; GFX7-NEXT:    buffer_store_dword v20, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v24
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[20:21], v2
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0xe4, v0
+; GFX7-NEXT:    v_add_i32_e32 v24, vcc, 0xd0, v0
+; GFX7-NEXT:    buffer_store_dword v21, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0xe0, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v21, 16, v25
+; GFX7-NEXT:    buffer_store_dword v20, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[20:21], v21
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0xdc, v0
+; GFX7-NEXT:    s_waitcnt vmcnt(8)
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    buffer_store_dword v21, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_lshlrev_b32_e32 v21, 16, v26
+; GFX7-NEXT:    buffer_store_dword v20, v22, s[0:3], 0 offen
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[20:21], v21
+; GFX7-NEXT:    v_lshlrev_b32_e32 v22, 16, v27
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0xd4, v0
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[22:23], v22
+; GFX7-NEXT:    buffer_store_dword v21, v2, s[0:3], 0 offen
+; GFX7-NEXT:    buffer_store_dword v20, v24, s[0:3], 0 offen
+; GFX7-NEXT:    v_lshlrev_b32_e32 v20, 16, v28
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[20:21], v20
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0xcc, v0
+; GFX7-NEXT:    buffer_store_dword v23, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0xc8, v0
+; GFX7-NEXT:    buffer_store_dword v22, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0xc4, v0
+; GFX7-NEXT:    buffer_store_dword v21, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_lshlrev_b32_e32 v21, 16, v34
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[21:22], v21
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0xc0, v0
+; GFX7-NEXT:    buffer_store_dword v20, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0xbc, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v20, 16, v33
+; GFX7-NEXT:    buffer_store_dword v22, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[22:23], v20
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0xb8, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v20, 16, v32
+; GFX7-NEXT:    buffer_store_dword v21, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[20:21], v20
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0xb4, v0
+; GFX7-NEXT:    buffer_store_dword v23, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0xb0, v0
+; GFX7-NEXT:    buffer_store_dword v22, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0xac, v0
+; GFX7-NEXT:    buffer_store_dword v21, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_lshlrev_b32_e32 v21, 16, v31
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[21:22], v21
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0xa8, v0
+; GFX7-NEXT:    buffer_store_dword v20, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0xa4, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v20, 16, v30
+; GFX7-NEXT:    buffer_store_dword v22, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[22:23], v20
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0xa0, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v20, 16, v29
+; GFX7-NEXT:    buffer_store_dword v21, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[20:21], v20
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0x9c, v0
+; GFX7-NEXT:    buffer_store_dword v23, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0x98, v0
+; GFX7-NEXT:    buffer_store_dword v22, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0x94, v0
+; GFX7-NEXT:    buffer_store_dword v21, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0x90, v0
+; GFX7-NEXT:    buffer_store_dword v20, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v18
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[20:21], v2
+; GFX7-NEXT:    v_add_i32_e32 v18, vcc, 0x8c, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v19
+; GFX7-NEXT:    buffer_store_dword v21, v18, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v18, vcc, 0x88, v0
+; GFX7-NEXT:    buffer_store_dword v20, v18, s[0:3], 0 offen
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[18:19], v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v15
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[20:21], v2
+; GFX7-NEXT:    v_add_i32_e32 v15, vcc, 0x84, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v17
+; GFX7-NEXT:    buffer_store_dword v21, v15, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v15, vcc, 0x80, v0
+; GFX7-NEXT:    buffer_store_dword v20, v15, s[0:3], 0 offen
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[20:21], v2
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
+; GFX7-NEXT:    v_add_i32_e32 v15, vcc, 0x7c, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX7-NEXT:    buffer_store_dword v2, v15, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0x78, v0
+; GFX7-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[1:2], v14
+; GFX7-NEXT:    v_lshlrev_b32_e32 v14, 16, v16
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[14:15], v14
+; GFX7-NEXT:    v_add_i32_e32 v16, vcc, 0x74, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX7-NEXT:    buffer_store_dword v15, v16, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v15, vcc, 0x70, v0
+; GFX7-NEXT:    buffer_store_dword v14, v15, s[0:3], 0 offen
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[14:15], v12
+; GFX7-NEXT:    v_lshlrev_b32_e32 v12, 16, v13
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[12:13], v12
+; GFX7-NEXT:    v_add_i32_e32 v16, vcc, 0x6c, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX7-NEXT:    buffer_store_dword v13, v16, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v13, vcc, 0x68, v0
+; GFX7-NEXT:    buffer_store_dword v12, v13, s[0:3], 0 offen
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[12:13], v10
+; GFX7-NEXT:    v_lshlrev_b32_e32 v10, 16, v11
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[10:11], v10
+; GFX7-NEXT:    v_add_i32_e32 v16, vcc, 0x64, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT:    buffer_store_dword v11, v16, s[0:3], 0 offen
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[16:17], v8
+; GFX7-NEXT:    v_add_i32_e32 v11, vcc, 0x60, v0
+; GFX7-NEXT:    v_add_i32_e32 v8, vcc, 0x5c, v0
+; GFX7-NEXT:    buffer_store_dword v10, v11, s[0:3], 0 offen
+; GFX7-NEXT:    buffer_store_dword v17, v8, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v8, vcc, 0x58, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v11, 16, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
+; GFX7-NEXT:    buffer_store_dword v16, v8, s[0:3], 0 offen
+; GFX7-NEXT:    v_lshlrev_b32_e32 v16, 16, v5
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[4:5], v4
+; GFX7-NEXT:    v_add_i32_e32 v6, vcc, 0x54, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX7-NEXT:    buffer_store_dword v5, v6, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v5, vcc, 0x50, v0
+; GFX7-NEXT:    buffer_store_dword v4, v5, s[0:3], 0 offen
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[3:4], v3
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[5:6], v16
+; GFX7-NEXT:    v_add_i32_e32 v16, vcc, 0x4c, v0
+; GFX7-NEXT:    buffer_store_dword v4, v16, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x48, v0
+; GFX7-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[3:4], v11
+; GFX7-NEXT:    v_add_i32_e32 v11, vcc, 0x44, v0
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[7:8], v7
+; GFX7-NEXT:    buffer_store_dword v6, v11, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v6, vcc, 64, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX7-NEXT:    buffer_store_dword v5, v6, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v5, vcc, 60, v0
+; GFX7-NEXT:    v_cvt_f64_f32_e32 v[9:10], v9
+; GFX7-NEXT:    buffer_store_dword v4, v5, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 56, v0
+; GFX7-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 52, v0
+; GFX7-NEXT:    buffer_store_dword v8, v3, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 48, v0
+; GFX7-NEXT:    buffer_store_dword v7, v3, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 44, v0
+; GFX7-NEXT:    buffer_store_dword v10, v3, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 40, v0
+; GFX7-NEXT:    buffer_store_dword v9, v3, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 36, v0
+; GFX7-NEXT:    buffer_store_dword v13, v3, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 32, v0
+; GFX7-NEXT:    buffer_store_dword v12, v3, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 28, v0
+; GFX7-NEXT:    buffer_store_dword v15, v3, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 24, v0
+; GFX7-NEXT:    buffer_store_dword v14, v3, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 20, v0
+; GFX7-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 16, v0
+; GFX7-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v1, vcc, 12, v0
+; GFX7-NEXT:    buffer_store_dword v21, v1, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v1, vcc, 8, v0
+; GFX7-NEXT:    buffer_store_dword v20, v1, s[0:3], 0 offen
+; GFX7-NEXT:    v_add_i32_e32 v1, vcc, 4, v0
+; GFX7-NEXT:    buffer_store_dword v19, v1, s[0:3], 0 offen
+; GFX7-NEXT:    buffer_store_dword v18, v0, s[0:3], 0 offen
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: global_extload_v32bf16_to_v32f64:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 2, v1
+; GFX8-NEXT:    v_addc_u32_e32 v10, vcc, 0, v2, vcc
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 4, v1
+; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v2, vcc
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 6, v1
+; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, 0, v2, vcc
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 8, v1
+; GFX8-NEXT:    v_addc_u32_e32 v8, vcc, 0, v2, vcc
+; GFX8-NEXT:    v_add_u32_e32 v11, vcc, 10, v1
+; GFX8-NEXT:    v_addc_u32_e32 v12, vcc, 0, v2, vcc
+; GFX8-NEXT:    v_add_u32_e32 v13, vcc, 12, v1
+; GFX8-NEXT:    v_addc_u32_e32 v14, vcc, 0, v2, vcc
+; GFX8-NEXT:    v_add_u32_e32 v17, vcc, 14, v1
+; GFX8-NEXT:    v_addc_u32_e32 v18, vcc, 0, v2, vcc
+; GFX8-NEXT:    v_add_u32_e32 v21, vcc, 16, v1
+; GFX8-NEXT:    v_addc_u32_e32 v22, vcc, 0, v2, vcc
+; GFX8-NEXT:    v_add_u32_e32 v15, vcc, 18, v1
+; GFX8-NEXT:    v_addc_u32_e32 v16, vcc, 0, v2, vcc
+; GFX8-NEXT:    v_add_u32_e32 v19, vcc, 20, v1
+; GFX8-NEXT:    v_addc_u32_e32 v20, vcc, 0, v2, vcc
+; GFX8-NEXT:    v_add_u32_e32 v23, vcc, 22, v1
+; GFX8-NEXT:    v_addc_u32_e32 v24, vcc, 0, v2, vcc
+; GFX8-NEXT:    v_add_u32_e32 v25, vcc, 24, v1
+; GFX8-NEXT:    v_addc_u32_e32 v26, vcc, 0, v2, vcc
+; GFX8-NEXT:    v_add_u32_e32 v27, vcc, 26, v1
+; GFX8-NEXT:    v_addc_u32_e32 v28, vcc, 0, v2, vcc
+; GFX8-NEXT:    v_add_u32_e32 v29, vcc, 28, v1
+; GFX8-NEXT:    v_addc_u32_e32 v30, vcc, 0, v2, vcc
+; GFX8-NEXT:    v_add_u32_e32 v31, vcc, 30, v1
+; GFX8-NEXT:    v_addc_u32_e32 v32, vcc, 0, v2, vcc
+; GFX8-NEXT:    v_add_u32_e32 v33, vcc, 32, v1
+; GFX8-NEXT:    v_addc_u32_e32 v34, vcc, 0, v2, vcc
+; GFX8-NEXT:    v_add_u32_e32 v35, vcc, 34, v1
+; GFX8-NEXT:    v_addc_u32_e32 v36, vcc, 0, v2, vcc
+; GFX8-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; GFX8-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX8-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX8-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX8-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX8-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX8-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX8-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX8-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX8-NEXT:    buffer_store_dword v57, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX8-NEXT:    v_add_u32_e32 v37, vcc, 36, v1
+; GFX8-NEXT:    flat_load_ushort v43, v[1:2]
+; GFX8-NEXT:    v_addc_u32_e32 v38, vcc, 0, v2, vcc
+; GFX8-NEXT:    v_add_u32_e32 v48, vcc, 38, v1
+; GFX8-NEXT:    v_addc_u32_e32 v49, vcc, 0, v2, vcc
+; GFX8-NEXT:    v_add_u32_e32 v50, vcc, 62, v1
+; GFX8-NEXT:    v_addc_u32_e32 v51, vcc, 0, v2, vcc
+; GFX8-NEXT:    flat_load_ushort v44, v[50:51]
+; GFX8-NEXT:    v_add_u32_e32 v50, vcc, 60, v1
+; GFX8-NEXT:    v_addc_u32_e32 v51, vcc, 0, v2, vcc
+; GFX8-NEXT:    flat_load_ushort v45, v[50:51]
+; GFX8-NEXT:    v_add_u32_e32 v50, vcc, 40, v1
+; GFX8-NEXT:    v_addc_u32_e32 v51, vcc, 0, v2, vcc
+; GFX8-NEXT:    v_add_u32_e32 v52, vcc, 58, v1
+; GFX8-NEXT:    v_addc_u32_e32 v53, vcc, 0, v2, vcc
+; GFX8-NEXT:    flat_load_ushort v46, v[52:53]
+; GFX8-NEXT:    v_add_u32_e32 v52, vcc, 42, v1
+; GFX8-NEXT:    v_addc_u32_e32 v53, vcc, 0, v2, vcc
+; GFX8-NEXT:    v_add_u32_e32 v54, vcc, 56, v1
+; GFX8-NEXT:    v_addc_u32_e32 v55, vcc, 0, v2, vcc
+; GFX8-NEXT:    flat_load_ushort v47, v[54:55]
+; GFX8-NEXT:    v_add_u32_e32 v54, vcc, 44, v1
+; GFX8-NEXT:    v_addc_u32_e32 v55, vcc, 0, v2, vcc
+; GFX8-NEXT:    v_add_u32_e32 v39, vcc, 54, v1
+; GFX8-NEXT:    v_addc_u32_e32 v40, vcc, 0, v2, vcc
+; GFX8-NEXT:    flat_load_ushort v56, v[39:40]
+; GFX8-NEXT:    v_add_u32_e32 v39, vcc, 52, v1
+; GFX8-NEXT:    v_addc_u32_e32 v40, vcc, 0, v2, vcc
+; GFX8-NEXT:    flat_load_ushort v57, v[39:40]
+; GFX8-NEXT:    v_add_u32_e32 v39, vcc, 46, v1
+; GFX8-NEXT:    v_addc_u32_e32 v40, vcc, 0, v2, vcc
+; GFX8-NEXT:    v_add_u32_e32 v41, vcc, 50, v1
+; GFX8-NEXT:    v_addc_u32_e32 v42, vcc, 0, v2, vcc
+; GFX8-NEXT:    flat_load_ushort v41, v[41:42]
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 48, v1
+; GFX8-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
+; GFX8-NEXT:    flat_load_ushort v42, v[9:10]
+; GFX8-NEXT:    flat_load_ushort v9, v[35:36]
+; GFX8-NEXT:    flat_load_ushort v10, v[37:38]
+; GFX8-NEXT:    flat_load_ushort v35, v[48:49]
+; GFX8-NEXT:    flat_load_ushort v36, v[50:51]
+; GFX8-NEXT:    flat_load_ushort v37, v[52:53]
+; GFX8-NEXT:    flat_load_ushort v48, v[54:55]
+; GFX8-NEXT:    flat_load_ushort v39, v[39:40]
+; GFX8-NEXT:    flat_load_ushort v49, v[1:2]
+; GFX8-NEXT:    flat_load_ushort v50, v[3:4]
+; GFX8-NEXT:    flat_load_ushort v51, v[5:6]
+; GFX8-NEXT:    flat_load_ushort v52, v[7:8]
+; GFX8-NEXT:    flat_load_ushort v53, v[11:12]
+; GFX8-NEXT:    flat_load_ushort v38, v[13:14]
+; GFX8-NEXT:    flat_load_ushort v14, v[17:18]
+; GFX8-NEXT:    flat_load_ushort v11, v[21:22]
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 4, v0
+; GFX8-NEXT:    flat_load_ushort v15, v[15:16]
+; GFX8-NEXT:    flat_load_ushort v13, v[19:20]
+; GFX8-NEXT:    flat_load_ushort v8, v[23:24]
+; GFX8-NEXT:    flat_load_ushort v6, v[25:26]
+; GFX8-NEXT:    flat_load_ushort v5, v[27:28]
+; GFX8-NEXT:    flat_load_ushort v7, v[29:30]
+; GFX8-NEXT:    flat_load_ushort v12, v[31:32]
+; GFX8-NEXT:    flat_load_ushort v16, v[33:34]
+; GFX8-NEXT:    v_add_u32_e32 v18, vcc, 0xc4, v0
+; GFX8-NEXT:    v_add_u32_e32 v20, vcc, 0xbc, v0
+; GFX8-NEXT:    v_add_u32_e32 v22, vcc, 0xb4, v0
+; GFX8-NEXT:    v_add_u32_e32 v24, vcc, 0xac, v0
+; GFX8-NEXT:    v_add_u32_e32 v26, vcc, 0xa4, v0
+; GFX8-NEXT:    v_add_u32_e32 v27, vcc, 0x9c, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(14)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v43
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
+; GFX8-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
+; GFX8-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0xfc, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v44
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
+; GFX8-NEXT:    buffer_store_dword v3, v1, s[0:3], 0 offen
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v45
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[3:4], v3
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0xf8, v0
+; GFX8-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0xf4, v0
+; GFX8-NEXT:    buffer_store_dword v4, v1, s[0:3], 0 offen
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v46
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0xf0, v0
+; GFX8-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0xec, v0
+; GFX8-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0xe8, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v47
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
+; GFX8-NEXT:    buffer_store_dword v1, v4, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0xe4, v0
+; GFX8-NEXT:    buffer_store_dword v3, v1, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0xe0, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v56
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[3:4], v3
+; GFX8-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0xdc, v0
+; GFX8-NEXT:    buffer_store_dword v4, v1, s[0:3], 0 offen
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v57
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0xd8, v0
+; GFX8-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0xd4, v0
+; GFX8-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0xd0, v0
+; GFX8-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v41
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0xcc, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v42
+; GFX8-NEXT:    buffer_store_dword v2, v4, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0xc8, v0
+; GFX8-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[1:2], v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v49
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[3:4], v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v50
+; GFX8-NEXT:    s_waitcnt vmcnt(14)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v19, 16, v51
+; GFX8-NEXT:    v_lshlrev_b32_e32 v21, 16, v52
+; GFX8-NEXT:    buffer_store_dword v4, v18, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0xc0, v0
+; GFX8-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[3:4], v17
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v39
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[17:18], v17
+; GFX8-NEXT:    v_lshlrev_b32_e32 v23, 16, v53
+; GFX8-NEXT:    v_lshlrev_b32_e32 v25, 16, v38
+; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX8-NEXT:    buffer_store_dword v18, v20, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v18, vcc, 0xb8, v0
+; GFX8-NEXT:    buffer_store_dword v17, v18, s[0:3], 0 offen
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[17:18], v19
+; GFX8-NEXT:    v_lshlrev_b32_e32 v19, 16, v48
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[19:20], v19
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    buffer_store_dword v20, v22, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v20, vcc, 0xb0, v0
+; GFX8-NEXT:    buffer_store_dword v19, v20, s[0:3], 0 offen
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[19:20], v21
+; GFX8-NEXT:    v_lshlrev_b32_e32 v21, 16, v37
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[21:22], v21
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX8-NEXT:    buffer_store_dword v22, v24, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v22, vcc, 0xa8, v0
+; GFX8-NEXT:    buffer_store_dword v21, v22, s[0:3], 0 offen
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[21:22], v23
+; GFX8-NEXT:    v_lshlrev_b32_e32 v23, 16, v36
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[23:24], v23
+; GFX8-NEXT:    buffer_store_dword v24, v26, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v24, vcc, 0xa0, v0
+; GFX8-NEXT:    buffer_store_dword v23, v24, s[0:3], 0 offen
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[23:24], v25
+; GFX8-NEXT:    v_lshlrev_b32_e32 v25, 16, v35
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[25:26], v25
+; GFX8-NEXT:    buffer_store_dword v26, v27, s[0:3], 0 offen
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[27:28], v10
+; GFX8-NEXT:    v_add_u32_e32 v26, vcc, 0x98, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v11
+; GFX8-NEXT:    v_add_u32_e32 v11, vcc, 0x94, v0
+; GFX8-NEXT:    buffer_store_dword v25, v26, s[0:3], 0 offen
+; GFX8-NEXT:    buffer_store_dword v28, v11, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v11, vcc, 0x90, v0
+; GFX8-NEXT:    buffer_store_dword v27, v11, s[0:3], 0 offen
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[27:28], v9
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[25:26], v14
+; GFX8-NEXT:    v_add_u32_e32 v14, vcc, 0x8c, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v15
+; GFX8-NEXT:    buffer_store_dword v28, v14, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v14, vcc, 0x88, v0
+; GFX8-NEXT:    buffer_store_dword v27, v14, s[0:3], 0 offen
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[14:15], v9
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v16
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[27:28], v9
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v13
+; GFX8-NEXT:    v_add_u32_e32 v13, vcc, 0x84, v0
+; GFX8-NEXT:    buffer_store_dword v28, v13, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v13, vcc, 0x80, v0
+; GFX8-NEXT:    buffer_store_dword v27, v13, s[0:3], 0 offen
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[27:28], v9
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v12
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[12:13], v9
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7c, v0
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[10:11], v10
+; GFX8-NEXT:    buffer_store_dword v13, v9, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x78, v0
+; GFX8-NEXT:    buffer_store_dword v12, v9, s[0:3], 0 offen
+; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v6
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
+; GFX8-NEXT:    v_add_u32_e32 v13, vcc, 0x74, v0
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[8:9], v8
+; GFX8-NEXT:    buffer_store_dword v7, v13, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x70, v0
+; GFX8-NEXT:    buffer_store_dword v6, v7, s[0:3], 0 offen
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[5:6], v5
+; GFX8-NEXT:    v_cvt_f64_f32_e32 v[12:13], v12
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x6c, v0
+; GFX8-NEXT:    buffer_store_dword v6, v7, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x68, v0
+; GFX8-NEXT:    buffer_store_dword v5, v6, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x64, v0
+; GFX8-NEXT:    buffer_store_dword v13, v5, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x60, v0
+; GFX8-NEXT:    buffer_store_dword v12, v5, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x5c, v0
+; GFX8-NEXT:    buffer_store_dword v9, v5, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x58, v0
+; GFX8-NEXT:    buffer_store_dword v8, v5, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x54, v0
+; GFX8-NEXT:    buffer_store_dword v28, v5, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x50, v0
+; GFX8-NEXT:    buffer_store_dword v27, v5, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x4c, v0
+; GFX8-NEXT:    buffer_store_dword v15, v5, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x48, v0
+; GFX8-NEXT:    buffer_store_dword v14, v5, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x44, v0
+; GFX8-NEXT:    buffer_store_dword v11, v5, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 64, v0
+; GFX8-NEXT:    buffer_store_dword v10, v5, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 60, v0
+; GFX8-NEXT:    buffer_store_dword v26, v5, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 56, v0
+; GFX8-NEXT:    buffer_store_dword v25, v5, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 52, v0
+; GFX8-NEXT:    buffer_store_dword v24, v5, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 48, v0
+; GFX8-NEXT:    buffer_store_dword v23, v5, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 44, v0
+; GFX8-NEXT:    buffer_store_dword v22, v5, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 40, v0
+; GFX8-NEXT:    buffer_store_dword v21, v5, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 36, v0
+; GFX8-NEXT:    buffer_store_dword v20, v5, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 32, v0
+; GFX8-NEXT:    buffer_store_dword v19, v5, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 28, v0
+; GFX8-NEXT:    buffer_store_dword v18, v5, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 24, v0
+; GFX8-NEXT:    buffer_store_dword v17, v5, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 20, v0
+; GFX8-NEXT:    buffer_store_dword v4, v5, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 16, v0
+; GFX8-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 12, v0
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 8, v0
+; GFX8-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
+; GFX8-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; GFX8-NEXT:    buffer_load_dword v57, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX8-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX8-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX8-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX8-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX8-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX8-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX8-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX8-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX8-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: global_extload_v32bf16_to_v32f64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_ushort v21, v[1:2], off offset:62
+; GFX9-NEXT:    global_load_ushort v23, v[1:2], off offset:60
+; GFX9-NEXT:    global_load_ushort v24, v[1:2], off offset:58
+; GFX9-NEXT:    global_load_ushort v25, v[1:2], off offset:56
+; GFX9-NEXT:    global_load_ushort v26, v[1:2], off offset:54
+; GFX9-NEXT:    global_load_ushort v27, v[1:2], off offset:52
+; GFX9-NEXT:    global_load_ushort v28, v[1:2], off offset:50
+; GFX9-NEXT:    global_load_ushort v29, v[1:2], off offset:48
+; GFX9-NEXT:    global_load_ushort v30, v[1:2], off offset:46
+; GFX9-NEXT:    global_load_ushort v31, v[1:2], off offset:44
+; GFX9-NEXT:    global_load_ushort v32, v[1:2], off offset:42
+; GFX9-NEXT:    global_load_ushort v33, v[1:2], off offset:40
+; GFX9-NEXT:    global_load_ushort v34, v[1:2], off offset:38
+; GFX9-NEXT:    global_load_ushort v19, v[1:2], off
+; GFX9-NEXT:    global_load_ushort v20, v[1:2], off offset:36
+; GFX9-NEXT:    global_load_ushort v17, v[1:2], off offset:2
+; GFX9-NEXT:    global_load_ushort v18, v[1:2], off offset:4
+; GFX9-NEXT:    global_load_ushort v16, v[1:2], off offset:34
+; GFX9-NEXT:    global_load_ushort v11, v[1:2], off offset:32
+; GFX9-NEXT:    global_load_ushort v13, v[1:2], off offset:6
+; GFX9-NEXT:    global_load_ushort v14, v[1:2], off offset:8
+; GFX9-NEXT:    global_load_ushort v15, v[1:2], off offset:30
+; GFX9-NEXT:    global_load_ushort v3, v[1:2], off offset:16
+; GFX9-NEXT:    global_load_ushort v4, v[1:2], off offset:18
+; GFX9-NEXT:    global_load_ushort v5, v[1:2], off offset:20
+; GFX9-NEXT:    global_load_ushort v6, v[1:2], off offset:22
+; GFX9-NEXT:    global_load_ushort v8, v[1:2], off offset:24
+; GFX9-NEXT:    global_load_ushort v10, v[1:2], off offset:26
+; GFX9-NEXT:    global_load_ushort v12, v[1:2], off offset:28
+; GFX9-NEXT:    global_load_ushort v9, v[1:2], off offset:10
+; GFX9-NEXT:    global_load_ushort v7, v[1:2], off offset:12
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    global_load_ushort v1, v[1:2], off offset:14
+; GFX9-NEXT:    s_waitcnt vmcnt(31)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v21
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[21:22], v2
+; GFX9-NEXT:    s_waitcnt vmcnt(30)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v23
+; GFX9-NEXT:    s_waitcnt vmcnt(28)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v23, 16, v25
+; GFX9-NEXT:    buffer_store_dword v22, v0, s[0:3], 0 offen offset:252
+; GFX9-NEXT:    buffer_store_dword v21, v0, s[0:3], 0 offen offset:248
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[21:22], v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v24
+; GFX9-NEXT:    s_waitcnt vmcnt(29)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v24, 16, v26
+; GFX9-NEXT:    buffer_store_dword v22, v0, s[0:3], 0 offen offset:244
+; GFX9-NEXT:    buffer_store_dword v21, v0, s[0:3], 0 offen offset:240
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[21:22], v2
+; GFX9-NEXT:    s_waitcnt vmcnt(30)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v25, 16, v27
+; GFX9-NEXT:    buffer_store_dword v22, v0, s[0:3], 0 offen offset:236
+; GFX9-NEXT:    buffer_store_dword v21, v0, s[0:3], 0 offen offset:232
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[21:22], v23
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[23:24], v24
+; GFX9-NEXT:    s_waitcnt vmcnt(31)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v26, 16, v28
+; GFX9-NEXT:    s_waitcnt vmcnt(30)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v27, 16, v29
+; GFX9-NEXT:    s_waitcnt vmcnt(29)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v30
+; GFX9-NEXT:    buffer_store_dword v22, v0, s[0:3], 0 offen offset:228
+; GFX9-NEXT:    buffer_store_dword v21, v0, s[0:3], 0 offen offset:224
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[21:22], v25
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[25:26], v26
+; GFX9-NEXT:    buffer_store_dword v24, v0, s[0:3], 0 offen offset:220
+; GFX9-NEXT:    buffer_store_dword v23, v0, s[0:3], 0 offen offset:216
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[23:24], v27
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[27:28], v2
+; GFX9-NEXT:    s_waitcnt vmcnt(28)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v19
+; GFX9-NEXT:    s_waitcnt vmcnt(27)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v19, 16, v20
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[19:20], v19
+; GFX9-NEXT:    v_lshlrev_b32_e32 v29, 16, v31
+; GFX9-NEXT:    v_lshlrev_b32_e32 v30, 16, v32
+; GFX9-NEXT:    v_lshlrev_b32_e32 v31, 16, v33
+; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v34
+; GFX9-NEXT:    buffer_store_dword v22, v0, s[0:3], 0 offen offset:212
+; GFX9-NEXT:    buffer_store_dword v21, v0, s[0:3], 0 offen offset:208
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[21:22], v29
+; GFX9-NEXT:    s_waitcnt vmcnt(26)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[29:30], v30
+; GFX9-NEXT:    buffer_store_dword v26, v0, s[0:3], 0 offen offset:204
+; GFX9-NEXT:    buffer_store_dword v25, v0, s[0:3], 0 offen offset:200
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[25:26], v31
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[31:32], v32
+; GFX9-NEXT:    buffer_store_dword v24, v0, s[0:3], 0 offen offset:196
+; GFX9-NEXT:    buffer_store_dword v23, v0, s[0:3], 0 offen offset:192
+; GFX9-NEXT:    buffer_store_dword v28, v0, s[0:3], 0 offen offset:188
+; GFX9-NEXT:    buffer_store_dword v27, v0, s[0:3], 0 offen offset:184
+; GFX9-NEXT:    buffer_store_dword v22, v0, s[0:3], 0 offen offset:180
+; GFX9-NEXT:    buffer_store_dword v21, v0, s[0:3], 0 offen offset:176
+; GFX9-NEXT:    buffer_store_dword v30, v0, s[0:3], 0 offen offset:172
+; GFX9-NEXT:    buffer_store_dword v29, v0, s[0:3], 0 offen offset:168
+; GFX9-NEXT:    buffer_store_dword v26, v0, s[0:3], 0 offen offset:164
+; GFX9-NEXT:    buffer_store_dword v25, v0, s[0:3], 0 offen offset:160
+; GFX9-NEXT:    buffer_store_dword v32, v0, s[0:3], 0 offen offset:156
+; GFX9-NEXT:    buffer_store_dword v31, v0, s[0:3], 0 offen offset:152
+; GFX9-NEXT:    v_lshlrev_b32_e32 v21, 16, v17
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[16:17], v16
+; GFX9-NEXT:    s_waitcnt vmcnt(39)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX9-NEXT:    buffer_store_dword v20, v0, s[0:3], 0 offen offset:148
+; GFX9-NEXT:    buffer_store_dword v19, v0, s[0:3], 0 offen offset:144
+; GFX9-NEXT:    s_waitcnt vmcnt(40)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v20, 16, v13
+; GFX9-NEXT:    s_waitcnt vmcnt(39)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v23, 16, v14
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[13:14], v11
+; GFX9-NEXT:    buffer_store_dword v17, v0, s[0:3], 0 offen offset:140
+; GFX9-NEXT:    buffer_store_dword v16, v0, s[0:3], 0 offen offset:136
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[16:17], v2
+; GFX9-NEXT:    s_waitcnt vmcnt(40)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v15
+; GFX9-NEXT:    buffer_store_dword v14, v0, s[0:3], 0 offen offset:132
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[14:15], v2
+; GFX9-NEXT:    s_waitcnt vmcnt(34)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v12
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[11:12], v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v10
+; GFX9-NEXT:    buffer_store_dword v13, v0, s[0:3], 0 offen offset:128
+; GFX9-NEXT:    buffer_store_dword v15, v0, s[0:3], 0 offen offset:124
+; GFX9-NEXT:    buffer_store_dword v14, v0, s[0:3], 0 offen offset:120
+; GFX9-NEXT:    buffer_store_dword v12, v0, s[0:3], 0 offen offset:116
+; GFX9-NEXT:    buffer_store_dword v11, v0, s[0:3], 0 offen offset:112
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[10:11], v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX9-NEXT:    s_waitcnt vmcnt(38)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v9
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[8:9], v8
+; GFX9-NEXT:    buffer_store_dword v11, v0, s[0:3], 0 offen offset:108
+; GFX9-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:104
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[10:11], v2
+; GFX9-NEXT:    s_waitcnt vmcnt(39)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v7
+; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX9-NEXT:    s_waitcnt vmcnt(38)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v5
+; GFX9-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:100
+; GFX9-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:96
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[8:9], v2
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[3:4], v3
+; GFX9-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:92
+; GFX9-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:88
+; GFX9-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:84
+; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:80
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[1:2], v5
+; GFX9-NEXT:    v_lshlrev_b32_e32 v22, 16, v18
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[18:19], v21
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[13:14], v22
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[20:21], v20
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[22:23], v23
+; GFX9-NEXT:    v_cvt_f64_f32_e32 v[5:6], v12
+; GFX9-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:76
+; GFX9-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:72
+; GFX9-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:68
+; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:64
+; GFX9-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:60
+; GFX9-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:56
+; GFX9-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:52
+; GFX9-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:48
+; GFX9-NEXT:    buffer_store_dword v11, v0, s[0:3], 0 offen offset:44
+; GFX9-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:40
+; GFX9-NEXT:    buffer_store_dword v23, v0, s[0:3], 0 offen offset:36
+; GFX9-NEXT:    buffer_store_dword v22, v0, s[0:3], 0 offen offset:32
+; GFX9-NEXT:    buffer_store_dword v21, v0, s[0:3], 0 offen offset:28
+; GFX9-NEXT:    buffer_store_dword v20, v0, s[0:3], 0 offen offset:24
+; GFX9-NEXT:    buffer_store_dword v14, v0, s[0:3], 0 offen offset:20
+; GFX9-NEXT:    buffer_store_dword v13, v0, s[0:3], 0 offen offset:16
+; GFX9-NEXT:    buffer_store_dword v19, v0, s[0:3], 0 offen offset:12
+; GFX9-NEXT:    buffer_store_dword v18, v0, s[0:3], 0 offen offset:8
+; GFX9-NEXT:    buffer_store_dword v17, v0, s[0:3], 0 offen offset:4
+; GFX9-NEXT:    buffer_store_dword v16, v0, s[0:3], 0 offen
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_extload_v32bf16_to_v32f64:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_clause 0x1f
+; GFX10-NEXT:    global_load_ushort v3, v[1:2], off
+; GFX10-NEXT:    global_load_ushort v4, v[1:2], off offset:2
+; GFX10-NEXT:    global_load_ushort v5, v[1:2], off offset:4
+; GFX10-NEXT:    global_load_ushort v6, v[1:2], off offset:6
+; GFX10-NEXT:    global_load_ushort v7, v[1:2], off offset:8
+; GFX10-NEXT:    global_load_ushort v8, v[1:2], off offset:10
+; GFX10-NEXT:    global_load_ushort v9, v[1:2], off offset:12
+; GFX10-NEXT:    global_load_ushort v10, v[1:2], off offset:14
+; GFX10-NEXT:    global_load_ushort v11, v[1:2], off offset:16
+; GFX10-NEXT:    global_load_ushort v12, v[1:2], off offset:18
+; GFX10-NEXT:    global_load_ushort v13, v[1:2], off offset:20
+; GFX10-NEXT:    global_load_ushort v14, v[1:2], off offset:22
+; GFX10-NEXT:    global_load_ushort v15, v[1:2], off offset:24
+; GFX10-NEXT:    global_load_ushort v16, v[1:2], off offset:26
+; GFX10-NEXT:    global_load_ushort v17, v[1:2], off offset:28
+; GFX10-NEXT:    global_load_ushort v18, v[1:2], off offset:30
+; GFX10-NEXT:    global_load_ushort v19, v[1:2], off offset:32
+; GFX10-NEXT:    global_load_ushort v20, v[1:2], off offset:34
+; GFX10-NEXT:    global_load_ushort v21, v[1:2], off offset:36
+; GFX10-NEXT:    global_load_ushort v22, v[1:2], off offset:38
+; GFX10-NEXT:    global_load_ushort v23, v[1:2], off offset:40
+; GFX10-NEXT:    global_load_ushort v24, v[1:2], off offset:42
+; GFX10-NEXT:    global_load_ushort v25, v[1:2], off offset:44
+; GFX10-NEXT:    global_load_ushort v26, v[1:2], off offset:46
+; GFX10-NEXT:    global_load_ushort v27, v[1:2], off offset:48
+; GFX10-NEXT:    global_load_ushort v28, v[1:2], off offset:62
+; GFX10-NEXT:    global_load_ushort v29, v[1:2], off offset:50
+; GFX10-NEXT:    global_load_ushort v30, v[1:2], off offset:52
+; GFX10-NEXT:    global_load_ushort v31, v[1:2], off offset:54
+; GFX10-NEXT:    global_load_ushort v32, v[1:2], off offset:60
+; GFX10-NEXT:    global_load_ushort v33, v[1:2], off offset:56
+; GFX10-NEXT:    global_load_ushort v34, v[1:2], off offset:58
+; GFX10-NEXT:    s_waitcnt vmcnt(31)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX10-NEXT:    s_waitcnt vmcnt(30)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v35, 16, v4
+; GFX10-NEXT:    s_waitcnt vmcnt(29)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v36, 16, v5
+; GFX10-NEXT:    s_waitcnt vmcnt(28)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v37, 16, v6
+; GFX10-NEXT:    s_waitcnt vmcnt(27)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v38, 16, v7
+; GFX10-NEXT:    s_waitcnt vmcnt(26)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v39, 16, v8
+; GFX10-NEXT:    s_waitcnt vmcnt(25)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v48, 16, v9
+; GFX10-NEXT:    s_waitcnt vmcnt(24)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v49, 16, v10
+; GFX10-NEXT:    s_waitcnt vmcnt(23)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v50, 16, v11
+; GFX10-NEXT:    s_waitcnt vmcnt(22)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v51, 16, v12
+; GFX10-NEXT:    s_waitcnt vmcnt(21)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v52, 16, v13
+; GFX10-NEXT:    s_waitcnt vmcnt(20)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v53, 16, v14
+; GFX10-NEXT:    s_waitcnt vmcnt(19)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v54, 16, v15
+; GFX10-NEXT:    s_waitcnt vmcnt(18)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v55, 16, v16
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[11:12], v37
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[15:16], v38
+; GFX10-NEXT:    s_waitcnt vmcnt(15)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v66, 16, v19
+; GFX10-NEXT:    s_waitcnt vmcnt(14)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v67, 16, v20
+; GFX10-NEXT:    s_waitcnt vmcnt(13)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v64, 16, v21
+; GFX10-NEXT:    s_waitcnt vmcnt(12)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v65, 16, v22
+; GFX10-NEXT:    s_waitcnt vmcnt(11)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v70, 16, v23
+; GFX10-NEXT:    s_waitcnt vmcnt(10)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v71, 16, v24
+; GFX10-NEXT:    s_waitcnt vmcnt(9)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v80, 16, v25
+; GFX10-NEXT:    s_waitcnt vmcnt(8)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v81, 16, v26
+; GFX10-NEXT:    s_waitcnt vmcnt(7)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v82, 16, v27
+; GFX10-NEXT:    s_waitcnt vmcnt(6)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v28
+; GFX10-NEXT:    s_waitcnt vmcnt(5)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v83, 16, v29
+; GFX10-NEXT:    s_waitcnt vmcnt(4)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v84, 16, v30
+; GFX10-NEXT:    s_waitcnt vmcnt(3)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v29, 16, v31
+; GFX10-NEXT:    s_waitcnt vmcnt(2)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v32
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v34
+; GFX10-NEXT:    v_lshlrev_b32_e32 v21, 16, v33
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[29:30], v29
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[5:6], v5
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[37:38], v84
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[13:14], v13
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[21:22], v21
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[25:26], v50
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[27:28], v51
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[50:51], v82
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[31:32], v52
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[33:34], v53
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[52:53], v80
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[7:8], v35
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[9:10], v36
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[19:20], v48
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[23:24], v49
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[35:36], v54
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[48:49], v55
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[54:55], v70
+; GFX10-NEXT:    v_lshlrev_b32_e32 v69, 16, v18
+; GFX10-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:252
+; GFX10-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:248
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[1:2], v83
+; GFX10-NEXT:    v_lshlrev_b32_e32 v68, 16, v17
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[3:4], v3
+; GFX10-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:244
+; GFX10-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:240
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[5:6], v81
+; GFX10-NEXT:    buffer_store_dword v14, v0, s[0:3], 0 offen offset:236
+; GFX10-NEXT:    buffer_store_dword v13, v0, s[0:3], 0 offen offset:232
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[13:14], v71
+; GFX10-NEXT:    buffer_store_dword v22, v0, s[0:3], 0 offen offset:228
+; GFX10-NEXT:    buffer_store_dword v21, v0, s[0:3], 0 offen offset:224
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[21:22], v65
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[64:65], v64
+; GFX10-NEXT:    buffer_store_dword v30, v0, s[0:3], 0 offen offset:220
+; GFX10-NEXT:    buffer_store_dword v29, v0, s[0:3], 0 offen offset:216
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[29:30], v67
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[66:67], v66
+; GFX10-NEXT:    buffer_store_dword v38, v0, s[0:3], 0 offen offset:212
+; GFX10-NEXT:    buffer_store_dword v37, v0, s[0:3], 0 offen offset:208
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[37:38], v69
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[17:18], v39
+; GFX10-NEXT:    v_cvt_f64_f32_e32 v[68:69], v68
+; GFX10-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:204
+; GFX10-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:200
+; GFX10-NEXT:    buffer_store_dword v51, v0, s[0:3], 0 offen offset:196
+; GFX10-NEXT:    buffer_store_dword v50, v0, s[0:3], 0 offen offset:192
+; GFX10-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:188
+; GFX10-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:184
+; GFX10-NEXT:    buffer_store_dword v53, v0, s[0:3], 0 offen offset:180
+; GFX10-NEXT:    buffer_store_dword v52, v0, s[0:3], 0 offen offset:176
+; GFX10-NEXT:    buffer_store_dword v14, v0, s[0:3], 0 offen offset:172
+; GFX10-NEXT:    buffer_store_dword v13, v0, s[0:3], 0 offen offset:168
+; GFX10-NEXT:    buffer_store_dword v55, v0, s[0:3], 0 offen offset:164
+; GFX10-NEXT:    buffer_store_dword v54, v0, s[0:3], 0 offen offset:160
+; GFX10-NEXT:    buffer_store_dword v22, v0, s[0:3], 0 offen offset:156
+; GFX10-NEXT:    buffer_store_dword v21, v0, s[0:3], 0 offen offset:152
+; GFX10-NEXT:    buffer_store_dword v65, v0, s[0:3], 0 offen offset:148
+; GFX10-NEXT:    buffer_store_dword v64, v0, s[0:3], 0 offen offset:144
+; GFX10-NEXT:    buffer_store_dword v30, v0, s[0:3], 0 offen offset:140
+; GFX10-NEXT:    buffer_store_dword v29, v0, s[0:3], 0 offen offset:136
+; GFX10-NEXT:    buffer_store_dword v67, v0, s[0:3], 0 offen offset:132
+; GFX10-NEXT:    buffer_store_dword v66, v0, s[0:3], 0 offen offset:128
+; GFX10-NEXT:    buffer_store_dword v38, v0, s[0:3], 0 offen offset:124
+; GFX10-NEXT:    buffer_store_dword v37, v0, s[0:3], 0 offen offset:120
+; GFX10-NEXT:    buffer_store_dword v69, v0, s[0:3], 0 offen offset:116
+; GFX10-NEXT:    buffer_store_dword v68, v0, s[0:3], 0 offen offset:112
+; GFX10-NEXT:    buffer_store_dword v49, v0, s[0:3], 0 offen offset:108
+; GFX10-NEXT:    buffer_store_dword v48, v0, s[0:3], 0 offen offset:104
+; GFX10-NEXT:    buffer_store_dword v36, v0, s[0:3], 0 offen offset:100
+; GFX10-NEXT:    buffer_store_dword v35, v0, s[0:3], 0 offen offset:96
+; GFX10-NEXT:    buffer_store_dword v34, v0, s[0:3], 0 offen offset:92
+; GFX10-NEXT:    buffer_store_dword v33, v0, s[0:3], 0 offen offset:88
+; GFX10-NEXT:    buffer_store_dword v32, v0, s[0:3], 0 offen offset:84
+; GFX10-NEXT:    buffer_store_dword v31, v0, s[0:3], 0 offen offset:80
+; GFX10-NEXT:    buffer_store_dword v28, v0, s[0:3], 0 offen offset:76
+; GFX10-NEXT:    buffer_store_dword v27, v0, s[0:3], 0 offen offset:72
+; GFX10-NEXT:    buffer_store_dword v26, v0, s[0:3], 0 offen offset:68
+; GFX10-NEXT:    buffer_store_dword v25, v0, s[0:3], 0 offen offset:64
+; GFX10-NEXT:    buffer_store_dword v24, v0, s[0:3], 0 offen offset:60
+; GFX10-NEXT:    buffer_store_dword v23, v0, s[0:3], 0 offen offset:56
+; GFX10-NEXT:    buffer_store_dword v20, v0, s[0:3], 0 offen offset:52
+; GFX10-NEXT:    buffer_store_dword v19, v0, s[0:3], 0 offen offset:48
+; GFX10-NEXT:    buffer_store_dword v18, v0, s[0:3], 0 offen offset:44
+; GFX10-NEXT:    buffer_store_dword v17, v0, s[0:3], 0 offen offset:40
+; GFX10-NEXT:    buffer_store_dword v16, v0, s[0:3], 0 offen offset:36
+; GFX10-NEXT:    buffer_store_dword v15, v0, s[0:3], 0 offen offset:32
+; GFX10-NEXT:    buffer_store_dword v12, v0, s[0:3], 0 offen offset:28
+; GFX10-NEXT:    buffer_store_dword v11, v0, s[0:3], 0 offen offset:24
+; GFX10-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:20
+; GFX10-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:16
+; GFX10-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:12
+; GFX10-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:8
+; GFX10-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:4
+; GFX10-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_extload_v32bf16_to_v32f64:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_clause 0x1f
+; GFX11-NEXT:    global_load_u16 v3, v[1:2], off offset:12
+; GFX11-NEXT:    global_load_u16 v4, v[1:2], off offset:8
+; GFX11-NEXT:    global_load_u16 v5, v[1:2], off offset:4
+; GFX11-NEXT:    global_load_u16 v6, v[1:2], off offset:2
+; GFX11-NEXT:    global_load_u16 v7, v[1:2], off
+; GFX11-NEXT:    global_load_u16 v8, v[1:2], off offset:6
+; GFX11-NEXT:    global_load_u16 v9, v[1:2], off offset:10
+; GFX11-NEXT:    global_load_u16 v10, v[1:2], off offset:14
+; GFX11-NEXT:    global_load_u16 v11, v[1:2], off offset:28
+; GFX11-NEXT:    global_load_u16 v12, v[1:2], off offset:24
+; GFX11-NEXT:    global_load_u16 v13, v[1:2], off offset:20
+; GFX11-NEXT:    global_load_u16 v14, v[1:2], off offset:18
+; GFX11-NEXT:    global_load_u16 v15, v[1:2], off offset:16
+; GFX11-NEXT:    global_load_u16 v16, v[1:2], off offset:22
+; GFX11-NEXT:    global_load_u16 v17, v[1:2], off offset:26
+; GFX11-NEXT:    global_load_u16 v18, v[1:2], off offset:30
+; GFX11-NEXT:    global_load_u16 v19, v[1:2], off offset:44
+; GFX11-NEXT:    global_load_u16 v20, v[1:2], off offset:40
+; GFX11-NEXT:    global_load_u16 v21, v[1:2], off offset:36
+; GFX11-NEXT:    global_load_u16 v22, v[1:2], off offset:34
+; GFX11-NEXT:    global_load_u16 v23, v[1:2], off offset:32
+; GFX11-NEXT:    global_load_u16 v24, v[1:2], off offset:38
+; GFX11-NEXT:    global_load_u16 v25, v[1:2], off offset:42
+; GFX11-NEXT:    global_load_u16 v26, v[1:2], off offset:46
+; GFX11-NEXT:    global_load_u16 v27, v[1:2], off offset:60
+; GFX11-NEXT:    global_load_u16 v28, v[1:2], off offset:56
+; GFX11-NEXT:    global_load_u16 v29, v[1:2], off offset:52
+; GFX11-NEXT:    global_load_u16 v30, v[1:2], off offset:50
+; GFX11-NEXT:    global_load_u16 v31, v[1:2], off offset:48
+; GFX11-NEXT:    global_load_u16 v32, v[1:2], off offset:54
+; GFX11-NEXT:    global_load_u16 v33, v[1:2], off offset:58
+; GFX11-NEXT:    global_load_u16 v1, v[1:2], off offset:62
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    s_add_i32 s1, s0, 0xf0
+; GFX11-NEXT:    s_add_i32 s2, s0, 0xe0
+; GFX11-NEXT:    s_add_i32 s3, s0, 0xd0
+; GFX11-NEXT:    s_add_i32 s4, s0, 0xc0
+; GFX11-NEXT:    s_add_i32 s5, s0, 0xb0
+; GFX11-NEXT:    s_add_i32 s6, s0, 0xa0
+; GFX11-NEXT:    s_add_i32 s7, s0, 0x90
+; GFX11-NEXT:    s_add_i32 s8, s0, 0x70
+; GFX11-NEXT:    s_add_i32 s9, s0, 0x60
+; GFX11-NEXT:    s_add_i32 s10, s0, 0x50
+; GFX11-NEXT:    s_add_i32 s11, s0, 48
+; GFX11-NEXT:    s_waitcnt vmcnt(31)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v39, 16, v3
+; GFX11-NEXT:    s_waitcnt vmcnt(30)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v38, 16, v4
+; GFX11-NEXT:    s_waitcnt vmcnt(29)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-NEXT:    s_waitcnt vmcnt(28)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v6
+; GFX11-NEXT:    s_waitcnt vmcnt(27)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v37, 16, v7
+; GFX11-NEXT:    s_waitcnt vmcnt(26)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v8
+; GFX11-NEXT:    s_waitcnt vmcnt(25)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-NEXT:    s_waitcnt vmcnt(24)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-NEXT:    s_waitcnt vmcnt(23)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v102, 16, v11
+; GFX11-NEXT:    s_waitcnt vmcnt(22)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v101, 16, v12
+; GFX11-NEXT:    s_waitcnt vmcnt(21)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX11-NEXT:    s_waitcnt vmcnt(20)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-NEXT:    s_waitcnt vmcnt(19)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v100, 16, v15
+; GFX11-NEXT:    s_waitcnt vmcnt(18)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v34, 16, v16
+; GFX11-NEXT:    s_waitcnt vmcnt(17)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
+; GFX11-NEXT:    s_waitcnt vmcnt(16)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
+; GFX11-NEXT:    s_waitcnt vmcnt(15)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v52, 16, v19
+; GFX11-NEXT:    s_waitcnt vmcnt(14)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v49, 16, v20
+; GFX11-NEXT:    s_waitcnt vmcnt(13)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; GFX11-NEXT:    s_waitcnt vmcnt(12)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
+; GFX11-NEXT:    s_waitcnt vmcnt(11)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v103, 16, v23
+; GFX11-NEXT:    s_waitcnt vmcnt(10)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v48, 16, v24
+; GFX11-NEXT:    s_waitcnt vmcnt(9)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
+; GFX11-NEXT:    s_waitcnt vmcnt(8)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
+; GFX11-NEXT:    s_waitcnt vmcnt(7)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v68, 16, v27
+; GFX11-NEXT:    s_waitcnt vmcnt(6)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v65, 16, v28
+; GFX11-NEXT:    s_waitcnt vmcnt(5)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
+; GFX11-NEXT:    s_waitcnt vmcnt(4)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
+; GFX11-NEXT:    s_waitcnt vmcnt(3)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v53, 16, v31
+; GFX11-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v64, 16, v32
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v33, 16, v33
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[96:97], v68
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[84:85], v65
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[82:83], v64
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[86:87], v33
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[98:99], v1
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[80:81], v29
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[70:71], v30
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[68:69], v53
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[66:67], v26
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[64:65], v52
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[54:55], v25
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[52:53], v49
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[50:51], v48
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[48:49], v21
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[23:24], v34
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[35:36], v22
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[33:34], v103
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[31:32], v18
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[29:30], v102
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[27:28], v17
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[25:26], v101
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[21:22], v13
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[19:20], v14
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[17:18], v100
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[15:16], v10
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[13:14], v39
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[11:12], v9
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[9:10], v38
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[7:8], v6
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[5:6], v5
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[3:4], v2
+; GFX11-NEXT:    v_cvt_f64_f32_e32 v[1:2], v37
+; GFX11-NEXT:    scratch_store_b128 off, v[96:99], s1
+; GFX11-NEXT:    scratch_store_b128 off, v[84:87], s2
+; GFX11-NEXT:    scratch_store_b128 off, v[80:83], s3
+; GFX11-NEXT:    scratch_store_b128 off, v[68:71], s4
+; GFX11-NEXT:    scratch_store_b128 off, v[64:67], s5
+; GFX11-NEXT:    scratch_store_b128 off, v[52:55], s6
+; GFX11-NEXT:    scratch_store_b128 off, v[48:51], s7
+; GFX11-NEXT:    scratch_store_b128 off, v[33:36], s0 offset:128
+; GFX11-NEXT:    scratch_store_b128 off, v[29:32], s8
+; GFX11-NEXT:    scratch_store_b128 off, v[25:28], s9
+; GFX11-NEXT:    scratch_store_b128 off, v[21:24], s10
+; GFX11-NEXT:    scratch_store_b128 off, v[17:20], s0 offset:64
+; GFX11-NEXT:    scratch_store_b128 off, v[13:16], s11
+; GFX11-NEXT:    s_clause 0x2
+; GFX11-NEXT:    scratch_store_b128 off, v[9:12], s0 offset:32
+; GFX11-NEXT:    scratch_store_b128 off, v[5:8], s0 offset:16
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %load = load <32 x bfloat>, ptr addrspace(1) %ptr
+  %fpext = fpext <32 x bfloat> %load to <32 x double>
+  ret <32 x double> %fpext
+}
+
+define bfloat @v_fadd_bf16(bfloat %a, bfloat %b) {
+; GCN-LABEL: v_fadd_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_add_f32_e32 v0, v0, v1
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fadd_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fadd_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fadd_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fadd_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_fadd_bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = fadd bfloat %a, %b
+  ret bfloat %op
+}
+
+define <2 x bfloat> @v_fadd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
+; GCN-LABEL: v_fadd_v2bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_add_f32_e32 v1, v1, v3
+; GCN-NEXT:    v_add_f32_e32 v0, v0, v2
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fadd_v2bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX7-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fadd_v2bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_add_f32_e32 v2, v3, v2
+; GFX8-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX8-NEXT:    v_perm_b32 v0, v0, v2, s4
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fadd_v2bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_add_f32_e32 v2, v3, v2
+; GFX9-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    v_perm_b32 v0, v0, v2, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fadd_v2bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_add_f32_e32 v2, v3, v2
+; GFX10-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x3020706
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_fadd_v2bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_add_f32_e32 v2, v3, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v2, 0x3020706
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = fadd <2 x bfloat> %a, %b
+  ret <2 x bfloat> %op
+}
+
+define <3 x bfloat> @v_fadd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
+; GCN-LABEL: v_fadd_v3bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_add_f32_e32 v2, v2, v5
+; GCN-NEXT:    v_add_f32_e32 v1, v1, v4
+; GCN-NEXT:    v_add_f32_e32 v0, v0, v3
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fadd_v3bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_add_f32_e32 v2, v2, v5
+; GFX7-NEXT:    v_add_f32_e32 v1, v1, v4
+; GFX7-NEXT:    v_add_f32_e32 v0, v0, v3
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fadd_v3bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_add_f32_e32 v3, v4, v3
+; GFX8-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fadd_v3bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
+; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_add_f32_e32 v4, v5, v4
+; GFX9-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX9-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_perm_b32 v0, v0, v4, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fadd_v3bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_add_f32_e32 v4, v5, v4
+; GFX10-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX10-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX10-NEXT:    v_perm_b32 v0, v0, v4, 0x3020706
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_fadd_v3bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_add_f32 v1, v1, v3 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_add_f32_e32 v4, v5, v4
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v4, 0x3020706
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = fadd <3 x bfloat> %a, %b
+  ret <3 x bfloat> %op
+}
+
+define <4 x bfloat> @v_fadd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
+; GCN-LABEL: v_fadd_v4bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_add_f32_e32 v3, v3, v7
+; GCN-NEXT:    v_add_f32_e32 v2, v2, v6
+; GCN-NEXT:    v_add_f32_e32 v1, v1, v5
+; GCN-NEXT:    v_add_f32_e32 v0, v0, v4
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fadd_v4bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_add_f32_e32 v3, v3, v7
+; GFX7-NEXT:    v_add_f32_e32 v2, v2, v6
+; GFX7-NEXT:    v_add_f32_e32 v1, v1, v5
+; GFX7-NEXT:    v_add_f32_e32 v0, v0, v4
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fadd_v4bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v3
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_add_f32_e32 v4, v5, v4
+; GFX8-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_add_f32_e32 v3, v5, v3
+; GFX8-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX8-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX8-NEXT:    v_perm_b32 v1, v1, v4, s4
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fadd_v4bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_add_f32_e32 v4, v5, v4
+; GFX9-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_add_f32_e32 v3, v5, v3
+; GFX9-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fadd_v4bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
+; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX10-NEXT:    v_add_f32_e32 v3, v5, v4
+; GFX10-NEXT:    v_add_f32_e32 v4, v7, v6
+; GFX10-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_perm_b32 v0, v0, v4, 0x3020706
+; GFX10-NEXT:    v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_fadd_v4bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_add_f32 v0, v0, v2 :: v_dual_and_b32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX11-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_add_f32 v3, v7, v6 :: v_dual_add_f32 v4, v5, v4
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v3, 0x3020706
+; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_or_b32_e32 v1, v4, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = fadd <4 x bfloat> %a, %b
+  ret <4 x bfloat> %op
+}
+
+define <8 x bfloat> @v_fadd_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
+; GCN-LABEL: v_fadd_v8bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_add_f32_e32 v7, v7, v15
+; GCN-NEXT:    v_add_f32_e32 v6, v6, v14
+; GCN-NEXT:    v_add_f32_e32 v5, v5, v13
+; GCN-NEXT:    v_add_f32_e32 v4, v4, v12
+; GCN-NEXT:    v_add_f32_e32 v3, v3, v11
+; GCN-NEXT:    v_add_f32_e32 v2, v2, v10
+; GCN-NEXT:    v_add_f32_e32 v1, v1, v9
+; GCN-NEXT:    v_add_f32_e32 v0, v0, v8
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fadd_v8bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_add_f32_e32 v7, v7, v15
+; GFX7-NEXT:    v_add_f32_e32 v6, v6, v14
+; GFX7-NEXT:    v_add_f32_e32 v5, v5, v13
+; GFX7-NEXT:    v_add_f32_e32 v4, v4, v12
+; GFX7-NEXT:    v_add_f32_e32 v3, v3, v11
+; GFX7-NEXT:    v_add_f32_e32 v2, v2, v10
+; GFX7-NEXT:    v_add_f32_e32 v1, v1, v9
+; GFX7-NEXT:    v_add_f32_e32 v0, v0, v8
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fadd_v8bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v8, 0xffff0000, v7
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_add_f32_e32 v8, v9, v8
+; GFX8-NEXT:    v_add_f32_e32 v3, v3, v7
+; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v6
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_add_f32_e32 v7, v9, v7
+; GFX8-NEXT:    v_add_f32_e32 v2, v2, v6
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v5
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_add_f32_e32 v6, v9, v6
+; GFX8-NEXT:    v_add_f32_e32 v1, v1, v5
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_add_f32_e32 v5, v9, v5
+; GFX8-NEXT:    v_add_f32_e32 v0, v0, v4
+; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX8-NEXT:    v_perm_b32 v0, v0, v5, s4
+; GFX8-NEXT:    v_perm_b32 v1, v1, v6, s4
+; GFX8-NEXT:    v_perm_b32 v2, v2, v7, s4
+; GFX8-NEXT:    v_perm_b32 v3, v3, v8, s4
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fadd_v8bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v8, 0xffff0000, v7
+; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v3
+; GFX9-NEXT:    v_add_f32_e32 v8, v9, v8
+; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v5
+; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v1
+; GFX9-NEXT:    v_add_f32_e32 v9, v10, v9
+; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v6
+; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v2
+; GFX9-NEXT:    v_add_f32_e32 v10, v11, v10
+; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v4
+; GFX9-NEXT:    v_and_b32_e32 v12, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_add_f32_e32 v1, v1, v5
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_add_f32_e32 v0, v0, v4
+; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v9
+; GFX9-NEXT:    v_add_f32_e32 v11, v12, v11
+; GFX9-NEXT:    v_add_f32_e32 v3, v3, v7
+; GFX9-NEXT:    v_add_f32_e32 v2, v2, v5
+; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v8
+; GFX9-NEXT:    v_perm_b32 v0, v0, v11, s4
+; GFX9-NEXT:    v_perm_b32 v2, v2, v10, s4
+; GFX9-NEXT:    v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fadd_v8bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v8, 0xffff0000, v7
+; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v3
+; GFX10-NEXT:    v_and_b32_e32 v10, 0xffff0000, v5
+; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v1
+; GFX10-NEXT:    v_and_b32_e32 v12, 0xffff0000, v6
+; GFX10-NEXT:    v_and_b32_e32 v13, 0xffff0000, v2
+; GFX10-NEXT:    v_and_b32_e32 v14, 0xffff0000, v4
+; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v0
+; GFX10-NEXT:    v_add_f32_e32 v8, v9, v8
+; GFX10-NEXT:    v_add_f32_e32 v9, v11, v10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    v_add_f32_e32 v10, v13, v12
+; GFX10-NEXT:    v_add_f32_e32 v11, v15, v14
+; GFX10-NEXT:    v_add_f32_e32 v3, v3, v7
+; GFX10-NEXT:    v_add_f32_e32 v1, v1, v5
+; GFX10-NEXT:    v_add_f32_e32 v0, v0, v4
+; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v9
+; GFX10-NEXT:    v_add_f32_e32 v2, v2, v6
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v8
+; GFX10-NEXT:    v_perm_b32 v0, v0, v11, 0x3020706
+; GFX10-NEXT:    v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_perm_b32 v2, v2, v10, 0x3020706
+; GFX10-NEXT:    v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_fadd_v8bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff0000, v4
+; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff0000, v1
+; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff0000, v7
+; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX11-NEXT:    v_dual_add_f32 v0, v0, v4 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_dual_add_f32 v8, v9, v8 :: v_dual_and_b32 v9, 0xffff0000, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-NEXT:    v_add_f32_e32 v3, v3, v7
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_add_f32_e32 v9, v10, v9
+; GFX11-NEXT:    v_dual_add_f32 v1, v1, v5 :: v_dual_and_b32 v12, 0xffff0000, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v6
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v9
+; GFX11-NEXT:    v_add_f32_e32 v2, v2, v5
+; GFX11-NEXT:    v_dual_add_f32 v10, v12, v11 :: v_dual_add_f32 v11, v14, v13
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v8
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v4
+; GFX11-NEXT:    v_perm_b32 v2, v2, v10, 0x3020706
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v11, 0x3020706
+; GFX11-NEXT:    v_or_b32_e32 v3, v3, v5
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = fadd <8 x bfloat> %a, %b
+  ret <8 x bfloat> %op
+}
+
+define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
+; GCN-LABEL: v_fadd_v16bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GCN-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GCN-NEXT:    v_add_f32_e32 v14, v14, v30
+; GCN-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GCN-NEXT:    v_add_f32_e32 v13, v13, v29
+; GCN-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GCN-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GCN-NEXT:    v_add_f32_e32 v12, v12, v28
+; GCN-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GCN-NEXT:    v_add_f32_e32 v11, v11, v27
+; GCN-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GCN-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GCN-NEXT:    v_add_f32_e32 v10, v10, v26
+; GCN-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GCN-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GCN-NEXT:    v_add_f32_e32 v9, v9, v25
+; GCN-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GCN-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GCN-NEXT:    v_add_f32_e32 v8, v8, v24
+; GCN-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT:    v_add_f32_e32 v7, v7, v23
+; GCN-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT:    v_add_f32_e32 v6, v6, v22
+; GCN-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT:    v_add_f32_e32 v5, v5, v21
+; GCN-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT:    v_add_f32_e32 v4, v4, v20
+; GCN-NEXT:    buffer_load_dword v20, off, s[0:3], s32
+; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GCN-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_add_f32_e32 v3, v3, v19
+; GCN-NEXT:    v_add_f32_e32 v2, v2, v18
+; GCN-NEXT:    v_add_f32_e32 v1, v1, v17
+; GCN-NEXT:    v_add_f32_e32 v0, v0, v16
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GCN-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GCN-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GCN-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v16, 0xffff0000, v20
+; GCN-NEXT:    v_add_f32_e32 v15, v15, v16
+; GCN-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fadd_v16bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT:    v_add_f32_e32 v4, v4, v20
+; GFX7-NEXT:    buffer_load_dword v20, off, s[0:3], s32
+; GFX7-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GFX7-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX7-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX7-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GFX7-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX7-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX7-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX7-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX7-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX7-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_add_f32_e32 v0, v0, v16
+; GFX7-NEXT:    v_add_f32_e32 v14, v14, v30
+; GFX7-NEXT:    v_add_f32_e32 v13, v13, v29
+; GFX7-NEXT:    v_add_f32_e32 v12, v12, v28
+; GFX7-NEXT:    v_add_f32_e32 v11, v11, v27
+; GFX7-NEXT:    v_add_f32_e32 v10, v10, v26
+; GFX7-NEXT:    v_add_f32_e32 v9, v9, v25
+; GFX7-NEXT:    v_add_f32_e32 v8, v8, v24
+; GFX7-NEXT:    v_add_f32_e32 v7, v7, v23
+; GFX7-NEXT:    v_add_f32_e32 v6, v6, v22
+; GFX7-NEXT:    v_add_f32_e32 v5, v5, v21
+; GFX7-NEXT:    v_add_f32_e32 v3, v3, v19
+; GFX7-NEXT:    v_add_f32_e32 v2, v2, v18
+; GFX7-NEXT:    v_add_f32_e32 v1, v1, v17
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX7-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX7-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v16, 0xffff0000, v20
+; GFX7-NEXT:    v_add_f32_e32 v15, v15, v16
+; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fadd_v16bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v16, 0xffff0000, v15
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v7
+; GFX8-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX8-NEXT:    v_add_f32_e32 v16, v17, v16
+; GFX8-NEXT:    v_add_f32_e32 v7, v7, v15
+; GFX8-NEXT:    v_and_b32_e32 v15, 0xffff0000, v14
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v6
+; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_add_f32_e32 v15, v17, v15
+; GFX8-NEXT:    v_add_f32_e32 v6, v6, v14
+; GFX8-NEXT:    v_and_b32_e32 v14, 0xffff0000, v13
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_add_f32_e32 v14, v17, v14
+; GFX8-NEXT:    v_add_f32_e32 v5, v5, v13
+; GFX8-NEXT:    v_and_b32_e32 v13, 0xffff0000, v12
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX8-NEXT:    v_add_f32_e32 v13, v17, v13
+; GFX8-NEXT:    v_add_f32_e32 v4, v4, v12
+; GFX8-NEXT:    v_and_b32_e32 v12, 0xffff0000, v11
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_add_f32_e32 v12, v17, v12
+; GFX8-NEXT:    v_add_f32_e32 v3, v3, v11
+; GFX8-NEXT:    v_and_b32_e32 v11, 0xffff0000, v10
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_add_f32_e32 v11, v17, v11
+; GFX8-NEXT:    v_add_f32_e32 v2, v2, v10
+; GFX8-NEXT:    v_and_b32_e32 v10, 0xffff0000, v9
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_add_f32_e32 v10, v17, v10
+; GFX8-NEXT:    v_add_f32_e32 v1, v1, v9
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v8
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_add_f32_e32 v9, v17, v9
+; GFX8-NEXT:    v_add_f32_e32 v0, v0, v8
+; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX8-NEXT:    v_perm_b32 v0, v0, v9, s4
+; GFX8-NEXT:    v_perm_b32 v1, v1, v10, s4
+; GFX8-NEXT:    v_perm_b32 v2, v2, v11, s4
+; GFX8-NEXT:    v_perm_b32 v3, v3, v12, s4
+; GFX8-NEXT:    v_perm_b32 v4, v4, v13, s4
+; GFX8-NEXT:    v_perm_b32 v5, v5, v14, s4
+; GFX8-NEXT:    v_perm_b32 v6, v6, v15, s4
+; GFX8-NEXT:    v_perm_b32 v7, v7, v16, s4
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fadd_v16bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v16, 0xffff0000, v15
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v7
+; GFX9-NEXT:    v_add_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v14
+; GFX9-NEXT:    v_and_b32_e32 v18, 0xffff0000, v6
+; GFX9-NEXT:    v_add_f32_e32 v17, v18, v17
+; GFX9-NEXT:    v_and_b32_e32 v18, 0xffff0000, v13
+; GFX9-NEXT:    v_and_b32_e32 v19, 0xffff0000, v5
+; GFX9-NEXT:    v_add_f32_e32 v18, v19, v18
+; GFX9-NEXT:    v_and_b32_e32 v19, 0xffff0000, v12
+; GFX9-NEXT:    v_and_b32_e32 v20, 0xffff0000, v4
+; GFX9-NEXT:    v_add_f32_e32 v19, v20, v19
+; GFX9-NEXT:    v_and_b32_e32 v20, 0xffff0000, v11
+; GFX9-NEXT:    v_and_b32_e32 v21, 0xffff0000, v3
+; GFX9-NEXT:    v_add_f32_e32 v20, v21, v20
+; GFX9-NEXT:    v_and_b32_e32 v21, 0xffff0000, v10
+; GFX9-NEXT:    v_and_b32_e32 v22, 0xffff0000, v2
+; GFX9-NEXT:    v_add_f32_e32 v21, v22, v21
+; GFX9-NEXT:    v_and_b32_e32 v22, 0xffff0000, v9
+; GFX9-NEXT:    v_and_b32_e32 v23, 0xffff0000, v1
+; GFX9-NEXT:    v_add_f32_e32 v22, v23, v22
+; GFX9-NEXT:    v_and_b32_e32 v23, 0xffff0000, v8
+; GFX9-NEXT:    v_and_b32_e32 v24, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX9-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX9-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_add_f32_e32 v23, v24, v23
+; GFX9-NEXT:    v_add_f32_e32 v7, v7, v15
+; GFX9-NEXT:    v_add_f32_e32 v6, v6, v14
+; GFX9-NEXT:    v_add_f32_e32 v5, v5, v13
+; GFX9-NEXT:    v_add_f32_e32 v4, v4, v12
+; GFX9-NEXT:    v_add_f32_e32 v3, v3, v11
+; GFX9-NEXT:    v_add_f32_e32 v2, v2, v10
+; GFX9-NEXT:    v_add_f32_e32 v1, v1, v9
+; GFX9-NEXT:    v_add_f32_e32 v0, v0, v8
+; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    v_perm_b32 v0, v0, v23, s4
+; GFX9-NEXT:    v_perm_b32 v1, v1, v22, s4
+; GFX9-NEXT:    v_perm_b32 v2, v2, v21, s4
+; GFX9-NEXT:    v_perm_b32 v3, v3, v20, s4
+; GFX9-NEXT:    v_perm_b32 v4, v4, v19, s4
+; GFX9-NEXT:    v_perm_b32 v5, v5, v18, s4
+; GFX9-NEXT:    v_perm_b32 v6, v6, v17, s4
+; GFX9-NEXT:    v_perm_b32 v7, v7, v16, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fadd_v16bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v16, 0xffff0000, v15
+; GFX10-NEXT:    v_and_b32_e32 v17, 0xffff0000, v7
+; GFX10-NEXT:    v_and_b32_e32 v18, 0xffff0000, v6
+; GFX10-NEXT:    v_and_b32_e32 v19, 0xffff0000, v13
+; GFX10-NEXT:    v_and_b32_e32 v20, 0xffff0000, v5
+; GFX10-NEXT:    v_and_b32_e32 v21, 0xffff0000, v12
+; GFX10-NEXT:    v_add_f32_e32 v16, v17, v16
+; GFX10-NEXT:    v_and_b32_e32 v17, 0xffff0000, v14
+; GFX10-NEXT:    v_and_b32_e32 v22, 0xffff0000, v4
+; GFX10-NEXT:    v_and_b32_e32 v23, 0xffff0000, v11
+; GFX10-NEXT:    v_and_b32_e32 v24, 0xffff0000, v3
+; GFX10-NEXT:    v_and_b32_e32 v25, 0xffff0000, v10
+; GFX10-NEXT:    v_and_b32_e32 v26, 0xffff0000, v2
+; GFX10-NEXT:    v_add_f32_e32 v17, v18, v17
+; GFX10-NEXT:    v_add_f32_e32 v18, v20, v19
+; GFX10-NEXT:    v_add_f32_e32 v19, v22, v21
+; GFX10-NEXT:    v_add_f32_e32 v20, v24, v23
+; GFX10-NEXT:    v_add_f32_e32 v21, v26, v25
+; GFX10-NEXT:    v_and_b32_e32 v22, 0xffff0000, v9
+; GFX10-NEXT:    v_and_b32_e32 v23, 0xffff0000, v1
+; GFX10-NEXT:    v_and_b32_e32 v24, 0xffff0000, v8
+; GFX10-NEXT:    v_and_b32_e32 v25, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX10-NEXT:    v_add_f32_e32 v22, v23, v22
+; GFX10-NEXT:    v_add_f32_e32 v23, v25, v24
+; GFX10-NEXT:    v_add_f32_e32 v7, v7, v15
+; GFX10-NEXT:    v_add_f32_e32 v6, v6, v14
+; GFX10-NEXT:    v_add_f32_e32 v5, v5, v13
+; GFX10-NEXT:    v_add_f32_e32 v0, v0, v8
+; GFX10-NEXT:    v_add_f32_e32 v1, v1, v9
+; GFX10-NEXT:    v_add_f32_e32 v2, v2, v10
+; GFX10-NEXT:    v_add_f32_e32 v3, v3, v11
+; GFX10-NEXT:    v_add_f32_e32 v4, v4, v12
+; GFX10-NEXT:    v_perm_b32 v0, v0, v23, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v1, v1, v22, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v2, v2, v21, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v3, v3, v20, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v4, v4, v19, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v5, v5, v18, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v6, v6, v17, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v7, v7, v16, 0x3020706
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_fadd_v16bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff0000, v7
+; GFX11-NEXT:    v_and_b32_e32 v20, 0xffff0000, v5
+; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff0000, v11
+; GFX11-NEXT:    v_and_b32_e32 v25, 0xffff0000, v10
+; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX11-NEXT:    v_and_b32_e32 v26, 0xffff0000, v2
+; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff0000, v4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-NEXT:    v_and_b32_e32 v24, 0xffff0000, v3
+; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff0000, v13
+; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff0000, v12
+; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff0000, v6
+; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff0000, v15
+; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-NEXT:    v_dual_add_f32 v16, v17, v16 :: v_dual_and_b32 v17, 0xffff0000, v14
+; GFX11-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_dual_add_f32 v4, v4, v12 :: v_dual_add_f32 v5, v5, v13
+; GFX11-NEXT:    v_dual_add_f32 v17, v18, v17 :: v_dual_add_f32 v18, v20, v19
+; GFX11-NEXT:    v_add_f32_e32 v19, v22, v21
+; GFX11-NEXT:    v_add_f32_e32 v7, v7, v15
+; GFX11-NEXT:    v_add_f32_e32 v21, v26, v25
+; GFX11-NEXT:    v_dual_add_f32 v6, v6, v14 :: v_dual_and_b32 v25, 0xffff0000, v0
+; GFX11-NEXT:    v_add_f32_e32 v20, v24, v23
+; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff0000, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff0000, v9
+; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-NEXT:    v_and_b32_e32 v24, 0xffff0000, v8
+; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-NEXT:    v_dual_add_f32 v2, v2, v10 :: v_dual_add_f32 v3, v3, v11
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_dual_add_f32 v1, v1, v9 :: v_dual_add_f32 v22, v23, v22
+; GFX11-NEXT:    v_add_f32_e32 v23, v25, v24
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_add_f32_e32 v0, v0, v8
+; GFX11-NEXT:    v_perm_b32 v2, v2, v21, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v3, v3, v20, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v1, v1, v22, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v4, v4, v19, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v0, v0, v23, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v5, v5, v18, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v6, v6, v17, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v7, v7, v16, 0x3020706
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = fadd <16 x bfloat> %a, %b
+  ret <16 x bfloat> %op
+}
+
+define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
+; GCN-LABEL: v_fadd_v32bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:128
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:124
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    v_add_f32_e32 v31, v32, v31
+; GCN-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:120
+; GCN-NEXT:    v_add_f32_e32 v30, v30, v32
+; GCN-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:116
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT:    v_add_f32_e32 v29, v29, v33
+; GCN-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:112
+; GCN-NEXT:    v_add_f32_e32 v28, v28, v32
+; GCN-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:108
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT:    v_add_f32_e32 v27, v27, v33
+; GCN-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:104
+; GCN-NEXT:    v_add_f32_e32 v26, v26, v32
+; GCN-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:100
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT:    v_add_f32_e32 v25, v25, v33
+; GCN-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:96
+; GCN-NEXT:    v_add_f32_e32 v24, v24, v32
+; GCN-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:92
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT:    v_add_f32_e32 v23, v23, v33
+; GCN-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:88
+; GCN-NEXT:    v_add_f32_e32 v22, v22, v32
+; GCN-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:84
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT:    v_add_f32_e32 v21, v21, v33
+; GCN-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:80
+; GCN-NEXT:    v_add_f32_e32 v20, v20, v32
+; GCN-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:76
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT:    v_add_f32_e32 v19, v19, v33
+; GCN-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:72
+; GCN-NEXT:    v_add_f32_e32 v18, v18, v32
+; GCN-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:68
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT:    v_add_f32_e32 v17, v17, v33
+; GCN-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:64
+; GCN-NEXT:    v_add_f32_e32 v16, v16, v32
+; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:60
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT:    v_add_f32_e32 v15, v15, v33
+; GCN-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:56
+; GCN-NEXT:    v_add_f32_e32 v14, v14, v32
+; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:52
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT:    v_add_f32_e32 v13, v13, v33
+; GCN-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:48
+; GCN-NEXT:    v_add_f32_e32 v12, v12, v32
+; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:44
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT:    v_add_f32_e32 v11, v11, v33
+; GCN-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:40
+; GCN-NEXT:    v_add_f32_e32 v10, v10, v32
+; GCN-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:36
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT:    v_add_f32_e32 v9, v9, v33
+; GCN-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:32
+; GCN-NEXT:    v_add_f32_e32 v8, v8, v32
+; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:28
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT:    v_add_f32_e32 v7, v7, v33
+; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:24
+; GCN-NEXT:    v_add_f32_e32 v6, v6, v32
+; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:20
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT:    v_add_f32_e32 v5, v5, v33
+; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:16
+; GCN-NEXT:    v_add_f32_e32 v4, v4, v32
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:12
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT:    v_add_f32_e32 v3, v3, v33
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
+; GCN-NEXT:    v_add_f32_e32 v2, v2, v32
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
+; GCN-NEXT:    v_add_f32_e32 v1, v1, v33
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    v_add_f32_e32 v0, v0, v32
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GCN-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GCN-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GCN-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GCN-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GCN-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GCN-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GCN-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GCN-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GCN-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GCN-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GCN-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GCN-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GCN-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GCN-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GCN-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GCN-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GCN-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GCN-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GCN-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GCN-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fadd_v32bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:128
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32
+; GFX7-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GFX7-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GFX7-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GFX7-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GFX7-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GFX7-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GFX7-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GFX7-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GFX7-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX7-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GFX7-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GFX7-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GFX7-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX7-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GFX7-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX7-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_add_f32_e32 v31, v32, v31
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:124
+; GFX7-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_add_f32_e32 v30, v30, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:120
+; GFX7-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_add_f32_e32 v29, v29, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:116
+; GFX7-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_add_f32_e32 v28, v28, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:112
+; GFX7-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_add_f32_e32 v27, v27, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:108
+; GFX7-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_add_f32_e32 v26, v26, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:104
+; GFX7-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_add_f32_e32 v25, v25, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:100
+; GFX7-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_add_f32_e32 v24, v24, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:96
+; GFX7-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_add_f32_e32 v23, v23, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:92
+; GFX7-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_add_f32_e32 v22, v22, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:88
+; GFX7-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_add_f32_e32 v21, v21, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:84
+; GFX7-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_add_f32_e32 v20, v20, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:80
+; GFX7-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_add_f32_e32 v19, v19, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:76
+; GFX7-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_add_f32_e32 v18, v18, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:72
+; GFX7-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_add_f32_e32 v17, v17, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:68
+; GFX7-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_add_f32_e32 v16, v16, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:64
+; GFX7-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_add_f32_e32 v15, v15, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:60
+; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_add_f32_e32 v14, v14, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:56
+; GFX7-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_add_f32_e32 v13, v13, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:52
+; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_add_f32_e32 v12, v12, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:48
+; GFX7-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_add_f32_e32 v11, v11, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:44
+; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_add_f32_e32 v10, v10, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:40
+; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_add_f32_e32 v9, v9, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:36
+; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_add_f32_e32 v8, v8, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:32
+; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_add_f32_e32 v7, v7, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:28
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_add_f32_e32 v6, v6, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:24
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_add_f32_e32 v5, v5, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:20
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_add_f32_e32 v4, v4, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:16
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_add_f32_e32 v3, v3, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:12
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_add_f32_e32 v2, v2, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:8
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_add_f32_e32 v1, v1, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_add_f32_e32 v0, v0, v32
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fadd_v32bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v31, 0xffff0000, v30
+; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v14
+; GFX8-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
+; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX8-NEXT:    v_add_f32_e32 v31, v32, v31
+; GFX8-NEXT:    v_add_f32_e32 v14, v14, v30
+; GFX8-NEXT:    v_and_b32_e32 v30, 0xffff0000, v29
+; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v13
+; GFX8-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
+; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX8-NEXT:    v_add_f32_e32 v30, v32, v30
+; GFX8-NEXT:    v_add_f32_e32 v13, v13, v29
+; GFX8-NEXT:    v_and_b32_e32 v29, 0xffff0000, v28
+; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v12
+; GFX8-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
+; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX8-NEXT:    v_add_f32_e32 v29, v32, v29
+; GFX8-NEXT:    v_add_f32_e32 v12, v12, v28
+; GFX8-NEXT:    v_and_b32_e32 v28, 0xffff0000, v27
+; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v11
+; GFX8-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
+; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX8-NEXT:    v_add_f32_e32 v28, v32, v28
+; GFX8-NEXT:    v_add_f32_e32 v11, v11, v27
+; GFX8-NEXT:    v_and_b32_e32 v27, 0xffff0000, v26
+; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v10
+; GFX8-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
+; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX8-NEXT:    v_add_f32_e32 v27, v32, v27
+; GFX8-NEXT:    v_add_f32_e32 v10, v10, v26
+; GFX8-NEXT:    v_and_b32_e32 v26, 0xffff0000, v25
+; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v9
+; GFX8-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX8-NEXT:    v_add_f32_e32 v26, v32, v26
+; GFX8-NEXT:    v_add_f32_e32 v9, v9, v25
+; GFX8-NEXT:    v_and_b32_e32 v25, 0xffff0000, v24
+; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v8
+; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX8-NEXT:    v_add_f32_e32 v8, v8, v24
+; GFX8-NEXT:    buffer_load_dword v24, off, s[0:3], s32
+; GFX8-NEXT:    v_add_f32_e32 v25, v32, v25
+; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v15
+; GFX8-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX8-NEXT:    v_perm_b32 v8, v8, v25, s4
+; GFX8-NEXT:    v_perm_b32 v9, v9, v26, s4
+; GFX8-NEXT:    v_perm_b32 v10, v10, v27, s4
+; GFX8-NEXT:    v_perm_b32 v11, v11, v28, s4
+; GFX8-NEXT:    v_perm_b32 v12, v12, v29, s4
+; GFX8-NEXT:    v_perm_b32 v13, v13, v30, s4
+; GFX8-NEXT:    v_perm_b32 v14, v14, v31, s4
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v24
+; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; GFX8-NEXT:    v_add_f32_e32 v32, v32, v33
+; GFX8-NEXT:    v_add_f32_e32 v15, v15, v24
+; GFX8-NEXT:    v_and_b32_e32 v24, 0xffff0000, v23
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v7
+; GFX8-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX8-NEXT:    v_add_f32_e32 v24, v33, v24
+; GFX8-NEXT:    v_add_f32_e32 v7, v7, v23
+; GFX8-NEXT:    v_and_b32_e32 v23, 0xffff0000, v22
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v6
+; GFX8-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_add_f32_e32 v23, v33, v23
+; GFX8-NEXT:    v_add_f32_e32 v6, v6, v22
+; GFX8-NEXT:    v_and_b32_e32 v22, 0xffff0000, v21
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_add_f32_e32 v22, v33, v22
+; GFX8-NEXT:    v_add_f32_e32 v5, v5, v21
+; GFX8-NEXT:    v_and_b32_e32 v21, 0xffff0000, v20
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX8-NEXT:    v_add_f32_e32 v21, v33, v21
+; GFX8-NEXT:    v_add_f32_e32 v4, v4, v20
+; GFX8-NEXT:    v_and_b32_e32 v20, 0xffff0000, v19
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_add_f32_e32 v20, v33, v20
+; GFX8-NEXT:    v_add_f32_e32 v3, v3, v19
+; GFX8-NEXT:    v_and_b32_e32 v19, 0xffff0000, v18
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_add_f32_e32 v19, v33, v19
+; GFX8-NEXT:    v_add_f32_e32 v2, v2, v18
+; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v17
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_add_f32_e32 v18, v33, v18
+; GFX8-NEXT:    v_add_f32_e32 v1, v1, v17
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v16
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_add_f32_e32 v17, v33, v17
+; GFX8-NEXT:    v_add_f32_e32 v0, v0, v16
+; GFX8-NEXT:    v_perm_b32 v0, v0, v17, s4
+; GFX8-NEXT:    v_perm_b32 v1, v1, v18, s4
+; GFX8-NEXT:    v_perm_b32 v2, v2, v19, s4
+; GFX8-NEXT:    v_perm_b32 v3, v3, v20, s4
+; GFX8-NEXT:    v_perm_b32 v4, v4, v21, s4
+; GFX8-NEXT:    v_perm_b32 v5, v5, v22, s4
+; GFX8-NEXT:    v_perm_b32 v6, v6, v23, s4
+; GFX8-NEXT:    v_perm_b32 v7, v7, v24, s4
+; GFX8-NEXT:    v_perm_b32 v15, v15, v32, s4
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fadd_v32bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_and_b32_e32 v38, 0xffff0000, v27
+; GFX9-NEXT:    v_and_b32_e32 v39, 0xffff0000, v11
+; GFX9-NEXT:    v_and_b32_e32 v48, 0xffff0000, v26
+; GFX9-NEXT:    v_and_b32_e32 v49, 0xffff0000, v10
+; GFX9-NEXT:    v_and_b32_e32 v50, 0xffff0000, v25
+; GFX9-NEXT:    v_and_b32_e32 v51, 0xffff0000, v9
+; GFX9-NEXT:    v_and_b32_e32 v40, 0xffff0000, v22
+; GFX9-NEXT:    v_and_b32_e32 v41, 0xffff0000, v6
+; GFX9-NEXT:    v_and_b32_e32 v58, 0xffff0000, v17
+; GFX9-NEXT:    v_and_b32_e32 v59, 0xffff0000, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_and_b32_e32 v52, 0xffff0000, v24
+; GFX9-NEXT:    v_and_b32_e32 v53, 0xffff0000, v8
+; GFX9-NEXT:    v_and_b32_e32 v54, 0xffff0000, v23
+; GFX9-NEXT:    v_and_b32_e32 v55, 0xffff0000, v7
+; GFX9-NEXT:    v_and_b32_e32 v42, 0xffff0000, v21
+; GFX9-NEXT:    v_and_b32_e32 v43, 0xffff0000, v5
+; GFX9-NEXT:    v_and_b32_e32 v44, 0xffff0000, v20
+; GFX9-NEXT:    v_and_b32_e32 v45, 0xffff0000, v4
+; GFX9-NEXT:    v_and_b32_e32 v46, 0xffff0000, v19
+; GFX9-NEXT:    v_and_b32_e32 v47, 0xffff0000, v3
+; GFX9-NEXT:    v_and_b32_e32 v56, 0xffff0000, v18
+; GFX9-NEXT:    v_and_b32_e32 v57, 0xffff0000, v2
+; GFX9-NEXT:    v_add_f32_e32 v38, v39, v38
+; GFX9-NEXT:    v_add_f32_e32 v39, v49, v48
+; GFX9-NEXT:    v_add_f32_e32 v48, v51, v50
+; GFX9-NEXT:    v_add_f32_e32 v51, v41, v40
+; GFX9-NEXT:    v_add_f32_e32 v40, v59, v58
+; GFX9-NEXT:    v_add_f32_e32 v1, v1, v17
+; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    v_add_f32_e32 v49, v53, v52
+; GFX9-NEXT:    v_add_f32_e32 v50, v55, v54
+; GFX9-NEXT:    v_add_f32_e32 v52, v43, v42
+; GFX9-NEXT:    v_add_f32_e32 v53, v45, v44
+; GFX9-NEXT:    v_add_f32_e32 v54, v47, v46
+; GFX9-NEXT:    v_add_f32_e32 v55, v57, v56
+; GFX9-NEXT:    v_perm_b32 v1, v1, v40, s4
+; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX9-NEXT:    v_and_b32_e32 v32, 0xffff0000, v30
+; GFX9-NEXT:    v_and_b32_e32 v33, 0xffff0000, v14
+; GFX9-NEXT:    v_and_b32_e32 v34, 0xffff0000, v29
+; GFX9-NEXT:    v_and_b32_e32 v35, 0xffff0000, v13
+; GFX9-NEXT:    v_and_b32_e32 v36, 0xffff0000, v28
+; GFX9-NEXT:    v_and_b32_e32 v37, 0xffff0000, v12
+; GFX9-NEXT:    v_add_f32_e32 v32, v33, v32
+; GFX9-NEXT:    v_and_b32_e32 v33, 0xffff0000, v16
+; GFX9-NEXT:    v_add_f32_e32 v34, v35, v34
+; GFX9-NEXT:    v_and_b32_e32 v35, 0xffff0000, v0
+; GFX9-NEXT:    v_add_f32_e32 v36, v37, v36
+; GFX9-NEXT:    v_and_b32_e32 v37, 0xffff0000, v15
+; GFX9-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX9-NEXT:    v_add_f32_e32 v33, v35, v33
+; GFX9-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
+; GFX9-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX9-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
+; GFX9-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX9-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
+; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX9-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
+; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX9-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
+; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX9-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX9-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX9-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX9-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
+; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX9-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX9-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_add_f32_e32 v14, v14, v30
+; GFX9-NEXT:    v_add_f32_e32 v13, v13, v29
+; GFX9-NEXT:    v_add_f32_e32 v12, v12, v28
+; GFX9-NEXT:    v_add_f32_e32 v11, v11, v27
+; GFX9-NEXT:    v_add_f32_e32 v10, v10, v26
+; GFX9-NEXT:    v_add_f32_e32 v9, v9, v25
+; GFX9-NEXT:    v_add_f32_e32 v8, v8, v24
+; GFX9-NEXT:    v_add_f32_e32 v7, v7, v23
+; GFX9-NEXT:    v_add_f32_e32 v6, v6, v22
+; GFX9-NEXT:    v_add_f32_e32 v5, v5, v21
+; GFX9-NEXT:    v_add_f32_e32 v4, v4, v20
+; GFX9-NEXT:    v_add_f32_e32 v3, v3, v19
+; GFX9-NEXT:    v_add_f32_e32 v2, v2, v18
+; GFX9-NEXT:    v_add_f32_e32 v0, v0, v16
+; GFX9-NEXT:    v_perm_b32 v0, v0, v33, s4
+; GFX9-NEXT:    v_perm_b32 v2, v2, v55, s4
+; GFX9-NEXT:    v_perm_b32 v3, v3, v54, s4
+; GFX9-NEXT:    v_perm_b32 v4, v4, v53, s4
+; GFX9-NEXT:    v_perm_b32 v5, v5, v52, s4
+; GFX9-NEXT:    v_perm_b32 v6, v6, v51, s4
+; GFX9-NEXT:    v_perm_b32 v7, v7, v50, s4
+; GFX9-NEXT:    v_perm_b32 v8, v8, v49, s4
+; GFX9-NEXT:    v_perm_b32 v9, v9, v48, s4
+; GFX9-NEXT:    v_perm_b32 v10, v10, v39, s4
+; GFX9-NEXT:    v_perm_b32 v11, v11, v38, s4
+; GFX9-NEXT:    v_perm_b32 v12, v12, v36, s4
+; GFX9-NEXT:    v_perm_b32 v13, v13, v34, s4
+; GFX9-NEXT:    v_perm_b32 v14, v14, v32, s4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v35, 0xffff0000, v31
+; GFX9-NEXT:    v_lshlrev_b32_e32 v31, 16, v31
+; GFX9-NEXT:    v_add_f32_e32 v35, v37, v35
+; GFX9-NEXT:    v_add_f32_e32 v15, v15, v31
+; GFX9-NEXT:    v_perm_b32 v15, v15, v35, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fadd_v32bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX10-NEXT:    v_and_b32_e32 v53, 0xffff0000, v24
+; GFX10-NEXT:    v_and_b32_e32 v54, 0xffff0000, v8
+; GFX10-NEXT:    v_and_b32_e32 v55, 0xffff0000, v23
+; GFX10-NEXT:    v_and_b32_e32 v64, 0xffff0000, v7
+; GFX10-NEXT:    v_and_b32_e32 v65, 0xffff0000, v22
+; GFX10-NEXT:    v_and_b32_e32 v66, 0xffff0000, v6
+; GFX10-NEXT:    v_and_b32_e32 v67, 0xffff0000, v21
+; GFX10-NEXT:    v_and_b32_e32 v68, 0xffff0000, v5
+; GFX10-NEXT:    v_and_b32_e32 v33, 0xffff0000, v30
+; GFX10-NEXT:    v_and_b32_e32 v34, 0xffff0000, v14
+; GFX10-NEXT:    v_and_b32_e32 v35, 0xffff0000, v29
+; GFX10-NEXT:    v_and_b32_e32 v36, 0xffff0000, v13
+; GFX10-NEXT:    v_and_b32_e32 v37, 0xffff0000, v28
+; GFX10-NEXT:    v_and_b32_e32 v38, 0xffff0000, v12
+; GFX10-NEXT:    v_and_b32_e32 v39, 0xffff0000, v27
+; GFX10-NEXT:    v_and_b32_e32 v48, 0xffff0000, v11
+; GFX10-NEXT:    v_and_b32_e32 v49, 0xffff0000, v26
+; GFX10-NEXT:    v_and_b32_e32 v50, 0xffff0000, v10
+; GFX10-NEXT:    v_and_b32_e32 v51, 0xffff0000, v25
+; GFX10-NEXT:    v_and_b32_e32 v52, 0xffff0000, v9
+; GFX10-NEXT:    v_add_f32_e32 v53, v54, v53
+; GFX10-NEXT:    v_and_b32_e32 v54, 0xffff0000, v17
+; GFX10-NEXT:    v_add_f32_e32 v55, v64, v55
+; GFX10-NEXT:    v_and_b32_e32 v64, 0xffff0000, v1
+; GFX10-NEXT:    v_add_f32_e32 v65, v66, v65
+; GFX10-NEXT:    v_and_b32_e32 v66, 0xffff0000, v16
+; GFX10-NEXT:    v_add_f32_e32 v67, v68, v67
+; GFX10-NEXT:    v_and_b32_e32 v68, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v32, 0xffff0000, v15
+; GFX10-NEXT:    v_add_f32_e32 v33, v34, v33
+; GFX10-NEXT:    v_and_b32_e32 v34, 0xffff0000, v20
+; GFX10-NEXT:    v_add_f32_e32 v35, v36, v35
+; GFX10-NEXT:    v_and_b32_e32 v36, 0xffff0000, v4
+; GFX10-NEXT:    v_add_f32_e32 v37, v38, v37
+; GFX10-NEXT:    v_and_b32_e32 v38, 0xffff0000, v19
+; GFX10-NEXT:    v_add_f32_e32 v39, v48, v39
+; GFX10-NEXT:    v_and_b32_e32 v48, 0xffff0000, v3
+; GFX10-NEXT:    v_add_f32_e32 v49, v50, v49
+; GFX10-NEXT:    v_and_b32_e32 v50, 0xffff0000, v18
+; GFX10-NEXT:    v_add_f32_e32 v51, v52, v51
+; GFX10-NEXT:    v_and_b32_e32 v52, 0xffff0000, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX10-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
+; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX10-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
+; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX10-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
+; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX10-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
+; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX10-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    v_add_f32_e32 v0, v0, v16
+; GFX10-NEXT:    v_add_f32_e32 v1, v1, v17
+; GFX10-NEXT:    v_add_f32_e32 v34, v36, v34
+; GFX10-NEXT:    v_add_f32_e32 v36, v48, v38
+; GFX10-NEXT:    v_add_f32_e32 v38, v52, v50
+; GFX10-NEXT:    v_add_f32_e32 v48, v64, v54
+; GFX10-NEXT:    v_add_f32_e32 v50, v68, v66
+; GFX10-NEXT:    v_add_f32_e32 v14, v14, v30
+; GFX10-NEXT:    v_add_f32_e32 v13, v13, v29
+; GFX10-NEXT:    v_add_f32_e32 v12, v12, v28
+; GFX10-NEXT:    v_add_f32_e32 v11, v11, v27
+; GFX10-NEXT:    v_add_f32_e32 v10, v10, v26
+; GFX10-NEXT:    v_add_f32_e32 v9, v9, v25
+; GFX10-NEXT:    v_add_f32_e32 v8, v8, v24
+; GFX10-NEXT:    v_add_f32_e32 v7, v7, v23
+; GFX10-NEXT:    v_add_f32_e32 v6, v6, v22
+; GFX10-NEXT:    v_add_f32_e32 v5, v5, v21
+; GFX10-NEXT:    v_add_f32_e32 v2, v2, v18
+; GFX10-NEXT:    v_add_f32_e32 v3, v3, v19
+; GFX10-NEXT:    v_add_f32_e32 v4, v4, v20
+; GFX10-NEXT:    v_perm_b32 v0, v0, v50, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v1, v1, v48, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v2, v2, v38, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v3, v3, v36, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v4, v4, v34, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v5, v5, v67, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v6, v6, v65, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v7, v7, v55, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v8, v8, v53, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v9, v9, v51, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v10, v10, v49, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v11, v11, v39, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v12, v12, v37, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v13, v13, v35, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v14, v14, v33, 0x3020706
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v16, 0xffff0000, v31
+; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v31
+; GFX10-NEXT:    v_add_f32_e32 v16, v32, v16
+; GFX10-NEXT:    v_add_f32_e32 v15, v15, v17
+; GFX10-NEXT:    v_perm_b32 v15, v15, v16, 0x3020706
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_fadd_v32bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    scratch_load_b32 v31, off, s32
+; GFX11-NEXT:    v_and_b32_e32 v82, 0xffff0000, v2
+; GFX11-NEXT:    v_and_b32_e32 v84, 0xffff0000, v1
+; GFX11-NEXT:    v_and_b32_e32 v85, 0xffff0000, v16
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v86, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v81, 0xffff0000, v18
+; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
+; GFX11-NEXT:    v_and_b32_e32 v83, 0xffff0000, v17
+; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
+; GFX11-NEXT:    v_and_b32_e32 v55, 0xffff0000, v23
+; GFX11-NEXT:    v_and_b32_e32 v70, 0xffff0000, v4
+; GFX11-NEXT:    v_add_f32_e32 v2, v2, v18
+; GFX11-NEXT:    v_and_b32_e32 v65, 0xffff0000, v22
+; GFX11-NEXT:    v_dual_add_f32 v1, v1, v17 :: v_dual_lshlrev_b32 v22, 16, v22
+; GFX11-NEXT:    v_and_b32_e32 v66, 0xffff0000, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX11-NEXT:    v_and_b32_e32 v50, 0xffff0000, v10
+; GFX11-NEXT:    v_and_b32_e32 v69, 0xffff0000, v20
+; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_dual_add_f32 v6, v6, v22 :: v_dual_lshlrev_b32 v23, 16, v23
+; GFX11-NEXT:    v_and_b32_e32 v54, 0xffff0000, v8
+; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-NEXT:    v_and_b32_e32 v71, 0xffff0000, v19
+; GFX11-NEXT:    v_dual_add_f32 v0, v0, v16 :: v_dual_lshlrev_b32 v19, 16, v19
+; GFX11-NEXT:    v_and_b32_e32 v64, 0xffff0000, v7
+; GFX11-NEXT:    v_and_b32_e32 v49, 0xffff0000, v26
+; GFX11-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
+; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX11-NEXT:    v_and_b32_e32 v52, 0xffff0000, v9
+; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-NEXT:    v_and_b32_e32 v80, 0xffff0000, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    v_dual_add_f32 v10, v10, v26 :: v_dual_and_b32 v67, 0xffff0000, v21
+; GFX11-NEXT:    v_and_b32_e32 v68, 0xffff0000, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_add_f32 v3, v3, v19 :: v_dual_and_b32 v38, 0xffff0000, v12
+; GFX11-NEXT:    v_and_b32_e32 v51, 0xffff0000, v25
+; GFX11-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
+; GFX11-NEXT:    v_add_f32_e32 v7, v7, v23
+; GFX11-NEXT:    v_and_b32_e32 v37, 0xffff0000, v28
+; GFX11-NEXT:    v_and_b32_e32 v34, 0xffff0000, v14
+; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-NEXT:    v_and_b32_e32 v39, 0xffff0000, v27
+; GFX11-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
+; GFX11-NEXT:    v_add_f32_e32 v9, v9, v25
+; GFX11-NEXT:    v_and_b32_e32 v53, 0xffff0000, v24
+; GFX11-NEXT:    v_and_b32_e32 v48, 0xffff0000, v11
+; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX11-NEXT:    v_and_b32_e32 v35, 0xffff0000, v29
+; GFX11-NEXT:    v_and_b32_e32 v36, 0xffff0000, v13
+; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_add_f32 v11, v11, v27 :: v_dual_lshlrev_b32 v4, 16, v4
+; GFX11-NEXT:    v_and_b32_e32 v33, 0xffff0000, v30
+; GFX11-NEXT:    v_and_b32_e32 v32, 0xffff0000, v15
+; GFX11-NEXT:    v_dual_add_f32 v4, v4, v20 :: v_dual_lshlrev_b32 v15, 16, v15
+; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX11-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
+; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX11-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; GFX11-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-NEXT:    v_dual_add_f32 v33, v34, v33 :: v_dual_add_f32 v34, v36, v35
+; GFX11-NEXT:    v_dual_add_f32 v35, v38, v37 :: v_dual_add_f32 v12, v12, v28
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_add_f32 v8, v8, v24 :: v_dual_add_f32 v5, v5, v21
+; GFX11-NEXT:    v_perm_b32 v12, v12, v35, 0x3020706
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff0000, v31
+; GFX11-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
+; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v31
+; GFX11-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_add_f32 v16, v32, v16 :: v_dual_add_f32 v13, v13, v29
+; GFX11-NEXT:    v_dual_add_f32 v15, v15, v17 :: v_dual_add_f32 v14, v14, v30
+; GFX11-NEXT:    v_add_f32_e32 v36, v48, v39
+; GFX11-NEXT:    v_dual_add_f32 v48, v64, v55 :: v_dual_add_f32 v37, v50, v49
+; GFX11-NEXT:    v_add_f32_e32 v50, v68, v67
+; GFX11-NEXT:    v_dual_add_f32 v38, v52, v51 :: v_dual_add_f32 v51, v70, v69
+; GFX11-NEXT:    v_dual_add_f32 v52, v80, v71 :: v_dual_add_f32 v39, v54, v53
+; GFX11-NEXT:    v_dual_add_f32 v53, v82, v81 :: v_dual_add_f32 v54, v84, v83
+; GFX11-NEXT:    v_add_f32_e32 v55, v86, v85
+; GFX11-NEXT:    v_add_f32_e32 v49, v66, v65
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_perm_b32 v3, v3, v52, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v2, v2, v53, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v1, v1, v54, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v0, v0, v55, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v4, v4, v51, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v5, v5, v50, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v6, v6, v49, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v7, v7, v48, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v8, v8, v39, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v9, v9, v38, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v10, v10, v37, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v11, v11, v36, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v13, v13, v34, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v14, v14, v33, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v15, v15, v16, 0x3020706
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = fadd <32 x bfloat> %a, %b
+  ret <32 x bfloat> %op
+}
+
+define bfloat @v_fadd_bf16_fpimm_0(bfloat %arg0) {
+; GCN-LABEL: v_fadd_bf16_fpimm_0:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_add_f32_e32 v0, 1.0, v0
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fadd_bf16_fpimm_0:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_add_f32_e32 v0, 1.0, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fadd_bf16_fpimm_0:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_add_f32_e32 v0, 1.0, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fadd_bf16_fpimm_0:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_add_f32_e32 v0, 1.0, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fadd_bf16_fpimm_0:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_add_f32_e32 v0, 1.0, v0
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_fadd_bf16_fpimm_0:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add_f32_e32 v0, 1.0, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %add = fadd bfloat %arg0, 1.0
+  ret bfloat %add
+}
+
+define bfloat @v_fadd_bf16_fpimm_1(bfloat %arg0) {
+; GCN-LABEL: v_fadd_bf16_fpimm_1:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_add_f32_e32 v0, 0x42280000, v0
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fadd_bf16_fpimm_1:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_add_f32_e32 v0, 0x42280000, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fadd_bf16_fpimm_1:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_add_f32_e32 v0, 0x42280000, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fadd_bf16_fpimm_1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_add_f32_e32 v0, 0x42280000, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fadd_bf16_fpimm_1:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_add_f32_e32 v0, 0x42280000, v0
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_fadd_bf16_fpimm_1:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add_f32_e32 v0, 0x42280000, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %add = fadd bfloat %arg0, 42.0
+  ret bfloat %add
+}
+
+define bfloat @v_fsub_bf16(bfloat %a, bfloat %b) {
+; GCN-LABEL: v_fsub_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fsub_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fsub_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fsub_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fsub_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_fsub_bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = fsub bfloat %a, %b
+  ret bfloat %op
+}
+
+define <2 x bfloat> @v_fsub_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
+; GCN-LABEL: v_fsub_v2bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_sub_f32_e32 v1, v1, v3
+; GCN-NEXT:    v_sub_f32_e32 v0, v0, v2
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fsub_v2bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_sub_f32_e32 v1, v1, v3
+; GFX7-NEXT:    v_sub_f32_e32 v0, v0, v2
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fsub_v2bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_sub_f32_e32 v2, v3, v2
+; GFX8-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX8-NEXT:    v_perm_b32 v0, v0, v2, s4
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fsub_v2bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_sub_f32_e32 v2, v3, v2
+; GFX9-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    v_perm_b32 v0, v0, v2, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fsub_v2bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_sub_f32_e32 v2, v3, v2
+; GFX10-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x3020706
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_fsub_v2bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_sub_f32_e32 v2, v3, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v2, 0x3020706
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = fsub <2 x bfloat> %a, %b
+  ret <2 x bfloat> %op
+}
+
+define <3 x bfloat> @v_fsub_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
+; GCN-LABEL: v_fsub_v3bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_sub_f32_e32 v2, v2, v5
+; GCN-NEXT:    v_sub_f32_e32 v1, v1, v4
+; GCN-NEXT:    v_sub_f32_e32 v0, v0, v3
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fsub_v3bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_sub_f32_e32 v2, v2, v5
+; GFX7-NEXT:    v_sub_f32_e32 v1, v1, v4
+; GFX7-NEXT:    v_sub_f32_e32 v0, v0, v3
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fsub_v3bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_sub_f32_e32 v1, v1, v3
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_sub_f32_e32 v3, v4, v3
+; GFX8-NEXT:    v_sub_f32_e32 v0, v0, v2
+; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_perm_b32 v0, v0, v3, s4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: global_extload_v32bf16_to_v32f64:
+; GFX9-LABEL: v_fsub_v3bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    global_load_ushort v21, v[1:2], off offset:62
-; GFX9-NEXT:    global_load_ushort v23, v[1:2], off offset:60
-; GFX9-NEXT:    global_load_ushort v24, v[1:2], off offset:58
-; GFX9-NEXT:    global_load_ushort v25, v[1:2], off offset:56
-; GFX9-NEXT:    global_load_ushort v26, v[1:2], off offset:54
-; GFX9-NEXT:    global_load_ushort v27, v[1:2], off offset:52
-; GFX9-NEXT:    global_load_ushort v28, v[1:2], off offset:50
-; GFX9-NEXT:    global_load_ushort v29, v[1:2], off offset:48
-; GFX9-NEXT:    global_load_ushort v30, v[1:2], off offset:46
-; GFX9-NEXT:    global_load_ushort v31, v[1:2], off offset:44
-; GFX9-NEXT:    global_load_ushort v32, v[1:2], off offset:42
-; GFX9-NEXT:    global_load_ushort v33, v[1:2], off offset:40
-; GFX9-NEXT:    global_load_ushort v34, v[1:2], off offset:38
-; GFX9-NEXT:    global_load_ushort v19, v[1:2], off
-; GFX9-NEXT:    global_load_ushort v20, v[1:2], off offset:36
-; GFX9-NEXT:    global_load_ushort v17, v[1:2], off offset:2
-; GFX9-NEXT:    global_load_ushort v18, v[1:2], off offset:4
-; GFX9-NEXT:    global_load_ushort v16, v[1:2], off offset:34
-; GFX9-NEXT:    global_load_ushort v11, v[1:2], off offset:32
-; GFX9-NEXT:    global_load_ushort v13, v[1:2], off offset:6
-; GFX9-NEXT:    global_load_ushort v14, v[1:2], off offset:8
-; GFX9-NEXT:    global_load_ushort v15, v[1:2], off offset:30
-; GFX9-NEXT:    global_load_ushort v3, v[1:2], off offset:16
-; GFX9-NEXT:    global_load_ushort v4, v[1:2], off offset:18
-; GFX9-NEXT:    global_load_ushort v5, v[1:2], off offset:20
-; GFX9-NEXT:    global_load_ushort v6, v[1:2], off offset:22
-; GFX9-NEXT:    global_load_ushort v8, v[1:2], off offset:24
-; GFX9-NEXT:    global_load_ushort v10, v[1:2], off offset:26
-; GFX9-NEXT:    global_load_ushort v12, v[1:2], off offset:28
-; GFX9-NEXT:    global_load_ushort v9, v[1:2], off offset:10
-; GFX9-NEXT:    global_load_ushort v7, v[1:2], off offset:12
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    global_load_ushort v1, v[1:2], off offset:14
-; GFX9-NEXT:    s_waitcnt vmcnt(31)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v21
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[21:22], v2
-; GFX9-NEXT:    s_waitcnt vmcnt(30)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v23
-; GFX9-NEXT:    s_waitcnt vmcnt(28)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v23, 16, v25
-; GFX9-NEXT:    buffer_store_dword v22, v0, s[0:3], 0 offen offset:252
-; GFX9-NEXT:    buffer_store_dword v21, v0, s[0:3], 0 offen offset:248
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[21:22], v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v24
-; GFX9-NEXT:    s_waitcnt vmcnt(29)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v24, 16, v26
-; GFX9-NEXT:    buffer_store_dword v22, v0, s[0:3], 0 offen offset:244
-; GFX9-NEXT:    buffer_store_dword v21, v0, s[0:3], 0 offen offset:240
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[21:22], v2
-; GFX9-NEXT:    s_waitcnt vmcnt(30)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v25, 16, v27
-; GFX9-NEXT:    buffer_store_dword v22, v0, s[0:3], 0 offen offset:236
-; GFX9-NEXT:    buffer_store_dword v21, v0, s[0:3], 0 offen offset:232
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[21:22], v23
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[23:24], v24
-; GFX9-NEXT:    s_waitcnt vmcnt(31)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v26, 16, v28
-; GFX9-NEXT:    s_waitcnt vmcnt(30)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v27, 16, v29
-; GFX9-NEXT:    s_waitcnt vmcnt(29)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v30
-; GFX9-NEXT:    buffer_store_dword v22, v0, s[0:3], 0 offen offset:228
-; GFX9-NEXT:    buffer_store_dword v21, v0, s[0:3], 0 offen offset:224
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[21:22], v25
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[25:26], v26
-; GFX9-NEXT:    buffer_store_dword v24, v0, s[0:3], 0 offen offset:220
-; GFX9-NEXT:    buffer_store_dword v23, v0, s[0:3], 0 offen offset:216
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[23:24], v27
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[27:28], v2
-; GFX9-NEXT:    s_waitcnt vmcnt(28)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v19
-; GFX9-NEXT:    s_waitcnt vmcnt(27)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v19, 16, v20
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[19:20], v19
-; GFX9-NEXT:    v_lshlrev_b32_e32 v29, 16, v31
-; GFX9-NEXT:    v_lshlrev_b32_e32 v30, 16, v32
-; GFX9-NEXT:    v_lshlrev_b32_e32 v31, 16, v33
-; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v34
-; GFX9-NEXT:    buffer_store_dword v22, v0, s[0:3], 0 offen offset:212
-; GFX9-NEXT:    buffer_store_dword v21, v0, s[0:3], 0 offen offset:208
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[21:22], v29
-; GFX9-NEXT:    s_waitcnt vmcnt(26)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[29:30], v30
-; GFX9-NEXT:    buffer_store_dword v26, v0, s[0:3], 0 offen offset:204
-; GFX9-NEXT:    buffer_store_dword v25, v0, s[0:3], 0 offen offset:200
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[25:26], v31
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[31:32], v32
-; GFX9-NEXT:    buffer_store_dword v24, v0, s[0:3], 0 offen offset:196
-; GFX9-NEXT:    buffer_store_dword v23, v0, s[0:3], 0 offen offset:192
-; GFX9-NEXT:    buffer_store_dword v28, v0, s[0:3], 0 offen offset:188
-; GFX9-NEXT:    buffer_store_dword v27, v0, s[0:3], 0 offen offset:184
-; GFX9-NEXT:    buffer_store_dword v22, v0, s[0:3], 0 offen offset:180
-; GFX9-NEXT:    buffer_store_dword v21, v0, s[0:3], 0 offen offset:176
-; GFX9-NEXT:    buffer_store_dword v30, v0, s[0:3], 0 offen offset:172
-; GFX9-NEXT:    buffer_store_dword v29, v0, s[0:3], 0 offen offset:168
-; GFX9-NEXT:    buffer_store_dword v26, v0, s[0:3], 0 offen offset:164
-; GFX9-NEXT:    buffer_store_dword v25, v0, s[0:3], 0 offen offset:160
-; GFX9-NEXT:    buffer_store_dword v32, v0, s[0:3], 0 offen offset:156
-; GFX9-NEXT:    buffer_store_dword v31, v0, s[0:3], 0 offen offset:152
-; GFX9-NEXT:    v_lshlrev_b32_e32 v21, 16, v17
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[16:17], v16
-; GFX9-NEXT:    s_waitcnt vmcnt(39)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX9-NEXT:    buffer_store_dword v20, v0, s[0:3], 0 offen offset:148
-; GFX9-NEXT:    buffer_store_dword v19, v0, s[0:3], 0 offen offset:144
-; GFX9-NEXT:    s_waitcnt vmcnt(40)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v20, 16, v13
-; GFX9-NEXT:    s_waitcnt vmcnt(39)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v23, 16, v14
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[13:14], v11
-; GFX9-NEXT:    buffer_store_dword v17, v0, s[0:3], 0 offen offset:140
-; GFX9-NEXT:    buffer_store_dword v16, v0, s[0:3], 0 offen offset:136
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[16:17], v2
-; GFX9-NEXT:    s_waitcnt vmcnt(40)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v15
-; GFX9-NEXT:    buffer_store_dword v14, v0, s[0:3], 0 offen offset:132
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[14:15], v2
-; GFX9-NEXT:    s_waitcnt vmcnt(34)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v12
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[11:12], v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v10
-; GFX9-NEXT:    buffer_store_dword v13, v0, s[0:3], 0 offen offset:128
-; GFX9-NEXT:    buffer_store_dword v15, v0, s[0:3], 0 offen offset:124
-; GFX9-NEXT:    buffer_store_dword v14, v0, s[0:3], 0 offen offset:120
-; GFX9-NEXT:    buffer_store_dword v12, v0, s[0:3], 0 offen offset:116
-; GFX9-NEXT:    buffer_store_dword v11, v0, s[0:3], 0 offen offset:112
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[10:11], v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX9-NEXT:    s_waitcnt vmcnt(38)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v9
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[8:9], v8
-; GFX9-NEXT:    buffer_store_dword v11, v0, s[0:3], 0 offen offset:108
-; GFX9-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:104
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[10:11], v2
-; GFX9-NEXT:    s_waitcnt vmcnt(39)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v7
-; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX9-NEXT:    s_waitcnt vmcnt(38)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 16, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v5
-; GFX9-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:100
-; GFX9-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:96
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[6:7], v6
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[8:9], v2
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[3:4], v3
-; GFX9-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:92
-; GFX9-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:88
-; GFX9-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:84
-; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:80
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[1:2], v5
-; GFX9-NEXT:    v_lshlrev_b32_e32 v22, 16, v18
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[18:19], v21
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[13:14], v22
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[20:21], v20
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[22:23], v23
-; GFX9-NEXT:    v_cvt_f64_f32_e32 v[5:6], v12
-; GFX9-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:76
-; GFX9-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:72
-; GFX9-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:68
-; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:64
-; GFX9-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:60
-; GFX9-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:56
-; GFX9-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:52
-; GFX9-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:48
-; GFX9-NEXT:    buffer_store_dword v11, v0, s[0:3], 0 offen offset:44
-; GFX9-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:40
-; GFX9-NEXT:    buffer_store_dword v23, v0, s[0:3], 0 offen offset:36
-; GFX9-NEXT:    buffer_store_dword v22, v0, s[0:3], 0 offen offset:32
-; GFX9-NEXT:    buffer_store_dword v21, v0, s[0:3], 0 offen offset:28
-; GFX9-NEXT:    buffer_store_dword v20, v0, s[0:3], 0 offen offset:24
-; GFX9-NEXT:    buffer_store_dword v14, v0, s[0:3], 0 offen offset:20
-; GFX9-NEXT:    buffer_store_dword v13, v0, s[0:3], 0 offen offset:16
-; GFX9-NEXT:    buffer_store_dword v19, v0, s[0:3], 0 offen offset:12
-; GFX9-NEXT:    buffer_store_dword v18, v0, s[0:3], 0 offen offset:8
-; GFX9-NEXT:    buffer_store_dword v17, v0, s[0:3], 0 offen offset:4
-; GFX9-NEXT:    buffer_store_dword v16, v0, s[0:3], 0 offen
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
+; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_sub_f32_e32 v4, v5, v4
+; GFX9-NEXT:    v_sub_f32_e32 v1, v1, v3
+; GFX9-NEXT:    v_sub_f32_e32 v0, v0, v2
+; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_perm_b32 v0, v0, v4, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_extload_v32bf16_to_v32f64:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    s_clause 0x1f
-; GFX10-NEXT:    global_load_ushort v3, v[1:2], off
-; GFX10-NEXT:    global_load_ushort v4, v[1:2], off offset:2
-; GFX10-NEXT:    global_load_ushort v5, v[1:2], off offset:4
-; GFX10-NEXT:    global_load_ushort v6, v[1:2], off offset:6
-; GFX10-NEXT:    global_load_ushort v7, v[1:2], off offset:8
-; GFX10-NEXT:    global_load_ushort v8, v[1:2], off offset:10
-; GFX10-NEXT:    global_load_ushort v9, v[1:2], off offset:12
-; GFX10-NEXT:    global_load_ushort v10, v[1:2], off offset:14
-; GFX10-NEXT:    global_load_ushort v11, v[1:2], off offset:16
-; GFX10-NEXT:    global_load_ushort v12, v[1:2], off offset:18
-; GFX10-NEXT:    global_load_ushort v13, v[1:2], off offset:20
-; GFX10-NEXT:    global_load_ushort v14, v[1:2], off offset:22
-; GFX10-NEXT:    global_load_ushort v15, v[1:2], off offset:24
-; GFX10-NEXT:    global_load_ushort v16, v[1:2], off offset:26
-; GFX10-NEXT:    global_load_ushort v17, v[1:2], off offset:28
-; GFX10-NEXT:    global_load_ushort v18, v[1:2], off offset:30
-; GFX10-NEXT:    global_load_ushort v19, v[1:2], off offset:32
-; GFX10-NEXT:    global_load_ushort v20, v[1:2], off offset:34
-; GFX10-NEXT:    global_load_ushort v21, v[1:2], off offset:36
-; GFX10-NEXT:    global_load_ushort v22, v[1:2], off offset:38
-; GFX10-NEXT:    global_load_ushort v23, v[1:2], off offset:40
-; GFX10-NEXT:    global_load_ushort v24, v[1:2], off offset:42
-; GFX10-NEXT:    global_load_ushort v25, v[1:2], off offset:44
-; GFX10-NEXT:    global_load_ushort v26, v[1:2], off offset:46
-; GFX10-NEXT:    global_load_ushort v27, v[1:2], off offset:48
-; GFX10-NEXT:    global_load_ushort v28, v[1:2], off offset:62
-; GFX10-NEXT:    global_load_ushort v29, v[1:2], off offset:50
-; GFX10-NEXT:    global_load_ushort v30, v[1:2], off offset:52
-; GFX10-NEXT:    global_load_ushort v31, v[1:2], off offset:54
-; GFX10-NEXT:    global_load_ushort v32, v[1:2], off offset:60
-; GFX10-NEXT:    global_load_ushort v33, v[1:2], off offset:56
-; GFX10-NEXT:    global_load_ushort v34, v[1:2], off offset:58
-; GFX10-NEXT:    s_waitcnt vmcnt(31)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX10-NEXT:    s_waitcnt vmcnt(30)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v35, 16, v4
-; GFX10-NEXT:    s_waitcnt vmcnt(29)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v36, 16, v5
-; GFX10-NEXT:    s_waitcnt vmcnt(28)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v37, 16, v6
-; GFX10-NEXT:    s_waitcnt vmcnt(27)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v38, 16, v7
-; GFX10-NEXT:    s_waitcnt vmcnt(26)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v39, 16, v8
-; GFX10-NEXT:    s_waitcnt vmcnt(25)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v48, 16, v9
-; GFX10-NEXT:    s_waitcnt vmcnt(24)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v49, 16, v10
-; GFX10-NEXT:    s_waitcnt vmcnt(23)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v50, 16, v11
-; GFX10-NEXT:    s_waitcnt vmcnt(22)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v51, 16, v12
-; GFX10-NEXT:    s_waitcnt vmcnt(21)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v52, 16, v13
-; GFX10-NEXT:    s_waitcnt vmcnt(20)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v53, 16, v14
-; GFX10-NEXT:    s_waitcnt vmcnt(19)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v54, 16, v15
-; GFX10-NEXT:    s_waitcnt vmcnt(18)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v55, 16, v16
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[11:12], v37
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[15:16], v38
-; GFX10-NEXT:    s_waitcnt vmcnt(15)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v66, 16, v19
-; GFX10-NEXT:    s_waitcnt vmcnt(14)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v67, 16, v20
-; GFX10-NEXT:    s_waitcnt vmcnt(13)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v64, 16, v21
-; GFX10-NEXT:    s_waitcnt vmcnt(12)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v65, 16, v22
-; GFX10-NEXT:    s_waitcnt vmcnt(11)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v70, 16, v23
-; GFX10-NEXT:    s_waitcnt vmcnt(10)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v71, 16, v24
-; GFX10-NEXT:    s_waitcnt vmcnt(9)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v80, 16, v25
-; GFX10-NEXT:    s_waitcnt vmcnt(8)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v81, 16, v26
-; GFX10-NEXT:    s_waitcnt vmcnt(7)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v82, 16, v27
-; GFX10-NEXT:    s_waitcnt vmcnt(6)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v28
-; GFX10-NEXT:    s_waitcnt vmcnt(5)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v83, 16, v29
-; GFX10-NEXT:    s_waitcnt vmcnt(4)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v84, 16, v30
-; GFX10-NEXT:    s_waitcnt vmcnt(3)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v29, 16, v31
-; GFX10-NEXT:    s_waitcnt vmcnt(2)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v32
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[1:2], v1
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v34
-; GFX10-NEXT:    v_lshlrev_b32_e32 v21, 16, v33
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[29:30], v29
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[5:6], v5
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[37:38], v84
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[13:14], v13
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[21:22], v21
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[25:26], v50
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[27:28], v51
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[50:51], v82
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[31:32], v52
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[33:34], v53
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[52:53], v80
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[7:8], v35
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[9:10], v36
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[19:20], v48
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[23:24], v49
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[35:36], v54
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[48:49], v55
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[54:55], v70
-; GFX10-NEXT:    v_lshlrev_b32_e32 v69, 16, v18
-; GFX10-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:252
-; GFX10-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:248
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[1:2], v83
-; GFX10-NEXT:    v_lshlrev_b32_e32 v68, 16, v17
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[3:4], v3
-; GFX10-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:244
-; GFX10-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:240
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[5:6], v81
-; GFX10-NEXT:    buffer_store_dword v14, v0, s[0:3], 0 offen offset:236
-; GFX10-NEXT:    buffer_store_dword v13, v0, s[0:3], 0 offen offset:232
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[13:14], v71
-; GFX10-NEXT:    buffer_store_dword v22, v0, s[0:3], 0 offen offset:228
-; GFX10-NEXT:    buffer_store_dword v21, v0, s[0:3], 0 offen offset:224
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[21:22], v65
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[64:65], v64
-; GFX10-NEXT:    buffer_store_dword v30, v0, s[0:3], 0 offen offset:220
-; GFX10-NEXT:    buffer_store_dword v29, v0, s[0:3], 0 offen offset:216
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[29:30], v67
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[66:67], v66
-; GFX10-NEXT:    buffer_store_dword v38, v0, s[0:3], 0 offen offset:212
-; GFX10-NEXT:    buffer_store_dword v37, v0, s[0:3], 0 offen offset:208
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[37:38], v69
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[17:18], v39
-; GFX10-NEXT:    v_cvt_f64_f32_e32 v[68:69], v68
-; GFX10-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:204
-; GFX10-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:200
-; GFX10-NEXT:    buffer_store_dword v51, v0, s[0:3], 0 offen offset:196
-; GFX10-NEXT:    buffer_store_dword v50, v0, s[0:3], 0 offen offset:192
-; GFX10-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:188
-; GFX10-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:184
-; GFX10-NEXT:    buffer_store_dword v53, v0, s[0:3], 0 offen offset:180
-; GFX10-NEXT:    buffer_store_dword v52, v0, s[0:3], 0 offen offset:176
-; GFX10-NEXT:    buffer_store_dword v14, v0, s[0:3], 0 offen offset:172
-; GFX10-NEXT:    buffer_store_dword v13, v0, s[0:3], 0 offen offset:168
-; GFX10-NEXT:    buffer_store_dword v55, v0, s[0:3], 0 offen offset:164
-; GFX10-NEXT:    buffer_store_dword v54, v0, s[0:3], 0 offen offset:160
-; GFX10-NEXT:    buffer_store_dword v22, v0, s[0:3], 0 offen offset:156
-; GFX10-NEXT:    buffer_store_dword v21, v0, s[0:3], 0 offen offset:152
-; GFX10-NEXT:    buffer_store_dword v65, v0, s[0:3], 0 offen offset:148
-; GFX10-NEXT:    buffer_store_dword v64, v0, s[0:3], 0 offen offset:144
-; GFX10-NEXT:    buffer_store_dword v30, v0, s[0:3], 0 offen offset:140
-; GFX10-NEXT:    buffer_store_dword v29, v0, s[0:3], 0 offen offset:136
-; GFX10-NEXT:    buffer_store_dword v67, v0, s[0:3], 0 offen offset:132
-; GFX10-NEXT:    buffer_store_dword v66, v0, s[0:3], 0 offen offset:128
-; GFX10-NEXT:    buffer_store_dword v38, v0, s[0:3], 0 offen offset:124
-; GFX10-NEXT:    buffer_store_dword v37, v0, s[0:3], 0 offen offset:120
-; GFX10-NEXT:    buffer_store_dword v69, v0, s[0:3], 0 offen offset:116
-; GFX10-NEXT:    buffer_store_dword v68, v0, s[0:3], 0 offen offset:112
-; GFX10-NEXT:    buffer_store_dword v49, v0, s[0:3], 0 offen offset:108
-; GFX10-NEXT:    buffer_store_dword v48, v0, s[0:3], 0 offen offset:104
-; GFX10-NEXT:    buffer_store_dword v36, v0, s[0:3], 0 offen offset:100
-; GFX10-NEXT:    buffer_store_dword v35, v0, s[0:3], 0 offen offset:96
-; GFX10-NEXT:    buffer_store_dword v34, v0, s[0:3], 0 offen offset:92
-; GFX10-NEXT:    buffer_store_dword v33, v0, s[0:3], 0 offen offset:88
-; GFX10-NEXT:    buffer_store_dword v32, v0, s[0:3], 0 offen offset:84
-; GFX10-NEXT:    buffer_store_dword v31, v0, s[0:3], 0 offen offset:80
-; GFX10-NEXT:    buffer_store_dword v28, v0, s[0:3], 0 offen offset:76
-; GFX10-NEXT:    buffer_store_dword v27, v0, s[0:3], 0 offen offset:72
-; GFX10-NEXT:    buffer_store_dword v26, v0, s[0:3], 0 offen offset:68
-; GFX10-NEXT:    buffer_store_dword v25, v0, s[0:3], 0 offen offset:64
-; GFX10-NEXT:    buffer_store_dword v24, v0, s[0:3], 0 offen offset:60
-; GFX10-NEXT:    buffer_store_dword v23, v0, s[0:3], 0 offen offset:56
-; GFX10-NEXT:    buffer_store_dword v20, v0, s[0:3], 0 offen offset:52
-; GFX10-NEXT:    buffer_store_dword v19, v0, s[0:3], 0 offen offset:48
-; GFX10-NEXT:    buffer_store_dword v18, v0, s[0:3], 0 offen offset:44
-; GFX10-NEXT:    buffer_store_dword v17, v0, s[0:3], 0 offen offset:40
-; GFX10-NEXT:    buffer_store_dword v16, v0, s[0:3], 0 offen offset:36
-; GFX10-NEXT:    buffer_store_dword v15, v0, s[0:3], 0 offen offset:32
-; GFX10-NEXT:    buffer_store_dword v12, v0, s[0:3], 0 offen offset:28
-; GFX10-NEXT:    buffer_store_dword v11, v0, s[0:3], 0 offen offset:24
-; GFX10-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:20
-; GFX10-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:16
-; GFX10-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:12
-; GFX10-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:8
-; GFX10-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:4
-; GFX10-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen
+; GFX10-LABEL: v_fsub_v3bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_sub_f32_e32 v4, v5, v4
+; GFX10-NEXT:    v_sub_f32_e32 v0, v0, v2
+; GFX10-NEXT:    v_sub_f32_e32 v1, v1, v3
+; GFX10-NEXT:    v_perm_b32 v0, v0, v4, 0x3020706
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_fsub_v3bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_sub_f32 v1, v1, v3 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_sub_f32_e32 v4, v5, v4
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v4, 0x3020706
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = fsub <3 x bfloat> %a, %b
+  ret <3 x bfloat> %op
+}
+
+define <4 x bfloat> @v_fsub_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
+; GCN-LABEL: v_fsub_v4bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_sub_f32_e32 v3, v3, v7
+; GCN-NEXT:    v_sub_f32_e32 v2, v2, v6
+; GCN-NEXT:    v_sub_f32_e32 v1, v1, v5
+; GCN-NEXT:    v_sub_f32_e32 v0, v0, v4
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fsub_v4bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_sub_f32_e32 v3, v3, v7
+; GFX7-NEXT:    v_sub_f32_e32 v2, v2, v6
+; GFX7-NEXT:    v_sub_f32_e32 v1, v1, v5
+; GFX7-NEXT:    v_sub_f32_e32 v0, v0, v4
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fsub_v4bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v3
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_sub_f32_e32 v4, v5, v4
+; GFX8-NEXT:    v_sub_f32_e32 v1, v1, v3
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_sub_f32_e32 v3, v5, v3
+; GFX8-NEXT:    v_sub_f32_e32 v0, v0, v2
+; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX8-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX8-NEXT:    v_perm_b32 v1, v1, v4, s4
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fsub_v4bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_sub_f32_e32 v4, v5, v4
+; GFX9-NEXT:    v_sub_f32_e32 v1, v1, v3
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_sub_f32_e32 v3, v5, v3
+; GFX9-NEXT:    v_sub_f32_e32 v0, v0, v2
+; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fsub_v4bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
+; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_sub_f32_e32 v1, v1, v3
+; GFX10-NEXT:    v_sub_f32_e32 v3, v5, v4
+; GFX10-NEXT:    v_sub_f32_e32 v4, v7, v6
+; GFX10-NEXT:    v_sub_f32_e32 v0, v0, v2
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_perm_b32 v0, v0, v4, 0x3020706
+; GFX10-NEXT:    v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_extload_v32bf16_to_v32f64:
+; GFX11-LABEL: v_fsub_v4bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_clause 0x1f
-; GFX11-NEXT:    global_load_u16 v3, v[1:2], off offset:12
-; GFX11-NEXT:    global_load_u16 v4, v[1:2], off offset:8
-; GFX11-NEXT:    global_load_u16 v5, v[1:2], off offset:4
-; GFX11-NEXT:    global_load_u16 v6, v[1:2], off offset:2
-; GFX11-NEXT:    global_load_u16 v7, v[1:2], off
-; GFX11-NEXT:    global_load_u16 v8, v[1:2], off offset:6
-; GFX11-NEXT:    global_load_u16 v9, v[1:2], off offset:10
-; GFX11-NEXT:    global_load_u16 v10, v[1:2], off offset:14
-; GFX11-NEXT:    global_load_u16 v11, v[1:2], off offset:28
-; GFX11-NEXT:    global_load_u16 v12, v[1:2], off offset:24
-; GFX11-NEXT:    global_load_u16 v13, v[1:2], off offset:20
-; GFX11-NEXT:    global_load_u16 v14, v[1:2], off offset:18
-; GFX11-NEXT:    global_load_u16 v15, v[1:2], off offset:16
-; GFX11-NEXT:    global_load_u16 v16, v[1:2], off offset:22
-; GFX11-NEXT:    global_load_u16 v17, v[1:2], off offset:26
-; GFX11-NEXT:    global_load_u16 v18, v[1:2], off offset:30
-; GFX11-NEXT:    global_load_u16 v19, v[1:2], off offset:44
-; GFX11-NEXT:    global_load_u16 v20, v[1:2], off offset:40
-; GFX11-NEXT:    global_load_u16 v21, v[1:2], off offset:36
-; GFX11-NEXT:    global_load_u16 v22, v[1:2], off offset:34
-; GFX11-NEXT:    global_load_u16 v23, v[1:2], off offset:32
-; GFX11-NEXT:    global_load_u16 v24, v[1:2], off offset:38
-; GFX11-NEXT:    global_load_u16 v25, v[1:2], off offset:42
-; GFX11-NEXT:    global_load_u16 v26, v[1:2], off offset:46
-; GFX11-NEXT:    global_load_u16 v27, v[1:2], off offset:60
-; GFX11-NEXT:    global_load_u16 v28, v[1:2], off offset:56
-; GFX11-NEXT:    global_load_u16 v29, v[1:2], off offset:52
-; GFX11-NEXT:    global_load_u16 v30, v[1:2], off offset:50
-; GFX11-NEXT:    global_load_u16 v31, v[1:2], off offset:48
-; GFX11-NEXT:    global_load_u16 v32, v[1:2], off offset:54
-; GFX11-NEXT:    global_load_u16 v33, v[1:2], off offset:58
-; GFX11-NEXT:    global_load_u16 v1, v[1:2], off offset:62
-; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_sub_f32 v0, v0, v2 :: v_dual_and_b32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX11-NEXT:    v_sub_f32_e32 v1, v1, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_sub_f32 v3, v7, v6 :: v_dual_sub_f32 v4, v5, v4
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v3, 0x3020706
+; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    s_add_i32 s1, s0, 0xf0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0xe0
-; GFX11-NEXT:    s_add_i32 s3, s0, 0xd0
-; GFX11-NEXT:    s_add_i32 s4, s0, 0xc0
-; GFX11-NEXT:    s_add_i32 s5, s0, 0xb0
-; GFX11-NEXT:    s_add_i32 s6, s0, 0xa0
-; GFX11-NEXT:    s_add_i32 s7, s0, 0x90
-; GFX11-NEXT:    s_add_i32 s8, s0, 0x70
-; GFX11-NEXT:    s_add_i32 s9, s0, 0x60
-; GFX11-NEXT:    s_add_i32 s10, s0, 0x50
-; GFX11-NEXT:    s_add_i32 s11, s0, 48
-; GFX11-NEXT:    s_waitcnt vmcnt(31)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v39, 16, v3
-; GFX11-NEXT:    s_waitcnt vmcnt(30)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v38, 16, v4
-; GFX11-NEXT:    s_waitcnt vmcnt(29)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT:    s_waitcnt vmcnt(28)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v6
-; GFX11-NEXT:    s_waitcnt vmcnt(27)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v37, 16, v7
-; GFX11-NEXT:    s_waitcnt vmcnt(26)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v8
-; GFX11-NEXT:    s_waitcnt vmcnt(25)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX11-NEXT:    s_waitcnt vmcnt(24)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX11-NEXT:    s_waitcnt vmcnt(23)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v102, 16, v11
-; GFX11-NEXT:    s_waitcnt vmcnt(22)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v101, 16, v12
-; GFX11-NEXT:    s_waitcnt vmcnt(21)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX11-NEXT:    s_waitcnt vmcnt(20)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX11-NEXT:    s_waitcnt vmcnt(19)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v100, 16, v15
-; GFX11-NEXT:    s_waitcnt vmcnt(18)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v34, 16, v16
-; GFX11-NEXT:    s_waitcnt vmcnt(17)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
-; GFX11-NEXT:    s_waitcnt vmcnt(16)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; GFX11-NEXT:    s_waitcnt vmcnt(15)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v52, 16, v19
-; GFX11-NEXT:    s_waitcnt vmcnt(14)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v49, 16, v20
-; GFX11-NEXT:    s_waitcnt vmcnt(13)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
-; GFX11-NEXT:    s_waitcnt vmcnt(12)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
-; GFX11-NEXT:    s_waitcnt vmcnt(11)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v103, 16, v23
-; GFX11-NEXT:    s_waitcnt vmcnt(10)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v48, 16, v24
-; GFX11-NEXT:    s_waitcnt vmcnt(9)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
-; GFX11-NEXT:    s_waitcnt vmcnt(8)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
-; GFX11-NEXT:    s_waitcnt vmcnt(7)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v68, 16, v27
-; GFX11-NEXT:    s_waitcnt vmcnt(6)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v65, 16, v28
-; GFX11-NEXT:    s_waitcnt vmcnt(5)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
-; GFX11-NEXT:    s_waitcnt vmcnt(4)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
-; GFX11-NEXT:    s_waitcnt vmcnt(3)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v53, 16, v31
-; GFX11-NEXT:    s_waitcnt vmcnt(2)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v64, 16, v32
-; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v33, 16, v33
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[96:97], v68
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[84:85], v65
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[82:83], v64
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[86:87], v33
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[98:99], v1
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[80:81], v29
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[70:71], v30
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[68:69], v53
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[66:67], v26
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[64:65], v52
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[54:55], v25
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[52:53], v49
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[50:51], v48
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[48:49], v21
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[23:24], v34
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[35:36], v22
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[33:34], v103
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[31:32], v18
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[29:30], v102
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[27:28], v17
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[25:26], v101
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[21:22], v13
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[19:20], v14
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[17:18], v100
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[15:16], v10
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[13:14], v39
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[11:12], v9
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[9:10], v38
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[7:8], v6
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[5:6], v5
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[3:4], v2
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[1:2], v37
-; GFX11-NEXT:    scratch_store_b128 off, v[96:99], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[84:87], s2
-; GFX11-NEXT:    scratch_store_b128 off, v[80:83], s3
-; GFX11-NEXT:    scratch_store_b128 off, v[68:71], s4
-; GFX11-NEXT:    scratch_store_b128 off, v[64:67], s5
-; GFX11-NEXT:    scratch_store_b128 off, v[52:55], s6
-; GFX11-NEXT:    scratch_store_b128 off, v[48:51], s7
-; GFX11-NEXT:    scratch_store_b128 off, v[33:36], s0 offset:128
-; GFX11-NEXT:    scratch_store_b128 off, v[29:32], s8
-; GFX11-NEXT:    scratch_store_b128 off, v[25:28], s9
-; GFX11-NEXT:    scratch_store_b128 off, v[21:24], s10
-; GFX11-NEXT:    scratch_store_b128 off, v[17:20], s0 offset:64
-; GFX11-NEXT:    scratch_store_b128 off, v[13:16], s11
-; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    scratch_store_b128 off, v[9:12], s0 offset:32
-; GFX11-NEXT:    scratch_store_b128 off, v[5:8], s0 offset:16
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s0
+; GFX11-NEXT:    v_or_b32_e32 v1, v4, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %load = load <32 x bfloat>, ptr addrspace(1) %ptr
-  %fpext = fpext <32 x bfloat> %load to <32 x double>
-  ret <32 x double> %fpext
+  %op = fsub <4 x bfloat> %a, %b
+  ret <4 x bfloat> %op
 }
 
-define bfloat @v_fadd_bf16(bfloat %a, bfloat %b) {
-; GCN-LABEL: v_fadd_bf16:
+define bfloat @v_fmul_bf16(bfloat %a, bfloat %b) {
+; GCN-LABEL: v_fmul_bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT:    v_add_f32_e32 v0, v0, v1
+; GCN-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: v_fadd_bf16:
+; GFX7-LABEL: v_fmul_bf16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: v_fadd_bf16:
+; GFX8-LABEL: v_fmul_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX8-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fadd_bf16:
+; GFX9-LABEL: v_fmul_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: v_fadd_bf16:
+; GFX10-LABEL: v_fmul_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_fadd_bf16:
+; GFX11-LABEL: v_fmul_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %op = fadd bfloat %a, %b
+  %op = fmul bfloat %a, %b
   ret bfloat %op
 }
 
-define <2 x bfloat> @v_fadd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
-; GCN-LABEL: v_fadd_v2bf16:
+define <2 x bfloat> @v_fmul_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
+; GCN-LABEL: v_fmul_v2bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT:    v_add_f32_e32 v1, v1, v3
-; GCN-NEXT:    v_add_f32_e32 v0, v0, v2
+; GCN-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GCN-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: v_fadd_v2bf16:
+; GFX7-LABEL: v_fmul_v2bf16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX7-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: v_fadd_v2bf16:
+; GFX8-LABEL: v_fmul_v2bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
 ; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_add_f32_e32 v2, v3, v2
-; GFX8-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX8-NEXT:    v_mul_f32_e32 v2, v3, v2
+; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
 ; GFX8-NEXT:    v_perm_b32 v0, v0, v2, s4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fadd_v2bf16:
+; GFX9-LABEL: v_fmul_v2bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
 ; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_add_f32_e32 v2, v3, v2
-; GFX9-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_mul_f32_e32 v2, v3, v2
+; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
 ; GFX9-NEXT:    v_perm_b32 v0, v0, v2, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: v_fadd_v2bf16:
+; GFX10-LABEL: v_fmul_v2bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
 ; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_add_f32_e32 v2, v3, v2
-; GFX10-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX10-NEXT:    v_mul_f32_e32 v2, v3, v2
+; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x3020706
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_fadd_v2bf16:
+; GFX11-LABEL: v_fmul_v2bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
@@ -7002,17 +11190,17 @@ define <2 x bfloat> @v_fadd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
 ; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX11-NEXT:    v_add_f32_e32 v2, v3, v2
+; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_mul_f32_e32 v2, v3, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_perm_b32 v0, v0, v2, 0x3020706
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %op = fadd <2 x bfloat> %a, %b
+  %op = fmul <2 x bfloat> %a, %b
   ret <2 x bfloat> %op
 }
 
-define <3 x bfloat> @v_fadd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
-; GCN-LABEL: v_fadd_v3bf16:
+define <3 x bfloat> @v_fmul_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
+; GCN-LABEL: v_fmul_v3bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
@@ -7021,15 +11209,15 @@ define <3 x bfloat> @v_fadd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT:    v_add_f32_e32 v2, v2, v5
-; GCN-NEXT:    v_add_f32_e32 v1, v1, v4
-; GCN-NEXT:    v_add_f32_e32 v0, v0, v3
+; GCN-NEXT:    v_mul_f32_e32 v2, v2, v5
+; GCN-NEXT:    v_mul_f32_e32 v1, v1, v4
+; GCN-NEXT:    v_mul_f32_e32 v0, v0, v3
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: v_fadd_v3bf16:
+; GFX7-LABEL: v_fmul_v3bf16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
@@ -7038,32 +11226,32 @@ define <3 x bfloat> @v_fadd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_add_f32_e32 v2, v2, v5
-; GFX7-NEXT:    v_add_f32_e32 v1, v1, v4
-; GFX7-NEXT:    v_add_f32_e32 v0, v0, v3
+; GFX7-NEXT:    v_mul_f32_e32 v2, v2, v5
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v4
+; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v3
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: v_fadd_v3bf16:
+; GFX8-LABEL: v_fmul_v3bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v3
 ; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
 ; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_add_f32_e32 v3, v4, v3
-; GFX8-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX8-NEXT:    v_mul_f32_e32 v3, v4, v3
+; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    v_perm_b32 v0, v0, v3, s4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fadd_v3bf16:
+; GFX9-LABEL: v_fmul_v3bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
@@ -7072,15 +11260,15 @@ define <3 x bfloat> @v_fadd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_add_f32_e32 v4, v5, v4
-; GFX9-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX9-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX9-NEXT:    v_mul_f32_e32 v4, v5, v4
+; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX9-NEXT:    v_perm_b32 v0, v0, v4, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: v_fadd_v3bf16:
+; GFX10-LABEL: v_fmul_v3bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
@@ -7089,14 +11277,14 @@ define <3 x bfloat> @v_fadd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX10-NEXT:    v_add_f32_e32 v4, v5, v4
-; GFX10-NEXT:    v_add_f32_e32 v0, v0, v2
-; GFX10-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX10-NEXT:    v_mul_f32_e32 v4, v5, v4
+; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v3
 ; GFX10-NEXT:    v_perm_b32 v0, v0, v4, 0x3020706
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_fadd_v3bf16:
+; GFX11-LABEL: v_fmul_v3bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
@@ -7105,20 +11293,20 @@ define <3 x bfloat> @v_fadd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_add_f32 v1, v1, v3 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX11-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX11-NEXT:    v_dual_mul_f32 v1, v1, v3 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_add_f32_e32 v4, v5, v4
+; GFX11-NEXT:    v_mul_f32_e32 v4, v5, v4
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_perm_b32 v0, v0, v4, 0x3020706
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %op = fadd <3 x bfloat> %a, %b
+  %op = fmul <3 x bfloat> %a, %b
   ret <3 x bfloat> %op
 }
 
-define <4 x bfloat> @v_fadd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
-; GCN-LABEL: v_fadd_v4bf16:
+define <4 x bfloat> @v_fmul_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
+; GCN-LABEL: v_fmul_v4bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
@@ -7129,17 +11317,17 @@ define <4 x bfloat> @v_fadd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT:    v_add_f32_e32 v3, v3, v7
-; GCN-NEXT:    v_add_f32_e32 v2, v2, v6
-; GCN-NEXT:    v_add_f32_e32 v1, v1, v5
-; GCN-NEXT:    v_add_f32_e32 v0, v0, v4
+; GCN-NEXT:    v_mul_f32_e32 v3, v3, v7
+; GCN-NEXT:    v_mul_f32_e32 v2, v2, v6
+; GCN-NEXT:    v_mul_f32_e32 v1, v1, v5
+; GCN-NEXT:    v_mul_f32_e32 v0, v0, v4
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: v_fadd_v4bf16:
+; GFX7-LABEL: v_fmul_v4bf16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
@@ -7150,58 +11338,58 @@ define <4 x bfloat> @v_fadd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_add_f32_e32 v3, v3, v7
-; GFX7-NEXT:    v_add_f32_e32 v2, v2, v6
-; GFX7-NEXT:    v_add_f32_e32 v1, v1, v5
-; GFX7-NEXT:    v_add_f32_e32 v0, v0, v4
+; GFX7-NEXT:    v_mul_f32_e32 v3, v3, v7
+; GFX7-NEXT:    v_mul_f32_e32 v2, v2, v6
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v5
+; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v4
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: v_fadd_v4bf16:
+; GFX8-LABEL: v_fmul_v4bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v3
 ; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_add_f32_e32 v4, v5, v4
-; GFX8-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX8-NEXT:    v_mul_f32_e32 v4, v5, v4
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v3
 ; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
 ; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_add_f32_e32 v3, v5, v3
-; GFX8-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX8-NEXT:    v_mul_f32_e32 v3, v5, v3
+; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
 ; GFX8-NEXT:    v_perm_b32 v0, v0, v3, s4
 ; GFX8-NEXT:    v_perm_b32 v1, v1, v4, s4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fadd_v4bf16:
+; GFX9-LABEL: v_fmul_v4bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
 ; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_add_f32_e32 v4, v5, v4
-; GFX9-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX9-NEXT:    v_mul_f32_e32 v4, v5, v4
+; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v3
 ; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
 ; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_add_f32_e32 v3, v5, v3
-; GFX9-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX9-NEXT:    v_mul_f32_e32 v3, v5, v3
+; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: v_fadd_v4bf16:
+; GFX10-LABEL: v_fmul_v4bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
@@ -7212,16 +11400,16 @@ define <4 x bfloat> @v_fadd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
 ; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX10-NEXT:    v_add_f32_e32 v3, v5, v4
-; GFX10-NEXT:    v_add_f32_e32 v4, v7, v6
-; GFX10-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX10-NEXT:    v_mul_f32_e32 v3, v5, v4
+; GFX10-NEXT:    v_mul_f32_e32 v4, v7, v6
+; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX10-NEXT:    v_perm_b32 v0, v0, v4, 0x3020706
 ; GFX10-NEXT:    v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_fadd_v4bf16:
+; GFX11-LABEL: v_fmul_v4bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
@@ -7231,11 +11419,11 @@ define <4 x bfloat> @v_fadd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_add_f32 v0, v0, v2 :: v_dual_and_b32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_dual_mul_f32 v0, v0, v2 :: v_dual_and_b32 v1, 0xffff0000, v1
 ; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_add_f32 v3, v7, v6 :: v_dual_add_f32 v4, v5, v4
+; GFX11-NEXT:    v_dual_mul_f32 v3, v7, v6 :: v_dual_mul_f32 v4, v5, v4
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_perm_b32 v0, v0, v3, 0x3020706
@@ -7243,12 +11431,12 @@ define <4 x bfloat> @v_fadd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_or_b32_e32 v1, v4, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %op = fadd <4 x bfloat> %a, %b
+  %op = fmul <4 x bfloat> %a, %b
   ret <4 x bfloat> %op
 }
 
-define <8 x bfloat> @v_fadd_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
-; GCN-LABEL: v_fadd_v8bf16:
+define <8 x bfloat> @v_fmul_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
+; GCN-LABEL: v_fmul_v8bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
@@ -7267,14 +11455,14 @@ define <8 x bfloat> @v_fadd_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GCN-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT:    v_add_f32_e32 v7, v7, v15
-; GCN-NEXT:    v_add_f32_e32 v6, v6, v14
-; GCN-NEXT:    v_add_f32_e32 v5, v5, v13
-; GCN-NEXT:    v_add_f32_e32 v4, v4, v12
-; GCN-NEXT:    v_add_f32_e32 v3, v3, v11
-; GCN-NEXT:    v_add_f32_e32 v2, v2, v10
-; GCN-NEXT:    v_add_f32_e32 v1, v1, v9
-; GCN-NEXT:    v_add_f32_e32 v0, v0, v8
+; GCN-NEXT:    v_mul_f32_e32 v7, v7, v15
+; GCN-NEXT:    v_mul_f32_e32 v6, v6, v14
+; GCN-NEXT:    v_mul_f32_e32 v5, v5, v13
+; GCN-NEXT:    v_mul_f32_e32 v4, v4, v12
+; GCN-NEXT:    v_mul_f32_e32 v3, v3, v11
+; GCN-NEXT:    v_mul_f32_e32 v2, v2, v10
+; GCN-NEXT:    v_mul_f32_e32 v1, v1, v9
+; GCN-NEXT:    v_mul_f32_e32 v0, v0, v8
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
@@ -7285,7 +11473,7 @@ define <8 x bfloat> @v_fadd_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
 ; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: v_fadd_v8bf16:
+; GFX7-LABEL: v_fmul_v8bf16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
@@ -7304,14 +11492,14 @@ define <8 x bfloat> @v_fadd_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_add_f32_e32 v7, v7, v15
-; GFX7-NEXT:    v_add_f32_e32 v6, v6, v14
-; GFX7-NEXT:    v_add_f32_e32 v5, v5, v13
-; GFX7-NEXT:    v_add_f32_e32 v4, v4, v12
-; GFX7-NEXT:    v_add_f32_e32 v3, v3, v11
-; GFX7-NEXT:    v_add_f32_e32 v2, v2, v10
-; GFX7-NEXT:    v_add_f32_e32 v1, v1, v9
-; GFX7-NEXT:    v_add_f32_e32 v0, v0, v8
+; GFX7-NEXT:    v_mul_f32_e32 v7, v7, v15
+; GFX7-NEXT:    v_mul_f32_e32 v6, v6, v14
+; GFX7-NEXT:    v_mul_f32_e32 v5, v5, v13
+; GFX7-NEXT:    v_mul_f32_e32 v4, v4, v12
+; GFX7-NEXT:    v_mul_f32_e32 v3, v3, v11
+; GFX7-NEXT:    v_mul_f32_e32 v2, v2, v10
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v9
+; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v8
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
@@ -7322,33 +11510,33 @@ define <8 x bfloat> @v_fadd_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
 ; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: v_fadd_v8bf16:
+; GFX8-LABEL: v_fmul_v8bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_and_b32_e32 v8, 0xffff0000, v7
 ; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v3
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_add_f32_e32 v8, v9, v8
-; GFX8-NEXT:    v_add_f32_e32 v3, v3, v7
+; GFX8-NEXT:    v_mul_f32_e32 v8, v9, v8
+; GFX8-NEXT:    v_mul_f32_e32 v3, v3, v7
 ; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v6
 ; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v2
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_add_f32_e32 v7, v9, v7
-; GFX8-NEXT:    v_add_f32_e32 v2, v2, v6
+; GFX8-NEXT:    v_mul_f32_e32 v7, v9, v7
+; GFX8-NEXT:    v_mul_f32_e32 v2, v2, v6
 ; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v5
 ; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v1
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_add_f32_e32 v6, v9, v6
-; GFX8-NEXT:    v_add_f32_e32 v1, v1, v5
+; GFX8-NEXT:    v_mul_f32_e32 v6, v9, v6
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v5
 ; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
 ; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_add_f32_e32 v5, v9, v5
-; GFX8-NEXT:    v_add_f32_e32 v0, v0, v4
+; GFX8-NEXT:    v_mul_f32_e32 v5, v9, v5
+; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v4
 ; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
 ; GFX8-NEXT:    v_perm_b32 v0, v0, v5, s4
 ; GFX8-NEXT:    v_perm_b32 v1, v1, v6, s4
@@ -7356,18 +11544,18 @@ define <8 x bfloat> @v_fadd_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
 ; GFX8-NEXT:    v_perm_b32 v3, v3, v8, s4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fadd_v8bf16:
+; GFX9-LABEL: v_fmul_v8bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v8, 0xffff0000, v7
 ; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v3
-; GFX9-NEXT:    v_add_f32_e32 v8, v9, v8
+; GFX9-NEXT:    v_mul_f32_e32 v8, v9, v8
 ; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v5
 ; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v1
-; GFX9-NEXT:    v_add_f32_e32 v9, v10, v9
+; GFX9-NEXT:    v_mul_f32_e32 v9, v10, v9
 ; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v6
 ; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v2
-; GFX9-NEXT:    v_add_f32_e32 v10, v11, v10
+; GFX9-NEXT:    v_mul_f32_e32 v10, v11, v10
 ; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v4
 ; GFX9-NEXT:    v_and_b32_e32 v12, 0xffff0000, v0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
@@ -7376,14 +11564,14 @@ define <8 x bfloat> @v_fadd_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX9-NEXT:    v_add_f32_e32 v1, v1, v5
+; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v5
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX9-NEXT:    v_add_f32_e32 v0, v0, v4
+; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v4
 ; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v9
-; GFX9-NEXT:    v_add_f32_e32 v11, v12, v11
-; GFX9-NEXT:    v_add_f32_e32 v3, v3, v7
-; GFX9-NEXT:    v_add_f32_e32 v2, v2, v5
+; GFX9-NEXT:    v_mul_f32_e32 v11, v12, v11
+; GFX9-NEXT:    v_mul_f32_e32 v3, v3, v7
+; GFX9-NEXT:    v_mul_f32_e32 v2, v2, v5
 ; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v8
@@ -7392,7 +11580,7 @@ define <8 x bfloat> @v_fadd_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
 ; GFX9-NEXT:    v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: v_fadd_v8bf16:
+; GFX10-LABEL: v_fmul_v8bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_and_b32_e32 v8, 0xffff0000, v7
@@ -7403,8 +11591,8 @@ define <8 x bfloat> @v_fadd_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
 ; GFX10-NEXT:    v_and_b32_e32 v13, 0xffff0000, v2
 ; GFX10-NEXT:    v_and_b32_e32 v14, 0xffff0000, v4
 ; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v0
-; GFX10-NEXT:    v_add_f32_e32 v8, v9, v8
-; GFX10-NEXT:    v_add_f32_e32 v9, v11, v10
+; GFX10-NEXT:    v_mul_f32_e32 v8, v9, v8
+; GFX10-NEXT:    v_mul_f32_e32 v9, v11, v10
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
@@ -7413,13 +11601,13 @@ define <8 x bfloat> @v_fadd_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-NEXT:    v_add_f32_e32 v10, v13, v12
-; GFX10-NEXT:    v_add_f32_e32 v11, v15, v14
-; GFX10-NEXT:    v_add_f32_e32 v3, v3, v7
-; GFX10-NEXT:    v_add_f32_e32 v1, v1, v5
-; GFX10-NEXT:    v_add_f32_e32 v0, v0, v4
+; GFX10-NEXT:    v_mul_f32_e32 v10, v13, v12
+; GFX10-NEXT:    v_mul_f32_e32 v11, v15, v14
+; GFX10-NEXT:    v_mul_f32_e32 v3, v3, v7
+; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v5
+; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v4
 ; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v9
-; GFX10-NEXT:    v_add_f32_e32 v2, v2, v6
+; GFX10-NEXT:    v_mul_f32_e32 v2, v2, v6
 ; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v8
 ; GFX10-NEXT:    v_perm_b32 v0, v0, v11, 0x3020706
 ; GFX10-NEXT:    v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
@@ -7427,7 +11615,7 @@ define <8 x bfloat> @v_fadd_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
 ; GFX10-NEXT:    v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_fadd_v8bf16:
+; GFX11-LABEL: v_fmul_v8bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff0000, v4
@@ -7439,22 +11627,22 @@ define <8 x bfloat> @v_fadd_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
 ; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff0000, v1
 ; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff0000, v7
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX11-NEXT:    v_dual_add_f32 v0, v0, v4 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX11-NEXT:    v_dual_mul_f32 v0, v0, v4 :: v_dual_lshlrev_b32 v1, 16, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_add_f32 v8, v9, v8 :: v_dual_and_b32 v9, 0xffff0000, v5
+; GFX11-NEXT:    v_dual_mul_f32 v8, v9, v8 :: v_dual_and_b32 v9, 0xffff0000, v5
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT:    v_add_f32_e32 v3, v3, v7
+; GFX11-NEXT:    v_mul_f32_e32 v3, v3, v7
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_add_f32_e32 v9, v10, v9
-; GFX11-NEXT:    v_dual_add_f32 v1, v1, v5 :: v_dual_and_b32 v12, 0xffff0000, v2
+; GFX11-NEXT:    v_mul_f32_e32 v9, v10, v9
+; GFX11-NEXT:    v_dual_mul_f32 v1, v1, v5 :: v_dual_and_b32 v12, 0xffff0000, v2
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v6
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v9
-; GFX11-NEXT:    v_add_f32_e32 v2, v2, v5
-; GFX11-NEXT:    v_dual_add_f32 v10, v12, v11 :: v_dual_add_f32 v11, v14, v13
+; GFX11-NEXT:    v_mul_f32_e32 v2, v2, v5
+; GFX11-NEXT:    v_dual_mul_f32 v10, v12, v11 :: v_dual_mul_f32 v11, v14, v13
 ; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v8
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_or_b32_e32 v1, v1, v4
@@ -7463,47 +11651,47 @@ define <8 x bfloat> @v_fadd_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
 ; GFX11-NEXT:    v_perm_b32 v0, v0, v11, 0x3020706
 ; GFX11-NEXT:    v_or_b32_e32 v3, v3, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %op = fadd <8 x bfloat> %a, %b
+  %op = fmul <8 x bfloat> %a, %b
   ret <8 x bfloat> %op
 }
 
-define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
-; GCN-LABEL: v_fadd_v16bf16:
+define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
+; GCN-LABEL: v_fmul_v16bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
 ; GCN-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
-; GCN-NEXT:    v_add_f32_e32 v14, v14, v30
+; GCN-NEXT:    v_mul_f32_e32 v14, v14, v30
 ; GCN-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
 ; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
-; GCN-NEXT:    v_add_f32_e32 v13, v13, v29
+; GCN-NEXT:    v_mul_f32_e32 v13, v13, v29
 ; GCN-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
 ; GCN-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
-; GCN-NEXT:    v_add_f32_e32 v12, v12, v28
+; GCN-NEXT:    v_mul_f32_e32 v12, v12, v28
 ; GCN-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
 ; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
-; GCN-NEXT:    v_add_f32_e32 v11, v11, v27
+; GCN-NEXT:    v_mul_f32_e32 v11, v11, v27
 ; GCN-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
 ; GCN-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
-; GCN-NEXT:    v_add_f32_e32 v10, v10, v26
+; GCN-NEXT:    v_mul_f32_e32 v10, v10, v26
 ; GCN-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
 ; GCN-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; GCN-NEXT:    v_add_f32_e32 v9, v9, v25
+; GCN-NEXT:    v_mul_f32_e32 v9, v9, v25
 ; GCN-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
 ; GCN-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
-; GCN-NEXT:    v_add_f32_e32 v8, v8, v24
+; GCN-NEXT:    v_mul_f32_e32 v8, v8, v24
 ; GCN-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
 ; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GCN-NEXT:    v_add_f32_e32 v7, v7, v23
+; GCN-NEXT:    v_mul_f32_e32 v7, v7, v23
 ; GCN-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
 ; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GCN-NEXT:    v_add_f32_e32 v6, v6, v22
+; GCN-NEXT:    v_mul_f32_e32 v6, v6, v22
 ; GCN-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
 ; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GCN-NEXT:    v_add_f32_e32 v5, v5, v21
+; GCN-NEXT:    v_mul_f32_e32 v5, v5, v21
 ; GCN-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
 ; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GCN-NEXT:    v_add_f32_e32 v4, v4, v20
+; GCN-NEXT:    v_mul_f32_e32 v4, v4, v20
 ; GCN-NEXT:    buffer_load_dword v20, off, s[0:3], s32
 ; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
 ; GCN-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
@@ -7514,10 +11702,10 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GCN-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT:    v_add_f32_e32 v3, v3, v19
-; GCN-NEXT:    v_add_f32_e32 v2, v2, v18
-; GCN-NEXT:    v_add_f32_e32 v1, v1, v17
-; GCN-NEXT:    v_add_f32_e32 v0, v0, v16
+; GCN-NEXT:    v_mul_f32_e32 v3, v3, v19
+; GCN-NEXT:    v_mul_f32_e32 v2, v2, v18
+; GCN-NEXT:    v_mul_f32_e32 v1, v1, v17
+; GCN-NEXT:    v_mul_f32_e32 v0, v0, v16
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
@@ -7534,17 +11722,17 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v16, 0xffff0000, v20
-; GCN-NEXT:    v_add_f32_e32 v15, v15, v16
+; GCN-NEXT:    v_mul_f32_e32 v15, v15, v16
 ; GCN-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
 ; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: v_fadd_v16bf16:
+; GFX7-LABEL: v_fmul_v16bf16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
 ; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX7-NEXT:    v_add_f32_e32 v4, v4, v20
+; GFX7-NEXT:    v_mul_f32_e32 v4, v4, v20
 ; GFX7-NEXT:    buffer_load_dword v20, off, s[0:3], s32
 ; GFX7-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
@@ -7575,20 +11763,20 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX7-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT:    v_add_f32_e32 v0, v0, v16
-; GFX7-NEXT:    v_add_f32_e32 v14, v14, v30
-; GFX7-NEXT:    v_add_f32_e32 v13, v13, v29
-; GFX7-NEXT:    v_add_f32_e32 v12, v12, v28
-; GFX7-NEXT:    v_add_f32_e32 v11, v11, v27
-; GFX7-NEXT:    v_add_f32_e32 v10, v10, v26
-; GFX7-NEXT:    v_add_f32_e32 v9, v9, v25
-; GFX7-NEXT:    v_add_f32_e32 v8, v8, v24
-; GFX7-NEXT:    v_add_f32_e32 v7, v7, v23
-; GFX7-NEXT:    v_add_f32_e32 v6, v6, v22
-; GFX7-NEXT:    v_add_f32_e32 v5, v5, v21
-; GFX7-NEXT:    v_add_f32_e32 v3, v3, v19
-; GFX7-NEXT:    v_add_f32_e32 v2, v2, v18
-; GFX7-NEXT:    v_add_f32_e32 v1, v1, v17
+; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v16
+; GFX7-NEXT:    v_mul_f32_e32 v14, v14, v30
+; GFX7-NEXT:    v_mul_f32_e32 v13, v13, v29
+; GFX7-NEXT:    v_mul_f32_e32 v12, v12, v28
+; GFX7-NEXT:    v_mul_f32_e32 v11, v11, v27
+; GFX7-NEXT:    v_mul_f32_e32 v10, v10, v26
+; GFX7-NEXT:    v_mul_f32_e32 v9, v9, v25
+; GFX7-NEXT:    v_mul_f32_e32 v8, v8, v24
+; GFX7-NEXT:    v_mul_f32_e32 v7, v7, v23
+; GFX7-NEXT:    v_mul_f32_e32 v6, v6, v22
+; GFX7-NEXT:    v_mul_f32_e32 v5, v5, v21
+; GFX7-NEXT:    v_mul_f32_e32 v3, v3, v19
+; GFX7-NEXT:    v_mul_f32_e32 v2, v2, v18
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v17
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
@@ -7606,61 +11794,61 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX7-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v16, 0xffff0000, v20
-; GFX7-NEXT:    v_add_f32_e32 v15, v15, v16
+; GFX7-NEXT:    v_mul_f32_e32 v15, v15, v16
 ; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: v_fadd_v16bf16:
+; GFX8-LABEL: v_fmul_v16bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_and_b32_e32 v16, 0xffff0000, v15
 ; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v7
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX8-NEXT:    v_add_f32_e32 v16, v17, v16
-; GFX8-NEXT:    v_add_f32_e32 v7, v7, v15
+; GFX8-NEXT:    v_mul_f32_e32 v16, v17, v16
+; GFX8-NEXT:    v_mul_f32_e32 v7, v7, v15
 ; GFX8-NEXT:    v_and_b32_e32 v15, 0xffff0000, v14
 ; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v6
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX8-NEXT:    v_add_f32_e32 v15, v17, v15
-; GFX8-NEXT:    v_add_f32_e32 v6, v6, v14
+; GFX8-NEXT:    v_mul_f32_e32 v15, v17, v15
+; GFX8-NEXT:    v_mul_f32_e32 v6, v6, v14
 ; GFX8-NEXT:    v_and_b32_e32 v14, 0xffff0000, v13
 ; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v5
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_add_f32_e32 v14, v17, v14
-; GFX8-NEXT:    v_add_f32_e32 v5, v5, v13
+; GFX8-NEXT:    v_mul_f32_e32 v14, v17, v14
+; GFX8-NEXT:    v_mul_f32_e32 v5, v5, v13
 ; GFX8-NEXT:    v_and_b32_e32 v13, 0xffff0000, v12
 ; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v4
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX8-NEXT:    v_add_f32_e32 v13, v17, v13
-; GFX8-NEXT:    v_add_f32_e32 v4, v4, v12
+; GFX8-NEXT:    v_mul_f32_e32 v13, v17, v13
+; GFX8-NEXT:    v_mul_f32_e32 v4, v4, v12
 ; GFX8-NEXT:    v_and_b32_e32 v12, 0xffff0000, v11
 ; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v3
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_add_f32_e32 v12, v17, v12
-; GFX8-NEXT:    v_add_f32_e32 v3, v3, v11
+; GFX8-NEXT:    v_mul_f32_e32 v12, v17, v12
+; GFX8-NEXT:    v_mul_f32_e32 v3, v3, v11
 ; GFX8-NEXT:    v_and_b32_e32 v11, 0xffff0000, v10
 ; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v2
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_add_f32_e32 v11, v17, v11
-; GFX8-NEXT:    v_add_f32_e32 v2, v2, v10
+; GFX8-NEXT:    v_mul_f32_e32 v11, v17, v11
+; GFX8-NEXT:    v_mul_f32_e32 v2, v2, v10
 ; GFX8-NEXT:    v_and_b32_e32 v10, 0xffff0000, v9
 ; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v1
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_add_f32_e32 v10, v17, v10
-; GFX8-NEXT:    v_add_f32_e32 v1, v1, v9
+; GFX8-NEXT:    v_mul_f32_e32 v10, v17, v10
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v9
 ; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v8
 ; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_add_f32_e32 v9, v17, v9
-; GFX8-NEXT:    v_add_f32_e32 v0, v0, v8
+; GFX8-NEXT:    v_mul_f32_e32 v9, v17, v9
+; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v8
 ; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
 ; GFX8-NEXT:    v_perm_b32 v0, v0, v9, s4
 ; GFX8-NEXT:    v_perm_b32 v1, v1, v10, s4
@@ -7672,30 +11860,30 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX8-NEXT:    v_perm_b32 v7, v7, v16, s4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fadd_v16bf16:
+; GFX9-LABEL: v_fmul_v16bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v16, 0xffff0000, v15
 ; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v7
-; GFX9-NEXT:    v_add_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_mul_f32_e32 v16, v17, v16
 ; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v14
 ; GFX9-NEXT:    v_and_b32_e32 v18, 0xffff0000, v6
-; GFX9-NEXT:    v_add_f32_e32 v17, v18, v17
+; GFX9-NEXT:    v_mul_f32_e32 v17, v18, v17
 ; GFX9-NEXT:    v_and_b32_e32 v18, 0xffff0000, v13
 ; GFX9-NEXT:    v_and_b32_e32 v19, 0xffff0000, v5
-; GFX9-NEXT:    v_add_f32_e32 v18, v19, v18
+; GFX9-NEXT:    v_mul_f32_e32 v18, v19, v18
 ; GFX9-NEXT:    v_and_b32_e32 v19, 0xffff0000, v12
 ; GFX9-NEXT:    v_and_b32_e32 v20, 0xffff0000, v4
-; GFX9-NEXT:    v_add_f32_e32 v19, v20, v19
+; GFX9-NEXT:    v_mul_f32_e32 v19, v20, v19
 ; GFX9-NEXT:    v_and_b32_e32 v20, 0xffff0000, v11
 ; GFX9-NEXT:    v_and_b32_e32 v21, 0xffff0000, v3
-; GFX9-NEXT:    v_add_f32_e32 v20, v21, v20
+; GFX9-NEXT:    v_mul_f32_e32 v20, v21, v20
 ; GFX9-NEXT:    v_and_b32_e32 v21, 0xffff0000, v10
 ; GFX9-NEXT:    v_and_b32_e32 v22, 0xffff0000, v2
-; GFX9-NEXT:    v_add_f32_e32 v21, v22, v21
+; GFX9-NEXT:    v_mul_f32_e32 v21, v22, v21
 ; GFX9-NEXT:    v_and_b32_e32 v22, 0xffff0000, v9
 ; GFX9-NEXT:    v_and_b32_e32 v23, 0xffff0000, v1
-; GFX9-NEXT:    v_add_f32_e32 v22, v23, v22
+; GFX9-NEXT:    v_mul_f32_e32 v22, v23, v22
 ; GFX9-NEXT:    v_and_b32_e32 v23, 0xffff0000, v8
 ; GFX9-NEXT:    v_and_b32_e32 v24, 0xffff0000, v0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
@@ -7713,16 +11901,16 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_add_f32_e32 v23, v24, v23
-; GFX9-NEXT:    v_add_f32_e32 v7, v7, v15
-; GFX9-NEXT:    v_add_f32_e32 v6, v6, v14
-; GFX9-NEXT:    v_add_f32_e32 v5, v5, v13
-; GFX9-NEXT:    v_add_f32_e32 v4, v4, v12
-; GFX9-NEXT:    v_add_f32_e32 v3, v3, v11
-; GFX9-NEXT:    v_add_f32_e32 v2, v2, v10
-; GFX9-NEXT:    v_add_f32_e32 v1, v1, v9
-; GFX9-NEXT:    v_add_f32_e32 v0, v0, v8
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_mul_f32_e32 v23, v24, v23
+; GFX9-NEXT:    v_mul_f32_e32 v7, v7, v15
+; GFX9-NEXT:    v_mul_f32_e32 v6, v6, v14
+; GFX9-NEXT:    v_mul_f32_e32 v5, v5, v13
+; GFX9-NEXT:    v_mul_f32_e32 v4, v4, v12
+; GFX9-NEXT:    v_mul_f32_e32 v3, v3, v11
+; GFX9-NEXT:    v_mul_f32_e32 v2, v2, v10
+; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v9
+; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v8
 ; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
 ; GFX9-NEXT:    v_perm_b32 v0, v0, v23, s4
 ; GFX9-NEXT:    v_perm_b32 v1, v1, v22, s4
@@ -7734,7 +11922,7 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX9-NEXT:    v_perm_b32 v7, v7, v16, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: v_fadd_v16bf16:
+; GFX10-LABEL: v_fmul_v16bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_and_b32_e32 v16, 0xffff0000, v15
@@ -7743,18 +11931,18 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX10-NEXT:    v_and_b32_e32 v19, 0xffff0000, v13
 ; GFX10-NEXT:    v_and_b32_e32 v20, 0xffff0000, v5
 ; GFX10-NEXT:    v_and_b32_e32 v21, 0xffff0000, v12
-; GFX10-NEXT:    v_add_f32_e32 v16, v17, v16
+; GFX10-NEXT:    v_mul_f32_e32 v16, v17, v16
 ; GFX10-NEXT:    v_and_b32_e32 v17, 0xffff0000, v14
 ; GFX10-NEXT:    v_and_b32_e32 v22, 0xffff0000, v4
 ; GFX10-NEXT:    v_and_b32_e32 v23, 0xffff0000, v11
 ; GFX10-NEXT:    v_and_b32_e32 v24, 0xffff0000, v3
 ; GFX10-NEXT:    v_and_b32_e32 v25, 0xffff0000, v10
 ; GFX10-NEXT:    v_and_b32_e32 v26, 0xffff0000, v2
-; GFX10-NEXT:    v_add_f32_e32 v17, v18, v17
-; GFX10-NEXT:    v_add_f32_e32 v18, v20, v19
-; GFX10-NEXT:    v_add_f32_e32 v19, v22, v21
-; GFX10-NEXT:    v_add_f32_e32 v20, v24, v23
-; GFX10-NEXT:    v_add_f32_e32 v21, v26, v25
+; GFX10-NEXT:    v_mul_f32_e32 v17, v18, v17
+; GFX10-NEXT:    v_mul_f32_e32 v18, v20, v19
+; GFX10-NEXT:    v_mul_f32_e32 v19, v22, v21
+; GFX10-NEXT:    v_mul_f32_e32 v20, v24, v23
+; GFX10-NEXT:    v_mul_f32_e32 v21, v26, v25
 ; GFX10-NEXT:    v_and_b32_e32 v22, 0xffff0000, v9
 ; GFX10-NEXT:    v_and_b32_e32 v23, 0xffff0000, v1
 ; GFX10-NEXT:    v_and_b32_e32 v24, 0xffff0000, v8
@@ -7775,16 +11963,16 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX10-NEXT:    v_add_f32_e32 v22, v23, v22
-; GFX10-NEXT:    v_add_f32_e32 v23, v25, v24
-; GFX10-NEXT:    v_add_f32_e32 v7, v7, v15
-; GFX10-NEXT:    v_add_f32_e32 v6, v6, v14
-; GFX10-NEXT:    v_add_f32_e32 v5, v5, v13
-; GFX10-NEXT:    v_add_f32_e32 v0, v0, v8
-; GFX10-NEXT:    v_add_f32_e32 v1, v1, v9
-; GFX10-NEXT:    v_add_f32_e32 v2, v2, v10
-; GFX10-NEXT:    v_add_f32_e32 v3, v3, v11
-; GFX10-NEXT:    v_add_f32_e32 v4, v4, v12
+; GFX10-NEXT:    v_mul_f32_e32 v22, v23, v22
+; GFX10-NEXT:    v_mul_f32_e32 v23, v25, v24
+; GFX10-NEXT:    v_mul_f32_e32 v7, v7, v15
+; GFX10-NEXT:    v_mul_f32_e32 v6, v6, v14
+; GFX10-NEXT:    v_mul_f32_e32 v5, v5, v13
+; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v8
+; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v9
+; GFX10-NEXT:    v_mul_f32_e32 v2, v2, v10
+; GFX10-NEXT:    v_mul_f32_e32 v3, v3, v11
+; GFX10-NEXT:    v_mul_f32_e32 v4, v4, v12
 ; GFX10-NEXT:    v_perm_b32 v0, v0, v23, 0x3020706
 ; GFX10-NEXT:    v_perm_b32 v1, v1, v22, 0x3020706
 ; GFX10-NEXT:    v_perm_b32 v2, v2, v21, 0x3020706
@@ -7795,7 +11983,7 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX10-NEXT:    v_perm_b32 v7, v7, v16, 0x3020706
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_fadd_v16bf16:
+; GFX11-LABEL: v_fmul_v16bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff0000, v7
@@ -7815,17 +12003,17 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-NEXT:    v_dual_add_f32 v16, v17, v16 :: v_dual_and_b32 v17, 0xffff0000, v14
+; GFX11-NEXT:    v_dual_mul_f32 v16, v17, v16 :: v_dual_and_b32 v17, 0xffff0000, v14
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_add_f32 v4, v4, v12 :: v_dual_add_f32 v5, v5, v13
-; GFX11-NEXT:    v_dual_add_f32 v17, v18, v17 :: v_dual_add_f32 v18, v20, v19
-; GFX11-NEXT:    v_add_f32_e32 v19, v22, v21
-; GFX11-NEXT:    v_add_f32_e32 v7, v7, v15
-; GFX11-NEXT:    v_add_f32_e32 v21, v26, v25
-; GFX11-NEXT:    v_dual_add_f32 v6, v6, v14 :: v_dual_and_b32 v25, 0xffff0000, v0
-; GFX11-NEXT:    v_add_f32_e32 v20, v24, v23
+; GFX11-NEXT:    v_dual_mul_f32 v4, v4, v12 :: v_dual_mul_f32 v5, v5, v13
+; GFX11-NEXT:    v_dual_mul_f32 v17, v18, v17 :: v_dual_mul_f32 v18, v20, v19
+; GFX11-NEXT:    v_mul_f32_e32 v19, v22, v21
+; GFX11-NEXT:    v_mul_f32_e32 v7, v7, v15
+; GFX11-NEXT:    v_mul_f32_e32 v21, v26, v25
+; GFX11-NEXT:    v_dual_mul_f32 v6, v6, v14 :: v_dual_and_b32 v25, 0xffff0000, v0
+; GFX11-NEXT:    v_mul_f32_e32 v20, v24, v23
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
@@ -7837,12 +12025,12 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
 ; GFX11-NEXT:    v_and_b32_e32 v24, 0xffff0000, v8
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX11-NEXT:    v_dual_add_f32 v2, v2, v10 :: v_dual_add_f32 v3, v3, v11
+; GFX11-NEXT:    v_dual_mul_f32 v2, v2, v10 :: v_dual_mul_f32 v3, v3, v11
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_add_f32 v1, v1, v9 :: v_dual_add_f32 v22, v23, v22
-; GFX11-NEXT:    v_add_f32_e32 v23, v25, v24
+; GFX11-NEXT:    v_dual_mul_f32 v1, v1, v9 :: v_dual_mul_f32 v22, v23, v22
+; GFX11-NEXT:    v_mul_f32_e32 v23, v25, v24
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add_f32_e32 v0, v0, v8
+; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v8
 ; GFX11-NEXT:    v_perm_b32 v2, v2, v21, 0x3020706
 ; GFX11-NEXT:    v_perm_b32 v3, v3, v20, 0x3020706
 ; GFX11-NEXT:    v_perm_b32 v1, v1, v22, 0x3020706
@@ -7852,12 +12040,12 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX11-NEXT:    v_perm_b32 v6, v6, v17, 0x3020706
 ; GFX11-NEXT:    v_perm_b32 v7, v7, v16, 0x3020706
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %op = fadd <16 x bfloat> %a, %b
+  %op = fmul <16 x bfloat> %a, %b
   ret <16 x bfloat> %op
 }
 
-define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
-; GCN-LABEL: v_fadd_v32bf16:
+define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
+; GCN-LABEL: v_fmul_v32bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:128
@@ -7867,161 +12055,161 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:124
 ; GCN-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT:    v_add_f32_e32 v31, v32, v31
+; GCN-NEXT:    v_mul_f32_e32 v31, v32, v31
 ; GCN-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
 ; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:120
-; GCN-NEXT:    v_add_f32_e32 v30, v30, v32
+; GCN-NEXT:    v_mul_f32_e32 v30, v30, v32
 ; GCN-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
 ; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:116
 ; GCN-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
-; GCN-NEXT:    v_add_f32_e32 v29, v29, v33
+; GCN-NEXT:    v_mul_f32_e32 v29, v29, v33
 ; GCN-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
 ; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:112
-; GCN-NEXT:    v_add_f32_e32 v28, v28, v32
+; GCN-NEXT:    v_mul_f32_e32 v28, v28, v32
 ; GCN-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
 ; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:108
 ; GCN-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
-; GCN-NEXT:    v_add_f32_e32 v27, v27, v33
+; GCN-NEXT:    v_mul_f32_e32 v27, v27, v33
 ; GCN-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
 ; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:104
-; GCN-NEXT:    v_add_f32_e32 v26, v26, v32
+; GCN-NEXT:    v_mul_f32_e32 v26, v26, v32
 ; GCN-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
 ; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:100
 ; GCN-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
-; GCN-NEXT:    v_add_f32_e32 v25, v25, v33
+; GCN-NEXT:    v_mul_f32_e32 v25, v25, v33
 ; GCN-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
 ; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:96
-; GCN-NEXT:    v_add_f32_e32 v24, v24, v32
+; GCN-NEXT:    v_mul_f32_e32 v24, v24, v32
 ; GCN-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
 ; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:92
 ; GCN-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
-; GCN-NEXT:    v_add_f32_e32 v23, v23, v33
+; GCN-NEXT:    v_mul_f32_e32 v23, v23, v33
 ; GCN-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
 ; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:88
-; GCN-NEXT:    v_add_f32_e32 v22, v22, v32
+; GCN-NEXT:    v_mul_f32_e32 v22, v22, v32
 ; GCN-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
 ; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:84
 ; GCN-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
-; GCN-NEXT:    v_add_f32_e32 v21, v21, v33
+; GCN-NEXT:    v_mul_f32_e32 v21, v21, v33
 ; GCN-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
 ; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:80
-; GCN-NEXT:    v_add_f32_e32 v20, v20, v32
+; GCN-NEXT:    v_mul_f32_e32 v20, v20, v32
 ; GCN-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
 ; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:76
 ; GCN-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
-; GCN-NEXT:    v_add_f32_e32 v19, v19, v33
+; GCN-NEXT:    v_mul_f32_e32 v19, v19, v33
 ; GCN-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
 ; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:72
-; GCN-NEXT:    v_add_f32_e32 v18, v18, v32
+; GCN-NEXT:    v_mul_f32_e32 v18, v18, v32
 ; GCN-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
 ; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:68
 ; GCN-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
-; GCN-NEXT:    v_add_f32_e32 v17, v17, v33
+; GCN-NEXT:    v_mul_f32_e32 v17, v17, v33
 ; GCN-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
 ; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:64
-; GCN-NEXT:    v_add_f32_e32 v16, v16, v32
+; GCN-NEXT:    v_mul_f32_e32 v16, v16, v32
 ; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
 ; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:60
 ; GCN-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
-; GCN-NEXT:    v_add_f32_e32 v15, v15, v33
+; GCN-NEXT:    v_mul_f32_e32 v15, v15, v33
 ; GCN-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
 ; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:56
-; GCN-NEXT:    v_add_f32_e32 v14, v14, v32
+; GCN-NEXT:    v_mul_f32_e32 v14, v14, v32
 ; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
 ; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:52
 ; GCN-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
-; GCN-NEXT:    v_add_f32_e32 v13, v13, v33
+; GCN-NEXT:    v_mul_f32_e32 v13, v13, v33
 ; GCN-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
 ; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:48
-; GCN-NEXT:    v_add_f32_e32 v12, v12, v32
+; GCN-NEXT:    v_mul_f32_e32 v12, v12, v32
 ; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
 ; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:44
 ; GCN-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
-; GCN-NEXT:    v_add_f32_e32 v11, v11, v33
+; GCN-NEXT:    v_mul_f32_e32 v11, v11, v33
 ; GCN-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
 ; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:40
-; GCN-NEXT:    v_add_f32_e32 v10, v10, v32
+; GCN-NEXT:    v_mul_f32_e32 v10, v10, v32
 ; GCN-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
 ; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:36
 ; GCN-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
-; GCN-NEXT:    v_add_f32_e32 v9, v9, v33
+; GCN-NEXT:    v_mul_f32_e32 v9, v9, v33
 ; GCN-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
 ; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:32
-; GCN-NEXT:    v_add_f32_e32 v8, v8, v32
+; GCN-NEXT:    v_mul_f32_e32 v8, v8, v32
 ; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:28
 ; GCN-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
-; GCN-NEXT:    v_add_f32_e32 v7, v7, v33
+; GCN-NEXT:    v_mul_f32_e32 v7, v7, v33
 ; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
 ; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:24
-; GCN-NEXT:    v_add_f32_e32 v6, v6, v32
+; GCN-NEXT:    v_mul_f32_e32 v6, v6, v32
 ; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:20
 ; GCN-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
-; GCN-NEXT:    v_add_f32_e32 v5, v5, v33
+; GCN-NEXT:    v_mul_f32_e32 v5, v5, v33
 ; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
 ; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:16
-; GCN-NEXT:    v_add_f32_e32 v4, v4, v32
+; GCN-NEXT:    v_mul_f32_e32 v4, v4, v32
 ; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:12
 ; GCN-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
-; GCN-NEXT:    v_add_f32_e32 v3, v3, v33
+; GCN-NEXT:    v_mul_f32_e32 v3, v3, v33
 ; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
 ; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
-; GCN-NEXT:    v_add_f32_e32 v2, v2, v32
+; GCN-NEXT:    v_mul_f32_e32 v2, v2, v32
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
 ; GCN-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-NEXT:    v_and_b32_e32 v33, 0xffff0000, v33
-; GCN-NEXT:    v_add_f32_e32 v1, v1, v33
+; GCN-NEXT:    v_mul_f32_e32 v1, v1, v33
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT:    v_add_f32_e32 v0, v0, v32
+; GCN-NEXT:    v_mul_f32_e32 v0, v0, v32
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
@@ -8056,7 +12244,7 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GCN-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: v_fadd_v32bf16:
+; GFX7-LABEL: v_fmul_v32bf16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:128
@@ -8096,211 +12284,211 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX7-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT:    v_add_f32_e32 v31, v32, v31
+; GFX7-NEXT:    v_mul_f32_e32 v31, v32, v31
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:124
 ; GFX7-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT:    v_add_f32_e32 v30, v30, v32
+; GFX7-NEXT:    v_mul_f32_e32 v30, v30, v32
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:120
 ; GFX7-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT:    v_add_f32_e32 v29, v29, v32
+; GFX7-NEXT:    v_mul_f32_e32 v29, v29, v32
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:116
 ; GFX7-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT:    v_add_f32_e32 v28, v28, v32
+; GFX7-NEXT:    v_mul_f32_e32 v28, v28, v32
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:112
 ; GFX7-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT:    v_add_f32_e32 v27, v27, v32
+; GFX7-NEXT:    v_mul_f32_e32 v27, v27, v32
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:108
 ; GFX7-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT:    v_add_f32_e32 v26, v26, v32
+; GFX7-NEXT:    v_mul_f32_e32 v26, v26, v32
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:104
 ; GFX7-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT:    v_add_f32_e32 v25, v25, v32
+; GFX7-NEXT:    v_mul_f32_e32 v25, v25, v32
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:100
 ; GFX7-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT:    v_add_f32_e32 v24, v24, v32
+; GFX7-NEXT:    v_mul_f32_e32 v24, v24, v32
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:96
 ; GFX7-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT:    v_add_f32_e32 v23, v23, v32
+; GFX7-NEXT:    v_mul_f32_e32 v23, v23, v32
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:92
 ; GFX7-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT:    v_add_f32_e32 v22, v22, v32
+; GFX7-NEXT:    v_mul_f32_e32 v22, v22, v32
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:88
 ; GFX7-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT:    v_add_f32_e32 v21, v21, v32
+; GFX7-NEXT:    v_mul_f32_e32 v21, v21, v32
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:84
 ; GFX7-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT:    v_add_f32_e32 v20, v20, v32
+; GFX7-NEXT:    v_mul_f32_e32 v20, v20, v32
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:80
 ; GFX7-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT:    v_add_f32_e32 v19, v19, v32
+; GFX7-NEXT:    v_mul_f32_e32 v19, v19, v32
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:76
 ; GFX7-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT:    v_add_f32_e32 v18, v18, v32
+; GFX7-NEXT:    v_mul_f32_e32 v18, v18, v32
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:72
 ; GFX7-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT:    v_add_f32_e32 v17, v17, v32
+; GFX7-NEXT:    v_mul_f32_e32 v17, v17, v32
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:68
 ; GFX7-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT:    v_add_f32_e32 v16, v16, v32
+; GFX7-NEXT:    v_mul_f32_e32 v16, v16, v32
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:64
 ; GFX7-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT:    v_add_f32_e32 v15, v15, v32
+; GFX7-NEXT:    v_mul_f32_e32 v15, v15, v32
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:60
 ; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT:    v_add_f32_e32 v14, v14, v32
+; GFX7-NEXT:    v_mul_f32_e32 v14, v14, v32
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:56
 ; GFX7-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT:    v_add_f32_e32 v13, v13, v32
+; GFX7-NEXT:    v_mul_f32_e32 v13, v13, v32
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:52
 ; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT:    v_add_f32_e32 v12, v12, v32
+; GFX7-NEXT:    v_mul_f32_e32 v12, v12, v32
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:48
 ; GFX7-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT:    v_add_f32_e32 v11, v11, v32
+; GFX7-NEXT:    v_mul_f32_e32 v11, v11, v32
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:44
 ; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT:    v_add_f32_e32 v10, v10, v32
+; GFX7-NEXT:    v_mul_f32_e32 v10, v10, v32
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:40
 ; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT:    v_add_f32_e32 v9, v9, v32
+; GFX7-NEXT:    v_mul_f32_e32 v9, v9, v32
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:36
 ; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT:    v_add_f32_e32 v8, v8, v32
+; GFX7-NEXT:    v_mul_f32_e32 v8, v8, v32
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:32
 ; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT:    v_add_f32_e32 v7, v7, v32
+; GFX7-NEXT:    v_mul_f32_e32 v7, v7, v32
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:28
 ; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT:    v_add_f32_e32 v6, v6, v32
+; GFX7-NEXT:    v_mul_f32_e32 v6, v6, v32
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:24
 ; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT:    v_add_f32_e32 v5, v5, v32
+; GFX7-NEXT:    v_mul_f32_e32 v5, v5, v32
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:20
 ; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT:    v_add_f32_e32 v4, v4, v32
+; GFX7-NEXT:    v_mul_f32_e32 v4, v4, v32
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:16
 ; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT:    v_add_f32_e32 v3, v3, v32
+; GFX7-NEXT:    v_mul_f32_e32 v3, v3, v32
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:12
 ; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT:    v_add_f32_e32 v2, v2, v32
+; GFX7-NEXT:    v_mul_f32_e32 v2, v2, v32
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:8
 ; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT:    v_add_f32_e32 v1, v1, v32
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v32
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT:    v_add_f32_e32 v0, v0, v32
+; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v32
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: v_fadd_v32bf16:
+; GFX8-LABEL: v_fmul_v32bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_and_b32_e32 v31, 0xffff0000, v30
 ; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v14
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX8-NEXT:    v_add_f32_e32 v31, v32, v31
-; GFX8-NEXT:    v_add_f32_e32 v14, v14, v30
+; GFX8-NEXT:    v_mul_f32_e32 v31, v32, v31
+; GFX8-NEXT:    v_mul_f32_e32 v14, v14, v30
 ; GFX8-NEXT:    v_and_b32_e32 v30, 0xffff0000, v29
 ; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v13
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX8-NEXT:    v_add_f32_e32 v30, v32, v30
-; GFX8-NEXT:    v_add_f32_e32 v13, v13, v29
+; GFX8-NEXT:    v_mul_f32_e32 v30, v32, v30
+; GFX8-NEXT:    v_mul_f32_e32 v13, v13, v29
 ; GFX8-NEXT:    v_and_b32_e32 v29, 0xffff0000, v28
 ; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v12
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; GFX8-NEXT:    v_add_f32_e32 v29, v32, v29
-; GFX8-NEXT:    v_add_f32_e32 v12, v12, v28
+; GFX8-NEXT:    v_mul_f32_e32 v29, v32, v29
+; GFX8-NEXT:    v_mul_f32_e32 v12, v12, v28
 ; GFX8-NEXT:    v_and_b32_e32 v28, 0xffff0000, v27
 ; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v11
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
-; GFX8-NEXT:    v_add_f32_e32 v28, v32, v28
-; GFX8-NEXT:    v_add_f32_e32 v11, v11, v27
+; GFX8-NEXT:    v_mul_f32_e32 v28, v32, v28
+; GFX8-NEXT:    v_mul_f32_e32 v11, v11, v27
 ; GFX8-NEXT:    v_and_b32_e32 v27, 0xffff0000, v26
 ; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v10
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX8-NEXT:    v_add_f32_e32 v27, v32, v27
-; GFX8-NEXT:    v_add_f32_e32 v10, v10, v26
+; GFX8-NEXT:    v_mul_f32_e32 v27, v32, v27
+; GFX8-NEXT:    v_mul_f32_e32 v10, v10, v26
 ; GFX8-NEXT:    v_and_b32_e32 v26, 0xffff0000, v25
 ; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v9
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX8-NEXT:    v_add_f32_e32 v26, v32, v26
-; GFX8-NEXT:    v_add_f32_e32 v9, v9, v25
+; GFX8-NEXT:    v_mul_f32_e32 v26, v32, v26
+; GFX8-NEXT:    v_mul_f32_e32 v9, v9, v25
 ; GFX8-NEXT:    v_and_b32_e32 v25, 0xffff0000, v24
 ; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v8
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX8-NEXT:    v_add_f32_e32 v8, v8, v24
+; GFX8-NEXT:    v_mul_f32_e32 v8, v8, v24
 ; GFX8-NEXT:    buffer_load_dword v24, off, s[0:3], s32
-; GFX8-NEXT:    v_add_f32_e32 v25, v32, v25
+; GFX8-NEXT:    v_mul_f32_e32 v25, v32, v25
 ; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v15
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
 ; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
@@ -8314,56 +12502,56 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v24
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
-; GFX8-NEXT:    v_add_f32_e32 v32, v32, v33
-; GFX8-NEXT:    v_add_f32_e32 v15, v15, v24
+; GFX8-NEXT:    v_mul_f32_e32 v32, v32, v33
+; GFX8-NEXT:    v_mul_f32_e32 v15, v15, v24
 ; GFX8-NEXT:    v_and_b32_e32 v24, 0xffff0000, v23
 ; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v7
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX8-NEXT:    v_add_f32_e32 v24, v33, v24
-; GFX8-NEXT:    v_add_f32_e32 v7, v7, v23
+; GFX8-NEXT:    v_mul_f32_e32 v24, v33, v24
+; GFX8-NEXT:    v_mul_f32_e32 v7, v7, v23
 ; GFX8-NEXT:    v_and_b32_e32 v23, 0xffff0000, v22
 ; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v6
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX8-NEXT:    v_add_f32_e32 v23, v33, v23
-; GFX8-NEXT:    v_add_f32_e32 v6, v6, v22
+; GFX8-NEXT:    v_mul_f32_e32 v23, v33, v23
+; GFX8-NEXT:    v_mul_f32_e32 v6, v6, v22
 ; GFX8-NEXT:    v_and_b32_e32 v22, 0xffff0000, v21
 ; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v5
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX8-NEXT:    v_add_f32_e32 v22, v33, v22
-; GFX8-NEXT:    v_add_f32_e32 v5, v5, v21
+; GFX8-NEXT:    v_mul_f32_e32 v22, v33, v22
+; GFX8-NEXT:    v_mul_f32_e32 v5, v5, v21
 ; GFX8-NEXT:    v_and_b32_e32 v21, 0xffff0000, v20
 ; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v4
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX8-NEXT:    v_add_f32_e32 v21, v33, v21
-; GFX8-NEXT:    v_add_f32_e32 v4, v4, v20
+; GFX8-NEXT:    v_mul_f32_e32 v21, v33, v21
+; GFX8-NEXT:    v_mul_f32_e32 v4, v4, v20
 ; GFX8-NEXT:    v_and_b32_e32 v20, 0xffff0000, v19
 ; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v3
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_add_f32_e32 v20, v33, v20
-; GFX8-NEXT:    v_add_f32_e32 v3, v3, v19
+; GFX8-NEXT:    v_mul_f32_e32 v20, v33, v20
+; GFX8-NEXT:    v_mul_f32_e32 v3, v3, v19
 ; GFX8-NEXT:    v_and_b32_e32 v19, 0xffff0000, v18
 ; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v2
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_add_f32_e32 v19, v33, v19
-; GFX8-NEXT:    v_add_f32_e32 v2, v2, v18
+; GFX8-NEXT:    v_mul_f32_e32 v19, v33, v19
+; GFX8-NEXT:    v_mul_f32_e32 v2, v2, v18
 ; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v17
 ; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v1
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_add_f32_e32 v18, v33, v18
-; GFX8-NEXT:    v_add_f32_e32 v1, v1, v17
+; GFX8-NEXT:    v_mul_f32_e32 v18, v33, v18
+; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v17
 ; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v16
 ; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_add_f32_e32 v17, v33, v17
-; GFX8-NEXT:    v_add_f32_e32 v0, v0, v16
+; GFX8-NEXT:    v_mul_f32_e32 v17, v33, v17
+; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v16
 ; GFX8-NEXT:    v_perm_b32 v0, v0, v17, s4
 ; GFX8-NEXT:    v_perm_b32 v1, v1, v18, s4
 ; GFX8-NEXT:    v_perm_b32 v2, v2, v19, s4
@@ -8375,7 +12563,7 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    v_perm_b32 v15, v15, v32, s4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fadd_v32bf16:
+; GFX9-LABEL: v_fmul_v32bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
@@ -8414,19 +12602,19 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX9-NEXT:    v_and_b32_e32 v47, 0xffff0000, v3
 ; GFX9-NEXT:    v_and_b32_e32 v56, 0xffff0000, v18
 ; GFX9-NEXT:    v_and_b32_e32 v57, 0xffff0000, v2
-; GFX9-NEXT:    v_add_f32_e32 v38, v39, v38
-; GFX9-NEXT:    v_add_f32_e32 v39, v49, v48
-; GFX9-NEXT:    v_add_f32_e32 v48, v51, v50
-; GFX9-NEXT:    v_add_f32_e32 v51, v41, v40
-; GFX9-NEXT:    v_add_f32_e32 v40, v59, v58
-; GFX9-NEXT:    v_add_f32_e32 v1, v1, v17
+; GFX9-NEXT:    v_mul_f32_e32 v38, v39, v38
+; GFX9-NEXT:    v_mul_f32_e32 v39, v49, v48
+; GFX9-NEXT:    v_mul_f32_e32 v48, v51, v50
+; GFX9-NEXT:    v_mul_f32_e32 v51, v41, v40
+; GFX9-NEXT:    v_mul_f32_e32 v40, v59, v58
+; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v17
 ; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX9-NEXT:    v_add_f32_e32 v49, v53, v52
-; GFX9-NEXT:    v_add_f32_e32 v50, v55, v54
-; GFX9-NEXT:    v_add_f32_e32 v52, v43, v42
-; GFX9-NEXT:    v_add_f32_e32 v53, v45, v44
-; GFX9-NEXT:    v_add_f32_e32 v54, v47, v46
-; GFX9-NEXT:    v_add_f32_e32 v55, v57, v56
+; GFX9-NEXT:    v_mul_f32_e32 v49, v53, v52
+; GFX9-NEXT:    v_mul_f32_e32 v50, v55, v54
+; GFX9-NEXT:    v_mul_f32_e32 v52, v43, v42
+; GFX9-NEXT:    v_mul_f32_e32 v53, v45, v44
+; GFX9-NEXT:    v_mul_f32_e32 v54, v47, v46
+; GFX9-NEXT:    v_mul_f32_e32 v55, v57, v56
 ; GFX9-NEXT:    v_perm_b32 v1, v1, v40, s4
 ; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
@@ -8447,14 +12635,14 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX9-NEXT:    v_and_b32_e32 v35, 0xffff0000, v13
 ; GFX9-NEXT:    v_and_b32_e32 v36, 0xffff0000, v28
 ; GFX9-NEXT:    v_and_b32_e32 v37, 0xffff0000, v12
-; GFX9-NEXT:    v_add_f32_e32 v32, v33, v32
+; GFX9-NEXT:    v_mul_f32_e32 v32, v33, v32
 ; GFX9-NEXT:    v_and_b32_e32 v33, 0xffff0000, v16
-; GFX9-NEXT:    v_add_f32_e32 v34, v35, v34
+; GFX9-NEXT:    v_mul_f32_e32 v34, v35, v34
 ; GFX9-NEXT:    v_and_b32_e32 v35, 0xffff0000, v0
-; GFX9-NEXT:    v_add_f32_e32 v36, v37, v36
+; GFX9-NEXT:    v_mul_f32_e32 v36, v37, v36
 ; GFX9-NEXT:    v_and_b32_e32 v37, 0xffff0000, v15
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX9-NEXT:    v_add_f32_e32 v33, v35, v33
+; GFX9-NEXT:    v_mul_f32_e32 v33, v35, v33
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
@@ -8483,20 +12671,20 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_add_f32_e32 v14, v14, v30
-; GFX9-NEXT:    v_add_f32_e32 v13, v13, v29
-; GFX9-NEXT:    v_add_f32_e32 v12, v12, v28
-; GFX9-NEXT:    v_add_f32_e32 v11, v11, v27
-; GFX9-NEXT:    v_add_f32_e32 v10, v10, v26
-; GFX9-NEXT:    v_add_f32_e32 v9, v9, v25
-; GFX9-NEXT:    v_add_f32_e32 v8, v8, v24
-; GFX9-NEXT:    v_add_f32_e32 v7, v7, v23
-; GFX9-NEXT:    v_add_f32_e32 v6, v6, v22
-; GFX9-NEXT:    v_add_f32_e32 v5, v5, v21
-; GFX9-NEXT:    v_add_f32_e32 v4, v4, v20
-; GFX9-NEXT:    v_add_f32_e32 v3, v3, v19
-; GFX9-NEXT:    v_add_f32_e32 v2, v2, v18
-; GFX9-NEXT:    v_add_f32_e32 v0, v0, v16
+; GFX9-NEXT:    v_mul_f32_e32 v14, v14, v30
+; GFX9-NEXT:    v_mul_f32_e32 v13, v13, v29
+; GFX9-NEXT:    v_mul_f32_e32 v12, v12, v28
+; GFX9-NEXT:    v_mul_f32_e32 v11, v11, v27
+; GFX9-NEXT:    v_mul_f32_e32 v10, v10, v26
+; GFX9-NEXT:    v_mul_f32_e32 v9, v9, v25
+; GFX9-NEXT:    v_mul_f32_e32 v8, v8, v24
+; GFX9-NEXT:    v_mul_f32_e32 v7, v7, v23
+; GFX9-NEXT:    v_mul_f32_e32 v6, v6, v22
+; GFX9-NEXT:    v_mul_f32_e32 v5, v5, v21
+; GFX9-NEXT:    v_mul_f32_e32 v4, v4, v20
+; GFX9-NEXT:    v_mul_f32_e32 v3, v3, v19
+; GFX9-NEXT:    v_mul_f32_e32 v2, v2, v18
+; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v16
 ; GFX9-NEXT:    v_perm_b32 v0, v0, v33, s4
 ; GFX9-NEXT:    v_perm_b32 v2, v2, v55, s4
 ; GFX9-NEXT:    v_perm_b32 v3, v3, v54, s4
@@ -8514,12 +12702,12 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v35, 0xffff0000, v31
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v31, 16, v31
-; GFX9-NEXT:    v_add_f32_e32 v35, v37, v35
-; GFX9-NEXT:    v_add_f32_e32 v15, v15, v31
+; GFX9-NEXT:    v_mul_f32_e32 v35, v37, v35
+; GFX9-NEXT:    v_mul_f32_e32 v15, v15, v31
 ; GFX9-NEXT:    v_perm_b32 v15, v15, v35, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: v_fadd_v32bf16:
+; GFX10-LABEL: v_fmul_v32bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    buffer_load_dword v31, off, s[0:3], s32
@@ -8543,30 +12731,30 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX10-NEXT:    v_and_b32_e32 v50, 0xffff0000, v10
 ; GFX10-NEXT:    v_and_b32_e32 v51, 0xffff0000, v25
 ; GFX10-NEXT:    v_and_b32_e32 v52, 0xffff0000, v9
-; GFX10-NEXT:    v_add_f32_e32 v53, v54, v53
+; GFX10-NEXT:    v_mul_f32_e32 v53, v54, v53
 ; GFX10-NEXT:    v_and_b32_e32 v54, 0xffff0000, v17
-; GFX10-NEXT:    v_add_f32_e32 v55, v64, v55
+; GFX10-NEXT:    v_mul_f32_e32 v55, v64, v55
 ; GFX10-NEXT:    v_and_b32_e32 v64, 0xffff0000, v1
-; GFX10-NEXT:    v_add_f32_e32 v65, v66, v65
+; GFX10-NEXT:    v_mul_f32_e32 v65, v66, v65
 ; GFX10-NEXT:    v_and_b32_e32 v66, 0xffff0000, v16
-; GFX10-NEXT:    v_add_f32_e32 v67, v68, v67
+; GFX10-NEXT:    v_mul_f32_e32 v67, v68, v67
 ; GFX10-NEXT:    v_and_b32_e32 v68, 0xffff0000, v0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    v_and_b32_e32 v32, 0xffff0000, v15
-; GFX10-NEXT:    v_add_f32_e32 v33, v34, v33
+; GFX10-NEXT:    v_mul_f32_e32 v33, v34, v33
 ; GFX10-NEXT:    v_and_b32_e32 v34, 0xffff0000, v20
-; GFX10-NEXT:    v_add_f32_e32 v35, v36, v35
+; GFX10-NEXT:    v_mul_f32_e32 v35, v36, v35
 ; GFX10-NEXT:    v_and_b32_e32 v36, 0xffff0000, v4
-; GFX10-NEXT:    v_add_f32_e32 v37, v38, v37
+; GFX10-NEXT:    v_mul_f32_e32 v37, v38, v37
 ; GFX10-NEXT:    v_and_b32_e32 v38, 0xffff0000, v19
-; GFX10-NEXT:    v_add_f32_e32 v39, v48, v39
+; GFX10-NEXT:    v_mul_f32_e32 v39, v48, v39
 ; GFX10-NEXT:    v_and_b32_e32 v48, 0xffff0000, v3
-; GFX10-NEXT:    v_add_f32_e32 v49, v50, v49
+; GFX10-NEXT:    v_mul_f32_e32 v49, v50, v49
 ; GFX10-NEXT:    v_and_b32_e32 v50, 0xffff0000, v18
-; GFX10-NEXT:    v_add_f32_e32 v51, v52, v51
+; GFX10-NEXT:    v_mul_f32_e32 v51, v52, v51
 ; GFX10-NEXT:    v_and_b32_e32 v52, 0xffff0000, v2
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
@@ -8595,26 +12783,26 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-NEXT:    v_add_f32_e32 v0, v0, v16
-; GFX10-NEXT:    v_add_f32_e32 v1, v1, v17
-; GFX10-NEXT:    v_add_f32_e32 v34, v36, v34
-; GFX10-NEXT:    v_add_f32_e32 v36, v48, v38
-; GFX10-NEXT:    v_add_f32_e32 v38, v52, v50
-; GFX10-NEXT:    v_add_f32_e32 v48, v64, v54
-; GFX10-NEXT:    v_add_f32_e32 v50, v68, v66
-; GFX10-NEXT:    v_add_f32_e32 v14, v14, v30
-; GFX10-NEXT:    v_add_f32_e32 v13, v13, v29
-; GFX10-NEXT:    v_add_f32_e32 v12, v12, v28
-; GFX10-NEXT:    v_add_f32_e32 v11, v11, v27
-; GFX10-NEXT:    v_add_f32_e32 v10, v10, v26
-; GFX10-NEXT:    v_add_f32_e32 v9, v9, v25
-; GFX10-NEXT:    v_add_f32_e32 v8, v8, v24
-; GFX10-NEXT:    v_add_f32_e32 v7, v7, v23
-; GFX10-NEXT:    v_add_f32_e32 v6, v6, v22
-; GFX10-NEXT:    v_add_f32_e32 v5, v5, v21
-; GFX10-NEXT:    v_add_f32_e32 v2, v2, v18
-; GFX10-NEXT:    v_add_f32_e32 v3, v3, v19
-; GFX10-NEXT:    v_add_f32_e32 v4, v4, v20
+; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v16
+; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v17
+; GFX10-NEXT:    v_mul_f32_e32 v34, v36, v34
+; GFX10-NEXT:    v_mul_f32_e32 v36, v48, v38
+; GFX10-NEXT:    v_mul_f32_e32 v38, v52, v50
+; GFX10-NEXT:    v_mul_f32_e32 v48, v64, v54
+; GFX10-NEXT:    v_mul_f32_e32 v50, v68, v66
+; GFX10-NEXT:    v_mul_f32_e32 v14, v14, v30
+; GFX10-NEXT:    v_mul_f32_e32 v13, v13, v29
+; GFX10-NEXT:    v_mul_f32_e32 v12, v12, v28
+; GFX10-NEXT:    v_mul_f32_e32 v11, v11, v27
+; GFX10-NEXT:    v_mul_f32_e32 v10, v10, v26
+; GFX10-NEXT:    v_mul_f32_e32 v9, v9, v25
+; GFX10-NEXT:    v_mul_f32_e32 v8, v8, v24
+; GFX10-NEXT:    v_mul_f32_e32 v7, v7, v23
+; GFX10-NEXT:    v_mul_f32_e32 v6, v6, v22
+; GFX10-NEXT:    v_mul_f32_e32 v5, v5, v21
+; GFX10-NEXT:    v_mul_f32_e32 v2, v2, v18
+; GFX10-NEXT:    v_mul_f32_e32 v3, v3, v19
+; GFX10-NEXT:    v_mul_f32_e32 v4, v4, v20
 ; GFX10-NEXT:    v_perm_b32 v0, v0, v50, 0x3020706
 ; GFX10-NEXT:    v_perm_b32 v1, v1, v48, 0x3020706
 ; GFX10-NEXT:    v_perm_b32 v2, v2, v38, 0x3020706
@@ -8633,12 +12821,12 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_and_b32_e32 v16, 0xffff0000, v31
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v31
-; GFX10-NEXT:    v_add_f32_e32 v16, v32, v16
-; GFX10-NEXT:    v_add_f32_e32 v15, v15, v17
+; GFX10-NEXT:    v_mul_f32_e32 v16, v32, v16
+; GFX10-NEXT:    v_mul_f32_e32 v15, v15, v17
 ; GFX10-NEXT:    v_perm_b32 v15, v15, v16, 0x3020706
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_fadd_v32bf16:
+; GFX11-LABEL: v_fmul_v32bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    scratch_load_b32 v31, off, s32
@@ -8656,20 +12844,20 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
 ; GFX11-NEXT:    v_and_b32_e32 v55, 0xffff0000, v23
 ; GFX11-NEXT:    v_and_b32_e32 v70, 0xffff0000, v4
-; GFX11-NEXT:    v_add_f32_e32 v2, v2, v18
+; GFX11-NEXT:    v_mul_f32_e32 v2, v2, v18
 ; GFX11-NEXT:    v_and_b32_e32 v65, 0xffff0000, v22
-; GFX11-NEXT:    v_dual_add_f32 v1, v1, v17 :: v_dual_lshlrev_b32 v22, 16, v22
+; GFX11-NEXT:    v_dual_mul_f32 v1, v1, v17 :: v_dual_lshlrev_b32 v22, 16, v22
 ; GFX11-NEXT:    v_and_b32_e32 v66, 0xffff0000, v6
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
 ; GFX11-NEXT:    v_and_b32_e32 v50, 0xffff0000, v10
 ; GFX11-NEXT:    v_and_b32_e32 v69, 0xffff0000, v20
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_add_f32 v6, v6, v22 :: v_dual_lshlrev_b32 v23, 16, v23
+; GFX11-NEXT:    v_dual_mul_f32 v6, v6, v22 :: v_dual_lshlrev_b32 v23, 16, v23
 ; GFX11-NEXT:    v_and_b32_e32 v54, 0xffff0000, v8
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
 ; GFX11-NEXT:    v_and_b32_e32 v71, 0xffff0000, v19
-; GFX11-NEXT:    v_dual_add_f32 v0, v0, v16 :: v_dual_lshlrev_b32 v19, 16, v19
+; GFX11-NEXT:    v_dual_mul_f32 v0, v0, v16 :: v_dual_lshlrev_b32 v19, 16, v19
 ; GFX11-NEXT:    v_and_b32_e32 v64, 0xffff0000, v7
 ; GFX11-NEXT:    v_and_b32_e32 v49, 0xffff0000, v26
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
@@ -8678,19 +12866,19 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
 ; GFX11-NEXT:    v_and_b32_e32 v80, 0xffff0000, v3
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_dual_add_f32 v10, v10, v26 :: v_dual_and_b32 v67, 0xffff0000, v21
+; GFX11-NEXT:    v_dual_mul_f32 v10, v10, v26 :: v_dual_and_b32 v67, 0xffff0000, v21
 ; GFX11-NEXT:    v_and_b32_e32 v68, 0xffff0000, v5
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_add_f32 v3, v3, v19 :: v_dual_and_b32 v38, 0xffff0000, v12
+; GFX11-NEXT:    v_dual_mul_f32 v3, v3, v19 :: v_dual_and_b32 v38, 0xffff0000, v12
 ; GFX11-NEXT:    v_and_b32_e32 v51, 0xffff0000, v25
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
-; GFX11-NEXT:    v_add_f32_e32 v7, v7, v23
+; GFX11-NEXT:    v_mul_f32_e32 v7, v7, v23
 ; GFX11-NEXT:    v_and_b32_e32 v37, 0xffff0000, v28
 ; GFX11-NEXT:    v_and_b32_e32 v34, 0xffff0000, v14
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
 ; GFX11-NEXT:    v_and_b32_e32 v39, 0xffff0000, v27
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
-; GFX11-NEXT:    v_add_f32_e32 v9, v9, v25
+; GFX11-NEXT:    v_mul_f32_e32 v9, v9, v25
 ; GFX11-NEXT:    v_and_b32_e32 v53, 0xffff0000, v24
 ; GFX11-NEXT:    v_and_b32_e32 v48, 0xffff0000, v11
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
@@ -8698,20 +12886,20 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX11-NEXT:    v_and_b32_e32 v36, 0xffff0000, v13
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_add_f32 v11, v11, v27 :: v_dual_lshlrev_b32 v4, 16, v4
+; GFX11-NEXT:    v_dual_mul_f32 v11, v11, v27 :: v_dual_lshlrev_b32 v4, 16, v4
 ; GFX11-NEXT:    v_and_b32_e32 v33, 0xffff0000, v30
 ; GFX11-NEXT:    v_and_b32_e32 v32, 0xffff0000, v15
-; GFX11-NEXT:    v_dual_add_f32 v4, v4, v20 :: v_dual_lshlrev_b32 v15, 16, v15
+; GFX11-NEXT:    v_dual_mul_f32 v4, v4, v20 :: v_dual_lshlrev_b32 v15, 16, v15
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT:    v_dual_add_f32 v33, v34, v33 :: v_dual_add_f32 v34, v36, v35
-; GFX11-NEXT:    v_dual_add_f32 v35, v38, v37 :: v_dual_add_f32 v12, v12, v28
+; GFX11-NEXT:    v_dual_mul_f32 v33, v34, v33 :: v_dual_mul_f32 v34, v36, v35
+; GFX11-NEXT:    v_dual_mul_f32 v35, v38, v37 :: v_dual_mul_f32 v12, v12, v28
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_add_f32 v8, v8, v24 :: v_dual_add_f32 v5, v5, v21
+; GFX11-NEXT:    v_dual_mul_f32 v8, v8, v24 :: v_dual_mul_f32 v5, v5, v21
 ; GFX11-NEXT:    v_perm_b32 v12, v12, v35, 0x3020706
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff0000, v31
@@ -8719,16 +12907,16 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v31
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_add_f32 v16, v32, v16 :: v_dual_add_f32 v13, v13, v29
-; GFX11-NEXT:    v_dual_add_f32 v15, v15, v17 :: v_dual_add_f32 v14, v14, v30
-; GFX11-NEXT:    v_add_f32_e32 v36, v48, v39
-; GFX11-NEXT:    v_dual_add_f32 v48, v64, v55 :: v_dual_add_f32 v37, v50, v49
-; GFX11-NEXT:    v_add_f32_e32 v50, v68, v67
-; GFX11-NEXT:    v_dual_add_f32 v38, v52, v51 :: v_dual_add_f32 v51, v70, v69
-; GFX11-NEXT:    v_dual_add_f32 v52, v80, v71 :: v_dual_add_f32 v39, v54, v53
-; GFX11-NEXT:    v_dual_add_f32 v53, v82, v81 :: v_dual_add_f32 v54, v84, v83
-; GFX11-NEXT:    v_add_f32_e32 v55, v86, v85
-; GFX11-NEXT:    v_add_f32_e32 v49, v66, v65
+; GFX11-NEXT:    v_dual_mul_f32 v16, v32, v16 :: v_dual_mul_f32 v13, v13, v29
+; GFX11-NEXT:    v_dual_mul_f32 v15, v15, v17 :: v_dual_mul_f32 v14, v14, v30
+; GFX11-NEXT:    v_mul_f32_e32 v36, v48, v39
+; GFX11-NEXT:    v_dual_mul_f32 v48, v64, v55 :: v_dual_mul_f32 v37, v50, v49
+; GFX11-NEXT:    v_mul_f32_e32 v50, v68, v67
+; GFX11-NEXT:    v_dual_mul_f32 v38, v52, v51 :: v_dual_mul_f32 v51, v70, v69
+; GFX11-NEXT:    v_dual_mul_f32 v52, v80, v71 :: v_dual_mul_f32 v39, v54, v53
+; GFX11-NEXT:    v_dual_mul_f32 v53, v82, v81 :: v_dual_mul_f32 v54, v84, v83
+; GFX11-NEXT:    v_mul_f32_e32 v55, v86, v85
+; GFX11-NEXT:    v_mul_f32_e32 v49, v66, v65
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_perm_b32 v3, v3, v52, 0x3020706
 ; GFX11-NEXT:    v_perm_b32 v2, v2, v53, 0x3020706
@@ -8746,530 +12934,634 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX11-NEXT:    v_perm_b32 v14, v14, v33, 0x3020706
 ; GFX11-NEXT:    v_perm_b32 v15, v15, v16, 0x3020706
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %op = fadd <32 x bfloat> %a, %b
-  ret <32 x bfloat> %op
+  %op = fmul <32 x bfloat> %a, %b
+  ret <32 x bfloat> %op
+}
+
+define bfloat @v_fdiv_bf16(bfloat %a, bfloat %b) {
+; GCN-LABEL: v_fdiv_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
+; GCN-NEXT:    v_rcp_f32_e32 v3, v2
+; GCN-NEXT:    v_fma_f32 v4, -v2, v3, 1.0
+; GCN-NEXT:    v_fma_f32 v3, v4, v3, v3
+; GCN-NEXT:    v_div_scale_f32 v4, vcc, v0, v1, v0
+; GCN-NEXT:    v_mul_f32_e32 v5, v4, v3
+; GCN-NEXT:    v_fma_f32 v6, -v2, v5, v4
+; GCN-NEXT:    v_fma_f32 v5, v6, v3, v5
+; GCN-NEXT:    v_fma_f32 v2, -v2, v5, v4
+; GCN-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
+; GCN-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_fdiv_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
+; GFX7-NEXT:    v_rcp_f32_e32 v3, v2
+; GFX7-NEXT:    v_fma_f32 v4, -v2, v3, 1.0
+; GFX7-NEXT:    v_fma_f32 v3, v4, v3, v3
+; GFX7-NEXT:    v_div_scale_f32 v4, vcc, v0, v1, v0
+; GFX7-NEXT:    v_mul_f32_e32 v5, v4, v3
+; GFX7-NEXT:    v_fma_f32 v6, -v2, v5, v4
+; GFX7-NEXT:    v_fma_f32 v5, v6, v3, v5
+; GFX7-NEXT:    v_fma_f32 v2, -v2, v5, v4
+; GFX7-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
+; GFX7-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fdiv_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
+; GFX8-NEXT:    v_div_scale_f32 v3, vcc, v0, v1, v0
+; GFX8-NEXT:    v_rcp_f32_e32 v4, v2
+; GFX8-NEXT:    v_fma_f32 v5, -v2, v4, 1.0
+; GFX8-NEXT:    v_fma_f32 v4, v5, v4, v4
+; GFX8-NEXT:    v_mul_f32_e32 v5, v3, v4
+; GFX8-NEXT:    v_fma_f32 v6, -v2, v5, v3
+; GFX8-NEXT:    v_fma_f32 v5, v6, v4, v5
+; GFX8-NEXT:    v_fma_f32 v2, -v2, v5, v3
+; GFX8-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
+; GFX8-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fdiv_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
+; GFX9-NEXT:    v_div_scale_f32 v3, vcc, v0, v1, v0
+; GFX9-NEXT:    v_rcp_f32_e32 v4, v2
+; GFX9-NEXT:    v_fma_f32 v5, -v2, v4, 1.0
+; GFX9-NEXT:    v_fma_f32 v4, v5, v4, v4
+; GFX9-NEXT:    v_mul_f32_e32 v5, v3, v4
+; GFX9-NEXT:    v_fma_f32 v6, -v2, v5, v3
+; GFX9-NEXT:    v_fma_f32 v5, v6, v4, v5
+; GFX9-NEXT:    v_fma_f32 v2, -v2, v5, v3
+; GFX9-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
+; GFX9-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fdiv_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_div_scale_f32 v2, s4, v1, v1, v0
+; GFX10-NEXT:    v_rcp_f32_e32 v3, v2
+; GFX10-NEXT:    v_fma_f32 v4, -v2, v3, 1.0
+; GFX10-NEXT:    v_fmac_f32_e32 v3, v4, v3
+; GFX10-NEXT:    v_div_scale_f32 v4, vcc_lo, v0, v1, v0
+; GFX10-NEXT:    v_mul_f32_e32 v5, v4, v3
+; GFX10-NEXT:    v_fma_f32 v6, -v2, v5, v4
+; GFX10-NEXT:    v_fmac_f32_e32 v5, v6, v3
+; GFX10-NEXT:    v_fma_f32 v2, -v2, v5, v4
+; GFX10-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
+; GFX10-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_fdiv_bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_div_scale_f32 v2, null, v1, v1, v0
+; GFX11-NEXT:    v_rcp_f32_e32 v3, v2
+; GFX11-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-NEXT:    v_fma_f32 v4, -v2, v3, 1.0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_fmac_f32_e32 v3, v4, v3
+; GFX11-NEXT:    v_div_scale_f32 v4, vcc_lo, v0, v1, v0
+; GFX11-NEXT:    v_mul_f32_e32 v5, v4, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_fma_f32 v6, -v2, v5, v4
+; GFX11-NEXT:    v_fmac_f32_e32 v5, v6, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_fma_f32 v2, -v2, v5, v4
+; GFX11-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = fdiv bfloat %a, %b
+  ret bfloat %op
 }
 
-define bfloat @v_fsub_bf16(bfloat %a, bfloat %b) {
-; GCN-LABEL: v_fsub_bf16:
+declare bfloat @llvm.fabs.bf16(bfloat)
+
+define bfloat @v_fabs_bf16(bfloat %a) {
+; GCN-LABEL: v_fabs_bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GCN-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: v_fsub_bf16:
+; GFX7-LABEL: v_fabs_bf16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: v_fsub_bf16:
+; GFX8-LABEL: v_fabs_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX8-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX8-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fsub_bf16:
+; GFX9-LABEL: v_fabs_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX9-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: v_fsub_bf16:
+; GFX10-LABEL: v_fabs_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX10-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_fsub_bf16:
+; GFX11-LABEL: v_fabs_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %op = fsub bfloat %a, %b
+  %op = call bfloat @llvm.fabs.bf16(bfloat %a)
   ret bfloat %op
 }
 
-define <2 x bfloat> @v_fsub_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
-; GCN-LABEL: v_fsub_v2bf16:
+define amdgpu_ps i32 @s_fabs_bf16(bfloat inreg %a) {
+; GCN-LABEL: s_fabs_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_bfe_u32 s0, s0, 0xf0010
+; GCN-NEXT:    ; return to shader part epilog
+;
+; GFX7-LABEL: s_fabs_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_bfe_u32 s0, s0, 0xf0010
+; GFX7-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: s_fabs_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_bfe_u32 s0, s0, 0xf0010
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: s_fabs_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_bfe_u32 s0, s0, 0xf0010
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: s_fabs_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_bfe_u32 s0, s0, 0xf0010
+; GFX10-NEXT:    ; return to shader part epilog
+;
+; GFX11-LABEL: s_fabs_bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_bfe_u32 s0, s0, 0xf0010
+; GFX11-NEXT:    ; return to shader part epilog
+  %op = call bfloat @llvm.fabs.bf16(bfloat %a)
+  %cast = bitcast bfloat %op to i16
+  %zext = zext i16 %cast to i32
+  %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %zext)
+  ret i32 %readlane
+}
+
+define bfloat @v_fneg_bf16(bfloat %a) {
+; GCN-LABEL: v_fneg_bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT:    v_sub_f32_e32 v1, v1, v3
-; GCN-NEXT:    v_sub_f32_e32 v0, v0, v2
-; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: v_fsub_v2bf16:
+; GFX7-LABEL: v_fneg_bf16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_sub_f32_e32 v1, v1, v3
-; GFX7-NEXT:    v_sub_f32_e32 v0, v0, v2
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: v_fsub_v2bf16:
+; GFX8-LABEL: v_fneg_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_sub_f32_e32 v2, v3, v2
-; GFX8-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX8-NEXT:    v_perm_b32 v0, v0, v2, s4
+; GFX8-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fsub_v2bf16:
+; GFX9-LABEL: v_fneg_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_sub_f32_e32 v2, v3, v2
-; GFX9-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX9-NEXT:    v_perm_b32 v0, v0, v2, s4
+; GFX9-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: v_fsub_v2bf16:
+; GFX10-LABEL: v_fneg_bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_fneg_bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = fneg bfloat %a
+  ret bfloat %op
+}
+
+declare i32 @llvm.amdgcn.readfirstlane(i32)
+
+; FIXME: readfirstlane hack for other bugs
+define amdgpu_ps i32 @s_fneg_bf16(bfloat inreg %a) {
+; GCN-LABEL: s_fneg_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_lshr_b32 s0, s0, 16
+; GCN-NEXT:    s_xor_b32 s0, s0, 0x8000
+; GCN-NEXT:    ; return to shader part epilog
+;
+; GFX7-LABEL: s_fneg_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX7-NEXT:    s_xor_b32 s0, s0, 0x8000
+; GFX7-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: s_fneg_bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX8-NEXT:    v_mov_b32_e32 v0, 0xffff8000
+; GFX8-NEXT:    v_xor_b32_e32 v0, s0, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: s_fneg_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0xffff8000
+; GFX9-NEXT:    v_xor_b32_e32 v0, s0, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: s_fneg_bf16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_sub_f32_e32 v2, v3, v2
-; GFX10-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x3020706
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX10-NEXT:    v_xor_b32_e64 v0, 0xffff8000, s0
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX10-NEXT:    ; return to shader part epilog
 ;
-; GFX11-LABEL: v_fsub_v2bf16:
+; GFX11-LABEL: s_fneg_bf16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX11-NEXT:    v_sub_f32_e32 v2, v3, v2
+; GFX11-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_xor_b32_e64 v0, 0xffff8000, s0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v2, 0x3020706
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %op = fsub <2 x bfloat> %a, %b
-  ret <2 x bfloat> %op
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX11-NEXT:    ; return to shader part epilog
+  %op = fneg bfloat %a
+  %cast = bitcast bfloat %op to i16
+  %zext = zext i16 %cast to i32
+  %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %zext)
+  ret i32 %readlane
 }
 
-define <3 x bfloat> @v_fsub_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
-; GCN-LABEL: v_fsub_v3bf16:
+define bfloat @v_fneg_fabs_bf16(bfloat %a) {
+; GCN-LABEL: v_fneg_fabs_bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT:    v_sub_f32_e32 v2, v2, v5
-; GCN-NEXT:    v_sub_f32_e32 v1, v1, v4
-; GCN-NEXT:    v_sub_f32_e32 v0, v0, v3
+; GCN-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: v_fsub_v3bf16:
+; GFX7-LABEL: v_fneg_fabs_bf16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_sub_f32_e32 v2, v2, v5
-; GFX7-NEXT:    v_sub_f32_e32 v1, v1, v4
-; GFX7-NEXT:    v_sub_f32_e32 v0, v0, v3
+; GFX7-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: v_fsub_v3bf16:
+; GFX8-LABEL: v_fneg_fabs_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_sub_f32_e32 v1, v1, v3
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_sub_f32_e32 v3, v4, v3
-; GFX8-NEXT:    v_sub_f32_e32 v0, v0, v2
-; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fsub_v3bf16:
+; GFX9-LABEL: v_fneg_fabs_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_sub_f32_e32 v4, v5, v4
-; GFX9-NEXT:    v_sub_f32_e32 v1, v1, v3
-; GFX9-NEXT:    v_sub_f32_e32 v0, v0, v2
-; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_perm_b32 v0, v0, v4, s4
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: v_fsub_v3bf16:
+; GFX10-LABEL: v_fneg_fabs_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX10-NEXT:    v_sub_f32_e32 v4, v5, v4
-; GFX10-NEXT:    v_sub_f32_e32 v0, v0, v2
-; GFX10-NEXT:    v_sub_f32_e32 v1, v1, v3
-; GFX10-NEXT:    v_perm_b32 v0, v0, v4, 0x3020706
-; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_fsub_v3bf16:
+; GFX11-LABEL: v_fneg_fabs_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_sub_f32 v1, v1, v3 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_sub_f32_e32 v4, v5, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v4, 0x3020706
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %op = fsub <3 x bfloat> %a, %b
-  ret <3 x bfloat> %op
+  %fabs = call bfloat @llvm.fabs.bf16(bfloat %a)
+  %op = fneg bfloat %fabs
+  ret bfloat %op
 }
 
-define <4 x bfloat> @v_fsub_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
-; GCN-LABEL: v_fsub_v4bf16:
+; FIXME: readfirstlane hack for other bugs
+define amdgpu_ps i32 @s_fneg_fabs_bf16(bfloat inreg %a) {
+; GCN-LABEL: s_fneg_fabs_bf16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT:    v_sub_f32_e32 v3, v3, v7
-; GCN-NEXT:    v_sub_f32_e32 v2, v2, v6
-; GCN-NEXT:    v_sub_f32_e32 v1, v1, v5
-; GCN-NEXT:    v_sub_f32_e32 v0, v0, v4
-; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; GCN-NEXT:    s_lshr_b32 s0, s0, 16
+; GCN-NEXT:    s_bitset1_b32 s0, 15
+; GCN-NEXT:    ; return to shader part epilog
 ;
-; GFX7-LABEL: v_fsub_v4bf16:
+; GFX7-LABEL: s_fneg_fabs_bf16:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_sub_f32_e32 v3, v3, v7
-; GFX7-NEXT:    v_sub_f32_e32 v2, v2, v6
-; GFX7-NEXT:    v_sub_f32_e32 v1, v1, v5
-; GFX7-NEXT:    v_sub_f32_e32 v0, v0, v4
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX7-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX7-NEXT:    s_bitset1_b32 s0, 15
+; GFX7-NEXT:    ; return to shader part epilog
 ;
-; GFX8-LABEL: v_fsub_v4bf16:
+; GFX8-LABEL: s_fneg_fabs_bf16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v3
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_sub_f32_e32 v4, v5, v4
-; GFX8-NEXT:    v_sub_f32_e32 v1, v1, v3
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_sub_f32_e32 v3, v5, v3
-; GFX8-NEXT:    v_sub_f32_e32 v0, v0, v2
-; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX8-NEXT:    v_perm_b32 v0, v0, v3, s4
-; GFX8-NEXT:    v_perm_b32 v1, v1, v4, s4
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
+; GFX8-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX8-NEXT:    v_mov_b32_e32 v0, 0xffff8000
+; GFX8-NEXT:    v_or_b32_e32 v0, s0, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX8-NEXT:    ; return to shader part epilog
 ;
-; GFX9-LABEL: v_fsub_v4bf16:
+; GFX9-LABEL: s_fneg_fabs_bf16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_sub_f32_e32 v4, v5, v4
-; GFX9-NEXT:    v_sub_f32_e32 v1, v1, v3
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_sub_f32_e32 v3, v5, v3
-; GFX9-NEXT:    v_sub_f32_e32 v0, v0, v2
-; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
-; GFX9-NEXT:    v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0xffff8000
+; GFX9-NEXT:    v_or_b32_e32 v0, s0, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NEXT:    ; return to shader part epilog
 ;
-; GFX10-LABEL: v_fsub_v4bf16:
+; GFX10-LABEL: s_fneg_fabs_bf16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_sub_f32_e32 v1, v1, v3
-; GFX10-NEXT:    v_sub_f32_e32 v3, v5, v4
-; GFX10-NEXT:    v_sub_f32_e32 v4, v7, v6
-; GFX10-NEXT:    v_sub_f32_e32 v0, v0, v2
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT:    v_perm_b32 v0, v0, v4, 0x3020706
-; GFX10-NEXT:    v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX10-NEXT:    v_or_b32_e64 v0, 0xffff8000, s0
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX10-NEXT:    ; return to shader part epilog
 ;
-; GFX11-LABEL: v_fsub_v4bf16:
+; GFX11-LABEL: s_fneg_fabs_bf16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_sub_f32 v0, v0, v2 :: v_dual_and_b32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT:    v_sub_f32_e32 v1, v1, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_sub_f32 v3, v7, v6 :: v_dual_sub_f32 v4, v5, v4
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v3, 0x3020706
-; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_or_b32_e64 v0, 0xffff8000, s0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_or_b32_e32 v1, v4, v1
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %op = fsub <4 x bfloat> %a, %b
-  ret <4 x bfloat> %op
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX11-NEXT:    ; return to shader part epilog
+  %fabs = call bfloat @llvm.fabs.bf16(bfloat %a)
+  %op = fneg bfloat %fabs
+  %cast = bitcast bfloat %op to i16
+  %zext = zext i16 %cast to i32
+  %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %zext)
+  ret i32 %readlane
 }
 
-define bfloat @v_fmul_bf16(bfloat %a, bfloat %b) {
-; GCN-LABEL: v_fmul_bf16:
+declare bfloat @llvm.minnum.bf16(bfloat, bfloat)
+declare <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat>, <2 x bfloat>)
+declare <3 x bfloat> @llvm.minnum.v3bf16(<3 x bfloat>, <3 x bfloat>)
+declare <4 x bfloat> @llvm.minnum.v4bf16(<4 x bfloat>, <4 x bfloat>)
+declare <8 x bfloat> @llvm.minnum.v8bf16(<8 x bfloat>, <8 x bfloat>)
+declare <16 x bfloat> @llvm.minnum.v16bf16(<16 x bfloat>, <16 x bfloat>)
+declare <32 x bfloat> @llvm.minnum.v32bf16(<32 x bfloat>, <32 x bfloat>)
+
+define bfloat @v_minnum_bf16(bfloat %a, bfloat %b) {
+; GCN-LABEL: v_minnum_bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT:    v_min_f32_e32 v0, v0, v1
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: v_fmul_bf16:
+; GFX7-LABEL: v_minnum_bf16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT:    v_min_f32_e32 v0, v0, v1
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: v_fmul_bf16:
+; GFX8-LABEL: v_minnum_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX8-NEXT:    v_min_f32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fmul_bf16:
+; GFX9-LABEL: v_minnum_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX9-NEXT:    v_min_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: v_fmul_bf16:
+; GFX10-LABEL: v_minnum_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX10-NEXT:    v_min_f32_e32 v0, v0, v1
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_fmul_bf16:
+; GFX11-LABEL: v_minnum_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_and_b32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_min_f32_e32 v0, v0, v1
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %op = fmul bfloat %a, %b
+  %op = call bfloat @llvm.minnum.bf16(bfloat %a, bfloat %b)
   ret bfloat %op
 }
 
-define <2 x bfloat> @v_fmul_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
-; GCN-LABEL: v_fmul_v2bf16:
+define <2 x bfloat> @v_minnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
+; GCN-LABEL: v_minnum_v2bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GCN-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT:    v_min_f32_e32 v1, v1, v3
+; GCN-NEXT:    v_min_f32_e32 v0, v0, v2
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: v_fmul_v2bf16:
+; GFX7-LABEL: v_minnum_v2bf16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT:    v_min_f32_e32 v1, v1, v3
+; GFX7-NEXT:    v_min_f32_e32 v0, v0, v2
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: v_fmul_v2bf16:
+; GFX8-LABEL: v_minnum_v2bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
 ; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_mul_f32_e32 v2, v3, v2
-; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX8-NEXT:    v_min_f32_e32 v2, v3, v2
+; GFX8-NEXT:    v_min_f32_e32 v0, v0, v1
 ; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
 ; GFX8-NEXT:    v_perm_b32 v0, v0, v2, s4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fmul_v2bf16:
+; GFX9-LABEL: v_minnum_v2bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
 ; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_mul_f32_e32 v2, v3, v2
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX9-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX9-NEXT:    v_min_f32_e32 v2, v3, v2
+; GFX9-NEXT:    v_min_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
 ; GFX9-NEXT:    v_perm_b32 v0, v0, v2, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: v_fmul_v2bf16:
+; GFX10-LABEL: v_minnum_v2bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
 ; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_mul_f32_e32 v2, v3, v2
-; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX10-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX10-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX10-NEXT:    v_min_f32_e32 v2, v3, v2
+; GFX10-NEXT:    v_min_f32_e32 v0, v0, v1
 ; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x3020706
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_fmul_v2bf16:
+; GFX11-LABEL: v_minnum_v2bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX11-NEXT:    v_mul_f32_e32 v2, v3, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_max_f32 v3, v3, v3 :: v_dual_max_f32 v2, v2, v2
+; GFX11-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_min_f32_e32 v2, v3, v2
+; GFX11-NEXT:    v_min_f32_e32 v0, v0, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_perm_b32 v0, v0, v2, 0x3020706
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %op = fmul <2 x bfloat> %a, %b
+  %op = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b)
   ret <2 x bfloat> %op
 }
 
-define <3 x bfloat> @v_fmul_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
-; GCN-LABEL: v_fmul_v3bf16:
+define <3 x bfloat> @v_minnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
+; GCN-LABEL: v_minnum_v3bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
@@ -9278,15 +13570,21 @@ define <3 x bfloat> @v_fmul_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT:    v_mul_f32_e32 v2, v2, v5
-; GCN-NEXT:    v_mul_f32_e32 v1, v1, v4
-; GCN-NEXT:    v_mul_f32_e32 v0, v0, v3
+; GCN-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GCN-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT:    v_min_f32_e32 v2, v2, v5
+; GCN-NEXT:    v_min_f32_e32 v1, v1, v4
+; GCN-NEXT:    v_min_f32_e32 v0, v0, v3
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: v_fmul_v3bf16:
+; GFX7-LABEL: v_minnum_v3bf16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
@@ -9295,87 +13593,115 @@ define <3 x bfloat> @v_fmul_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_mul_f32_e32 v2, v2, v5
-; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v4
-; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v3
+; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT:    v_min_f32_e32 v2, v2, v5
+; GFX7-NEXT:    v_min_f32_e32 v1, v1, v4
+; GFX7-NEXT:    v_min_f32_e32 v0, v0, v3
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: v_fmul_v3bf16:
+; GFX8-LABEL: v_minnum_v3bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX8-NEXT:    v_min_f32_e32 v1, v1, v3
 ; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
 ; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_mul_f32_e32 v3, v4, v3
-; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX8-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX8-NEXT:    v_min_f32_e32 v3, v4, v3
+; GFX8-NEXT:    v_min_f32_e32 v0, v0, v2
 ; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    v_perm_b32 v0, v0, v3, s4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fmul_v3bf16:
+; GFX9-LABEL: v_minnum_v3bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX9-NEXT:    v_min_f32_e32 v1, v1, v3
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_mul_f32_e32 v4, v5, v4
-; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX9-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX9-NEXT:    v_max_f32_e32 v4, v4, v4
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX9-NEXT:    v_min_f32_e32 v3, v4, v3
+; GFX9-NEXT:    v_min_f32_e32 v0, v0, v2
 ; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_perm_b32 v0, v0, v4, s4
+; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: v_fmul_v3bf16:
+; GFX10-LABEL: v_minnum_v3bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
 ; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX10-NEXT:    v_mul_f32_e32 v4, v5, v4
-; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX10-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX10-NEXT:    v_max_f32_e32 v4, v4, v4
+; GFX10-NEXT:    v_max_f32_e32 v5, v5, v5
+; GFX10-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX10-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX10-NEXT:    v_min_f32_e32 v4, v5, v4
+; GFX10-NEXT:    v_min_f32_e32 v0, v0, v2
+; GFX10-NEXT:    v_min_f32_e32 v1, v1, v3
 ; GFX10-NEXT:    v_perm_b32 v0, v0, v4, 0x3020706
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_fmul_v3bf16:
+; GFX11-LABEL: v_minnum_v3bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_mul_f32 v1, v1, v3 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_mul_f32_e32 v4, v5, v4
+; GFX11-NEXT:    v_dual_max_f32 v4, v4, v4 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX11-NEXT:    v_dual_max_f32 v2, v2, v2 :: v_dual_and_b32 v5, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_max_f32 v1, v1, v1 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-NEXT:    v_dual_max_f32 v3, v3, v3 :: v_dual_max_f32 v0, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_dual_min_f32 v1, v1, v3 :: v_dual_min_f32 v0, v0, v2
+; GFX11-NEXT:    v_max_f32_e32 v5, v5, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_min_f32_e32 v4, v5, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_perm_b32 v0, v0, v4, 0x3020706
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %op = fmul <3 x bfloat> %a, %b
+  %op = call <3 x bfloat> @llvm.minnum.v3bf16(<3 x bfloat> %a, <3 x bfloat> %b)
   ret <3 x bfloat> %op
 }
 
-define <4 x bfloat> @v_fmul_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
-; GCN-LABEL: v_fmul_v4bf16:
+define <4 x bfloat> @v_minnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
+; GCN-LABEL: v_minnum_v4bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
@@ -9386,17 +13712,25 @@ define <4 x bfloat> @v_fmul_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT:    v_mul_f32_e32 v3, v3, v7
-; GCN-NEXT:    v_mul_f32_e32 v2, v2, v6
-; GCN-NEXT:    v_mul_f32_e32 v1, v1, v5
-; GCN-NEXT:    v_mul_f32_e32 v0, v0, v4
-; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_mul_f32_e32 v7, 1.0, v7
+; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT:    v_mul_f32_e32 v6, 1.0, v6
+; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GCN-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT:    v_min_f32_e32 v3, v3, v7
+; GCN-NEXT:    v_min_f32_e32 v2, v2, v6
+; GCN-NEXT:    v_min_f32_e32 v1, v1, v5
+; GCN-NEXT:    v_min_f32_e32 v0, v0, v4
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: v_fmul_v4bf16:
+; GFX7-LABEL: v_minnum_v4bf16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
@@ -9407,58 +13741,82 @@ define <4 x bfloat> @v_fmul_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_mul_f32_e32 v3, v3, v7
-; GFX7-NEXT:    v_mul_f32_e32 v2, v2, v6
-; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v5
-; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v4
+; GFX7-NEXT:    v_mul_f32_e32 v7, 1.0, v7
+; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v6
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT:    v_min_f32_e32 v3, v3, v7
+; GFX7-NEXT:    v_min_f32_e32 v2, v2, v6
+; GFX7-NEXT:    v_min_f32_e32 v1, v1, v5
+; GFX7-NEXT:    v_min_f32_e32 v0, v0, v4
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: v_fmul_v4bf16:
+; GFX8-LABEL: v_minnum_v4bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v3
 ; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_mul_f32_e32 v4, v5, v4
-; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX8-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GFX8-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX8-NEXT:    v_min_f32_e32 v4, v5, v4
+; GFX8-NEXT:    v_min_f32_e32 v1, v1, v3
 ; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
 ; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_mul_f32_e32 v3, v5, v3
-; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX8-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX8-NEXT:    v_min_f32_e32 v3, v5, v3
+; GFX8-NEXT:    v_min_f32_e32 v0, v0, v2
 ; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
 ; GFX8-NEXT:    v_perm_b32 v0, v0, v3, s4
 ; GFX8-NEXT:    v_perm_b32 v1, v1, v4, s4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fmul_v4bf16:
+; GFX9-LABEL: v_minnum_v4bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
 ; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_mul_f32_e32 v4, v5, v4
-; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX9-NEXT:    v_max_f32_e32 v4, v4, v4
+; GFX9-NEXT:    v_max_f32_e32 v5, v5, v5
+; GFX9-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX9-NEXT:    v_min_f32_e32 v4, v5, v4
+; GFX9-NEXT:    v_min_f32_e32 v1, v1, v3
 ; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
 ; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_mul_f32_e32 v3, v5, v3
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX9-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX9-NEXT:    v_max_f32_e32 v5, v5, v5
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX9-NEXT:    v_min_f32_e32 v3, v5, v3
+; GFX9-NEXT:    v_min_f32_e32 v0, v0, v2
 ; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: v_fmul_v4bf16:
+; GFX10-LABEL: v_minnum_v4bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
@@ -9468,836 +13826,4775 @@ define <4 x bfloat> @v_fmul_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
 ; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
 ; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    v_max_f32_e32 v3, v3, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX10-NEXT:    v_mul_f32_e32 v3, v5, v4
-; GFX10-NEXT:    v_mul_f32_e32 v4, v7, v6
-; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX10-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX10-NEXT:    v_max_f32_e32 v4, v4, v4
+; GFX10-NEXT:    v_max_f32_e32 v5, v5, v5
+; GFX10-NEXT:    v_max_f32_e32 v6, v6, v6
+; GFX10-NEXT:    v_max_f32_e32 v7, v7, v7
+; GFX10-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX10-NEXT:    v_min_f32_e32 v1, v1, v3
+; GFX10-NEXT:    v_min_f32_e32 v3, v5, v4
+; GFX10-NEXT:    v_min_f32_e32 v4, v7, v6
+; GFX10-NEXT:    v_min_f32_e32 v0, v0, v2
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX10-NEXT:    v_perm_b32 v0, v0, v4, 0x3020706
 ; GFX10-NEXT:    v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_fmul_v4bf16:
+; GFX11-LABEL: v_minnum_v4bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_max_f32 v5, v5, v5 :: v_dual_lshlrev_b32 v4, 16, v3
+; GFX11-NEXT:    v_dual_max_f32 v4, v4, v4 :: v_dual_and_b32 v3, 0xffff0000, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_min_f32_e32 v4, v5, v4
+; GFX11-NEXT:    v_dual_max_f32 v3, v3, v3 :: v_dual_and_b32 v6, 0xffff0000, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_max_f32 v5, v6, v6 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-NEXT:    v_dual_max_f32 v2, v2, v2 :: v_dual_and_b32 v1, 0xffff0000, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_max_f32 v6, v7, v7 :: v_dual_max_f32 v1, v1, v1
+; GFX11-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_min_f32 v1, v1, v3
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_min_f32_e32 v4, v6, v5
+; GFX11-NEXT:    v_dual_min_f32 v0, v0, v2 :: v_dual_and_b32 v1, 0xffff0000, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v4, 0x3020706
+; GFX11-NEXT:    v_or_b32_e32 v1, v3, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = call <4 x bfloat> @llvm.minnum.v4bf16(<4 x bfloat> %a, <4 x bfloat> %b)
+  ret <4 x bfloat> %op
+}
+
+define <8 x bfloat> @v_minnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
+; GCN-LABEL: v_minnum_v8bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_mul_f32_e32 v15, 1.0, v15
+; GCN-NEXT:    v_mul_f32_e32 v7, 1.0, v7
+; GCN-NEXT:    v_mul_f32_e32 v14, 1.0, v14
+; GCN-NEXT:    v_mul_f32_e32 v6, 1.0, v6
+; GCN-NEXT:    v_mul_f32_e32 v13, 1.0, v13
+; GCN-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT:    v_mul_f32_e32 v12, 1.0, v12
+; GCN-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GCN-NEXT:    v_mul_f32_e32 v11, 1.0, v11
+; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT:    v_mul_f32_e32 v10, 1.0, v10
+; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GCN-NEXT:    v_mul_f32_e32 v9, 1.0, v9
+; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT:    v_mul_f32_e32 v8, 1.0, v8
+; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT:    v_min_f32_e32 v7, v7, v15
+; GCN-NEXT:    v_min_f32_e32 v6, v6, v14
+; GCN-NEXT:    v_min_f32_e32 v5, v5, v13
+; GCN-NEXT:    v_min_f32_e32 v4, v4, v12
+; GCN-NEXT:    v_min_f32_e32 v3, v3, v11
+; GCN-NEXT:    v_min_f32_e32 v2, v2, v10
+; GCN-NEXT:    v_min_f32_e32 v1, v1, v9
+; GCN-NEXT:    v_min_f32_e32 v0, v0, v8
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: v_minnum_v8bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_mul_f32_e32 v15, 1.0, v15
+; GFX7-NEXT:    v_mul_f32_e32 v7, 1.0, v7
+; GFX7-NEXT:    v_mul_f32_e32 v14, 1.0, v14
+; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v6
+; GFX7-NEXT:    v_mul_f32_e32 v13, 1.0, v13
+; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT:    v_mul_f32_e32 v12, 1.0, v12
+; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT:    v_mul_f32_e32 v11, 1.0, v11
+; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT:    v_mul_f32_e32 v10, 1.0, v10
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT:    v_mul_f32_e32 v9, 1.0, v9
+; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT:    v_mul_f32_e32 v8, 1.0, v8
+; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT:    v_min_f32_e32 v7, v7, v15
+; GFX7-NEXT:    v_min_f32_e32 v6, v6, v14
+; GFX7-NEXT:    v_min_f32_e32 v5, v5, v13
+; GFX7-NEXT:    v_min_f32_e32 v4, v4, v12
+; GFX7-NEXT:    v_min_f32_e32 v3, v3, v11
+; GFX7-NEXT:    v_min_f32_e32 v2, v2, v10
+; GFX7-NEXT:    v_min_f32_e32 v1, v1, v9
+; GFX7-NEXT:    v_min_f32_e32 v0, v0, v8
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_minnum_v8bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v8, 0xffff0000, v7
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_mul_f32_e32 v8, 1.0, v8
+; GFX8-NEXT:    v_mul_f32_e32 v9, 1.0, v9
+; GFX8-NEXT:    v_mul_f32_e32 v7, 1.0, v7
+; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX8-NEXT:    v_min_f32_e32 v8, v9, v8
+; GFX8-NEXT:    v_min_f32_e32 v3, v3, v7
+; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v6
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_mul_f32_e32 v7, 1.0, v7
+; GFX8-NEXT:    v_mul_f32_e32 v9, 1.0, v9
+; GFX8-NEXT:    v_mul_f32_e32 v6, 1.0, v6
+; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX8-NEXT:    v_min_f32_e32 v7, v9, v7
+; GFX8-NEXT:    v_min_f32_e32 v2, v2, v6
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v5
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_mul_f32_e32 v6, 1.0, v6
+; GFX8-NEXT:    v_mul_f32_e32 v9, 1.0, v9
+; GFX8-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX8-NEXT:    v_min_f32_e32 v6, v9, v6
+; GFX8-NEXT:    v_min_f32_e32 v1, v1, v5
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GFX8-NEXT:    v_mul_f32_e32 v9, 1.0, v9
+; GFX8-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX8-NEXT:    v_min_f32_e32 v5, v9, v5
+; GFX8-NEXT:    v_min_f32_e32 v0, v0, v4
+; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX8-NEXT:    v_perm_b32 v0, v0, v5, s4
+; GFX8-NEXT:    v_perm_b32 v1, v1, v6, s4
+; GFX8-NEXT:    v_perm_b32 v2, v2, v7, s4
+; GFX8-NEXT:    v_perm_b32 v3, v3, v8, s4
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_minnum_v8bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v8, 0xffff0000, v7
+; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v3
+; GFX9-NEXT:    v_max_f32_e32 v8, v8, v8
+; GFX9-NEXT:    v_max_f32_e32 v9, v9, v9
+; GFX9-NEXT:    v_min_f32_e32 v8, v9, v8
+; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v5
+; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v1
+; GFX9-NEXT:    v_max_f32_e32 v9, v9, v9
+; GFX9-NEXT:    v_max_f32_e32 v10, v10, v10
+; GFX9-NEXT:    v_min_f32_e32 v9, v10, v9
+; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v6
+; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v2
+; GFX9-NEXT:    v_max_f32_e32 v10, v10, v10
+; GFX9-NEXT:    v_max_f32_e32 v11, v11, v11
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_min_f32_e32 v10, v11, v10
+; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v4
+; GFX9-NEXT:    v_and_b32_e32 v12, 0xffff0000, v0
+; GFX9-NEXT:    v_max_f32_e32 v5, v5, v5
+; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_min_f32_e32 v1, v1, v5
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_max_f32_e32 v4, v4, v4
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX9-NEXT:    v_max_f32_e32 v11, v11, v11
+; GFX9-NEXT:    v_max_f32_e32 v12, v12, v12
+; GFX9-NEXT:    v_max_f32_e32 v7, v7, v7
+; GFX9-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX9-NEXT:    v_max_f32_e32 v5, v5, v5
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX9-NEXT:    v_min_f32_e32 v0, v0, v4
+; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v9
+; GFX9-NEXT:    v_min_f32_e32 v11, v12, v11
+; GFX9-NEXT:    v_min_f32_e32 v3, v3, v7
+; GFX9-NEXT:    v_min_f32_e32 v2, v2, v5
+; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v8
+; GFX9-NEXT:    v_perm_b32 v0, v0, v11, s4
+; GFX9-NEXT:    v_perm_b32 v2, v2, v10, s4
+; GFX9-NEXT:    v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_minnum_v8bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v8, 0xffff0000, v7
+; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v3
+; GFX10-NEXT:    v_and_b32_e32 v10, 0xffff0000, v5
+; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v1
+; GFX10-NEXT:    v_and_b32_e32 v12, 0xffff0000, v2
+; GFX10-NEXT:    v_max_f32_e32 v8, v8, v8
+; GFX10-NEXT:    v_max_f32_e32 v9, v9, v9
+; GFX10-NEXT:    v_max_f32_e32 v10, v10, v10
+; GFX10-NEXT:    v_and_b32_e32 v13, 0xffff0000, v4
+; GFX10-NEXT:    v_and_b32_e32 v14, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX10-NEXT:    v_min_f32_e32 v8, v9, v8
+; GFX10-NEXT:    v_max_f32_e32 v9, v11, v11
+; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    v_min_f32_e32 v9, v9, v10
+; GFX10-NEXT:    v_max_f32_e32 v10, v11, v11
+; GFX10-NEXT:    v_max_f32_e32 v11, v12, v12
+; GFX10-NEXT:    v_max_f32_e32 v12, v13, v13
+; GFX10-NEXT:    v_max_f32_e32 v13, v14, v14
+; GFX10-NEXT:    v_max_f32_e32 v7, v7, v7
+; GFX10-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX10-NEXT:    v_max_f32_e32 v5, v5, v5
+; GFX10-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX10-NEXT:    v_max_f32_e32 v4, v4, v4
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX10-NEXT:    v_max_f32_e32 v6, v6, v6
+; GFX10-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX10-NEXT:    v_min_f32_e32 v10, v11, v10
+; GFX10-NEXT:    v_min_f32_e32 v11, v13, v12
+; GFX10-NEXT:    v_min_f32_e32 v3, v3, v7
+; GFX10-NEXT:    v_min_f32_e32 v1, v1, v5
+; GFX10-NEXT:    v_min_f32_e32 v0, v0, v4
+; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v9
+; GFX10-NEXT:    v_min_f32_e32 v2, v2, v6
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v8
+; GFX10-NEXT:    v_perm_b32 v0, v0, v11, 0x3020706
+; GFX10-NEXT:    v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_perm_b32 v2, v2, v10, 0x3020706
+; GFX10-NEXT:    v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_minnum_v8bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v3
+; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff0000, v7
+; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v1
+; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff0000, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_max_f32 v9, v9, v9 :: v_dual_max_f32 v8, v8, v8
+; GFX11-NEXT:    v_dual_max_f32 v11, v11, v11 :: v_dual_max_f32 v10, v10, v10
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_max_f32 v7, v7, v7 :: v_dual_min_f32 v8, v9, v8
+; GFX11-NEXT:    v_min_f32_e32 v9, v11, v10
+; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v2
+; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff0000, v6
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_dual_max_f32 v11, v11, v11 :: v_dual_lshlrev_b32 v6, 16, v6
+; GFX11-NEXT:    v_max_f32_e32 v10, v12, v12
+; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff0000, v0
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_mul_f32 v0, v0, v2 :: v_dual_and_b32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX11-NEXT:    v_dual_max_f32 v2, v2, v2 :: v_dual_lshlrev_b32 v5, 16, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_min_f32_e32 v10, v11, v10
+; GFX11-NEXT:    v_dual_max_f32 v13, v13, v13 :: v_dual_and_b32 v12, 0xffff0000, v4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v12, v12, v12
+; GFX11-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX11-NEXT:    v_dual_max_f32 v4, v4, v4 :: v_dual_max_f32 v3, v3, v3
+; GFX11-NEXT:    v_max_f32_e32 v5, v5, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_min_f32_e32 v11, v13, v12
+; GFX11-NEXT:    v_dual_min_f32 v0, v0, v4 :: v_dual_min_f32 v3, v3, v7
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_min_f32_e32 v1, v1, v5
+; GFX11-NEXT:    v_dual_max_f32 v5, v6, v6 :: v_dual_and_b32 v4, 0xffff0000, v9
+; GFX11-NEXT:    v_perm_b32 v0, v0, v11, 0x3020706
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_mul_f32 v3, v7, v6 :: v_dual_mul_f32 v4, v5, v4
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_dual_min_f32 v2, v2, v5 :: v_dual_and_b32 v5, 0xffff0000, v8
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v4
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v3, 0x3020706
-; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_or_b32_e32 v1, v4, v1
+; GFX11-NEXT:    v_perm_b32 v2, v2, v10, 0x3020706
+; GFX11-NEXT:    v_or_b32_e32 v3, v3, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %op = fmul <4 x bfloat> %a, %b
-  ret <4 x bfloat> %op
+  %op = call <8 x bfloat> @llvm.minnum.v8bf16(<8 x bfloat> %a, <8 x bfloat> %b)
+  ret <8 x bfloat> %op
 }
 
-define bfloat @v_fdiv_bf16(bfloat %a, bfloat %b) {
-; GCN-LABEL: v_fdiv_bf16:
+define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
+; GCN-LABEL: v_minnum_v16bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GCN-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GCN-NEXT:    v_mul_f32_e32 v30, 1.0, v30
+; GCN-NEXT:    v_mul_f32_e32 v14, 1.0, v14
+; GCN-NEXT:    v_min_f32_e32 v14, v14, v30
+; GCN-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GCN-NEXT:    v_mul_f32_e32 v29, 1.0, v29
+; GCN-NEXT:    v_mul_f32_e32 v13, 1.0, v13
+; GCN-NEXT:    v_min_f32_e32 v13, v13, v29
+; GCN-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GCN-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GCN-NEXT:    v_mul_f32_e32 v28, 1.0, v28
+; GCN-NEXT:    v_mul_f32_e32 v12, 1.0, v12
+; GCN-NEXT:    v_min_f32_e32 v12, v12, v28
+; GCN-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GCN-NEXT:    v_mul_f32_e32 v27, 1.0, v27
+; GCN-NEXT:    v_mul_f32_e32 v11, 1.0, v11
+; GCN-NEXT:    v_min_f32_e32 v11, v11, v27
+; GCN-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GCN-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GCN-NEXT:    v_mul_f32_e32 v26, 1.0, v26
+; GCN-NEXT:    v_mul_f32_e32 v10, 1.0, v10
+; GCN-NEXT:    v_min_f32_e32 v10, v10, v26
+; GCN-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GCN-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GCN-NEXT:    v_mul_f32_e32 v25, 1.0, v25
+; GCN-NEXT:    v_mul_f32_e32 v9, 1.0, v9
+; GCN-NEXT:    v_min_f32_e32 v9, v9, v25
+; GCN-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GCN-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GCN-NEXT:    v_mul_f32_e32 v24, 1.0, v24
+; GCN-NEXT:    v_mul_f32_e32 v8, 1.0, v8
+; GCN-NEXT:    v_min_f32_e32 v8, v8, v24
+; GCN-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT:    v_mul_f32_e32 v23, 1.0, v23
+; GCN-NEXT:    v_mul_f32_e32 v7, 1.0, v7
+; GCN-NEXT:    v_min_f32_e32 v7, v7, v23
+; GCN-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT:    v_mul_f32_e32 v22, 1.0, v22
+; GCN-NEXT:    v_mul_f32_e32 v6, 1.0, v6
+; GCN-NEXT:    v_min_f32_e32 v6, v6, v22
+; GCN-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT:    v_mul_f32_e32 v21, 1.0, v21
+; GCN-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT:    v_min_f32_e32 v5, v5, v21
+; GCN-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT:    v_mul_f32_e32 v20, 1.0, v20
+; GCN-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GCN-NEXT:    v_min_f32_e32 v4, v4, v20
+; GCN-NEXT:    buffer_load_dword v20, off, s[0:3], s32
+; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GCN-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
-; GCN-NEXT:    v_rcp_f32_e32 v3, v2
-; GCN-NEXT:    v_fma_f32 v4, -v2, v3, 1.0
-; GCN-NEXT:    v_fma_f32 v3, v4, v3, v3
-; GCN-NEXT:    v_div_scale_f32 v4, vcc, v0, v1, v0
-; GCN-NEXT:    v_mul_f32_e32 v5, v4, v3
-; GCN-NEXT:    v_fma_f32 v6, -v2, v5, v4
-; GCN-NEXT:    v_fma_f32 v5, v6, v3, v5
-; GCN-NEXT:    v_fma_f32 v2, -v2, v5, v4
-; GCN-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
-; GCN-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GCN-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_mul_f32_e32 v15, 1.0, v15
+; GCN-NEXT:    v_mul_f32_e32 v19, 1.0, v19
+; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT:    v_mul_f32_e32 v18, 1.0, v18
+; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GCN-NEXT:    v_mul_f32_e32 v17, 1.0, v17
+; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT:    v_mul_f32_e32 v16, 1.0, v16
+; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT:    v_min_f32_e32 v3, v3, v19
+; GCN-NEXT:    v_min_f32_e32 v2, v2, v18
+; GCN-NEXT:    v_min_f32_e32 v1, v1, v17
+; GCN-NEXT:    v_min_f32_e32 v0, v0, v16
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GCN-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GCN-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GCN-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v16, 0xffff0000, v20
+; GCN-NEXT:    v_mul_f32_e32 v16, 1.0, v16
+; GCN-NEXT:    v_min_f32_e32 v15, v15, v16
+; GCN-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: v_fdiv_bf16:
+; GFX7-LABEL: v_minnum_v16bf16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT:    v_mul_f32_e32 v22, 1.0, v22
+; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v6
+; GFX7-NEXT:    v_min_f32_e32 v6, v6, v22
+; GFX7-NEXT:    buffer_load_dword v22, off, s[0:3], s32
+; GFX7-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GFX7-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX7-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX7-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GFX7-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX7-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX7-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX7-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX7-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX7-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
-; GFX7-NEXT:    v_rcp_f32_e32 v3, v2
-; GFX7-NEXT:    v_fma_f32 v4, -v2, v3, 1.0
-; GFX7-NEXT:    v_fma_f32 v3, v4, v3, v3
-; GFX7-NEXT:    v_div_scale_f32 v4, vcc, v0, v1, v0
-; GFX7-NEXT:    v_mul_f32_e32 v5, v4, v3
-; GFX7-NEXT:    v_fma_f32 v6, -v2, v5, v4
-; GFX7-NEXT:    v_fma_f32 v5, v6, v3, v5
-; GFX7-NEXT:    v_fma_f32 v2, -v2, v5, v4
-; GFX7-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
-; GFX7-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; GFX7-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_mul_f32_e32 v30, 1.0, v30
+; GFX7-NEXT:    v_mul_f32_e32 v14, 1.0, v14
+; GFX7-NEXT:    v_mul_f32_e32 v29, 1.0, v29
+; GFX7-NEXT:    v_mul_f32_e32 v13, 1.0, v13
+; GFX7-NEXT:    v_mul_f32_e32 v28, 1.0, v28
+; GFX7-NEXT:    v_mul_f32_e32 v12, 1.0, v12
+; GFX7-NEXT:    v_mul_f32_e32 v27, 1.0, v27
+; GFX7-NEXT:    v_mul_f32_e32 v11, 1.0, v11
+; GFX7-NEXT:    v_mul_f32_e32 v26, 1.0, v26
+; GFX7-NEXT:    v_mul_f32_e32 v10, 1.0, v10
+; GFX7-NEXT:    v_mul_f32_e32 v25, 1.0, v25
+; GFX7-NEXT:    v_mul_f32_e32 v9, 1.0, v9
+; GFX7-NEXT:    v_mul_f32_e32 v24, 1.0, v24
+; GFX7-NEXT:    v_mul_f32_e32 v8, 1.0, v8
+; GFX7-NEXT:    v_mul_f32_e32 v23, 1.0, v23
+; GFX7-NEXT:    v_mul_f32_e32 v7, 1.0, v7
+; GFX7-NEXT:    v_mul_f32_e32 v15, 1.0, v15
+; GFX7-NEXT:    v_mul_f32_e32 v21, 1.0, v21
+; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT:    v_mul_f32_e32 v20, 1.0, v20
+; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT:    v_mul_f32_e32 v19, 1.0, v19
+; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT:    v_mul_f32_e32 v18, 1.0, v18
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT:    v_mul_f32_e32 v17, 1.0, v17
+; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT:    v_mul_f32_e32 v16, 1.0, v16
+; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT:    v_min_f32_e32 v14, v14, v30
+; GFX7-NEXT:    v_min_f32_e32 v13, v13, v29
+; GFX7-NEXT:    v_min_f32_e32 v12, v12, v28
+; GFX7-NEXT:    v_min_f32_e32 v11, v11, v27
+; GFX7-NEXT:    v_min_f32_e32 v10, v10, v26
+; GFX7-NEXT:    v_min_f32_e32 v9, v9, v25
+; GFX7-NEXT:    v_min_f32_e32 v8, v8, v24
+; GFX7-NEXT:    v_min_f32_e32 v7, v7, v23
+; GFX7-NEXT:    v_min_f32_e32 v5, v5, v21
+; GFX7-NEXT:    v_min_f32_e32 v4, v4, v20
+; GFX7-NEXT:    v_min_f32_e32 v3, v3, v19
+; GFX7-NEXT:    v_min_f32_e32 v2, v2, v18
+; GFX7-NEXT:    v_min_f32_e32 v1, v1, v17
+; GFX7-NEXT:    v_min_f32_e32 v0, v0, v16
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX7-NEXT:    v_mul_f32_e32 v22, 1.0, v22
+; GFX7-NEXT:    v_min_f32_e32 v15, v15, v22
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX7-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX7-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: v_fdiv_bf16:
+; GFX8-LABEL: v_minnum_v16bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
-; GFX8-NEXT:    v_div_scale_f32 v3, vcc, v0, v1, v0
-; GFX8-NEXT:    v_rcp_f32_e32 v4, v2
-; GFX8-NEXT:    v_fma_f32 v5, -v2, v4, 1.0
-; GFX8-NEXT:    v_fma_f32 v4, v5, v4, v4
-; GFX8-NEXT:    v_mul_f32_e32 v5, v3, v4
-; GFX8-NEXT:    v_fma_f32 v6, -v2, v5, v3
-; GFX8-NEXT:    v_fma_f32 v5, v6, v4, v5
-; GFX8-NEXT:    v_fma_f32 v2, -v2, v5, v3
-; GFX8-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
-; GFX8-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_and_b32_e32 v16, 0xffff0000, v15
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v7
+; GFX8-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX8-NEXT:    v_mul_f32_e32 v16, 1.0, v16
+; GFX8-NEXT:    v_mul_f32_e32 v17, 1.0, v17
+; GFX8-NEXT:    v_mul_f32_e32 v15, 1.0, v15
+; GFX8-NEXT:    v_mul_f32_e32 v7, 1.0, v7
+; GFX8-NEXT:    v_min_f32_e32 v16, v17, v16
+; GFX8-NEXT:    v_min_f32_e32 v7, v7, v15
+; GFX8-NEXT:    v_and_b32_e32 v15, 0xffff0000, v14
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v6
+; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_mul_f32_e32 v15, 1.0, v15
+; GFX8-NEXT:    v_mul_f32_e32 v17, 1.0, v17
+; GFX8-NEXT:    v_mul_f32_e32 v14, 1.0, v14
+; GFX8-NEXT:    v_mul_f32_e32 v6, 1.0, v6
+; GFX8-NEXT:    v_min_f32_e32 v15, v17, v15
+; GFX8-NEXT:    v_min_f32_e32 v6, v6, v14
+; GFX8-NEXT:    v_and_b32_e32 v14, 0xffff0000, v13
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_mul_f32_e32 v14, 1.0, v14
+; GFX8-NEXT:    v_mul_f32_e32 v17, 1.0, v17
+; GFX8-NEXT:    v_mul_f32_e32 v13, 1.0, v13
+; GFX8-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GFX8-NEXT:    v_min_f32_e32 v14, v17, v14
+; GFX8-NEXT:    v_min_f32_e32 v5, v5, v13
+; GFX8-NEXT:    v_and_b32_e32 v13, 0xffff0000, v12
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX8-NEXT:    v_mul_f32_e32 v13, 1.0, v13
+; GFX8-NEXT:    v_mul_f32_e32 v17, 1.0, v17
+; GFX8-NEXT:    v_mul_f32_e32 v12, 1.0, v12
+; GFX8-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GFX8-NEXT:    v_min_f32_e32 v13, v17, v13
+; GFX8-NEXT:    v_min_f32_e32 v4, v4, v12
+; GFX8-NEXT:    v_and_b32_e32 v12, 0xffff0000, v11
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_mul_f32_e32 v12, 1.0, v12
+; GFX8-NEXT:    v_mul_f32_e32 v17, 1.0, v17
+; GFX8-NEXT:    v_mul_f32_e32 v11, 1.0, v11
+; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX8-NEXT:    v_min_f32_e32 v12, v17, v12
+; GFX8-NEXT:    v_min_f32_e32 v3, v3, v11
+; GFX8-NEXT:    v_and_b32_e32 v11, 0xffff0000, v10
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_mul_f32_e32 v11, 1.0, v11
+; GFX8-NEXT:    v_mul_f32_e32 v17, 1.0, v17
+; GFX8-NEXT:    v_mul_f32_e32 v10, 1.0, v10
+; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX8-NEXT:    v_min_f32_e32 v11, v17, v11
+; GFX8-NEXT:    v_min_f32_e32 v2, v2, v10
+; GFX8-NEXT:    v_and_b32_e32 v10, 0xffff0000, v9
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_mul_f32_e32 v10, 1.0, v10
+; GFX8-NEXT:    v_mul_f32_e32 v17, 1.0, v17
+; GFX8-NEXT:    v_mul_f32_e32 v9, 1.0, v9
+; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX8-NEXT:    v_min_f32_e32 v10, v17, v10
+; GFX8-NEXT:    v_min_f32_e32 v1, v1, v9
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v8
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_mul_f32_e32 v9, 1.0, v9
+; GFX8-NEXT:    v_mul_f32_e32 v17, 1.0, v17
+; GFX8-NEXT:    v_mul_f32_e32 v8, 1.0, v8
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX8-NEXT:    v_min_f32_e32 v9, v17, v9
+; GFX8-NEXT:    v_min_f32_e32 v0, v0, v8
+; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX8-NEXT:    v_perm_b32 v0, v0, v9, s4
+; GFX8-NEXT:    v_perm_b32 v1, v1, v10, s4
+; GFX8-NEXT:    v_perm_b32 v2, v2, v11, s4
+; GFX8-NEXT:    v_perm_b32 v3, v3, v12, s4
+; GFX8-NEXT:    v_perm_b32 v4, v4, v13, s4
+; GFX8-NEXT:    v_perm_b32 v5, v5, v14, s4
+; GFX8-NEXT:    v_perm_b32 v6, v6, v15, s4
+; GFX8-NEXT:    v_perm_b32 v7, v7, v16, s4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fdiv_bf16:
+; GFX9-LABEL: v_minnum_v16bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
-; GFX9-NEXT:    v_div_scale_f32 v3, vcc, v0, v1, v0
-; GFX9-NEXT:    v_rcp_f32_e32 v4, v2
-; GFX9-NEXT:    v_fma_f32 v5, -v2, v4, 1.0
-; GFX9-NEXT:    v_fma_f32 v4, v5, v4, v4
-; GFX9-NEXT:    v_mul_f32_e32 v5, v3, v4
-; GFX9-NEXT:    v_fma_f32 v6, -v2, v5, v3
-; GFX9-NEXT:    v_fma_f32 v5, v6, v4, v5
-; GFX9-NEXT:    v_fma_f32 v2, -v2, v5, v3
-; GFX9-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
-; GFX9-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_and_b32_e32 v16, 0xffff0000, v15
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v7
+; GFX9-NEXT:    v_max_f32_e32 v16, v16, v16
+; GFX9-NEXT:    v_max_f32_e32 v17, v17, v17
+; GFX9-NEXT:    v_min_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v14
+; GFX9-NEXT:    v_and_b32_e32 v18, 0xffff0000, v6
+; GFX9-NEXT:    v_max_f32_e32 v17, v17, v17
+; GFX9-NEXT:    v_max_f32_e32 v18, v18, v18
+; GFX9-NEXT:    v_min_f32_e32 v17, v18, v17
+; GFX9-NEXT:    v_and_b32_e32 v18, 0xffff0000, v13
+; GFX9-NEXT:    v_and_b32_e32 v19, 0xffff0000, v5
+; GFX9-NEXT:    v_max_f32_e32 v18, v18, v18
+; GFX9-NEXT:    v_max_f32_e32 v19, v19, v19
+; GFX9-NEXT:    v_min_f32_e32 v18, v19, v18
+; GFX9-NEXT:    v_and_b32_e32 v19, 0xffff0000, v12
+; GFX9-NEXT:    v_and_b32_e32 v20, 0xffff0000, v4
+; GFX9-NEXT:    v_max_f32_e32 v19, v19, v19
+; GFX9-NEXT:    v_max_f32_e32 v20, v20, v20
+; GFX9-NEXT:    v_min_f32_e32 v19, v20, v19
+; GFX9-NEXT:    v_and_b32_e32 v20, 0xffff0000, v11
+; GFX9-NEXT:    v_and_b32_e32 v21, 0xffff0000, v3
+; GFX9-NEXT:    v_max_f32_e32 v20, v20, v20
+; GFX9-NEXT:    v_max_f32_e32 v21, v21, v21
+; GFX9-NEXT:    v_min_f32_e32 v20, v21, v20
+; GFX9-NEXT:    v_and_b32_e32 v21, 0xffff0000, v10
+; GFX9-NEXT:    v_and_b32_e32 v22, 0xffff0000, v2
+; GFX9-NEXT:    v_max_f32_e32 v21, v21, v21
+; GFX9-NEXT:    v_max_f32_e32 v22, v22, v22
+; GFX9-NEXT:    v_min_f32_e32 v21, v22, v21
+; GFX9-NEXT:    v_and_b32_e32 v22, 0xffff0000, v9
+; GFX9-NEXT:    v_and_b32_e32 v23, 0xffff0000, v1
+; GFX9-NEXT:    v_max_f32_e32 v22, v22, v22
+; GFX9-NEXT:    v_max_f32_e32 v23, v23, v23
+; GFX9-NEXT:    v_min_f32_e32 v22, v23, v22
+; GFX9-NEXT:    v_and_b32_e32 v23, 0xffff0000, v8
+; GFX9-NEXT:    v_and_b32_e32 v24, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX9-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX9-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_max_f32_e32 v23, v23, v23
+; GFX9-NEXT:    v_max_f32_e32 v24, v24, v24
+; GFX9-NEXT:    v_max_f32_e32 v15, v15, v15
+; GFX9-NEXT:    v_max_f32_e32 v7, v7, v7
+; GFX9-NEXT:    v_max_f32_e32 v14, v14, v14
+; GFX9-NEXT:    v_max_f32_e32 v6, v6, v6
+; GFX9-NEXT:    v_max_f32_e32 v13, v13, v13
+; GFX9-NEXT:    v_max_f32_e32 v5, v5, v5
+; GFX9-NEXT:    v_max_f32_e32 v12, v12, v12
+; GFX9-NEXT:    v_max_f32_e32 v4, v4, v4
+; GFX9-NEXT:    v_max_f32_e32 v11, v11, v11
+; GFX9-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX9-NEXT:    v_max_f32_e32 v10, v10, v10
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX9-NEXT:    v_max_f32_e32 v9, v9, v9
+; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX9-NEXT:    v_max_f32_e32 v8, v8, v8
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX9-NEXT:    v_min_f32_e32 v23, v24, v23
+; GFX9-NEXT:    v_min_f32_e32 v7, v7, v15
+; GFX9-NEXT:    v_min_f32_e32 v6, v6, v14
+; GFX9-NEXT:    v_min_f32_e32 v5, v5, v13
+; GFX9-NEXT:    v_min_f32_e32 v4, v4, v12
+; GFX9-NEXT:    v_min_f32_e32 v3, v3, v11
+; GFX9-NEXT:    v_min_f32_e32 v2, v2, v10
+; GFX9-NEXT:    v_min_f32_e32 v1, v1, v9
+; GFX9-NEXT:    v_min_f32_e32 v0, v0, v8
+; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    v_perm_b32 v0, v0, v23, s4
+; GFX9-NEXT:    v_perm_b32 v1, v1, v22, s4
+; GFX9-NEXT:    v_perm_b32 v2, v2, v21, s4
+; GFX9-NEXT:    v_perm_b32 v3, v3, v20, s4
+; GFX9-NEXT:    v_perm_b32 v4, v4, v19, s4
+; GFX9-NEXT:    v_perm_b32 v5, v5, v18, s4
+; GFX9-NEXT:    v_perm_b32 v6, v6, v17, s4
+; GFX9-NEXT:    v_perm_b32 v7, v7, v16, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: v_fdiv_bf16:
+; GFX10-LABEL: v_minnum_v16bf16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT:    v_div_scale_f32 v2, s4, v1, v1, v0
-; GFX10-NEXT:    v_rcp_f32_e32 v3, v2
-; GFX10-NEXT:    v_fma_f32 v4, -v2, v3, 1.0
-; GFX10-NEXT:    v_fmac_f32_e32 v3, v4, v3
-; GFX10-NEXT:    v_div_scale_f32 v4, vcc_lo, v0, v1, v0
-; GFX10-NEXT:    v_mul_f32_e32 v5, v4, v3
-; GFX10-NEXT:    v_fma_f32 v6, -v2, v5, v4
-; GFX10-NEXT:    v_fmac_f32_e32 v5, v6, v3
-; GFX10-NEXT:    v_fma_f32 v2, -v2, v5, v4
-; GFX10-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
-; GFX10-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v16, 0xffff0000, v15
+; GFX10-NEXT:    v_and_b32_e32 v17, 0xffff0000, v7
+; GFX10-NEXT:    v_and_b32_e32 v18, 0xffff0000, v14
+; GFX10-NEXT:    v_and_b32_e32 v19, 0xffff0000, v6
+; GFX10-NEXT:    v_and_b32_e32 v20, 0xffff0000, v5
+; GFX10-NEXT:    v_max_f32_e32 v16, v16, v16
+; GFX10-NEXT:    v_max_f32_e32 v17, v17, v17
+; GFX10-NEXT:    v_and_b32_e32 v21, 0xffff0000, v12
+; GFX10-NEXT:    v_and_b32_e32 v22, 0xffff0000, v4
+; GFX10-NEXT:    v_and_b32_e32 v23, 0xffff0000, v3
+; GFX10-NEXT:    v_and_b32_e32 v24, 0xffff0000, v10
+; GFX10-NEXT:    v_min_f32_e32 v16, v17, v16
+; GFX10-NEXT:    v_max_f32_e32 v17, v18, v18
+; GFX10-NEXT:    v_max_f32_e32 v18, v19, v19
+; GFX10-NEXT:    v_and_b32_e32 v19, 0xffff0000, v13
+; GFX10-NEXT:    v_and_b32_e32 v25, 0xffff0000, v1
+; GFX10-NEXT:    v_and_b32_e32 v26, 0xffff0000, v8
+; GFX10-NEXT:    v_and_b32_e32 v27, 0xffff0000, v0
+; GFX10-NEXT:    v_min_f32_e32 v17, v18, v17
+; GFX10-NEXT:    v_max_f32_e32 v18, v19, v19
+; GFX10-NEXT:    v_max_f32_e32 v19, v20, v20
+; GFX10-NEXT:    v_max_f32_e32 v20, v21, v21
+; GFX10-NEXT:    v_max_f32_e32 v21, v22, v22
+; GFX10-NEXT:    v_and_b32_e32 v22, 0xffff0000, v11
+; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX10-NEXT:    v_min_f32_e32 v18, v19, v18
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX10-NEXT:    v_min_f32_e32 v19, v21, v20
+; GFX10-NEXT:    v_max_f32_e32 v20, v22, v22
+; GFX10-NEXT:    v_max_f32_e32 v21, v23, v23
+; GFX10-NEXT:    v_max_f32_e32 v22, v24, v24
+; GFX10-NEXT:    v_and_b32_e32 v23, 0xffff0000, v2
+; GFX10-NEXT:    v_and_b32_e32 v24, 0xffff0000, v9
+; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    v_max_f32_e32 v23, v23, v23
+; GFX10-NEXT:    v_max_f32_e32 v24, v24, v24
+; GFX10-NEXT:    v_max_f32_e32 v25, v25, v25
+; GFX10-NEXT:    v_max_f32_e32 v26, v26, v26
+; GFX10-NEXT:    v_max_f32_e32 v27, v27, v27
+; GFX10-NEXT:    v_max_f32_e32 v15, v15, v15
+; GFX10-NEXT:    v_max_f32_e32 v7, v7, v7
+; GFX10-NEXT:    v_max_f32_e32 v14, v14, v14
+; GFX10-NEXT:    v_max_f32_e32 v6, v6, v6
+; GFX10-NEXT:    v_max_f32_e32 v13, v13, v13
+; GFX10-NEXT:    v_max_f32_e32 v5, v5, v5
+; GFX10-NEXT:    v_max_f32_e32 v12, v12, v12
+; GFX10-NEXT:    v_max_f32_e32 v11, v11, v11
+; GFX10-NEXT:    v_max_f32_e32 v10, v10, v10
+; GFX10-NEXT:    v_max_f32_e32 v9, v9, v9
+; GFX10-NEXT:    v_max_f32_e32 v8, v8, v8
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX10-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX10-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX10-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX10-NEXT:    v_max_f32_e32 v4, v4, v4
+; GFX10-NEXT:    v_min_f32_e32 v20, v21, v20
+; GFX10-NEXT:    v_min_f32_e32 v21, v23, v22
+; GFX10-NEXT:    v_min_f32_e32 v22, v25, v24
+; GFX10-NEXT:    v_min_f32_e32 v23, v27, v26
+; GFX10-NEXT:    v_min_f32_e32 v7, v7, v15
+; GFX10-NEXT:    v_min_f32_e32 v6, v6, v14
+; GFX10-NEXT:    v_min_f32_e32 v5, v5, v13
+; GFX10-NEXT:    v_min_f32_e32 v0, v0, v8
+; GFX10-NEXT:    v_min_f32_e32 v1, v1, v9
+; GFX10-NEXT:    v_min_f32_e32 v2, v2, v10
+; GFX10-NEXT:    v_min_f32_e32 v3, v3, v11
+; GFX10-NEXT:    v_min_f32_e32 v4, v4, v12
+; GFX10-NEXT:    v_perm_b32 v0, v0, v23, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v1, v1, v22, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v2, v2, v21, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v3, v3, v20, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v4, v4, v19, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v5, v5, v18, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v6, v6, v17, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v7, v7, v16, 0x3020706
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_fdiv_bf16:
+; GFX11-LABEL: v_minnum_v16bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_div_scale_f32 v2, null, v1, v1, v0
-; GFX11-NEXT:    v_rcp_f32_e32 v3, v2
-; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_fma_f32 v4, -v2, v3, 1.0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_fmac_f32_e32 v3, v4, v3
-; GFX11-NEXT:    v_div_scale_f32 v4, vcc_lo, v0, v1, v0
-; GFX11-NEXT:    v_mul_f32_e32 v5, v4, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_fma_f32 v6, -v2, v5, v4
-; GFX11-NEXT:    v_fmac_f32_e32 v5, v6, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_fma_f32 v2, -v2, v5, v4
-; GFX11-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff0000, v7
+; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff0000, v15
+; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff0000, v4
+; GFX11-NEXT:    v_and_b32_e32 v20, 0xffff0000, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_dual_max_f32 v17, v17, v17 :: v_dual_and_b32 v18, 0xffff0000, v14
+; GFX11-NEXT:    v_dual_max_f32 v16, v16, v16 :: v_dual_and_b32 v19, 0xffff0000, v6
+; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff0000, v3
+; GFX11-NEXT:    v_and_b32_e32 v24, 0xffff0000, v10
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_dual_min_f32 v16, v17, v16 :: v_dual_and_b32 v25, 0xffff0000, v1
+; GFX11-NEXT:    v_dual_max_f32 v17, v18, v18 :: v_dual_max_f32 v18, v19, v19
+; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff0000, v13
+; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff0000, v12
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_max_f32 v25, v25, v25 :: v_dual_and_b32 v26, 0xffff0000, v8
+; GFX11-NEXT:    v_dual_min_f32 v17, v18, v17 :: v_dual_max_f32 v18, v19, v19
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_max_f32 v19, v20, v20 :: v_dual_max_f32 v20, v21, v21
+; GFX11-NEXT:    v_dual_max_f32 v21, v22, v22 :: v_dual_and_b32 v22, 0xffff0000, v11
+; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-NEXT:    v_dual_min_f32 v18, v19, v18 :: v_dual_lshlrev_b32 v7, 16, v7
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_min_f32 v19, v21, v20 :: v_dual_max_f32 v20, v22, v22
+; GFX11-NEXT:    v_dual_max_f32 v14, v14, v14 :: v_dual_max_f32 v21, v23, v23
+; GFX11-NEXT:    v_dual_max_f32 v22, v24, v24 :: v_dual_lshlrev_b32 v15, 16, v15
+; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff0000, v2
+; GFX11-NEXT:    v_and_b32_e32 v24, 0xffff0000, v9
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_min_f32 v20, v21, v20 :: v_dual_max_f32 v15, v15, v15
+; GFX11-NEXT:    v_dual_max_f32 v26, v26, v26 :: v_dual_and_b32 v27, 0xffff0000, v0
+; GFX11-NEXT:    v_dual_max_f32 v7, v7, v7 :: v_dual_lshlrev_b32 v6, 16, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX11-NEXT:    v_dual_max_f32 v23, v23, v23 :: v_dual_max_f32 v24, v24, v24
+; GFX11-NEXT:    v_dual_max_f32 v27, v27, v27 :: v_dual_max_f32 v6, v6, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_dual_max_f32 v13, v13, v13 :: v_dual_lshlrev_b32 v10, 16, v10
+; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX11-NEXT:    v_dual_min_f32 v21, v23, v22 :: v_dual_min_f32 v22, v25, v24
+; GFX11-NEXT:    v_dual_min_f32 v23, v27, v26 :: v_dual_lshlrev_b32 v12, 16, v12
+; GFX11-NEXT:    v_dual_min_f32 v6, v6, v14 :: v_dual_max_f32 v5, v5, v5
+; GFX11-NEXT:    v_dual_max_f32 v10, v10, v10 :: v_dual_max_f32 v11, v11, v11
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-NEXT:    v_dual_min_f32 v7, v7, v15 :: v_dual_lshlrev_b32 v4, 16, v4
+; GFX11-NEXT:    v_dual_max_f32 v12, v12, v12 :: v_dual_min_f32 v5, v5, v13
+; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_lshlrev_b32 v3, 16, v3
+; GFX11-NEXT:    v_dual_max_f32 v9, v9, v9 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_max_f32 v8, v8, v8 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX11-NEXT:    v_dual_max_f32 v3, v3, v3 :: v_dual_max_f32 v2, v2, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_max_f32 v4, v4, v4 :: v_dual_max_f32 v1, v1, v1
+; GFX11-NEXT:    v_dual_min_f32 v0, v0, v8 :: v_dual_min_f32 v3, v3, v11
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_min_f32_e32 v2, v2, v10
+; GFX11-NEXT:    v_dual_min_f32 v4, v4, v12 :: v_dual_min_f32 v1, v1, v9
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v23, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v3, v3, v20, 0x3020706
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_perm_b32 v2, v2, v21, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v4, v4, v19, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v1, v1, v22, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v5, v5, v18, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v6, v6, v17, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v7, v7, v16, 0x3020706
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %op = fdiv bfloat %a, %b
-  ret bfloat %op
+  %op = call <16 x bfloat> @llvm.minnum.v16bf16(<16 x bfloat> %a, <16 x bfloat> %b)
+  ret <16 x bfloat> %op
 }
 
-declare bfloat @llvm.fabs.bf16(bfloat)
-
-define bfloat @v_fabs_bf16(bfloat %a) {
-; GCN-LABEL: v_fabs_bf16:
+define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
+; GCN-LABEL: v_minnum_v32bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:128
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    v_mul_f32_e32 v31, 1.0, v31
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:124
+; GCN-NEXT:    v_min_f32_e32 v31, v32, v31
+; GCN-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GCN-NEXT:    v_mul_f32_e32 v30, 1.0, v30
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:120
+; GCN-NEXT:    v_min_f32_e32 v30, v30, v32
+; GCN-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GCN-NEXT:    v_mul_f32_e32 v29, 1.0, v29
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:116
+; GCN-NEXT:    v_min_f32_e32 v29, v29, v32
+; GCN-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GCN-NEXT:    v_mul_f32_e32 v28, 1.0, v28
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:112
+; GCN-NEXT:    v_min_f32_e32 v28, v28, v32
+; GCN-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GCN-NEXT:    v_mul_f32_e32 v27, 1.0, v27
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:108
+; GCN-NEXT:    v_min_f32_e32 v27, v27, v32
+; GCN-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GCN-NEXT:    v_mul_f32_e32 v26, 1.0, v26
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:104
+; GCN-NEXT:    v_min_f32_e32 v26, v26, v32
+; GCN-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GCN-NEXT:    v_mul_f32_e32 v25, 1.0, v25
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:100
+; GCN-NEXT:    v_min_f32_e32 v25, v25, v32
+; GCN-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GCN-NEXT:    v_mul_f32_e32 v24, 1.0, v24
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:96
+; GCN-NEXT:    v_min_f32_e32 v24, v24, v32
+; GCN-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GCN-NEXT:    v_mul_f32_e32 v23, 1.0, v23
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:92
+; GCN-NEXT:    v_min_f32_e32 v23, v23, v32
+; GCN-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GCN-NEXT:    v_mul_f32_e32 v22, 1.0, v22
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:88
+; GCN-NEXT:    v_min_f32_e32 v22, v22, v32
+; GCN-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GCN-NEXT:    v_mul_f32_e32 v21, 1.0, v21
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:84
+; GCN-NEXT:    v_min_f32_e32 v21, v21, v32
+; GCN-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GCN-NEXT:    v_mul_f32_e32 v20, 1.0, v20
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:80
+; GCN-NEXT:    v_min_f32_e32 v20, v20, v32
+; GCN-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GCN-NEXT:    v_mul_f32_e32 v19, 1.0, v19
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:76
+; GCN-NEXT:    v_min_f32_e32 v19, v19, v32
+; GCN-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GCN-NEXT:    v_mul_f32_e32 v18, 1.0, v18
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:72
+; GCN-NEXT:    v_min_f32_e32 v18, v18, v32
+; GCN-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GCN-NEXT:    v_mul_f32_e32 v17, 1.0, v17
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:68
+; GCN-NEXT:    v_min_f32_e32 v17, v17, v32
+; GCN-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GCN-NEXT:    v_mul_f32_e32 v16, 1.0, v16
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:64
+; GCN-NEXT:    v_min_f32_e32 v16, v16, v32
+; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GCN-NEXT:    v_mul_f32_e32 v15, 1.0, v15
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:60
+; GCN-NEXT:    v_min_f32_e32 v15, v15, v32
+; GCN-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GCN-NEXT:    v_mul_f32_e32 v14, 1.0, v14
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:56
+; GCN-NEXT:    v_min_f32_e32 v14, v14, v32
+; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GCN-NEXT:    v_mul_f32_e32 v13, 1.0, v13
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:52
+; GCN-NEXT:    v_min_f32_e32 v13, v13, v32
+; GCN-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GCN-NEXT:    v_mul_f32_e32 v12, 1.0, v12
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:48
+; GCN-NEXT:    v_min_f32_e32 v12, v12, v32
+; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GCN-NEXT:    v_mul_f32_e32 v11, 1.0, v11
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:44
+; GCN-NEXT:    v_min_f32_e32 v11, v11, v32
+; GCN-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GCN-NEXT:    v_mul_f32_e32 v10, 1.0, v10
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:40
+; GCN-NEXT:    v_min_f32_e32 v10, v10, v32
+; GCN-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GCN-NEXT:    v_mul_f32_e32 v9, 1.0, v9
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:36
+; GCN-NEXT:    v_min_f32_e32 v9, v9, v32
+; GCN-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GCN-NEXT:    v_mul_f32_e32 v8, 1.0, v8
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:32
+; GCN-NEXT:    v_min_f32_e32 v8, v8, v32
+; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT:    v_mul_f32_e32 v7, 1.0, v7
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:28
+; GCN-NEXT:    v_min_f32_e32 v7, v7, v32
+; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT:    v_mul_f32_e32 v6, 1.0, v6
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:24
+; GCN-NEXT:    v_min_f32_e32 v6, v6, v32
+; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:20
+; GCN-NEXT:    v_min_f32_e32 v5, v5, v32
+; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:16
+; GCN-NEXT:    v_min_f32_e32 v4, v4, v32
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:12
+; GCN-NEXT:    v_min_f32_e32 v3, v3, v32
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
+; GCN-NEXT:    v_min_f32_e32 v2, v2, v32
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:4
+; GCN-NEXT:    v_min_f32_e32 v1, v1, v32
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
+; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    v_min_f32_e32 v0, v0, v32
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GCN-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GCN-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GCN-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GCN-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GCN-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GCN-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GCN-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GCN-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GCN-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GCN-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GCN-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GCN-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GCN-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GCN-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GCN-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GCN-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GCN-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GCN-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GCN-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GCN-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: v_fabs_bf16:
+; GFX7-LABEL: v_minnum_v32bf16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:128
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32
+; GFX7-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GFX7-NEXT:    v_mul_f32_e32 v30, 1.0, v30
+; GFX7-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GFX7-NEXT:    v_mul_f32_e32 v29, 1.0, v29
+; GFX7-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GFX7-NEXT:    v_mul_f32_e32 v28, 1.0, v28
+; GFX7-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GFX7-NEXT:    v_mul_f32_e32 v27, 1.0, v27
+; GFX7-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GFX7-NEXT:    v_mul_f32_e32 v26, 1.0, v26
+; GFX7-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GFX7-NEXT:    v_mul_f32_e32 v25, 1.0, v25
+; GFX7-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GFX7-NEXT:    v_mul_f32_e32 v24, 1.0, v24
+; GFX7-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GFX7-NEXT:    v_mul_f32_e32 v23, 1.0, v23
+; GFX7-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX7-NEXT:    v_mul_f32_e32 v22, 1.0, v22
+; GFX7-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GFX7-NEXT:    v_mul_f32_e32 v21, 1.0, v21
+; GFX7-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GFX7-NEXT:    v_mul_f32_e32 v20, 1.0, v20
+; GFX7-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GFX7-NEXT:    v_mul_f32_e32 v19, 1.0, v19
+; GFX7-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX7-NEXT:    v_mul_f32_e32 v18, 1.0, v18
+; GFX7-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GFX7-NEXT:    v_mul_f32_e32 v17, 1.0, v17
+; GFX7-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX7-NEXT:    v_mul_f32_e32 v16, 1.0, v16
+; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT:    v_mul_f32_e32 v15, 1.0, v15
+; GFX7-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX7-NEXT:    v_mul_f32_e32 v14, 1.0, v14
+; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX7-NEXT:    v_mul_f32_e32 v13, 1.0, v13
+; GFX7-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX7-NEXT:    v_mul_f32_e32 v12, 1.0, v12
+; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX7-NEXT:    v_mul_f32_e32 v11, 1.0, v11
+; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX7-NEXT:    v_mul_f32_e32 v10, 1.0, v10
+; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX7-NEXT:    v_mul_f32_e32 v9, 1.0, v9
+; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX7-NEXT:    v_mul_f32_e32 v8, 1.0, v8
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT:    v_mul_f32_e32 v7, 1.0, v7
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v6
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
+; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v31, 1.0, v31
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_min_f32_e32 v31, v32, v31
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:124
+; GFX7-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_min_f32_e32 v30, v30, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:120
+; GFX7-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_min_f32_e32 v29, v29, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:116
+; GFX7-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_min_f32_e32 v28, v28, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:112
+; GFX7-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_min_f32_e32 v27, v27, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:108
+; GFX7-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_min_f32_e32 v26, v26, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:104
+; GFX7-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_min_f32_e32 v25, v25, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:100
+; GFX7-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_min_f32_e32 v24, v24, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:96
+; GFX7-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_min_f32_e32 v23, v23, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:92
+; GFX7-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_min_f32_e32 v22, v22, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:88
+; GFX7-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_min_f32_e32 v21, v21, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:84
+; GFX7-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_min_f32_e32 v20, v20, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:80
+; GFX7-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_min_f32_e32 v19, v19, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:76
+; GFX7-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_min_f32_e32 v18, v18, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:72
+; GFX7-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_min_f32_e32 v17, v17, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:68
+; GFX7-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_min_f32_e32 v16, v16, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:64
+; GFX7-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_min_f32_e32 v15, v15, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:60
+; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_min_f32_e32 v14, v14, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:56
+; GFX7-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_min_f32_e32 v13, v13, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:52
+; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_min_f32_e32 v12, v12, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:48
+; GFX7-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_min_f32_e32 v11, v11, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:44
+; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_min_f32_e32 v10, v10, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:40
+; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_min_f32_e32 v9, v9, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:36
+; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_min_f32_e32 v8, v8, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:32
+; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_min_f32_e32 v7, v7, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:28
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_min_f32_e32 v6, v6, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:24
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_min_f32_e32 v5, v5, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:20
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_min_f32_e32 v4, v4, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:16
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_min_f32_e32 v3, v3, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:12
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_min_f32_e32 v2, v2, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:8
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_min_f32_e32 v1, v1, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_min_f32_e32 v0, v0, v32
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: v_fabs_bf16:
+; GFX8-LABEL: v_minnum_v32bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX8-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_and_b32_e32 v31, 0xffff0000, v30
+; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v14
+; GFX8-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
+; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX8-NEXT:    v_mul_f32_e32 v31, 1.0, v31
+; GFX8-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX8-NEXT:    v_mul_f32_e32 v30, 1.0, v30
+; GFX8-NEXT:    v_mul_f32_e32 v14, 1.0, v14
+; GFX8-NEXT:    v_min_f32_e32 v31, v32, v31
+; GFX8-NEXT:    v_min_f32_e32 v14, v14, v30
+; GFX8-NEXT:    v_and_b32_e32 v30, 0xffff0000, v29
+; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v13
+; GFX8-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
+; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX8-NEXT:    v_mul_f32_e32 v30, 1.0, v30
+; GFX8-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX8-NEXT:    v_mul_f32_e32 v29, 1.0, v29
+; GFX8-NEXT:    v_mul_f32_e32 v13, 1.0, v13
+; GFX8-NEXT:    v_min_f32_e32 v30, v32, v30
+; GFX8-NEXT:    v_min_f32_e32 v13, v13, v29
+; GFX8-NEXT:    v_and_b32_e32 v29, 0xffff0000, v28
+; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v12
+; GFX8-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
+; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX8-NEXT:    v_mul_f32_e32 v29, 1.0, v29
+; GFX8-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX8-NEXT:    v_mul_f32_e32 v28, 1.0, v28
+; GFX8-NEXT:    v_mul_f32_e32 v12, 1.0, v12
+; GFX8-NEXT:    v_min_f32_e32 v29, v32, v29
+; GFX8-NEXT:    v_min_f32_e32 v12, v12, v28
+; GFX8-NEXT:    v_and_b32_e32 v28, 0xffff0000, v27
+; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v11
+; GFX8-NEXT:    v_mul_f32_e32 v28, 1.0, v28
+; GFX8-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX8-NEXT:    v_min_f32_e32 v28, v32, v28
+; GFX8-NEXT:    buffer_load_dword v32, off, s[0:3], s32
+; GFX8-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
+; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX8-NEXT:    v_mul_f32_e32 v27, 1.0, v27
+; GFX8-NEXT:    v_mul_f32_e32 v11, 1.0, v11
+; GFX8-NEXT:    v_min_f32_e32 v11, v11, v27
+; GFX8-NEXT:    v_and_b32_e32 v27, 0xffff0000, v15
+; GFX8-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX8-NEXT:    v_mul_f32_e32 v27, 1.0, v27
+; GFX8-NEXT:    v_mul_f32_e32 v15, 1.0, v15
+; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX8-NEXT:    v_perm_b32 v11, v11, v28, s4
+; GFX8-NEXT:    v_perm_b32 v12, v12, v29, s4
+; GFX8-NEXT:    v_perm_b32 v13, v13, v30, s4
+; GFX8-NEXT:    v_perm_b32 v14, v14, v31, s4
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v32
+; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v32
+; GFX8-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX8-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX8-NEXT:    v_min_f32_e32 v27, v27, v33
+; GFX8-NEXT:    v_min_f32_e32 v15, v15, v32
+; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v26
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v10
+; GFX8-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
+; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX8-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX8-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX8-NEXT:    v_mul_f32_e32 v26, 1.0, v26
+; GFX8-NEXT:    v_mul_f32_e32 v10, 1.0, v10
+; GFX8-NEXT:    v_min_f32_e32 v32, v33, v32
+; GFX8-NEXT:    v_min_f32_e32 v10, v10, v26
+; GFX8-NEXT:    v_and_b32_e32 v26, 0xffff0000, v25
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v9
+; GFX8-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX8-NEXT:    v_mul_f32_e32 v26, 1.0, v26
+; GFX8-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX8-NEXT:    v_mul_f32_e32 v25, 1.0, v25
+; GFX8-NEXT:    v_mul_f32_e32 v9, 1.0, v9
+; GFX8-NEXT:    v_min_f32_e32 v26, v33, v26
+; GFX8-NEXT:    v_min_f32_e32 v9, v9, v25
+; GFX8-NEXT:    v_and_b32_e32 v25, 0xffff0000, v24
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v8
+; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX8-NEXT:    v_mul_f32_e32 v25, 1.0, v25
+; GFX8-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX8-NEXT:    v_mul_f32_e32 v24, 1.0, v24
+; GFX8-NEXT:    v_mul_f32_e32 v8, 1.0, v8
+; GFX8-NEXT:    v_min_f32_e32 v25, v33, v25
+; GFX8-NEXT:    v_min_f32_e32 v8, v8, v24
+; GFX8-NEXT:    v_and_b32_e32 v24, 0xffff0000, v23
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v7
+; GFX8-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX8-NEXT:    v_mul_f32_e32 v24, 1.0, v24
+; GFX8-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX8-NEXT:    v_mul_f32_e32 v23, 1.0, v23
+; GFX8-NEXT:    v_mul_f32_e32 v7, 1.0, v7
+; GFX8-NEXT:    v_min_f32_e32 v24, v33, v24
+; GFX8-NEXT:    v_min_f32_e32 v7, v7, v23
+; GFX8-NEXT:    v_and_b32_e32 v23, 0xffff0000, v22
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v6
+; GFX8-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_mul_f32_e32 v23, 1.0, v23
+; GFX8-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX8-NEXT:    v_mul_f32_e32 v22, 1.0, v22
+; GFX8-NEXT:    v_mul_f32_e32 v6, 1.0, v6
+; GFX8-NEXT:    v_min_f32_e32 v23, v33, v23
+; GFX8-NEXT:    v_min_f32_e32 v6, v6, v22
+; GFX8-NEXT:    v_and_b32_e32 v22, 0xffff0000, v21
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_mul_f32_e32 v22, 1.0, v22
+; GFX8-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX8-NEXT:    v_mul_f32_e32 v21, 1.0, v21
+; GFX8-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GFX8-NEXT:    v_min_f32_e32 v22, v33, v22
+; GFX8-NEXT:    v_min_f32_e32 v5, v5, v21
+; GFX8-NEXT:    v_and_b32_e32 v21, 0xffff0000, v20
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX8-NEXT:    v_mul_f32_e32 v21, 1.0, v21
+; GFX8-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX8-NEXT:    v_mul_f32_e32 v20, 1.0, v20
+; GFX8-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GFX8-NEXT:    v_min_f32_e32 v21, v33, v21
+; GFX8-NEXT:    v_min_f32_e32 v4, v4, v20
+; GFX8-NEXT:    v_and_b32_e32 v20, 0xffff0000, v19
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_mul_f32_e32 v20, 1.0, v20
+; GFX8-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX8-NEXT:    v_mul_f32_e32 v19, 1.0, v19
+; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX8-NEXT:    v_min_f32_e32 v20, v33, v20
+; GFX8-NEXT:    v_min_f32_e32 v3, v3, v19
+; GFX8-NEXT:    v_and_b32_e32 v19, 0xffff0000, v18
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_mul_f32_e32 v19, 1.0, v19
+; GFX8-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX8-NEXT:    v_mul_f32_e32 v18, 1.0, v18
+; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX8-NEXT:    v_min_f32_e32 v19, v33, v19
+; GFX8-NEXT:    v_min_f32_e32 v2, v2, v18
+; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v17
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_mul_f32_e32 v18, 1.0, v18
+; GFX8-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX8-NEXT:    v_mul_f32_e32 v17, 1.0, v17
+; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX8-NEXT:    v_min_f32_e32 v18, v33, v18
+; GFX8-NEXT:    v_min_f32_e32 v1, v1, v17
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v16
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_mul_f32_e32 v17, 1.0, v17
+; GFX8-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX8-NEXT:    v_mul_f32_e32 v16, 1.0, v16
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX8-NEXT:    v_min_f32_e32 v17, v33, v17
+; GFX8-NEXT:    v_min_f32_e32 v0, v0, v16
+; GFX8-NEXT:    v_perm_b32 v0, v0, v17, s4
+; GFX8-NEXT:    v_perm_b32 v1, v1, v18, s4
+; GFX8-NEXT:    v_perm_b32 v2, v2, v19, s4
+; GFX8-NEXT:    v_perm_b32 v3, v3, v20, s4
+; GFX8-NEXT:    v_perm_b32 v4, v4, v21, s4
+; GFX8-NEXT:    v_perm_b32 v5, v5, v22, s4
+; GFX8-NEXT:    v_perm_b32 v6, v6, v23, s4
+; GFX8-NEXT:    v_perm_b32 v7, v7, v24, s4
+; GFX8-NEXT:    v_perm_b32 v8, v8, v25, s4
+; GFX8-NEXT:    v_perm_b32 v9, v9, v26, s4
+; GFX8-NEXT:    v_perm_b32 v10, v10, v32, s4
+; GFX8-NEXT:    v_perm_b32 v15, v15, v27, s4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fabs_bf16:
+; GFX9-LABEL: v_minnum_v32bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    buffer_load_dword v35, off, s[0:3], s32
+; GFX9-NEXT:    v_and_b32_e32 v31, 0xffff0000, v30
+; GFX9-NEXT:    v_and_b32_e32 v32, 0xffff0000, v14
+; GFX9-NEXT:    v_and_b32_e32 v33, 0xffff0000, v29
+; GFX9-NEXT:    v_and_b32_e32 v34, 0xffff0000, v13
+; GFX9-NEXT:    v_and_b32_e32 v36, 0xffff0000, v28
+; GFX9-NEXT:    v_and_b32_e32 v37, 0xffff0000, v12
+; GFX9-NEXT:    v_and_b32_e32 v50, 0xffff0000, v25
+; GFX9-NEXT:    v_and_b32_e32 v51, 0xffff0000, v9
+; GFX9-NEXT:    v_max_f32_e32 v31, v31, v31
+; GFX9-NEXT:    v_max_f32_e32 v32, v32, v32
+; GFX9-NEXT:    v_max_f32_e32 v33, v33, v33
+; GFX9-NEXT:    v_max_f32_e32 v34, v34, v34
+; GFX9-NEXT:    v_max_f32_e32 v36, v36, v36
+; GFX9-NEXT:    v_max_f32_e32 v37, v37, v37
+; GFX9-NEXT:    v_max_f32_e32 v50, v50, v50
+; GFX9-NEXT:    v_max_f32_e32 v51, v51, v51
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_and_b32_e32 v38, 0xffff0000, v27
+; GFX9-NEXT:    v_and_b32_e32 v39, 0xffff0000, v11
+; GFX9-NEXT:    v_and_b32_e32 v52, 0xffff0000, v24
+; GFX9-NEXT:    v_and_b32_e32 v53, 0xffff0000, v8
+; GFX9-NEXT:    v_and_b32_e32 v43, 0xffff0000, v21
+; GFX9-NEXT:    v_min_f32_e32 v31, v32, v31
+; GFX9-NEXT:    v_min_f32_e32 v32, v34, v33
+; GFX9-NEXT:    v_min_f32_e32 v33, v37, v36
+; GFX9-NEXT:    v_min_f32_e32 v37, v51, v50
+; GFX9-NEXT:    v_and_b32_e32 v51, 0xffff0000, v5
+; GFX9-NEXT:    v_max_f32_e32 v38, v38, v38
+; GFX9-NEXT:    v_max_f32_e32 v39, v39, v39
+; GFX9-NEXT:    v_max_f32_e32 v52, v52, v52
+; GFX9-NEXT:    v_max_f32_e32 v53, v53, v53
+; GFX9-NEXT:    v_max_f32_e32 v50, v43, v43
+; GFX9-NEXT:    v_max_f32_e32 v51, v51, v51
+; GFX9-NEXT:    v_min_f32_e32 v34, v39, v38
+; GFX9-NEXT:    v_min_f32_e32 v38, v53, v52
+; GFX9-NEXT:    v_min_f32_e32 v50, v51, v50
+; GFX9-NEXT:    v_and_b32_e32 v51, 0xffff0000, v20
+; GFX9-NEXT:    v_and_b32_e32 v52, 0xffff0000, v4
+; GFX9-NEXT:    v_max_f32_e32 v51, v51, v51
+; GFX9-NEXT:    v_max_f32_e32 v52, v52, v52
+; GFX9-NEXT:    v_and_b32_e32 v54, 0xffff0000, v23
+; GFX9-NEXT:    v_and_b32_e32 v55, 0xffff0000, v7
+; GFX9-NEXT:    v_min_f32_e32 v51, v52, v51
+; GFX9-NEXT:    v_and_b32_e32 v52, 0xffff0000, v19
+; GFX9-NEXT:    v_and_b32_e32 v53, 0xffff0000, v3
+; GFX9-NEXT:    v_max_f32_e32 v54, v54, v54
+; GFX9-NEXT:    v_max_f32_e32 v55, v55, v55
+; GFX9-NEXT:    v_max_f32_e32 v52, v52, v52
+; GFX9-NEXT:    v_max_f32_e32 v53, v53, v53
+; GFX9-NEXT:    v_min_f32_e32 v39, v55, v54
+; GFX9-NEXT:    v_min_f32_e32 v52, v53, v52
+; GFX9-NEXT:    v_and_b32_e32 v53, 0xffff0000, v18
+; GFX9-NEXT:    v_and_b32_e32 v54, 0xffff0000, v2
+; GFX9-NEXT:    v_max_f32_e32 v53, v53, v53
+; GFX9-NEXT:    v_max_f32_e32 v54, v54, v54
+; GFX9-NEXT:    v_and_b32_e32 v48, 0xffff0000, v26
+; GFX9-NEXT:    v_and_b32_e32 v49, 0xffff0000, v10
+; GFX9-NEXT:    v_and_b32_e32 v40, 0xffff0000, v22
+; GFX9-NEXT:    v_and_b32_e32 v41, 0xffff0000, v6
+; GFX9-NEXT:    v_min_f32_e32 v53, v54, v53
+; GFX9-NEXT:    v_and_b32_e32 v54, 0xffff0000, v17
+; GFX9-NEXT:    v_and_b32_e32 v55, 0xffff0000, v1
+; GFX9-NEXT:    v_max_f32_e32 v48, v48, v48
+; GFX9-NEXT:    v_max_f32_e32 v49, v49, v49
+; GFX9-NEXT:    v_max_f32_e32 v40, v40, v40
+; GFX9-NEXT:    v_max_f32_e32 v41, v41, v41
+; GFX9-NEXT:    v_max_f32_e32 v54, v54, v54
+; GFX9-NEXT:    v_max_f32_e32 v55, v55, v55
+; GFX9-NEXT:    v_and_b32_e32 v42, 0xffff0000, v15
+; GFX9-NEXT:    v_min_f32_e32 v36, v49, v48
+; GFX9-NEXT:    v_min_f32_e32 v48, v41, v40
+; GFX9-NEXT:    v_min_f32_e32 v54, v55, v54
+; GFX9-NEXT:    v_and_b32_e32 v55, 0xffff0000, v16
+; GFX9-NEXT:    v_and_b32_e32 v40, 0xffff0000, v0
+; GFX9-NEXT:    v_max_f32_e32 v42, v42, v42
+; GFX9-NEXT:    v_max_f32_e32 v55, v55, v55
+; GFX9-NEXT:    v_max_f32_e32 v40, v40, v40
+; GFX9-NEXT:    v_min_f32_e32 v55, v40, v55
+; GFX9-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    v_and_b32_e32 v49, 0xffff0000, v35
+; GFX9-NEXT:    v_max_f32_e32 v49, v49, v49
+; GFX9-NEXT:    v_min_f32_e32 v49, v42, v49
+; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9-NEXT:    v_lshlrev_b32_e32 v35, 16, v35
+; GFX9-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
+; GFX9-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX9-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
+; GFX9-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX9-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
+; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX9-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
+; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX9-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
+; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX9-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX9-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX9-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX9-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
+; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX9-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX9-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_max_f32_e32 v35, v35, v35
+; GFX9-NEXT:    v_max_f32_e32 v15, v15, v15
+; GFX9-NEXT:    v_max_f32_e32 v30, v30, v30
+; GFX9-NEXT:    v_max_f32_e32 v14, v14, v14
+; GFX9-NEXT:    v_max_f32_e32 v29, v29, v29
+; GFX9-NEXT:    v_max_f32_e32 v13, v13, v13
+; GFX9-NEXT:    v_max_f32_e32 v28, v28, v28
+; GFX9-NEXT:    v_max_f32_e32 v12, v12, v12
+; GFX9-NEXT:    v_max_f32_e32 v27, v27, v27
+; GFX9-NEXT:    v_max_f32_e32 v11, v11, v11
+; GFX9-NEXT:    v_max_f32_e32 v26, v26, v26
+; GFX9-NEXT:    v_max_f32_e32 v10, v10, v10
+; GFX9-NEXT:    v_max_f32_e32 v25, v25, v25
+; GFX9-NEXT:    v_max_f32_e32 v9, v9, v9
+; GFX9-NEXT:    v_max_f32_e32 v24, v24, v24
+; GFX9-NEXT:    v_max_f32_e32 v8, v8, v8
+; GFX9-NEXT:    v_max_f32_e32 v23, v23, v23
+; GFX9-NEXT:    v_max_f32_e32 v7, v7, v7
+; GFX9-NEXT:    v_max_f32_e32 v22, v22, v22
+; GFX9-NEXT:    v_max_f32_e32 v6, v6, v6
+; GFX9-NEXT:    v_max_f32_e32 v21, v21, v21
+; GFX9-NEXT:    v_max_f32_e32 v5, v5, v5
+; GFX9-NEXT:    v_max_f32_e32 v20, v20, v20
+; GFX9-NEXT:    v_max_f32_e32 v4, v4, v4
+; GFX9-NEXT:    v_max_f32_e32 v19, v19, v19
+; GFX9-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX9-NEXT:    v_max_f32_e32 v18, v18, v18
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX9-NEXT:    v_max_f32_e32 v17, v17, v17
+; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX9-NEXT:    v_max_f32_e32 v16, v16, v16
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX9-NEXT:    v_min_f32_e32 v15, v15, v35
+; GFX9-NEXT:    v_min_f32_e32 v14, v14, v30
+; GFX9-NEXT:    v_min_f32_e32 v13, v13, v29
+; GFX9-NEXT:    v_min_f32_e32 v12, v12, v28
+; GFX9-NEXT:    v_min_f32_e32 v11, v11, v27
+; GFX9-NEXT:    v_min_f32_e32 v10, v10, v26
+; GFX9-NEXT:    v_min_f32_e32 v9, v9, v25
+; GFX9-NEXT:    v_min_f32_e32 v8, v8, v24
+; GFX9-NEXT:    v_min_f32_e32 v7, v7, v23
+; GFX9-NEXT:    v_min_f32_e32 v6, v6, v22
+; GFX9-NEXT:    v_min_f32_e32 v5, v5, v21
+; GFX9-NEXT:    v_min_f32_e32 v4, v4, v20
+; GFX9-NEXT:    v_min_f32_e32 v3, v3, v19
+; GFX9-NEXT:    v_min_f32_e32 v2, v2, v18
+; GFX9-NEXT:    v_min_f32_e32 v1, v1, v17
+; GFX9-NEXT:    v_min_f32_e32 v0, v0, v16
+; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    v_perm_b32 v0, v0, v55, s4
+; GFX9-NEXT:    v_perm_b32 v1, v1, v54, s4
+; GFX9-NEXT:    v_perm_b32 v2, v2, v53, s4
+; GFX9-NEXT:    v_perm_b32 v3, v3, v52, s4
+; GFX9-NEXT:    v_perm_b32 v4, v4, v51, s4
+; GFX9-NEXT:    v_perm_b32 v5, v5, v50, s4
+; GFX9-NEXT:    v_perm_b32 v6, v6, v48, s4
+; GFX9-NEXT:    v_perm_b32 v7, v7, v39, s4
+; GFX9-NEXT:    v_perm_b32 v8, v8, v38, s4
+; GFX9-NEXT:    v_perm_b32 v9, v9, v37, s4
+; GFX9-NEXT:    v_perm_b32 v10, v10, v36, s4
+; GFX9-NEXT:    v_perm_b32 v11, v11, v34, s4
+; GFX9-NEXT:    v_perm_b32 v12, v12, v33, s4
+; GFX9-NEXT:    v_perm_b32 v13, v13, v32, s4
+; GFX9-NEXT:    v_perm_b32 v14, v14, v31, s4
+; GFX9-NEXT:    v_perm_b32 v15, v15, v49, s4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: v_fabs_bf16:
+; GFX10-LABEL: v_minnum_v32bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX10-NEXT:    v_and_b32_e32 v53, 0xffff0000, v24
+; GFX10-NEXT:    v_and_b32_e32 v54, 0xffff0000, v8
+; GFX10-NEXT:    v_and_b32_e32 v55, 0xffff0000, v23
+; GFX10-NEXT:    v_and_b32_e32 v64, 0xffff0000, v7
+; GFX10-NEXT:    v_and_b32_e32 v65, 0xffff0000, v22
+; GFX10-NEXT:    v_and_b32_e32 v66, 0xffff0000, v6
+; GFX10-NEXT:    v_and_b32_e32 v67, 0xffff0000, v21
+; GFX10-NEXT:    v_and_b32_e32 v68, 0xffff0000, v5
+; GFX10-NEXT:    v_max_f32_e32 v53, v53, v53
+; GFX10-NEXT:    v_max_f32_e32 v54, v54, v54
+; GFX10-NEXT:    v_max_f32_e32 v55, v55, v55
+; GFX10-NEXT:    v_max_f32_e32 v64, v64, v64
+; GFX10-NEXT:    v_max_f32_e32 v65, v65, v65
+; GFX10-NEXT:    v_max_f32_e32 v66, v66, v66
+; GFX10-NEXT:    v_max_f32_e32 v67, v67, v67
+; GFX10-NEXT:    v_max_f32_e32 v68, v68, v68
+; GFX10-NEXT:    v_and_b32_e32 v32, 0xffff0000, v30
+; GFX10-NEXT:    v_and_b32_e32 v34, 0xffff0000, v14
+; GFX10-NEXT:    v_and_b32_e32 v35, 0xffff0000, v29
+; GFX10-NEXT:    v_and_b32_e32 v36, 0xffff0000, v13
+; GFX10-NEXT:    v_and_b32_e32 v37, 0xffff0000, v28
+; GFX10-NEXT:    v_and_b32_e32 v38, 0xffff0000, v12
+; GFX10-NEXT:    v_and_b32_e32 v39, 0xffff0000, v27
+; GFX10-NEXT:    v_and_b32_e32 v48, 0xffff0000, v11
+; GFX10-NEXT:    v_and_b32_e32 v49, 0xffff0000, v26
+; GFX10-NEXT:    v_and_b32_e32 v50, 0xffff0000, v10
+; GFX10-NEXT:    v_and_b32_e32 v51, 0xffff0000, v25
+; GFX10-NEXT:    v_and_b32_e32 v52, 0xffff0000, v9
+; GFX10-NEXT:    v_min_f32_e32 v53, v54, v53
+; GFX10-NEXT:    v_and_b32_e32 v54, 0xffff0000, v17
+; GFX10-NEXT:    v_min_f32_e32 v55, v64, v55
+; GFX10-NEXT:    v_and_b32_e32 v64, 0xffff0000, v1
+; GFX10-NEXT:    v_min_f32_e32 v65, v66, v65
+; GFX10-NEXT:    v_and_b32_e32 v66, 0xffff0000, v16
+; GFX10-NEXT:    v_min_f32_e32 v67, v68, v67
+; GFX10-NEXT:    v_and_b32_e32 v68, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_max_f32_e32 v32, v32, v32
+; GFX10-NEXT:    v_max_f32_e32 v34, v34, v34
+; GFX10-NEXT:    v_max_f32_e32 v35, v35, v35
+; GFX10-NEXT:    v_max_f32_e32 v36, v36, v36
+; GFX10-NEXT:    v_max_f32_e32 v37, v37, v37
+; GFX10-NEXT:    v_max_f32_e32 v38, v38, v38
+; GFX10-NEXT:    v_max_f32_e32 v39, v39, v39
+; GFX10-NEXT:    v_max_f32_e32 v48, v48, v48
+; GFX10-NEXT:    v_max_f32_e32 v49, v49, v49
+; GFX10-NEXT:    v_max_f32_e32 v50, v50, v50
+; GFX10-NEXT:    v_max_f32_e32 v51, v51, v51
+; GFX10-NEXT:    v_max_f32_e32 v52, v52, v52
+; GFX10-NEXT:    v_max_f32_e32 v17, v17, v17
+; GFX10-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX10-NEXT:    v_max_f32_e32 v16, v16, v16
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX10-NEXT:    v_and_b32_e32 v33, 0xffff0000, v15
+; GFX10-NEXT:    v_min_f32_e32 v32, v34, v32
+; GFX10-NEXT:    v_and_b32_e32 v34, 0xffff0000, v20
+; GFX10-NEXT:    v_min_f32_e32 v35, v36, v35
+; GFX10-NEXT:    v_and_b32_e32 v36, 0xffff0000, v4
+; GFX10-NEXT:    v_min_f32_e32 v37, v38, v37
+; GFX10-NEXT:    v_and_b32_e32 v38, 0xffff0000, v19
+; GFX10-NEXT:    v_min_f32_e32 v39, v48, v39
+; GFX10-NEXT:    v_and_b32_e32 v48, 0xffff0000, v3
+; GFX10-NEXT:    v_min_f32_e32 v49, v50, v49
+; GFX10-NEXT:    v_and_b32_e32 v50, 0xffff0000, v18
+; GFX10-NEXT:    v_min_f32_e32 v51, v52, v51
+; GFX10-NEXT:    v_and_b32_e32 v52, 0xffff0000, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX10-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
+; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX10-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
+; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX10-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
+; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX10-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
+; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX10-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    v_min_f32_e32 v0, v0, v16
+; GFX10-NEXT:    v_min_f32_e32 v1, v1, v17
+; GFX10-NEXT:    v_max_f32_e32 v33, v33, v33
+; GFX10-NEXT:    v_max_f32_e32 v34, v34, v34
+; GFX10-NEXT:    v_max_f32_e32 v36, v36, v36
+; GFX10-NEXT:    v_max_f32_e32 v38, v38, v38
+; GFX10-NEXT:    v_max_f32_e32 v48, v48, v48
+; GFX10-NEXT:    v_max_f32_e32 v50, v50, v50
+; GFX10-NEXT:    v_max_f32_e32 v52, v52, v52
+; GFX10-NEXT:    v_max_f32_e32 v54, v54, v54
+; GFX10-NEXT:    v_max_f32_e32 v64, v64, v64
+; GFX10-NEXT:    v_max_f32_e32 v66, v66, v66
+; GFX10-NEXT:    v_max_f32_e32 v68, v68, v68
+; GFX10-NEXT:    v_max_f32_e32 v15, v15, v15
+; GFX10-NEXT:    v_max_f32_e32 v30, v30, v30
+; GFX10-NEXT:    v_max_f32_e32 v14, v14, v14
+; GFX10-NEXT:    v_max_f32_e32 v29, v29, v29
+; GFX10-NEXT:    v_max_f32_e32 v13, v13, v13
+; GFX10-NEXT:    v_max_f32_e32 v28, v28, v28
+; GFX10-NEXT:    v_max_f32_e32 v12, v12, v12
+; GFX10-NEXT:    v_max_f32_e32 v27, v27, v27
+; GFX10-NEXT:    v_max_f32_e32 v11, v11, v11
+; GFX10-NEXT:    v_max_f32_e32 v26, v26, v26
+; GFX10-NEXT:    v_max_f32_e32 v10, v10, v10
+; GFX10-NEXT:    v_max_f32_e32 v25, v25, v25
+; GFX10-NEXT:    v_max_f32_e32 v9, v9, v9
+; GFX10-NEXT:    v_max_f32_e32 v24, v24, v24
+; GFX10-NEXT:    v_max_f32_e32 v8, v8, v8
+; GFX10-NEXT:    v_max_f32_e32 v23, v23, v23
+; GFX10-NEXT:    v_max_f32_e32 v7, v7, v7
+; GFX10-NEXT:    v_max_f32_e32 v22, v22, v22
+; GFX10-NEXT:    v_max_f32_e32 v6, v6, v6
+; GFX10-NEXT:    v_max_f32_e32 v21, v21, v21
+; GFX10-NEXT:    v_max_f32_e32 v5, v5, v5
+; GFX10-NEXT:    v_max_f32_e32 v20, v20, v20
+; GFX10-NEXT:    v_max_f32_e32 v4, v4, v4
+; GFX10-NEXT:    v_max_f32_e32 v19, v19, v19
+; GFX10-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX10-NEXT:    v_max_f32_e32 v18, v18, v18
+; GFX10-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX10-NEXT:    v_min_f32_e32 v34, v36, v34
+; GFX10-NEXT:    v_min_f32_e32 v36, v48, v38
+; GFX10-NEXT:    v_min_f32_e32 v38, v52, v50
+; GFX10-NEXT:    v_min_f32_e32 v48, v64, v54
+; GFX10-NEXT:    v_min_f32_e32 v50, v68, v66
+; GFX10-NEXT:    v_min_f32_e32 v14, v14, v30
+; GFX10-NEXT:    v_min_f32_e32 v13, v13, v29
+; GFX10-NEXT:    v_min_f32_e32 v12, v12, v28
+; GFX10-NEXT:    v_min_f32_e32 v11, v11, v27
+; GFX10-NEXT:    v_min_f32_e32 v10, v10, v26
+; GFX10-NEXT:    v_min_f32_e32 v9, v9, v25
+; GFX10-NEXT:    v_min_f32_e32 v8, v8, v24
+; GFX10-NEXT:    v_min_f32_e32 v7, v7, v23
+; GFX10-NEXT:    v_min_f32_e32 v6, v6, v22
+; GFX10-NEXT:    v_min_f32_e32 v5, v5, v21
+; GFX10-NEXT:    v_min_f32_e32 v2, v2, v18
+; GFX10-NEXT:    v_min_f32_e32 v3, v3, v19
+; GFX10-NEXT:    v_min_f32_e32 v4, v4, v20
+; GFX10-NEXT:    v_perm_b32 v0, v0, v50, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v1, v1, v48, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v2, v2, v38, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v3, v3, v36, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v4, v4, v34, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v5, v5, v67, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v6, v6, v65, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v7, v7, v55, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v8, v8, v53, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v9, v9, v51, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v10, v10, v49, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v11, v11, v39, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v12, v12, v37, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v13, v13, v35, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v14, v14, v32, 0x3020706
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v16, 0xffff0000, v31
+; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v31
+; GFX10-NEXT:    v_max_f32_e32 v16, v16, v16
+; GFX10-NEXT:    v_max_f32_e32 v17, v17, v17
+; GFX10-NEXT:    v_min_f32_e32 v16, v33, v16
+; GFX10-NEXT:    v_min_f32_e32 v15, v15, v17
+; GFX10-NEXT:    v_perm_b32 v15, v15, v16, 0x3020706
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_fabs_bf16:
+; GFX11-LABEL: v_minnum_v32bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    scratch_load_b32 v31, off, s32
+; GFX11-NEXT:    v_and_b32_e32 v33, 0xffff0000, v30
+; GFX11-NEXT:    v_and_b32_e32 v35, 0xffff0000, v29
+; GFX11-NEXT:    v_and_b32_e32 v34, 0xffff0000, v14
+; GFX11-NEXT:    v_and_b32_e32 v38, 0xffff0000, v12
+; GFX11-NEXT:    v_and_b32_e32 v37, 0xffff0000, v28
+; GFX11-NEXT:    v_and_b32_e32 v39, 0xffff0000, v27
+; GFX11-NEXT:    v_and_b32_e32 v36, 0xffff0000, v13
+; GFX11-NEXT:    v_dual_max_f32 v33, v33, v33 :: v_dual_and_b32 v32, 0xffff0000, v15
+; GFX11-NEXT:    v_and_b32_e32 v49, 0xffff0000, v26
+; GFX11-NEXT:    v_and_b32_e32 v48, 0xffff0000, v11
+; GFX11-NEXT:    v_and_b32_e32 v51, 0xffff0000, v25
+; GFX11-NEXT:    v_and_b32_e32 v50, 0xffff0000, v10
+; GFX11-NEXT:    v_and_b32_e32 v54, 0xffff0000, v8
+; GFX11-NEXT:    v_and_b32_e32 v53, 0xffff0000, v24
+; GFX11-NEXT:    v_and_b32_e32 v55, 0xffff0000, v23
+; GFX11-NEXT:    v_and_b32_e32 v52, 0xffff0000, v9
+; GFX11-NEXT:    v_and_b32_e32 v65, 0xffff0000, v22
+; GFX11-NEXT:    v_and_b32_e32 v67, 0xffff0000, v21
+; GFX11-NEXT:    v_and_b32_e32 v66, 0xffff0000, v6
+; GFX11-NEXT:    v_and_b32_e32 v71, 0xffff0000, v19
+; GFX11-NEXT:    v_and_b32_e32 v68, 0xffff0000, v5
+; GFX11-NEXT:    v_and_b32_e32 v83, 0xffff0000, v17
+; GFX11-NEXT:    v_and_b32_e32 v86, 0xffff0000, v0
+; GFX11-NEXT:    v_and_b32_e32 v85, 0xffff0000, v16
+; GFX11-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX11-NEXT:    v_and_b32_e32 v84, 0xffff0000, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
+; GFX11-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
+; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; GFX11-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_dual_max_f32 v35, v35, v35 :: v_dual_max_f32 v34, v34, v34
+; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
+; GFX11-NEXT:    v_dual_max_f32 v38, v38, v38 :: v_dual_max_f32 v37, v37, v37
+; GFX11-NEXT:    v_dual_max_f32 v39, v39, v39 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-NEXT:    v_max_f32_e32 v36, v36, v36
+; GFX11-NEXT:    v_dual_max_f32 v65, v65, v65 :: v_dual_and_b32 v64, 0xffff0000, v7
+; GFX11-NEXT:    v_and_b32_e32 v70, 0xffff0000, v4
+; GFX11-NEXT:    v_and_b32_e32 v69, 0xffff0000, v20
+; GFX11-NEXT:    v_and_b32_e32 v81, 0xffff0000, v18
+; GFX11-NEXT:    v_dual_max_f32 v83, v83, v83 :: v_dual_and_b32 v82, 0xffff0000, v2
+; GFX11-NEXT:    v_dual_max_f32 v17, v17, v17 :: v_dual_lshlrev_b32 v18, 16, v18
+; GFX11-NEXT:    v_dual_max_f32 v1, v1, v1 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX11-NEXT:    v_dual_max_f32 v49, v49, v49 :: v_dual_max_f32 v48, v48, v48
+; GFX11-NEXT:    v_dual_max_f32 v51, v51, v51 :: v_dual_max_f32 v50, v50, v50
+; GFX11-NEXT:    v_dual_max_f32 v54, v54, v54 :: v_dual_max_f32 v53, v53, v53
+; GFX11-NEXT:    v_dual_max_f32 v67, v67, v67 :: v_dual_max_f32 v66, v66, v66
+; GFX11-NEXT:    v_dual_max_f32 v25, v25, v25 :: v_dual_max_f32 v26, v26, v26
+; GFX11-NEXT:    v_dual_max_f32 v9, v9, v9 :: v_dual_max_f32 v10, v10, v10
+; GFX11-NEXT:    v_dual_max_f32 v21, v21, v21 :: v_dual_max_f32 v22, v22, v22
+; GFX11-NEXT:    v_dual_max_f32 v5, v5, v5 :: v_dual_max_f32 v6, v6, v6
+; GFX11-NEXT:    v_dual_min_f32 v33, v34, v33 :: v_dual_max_f32 v16, v16, v16
+; GFX11-NEXT:    v_dual_min_f32 v34, v36, v35 :: v_dual_min_f32 v35, v38, v37
+; GFX11-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX11-NEXT:    v_dual_max_f32 v81, v81, v81 :: v_dual_and_b32 v80, 0xffff0000, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
+; GFX11-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
+; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
+; GFX11-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
+; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX11-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; GFX11-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
+; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-NEXT:    v_dual_max_f32 v70, v70, v70 :: v_dual_max_f32 v69, v69, v69
+; GFX11-NEXT:    v_dual_min_f32 v36, v48, v39 :: v_dual_min_f32 v37, v50, v49
+; GFX11-NEXT:    v_min_f32_e32 v39, v54, v53
+; GFX11-NEXT:    v_dual_min_f32 v10, v10, v26 :: v_dual_min_f32 v1, v1, v17
+; GFX11-NEXT:    v_min_f32_e32 v6, v6, v22
+; GFX11-NEXT:    v_dual_max_f32 v32, v32, v32 :: v_dual_max_f32 v55, v55, v55
+; GFX11-NEXT:    v_max_f32_e32 v52, v52, v52
+; GFX11-NEXT:    v_dual_max_f32 v64, v64, v64 :: v_dual_max_f32 v71, v71, v71
+; GFX11-NEXT:    v_max_f32_e32 v68, v68, v68
+; GFX11-NEXT:    v_max_f32_e32 v80, v80, v80
+; GFX11-NEXT:    v_max_f32_e32 v82, v82, v82
+; GFX11-NEXT:    v_dual_max_f32 v86, v86, v86 :: v_dual_max_f32 v85, v85, v85
+; GFX11-NEXT:    v_dual_max_f32 v15, v15, v15 :: v_dual_max_f32 v84, v84, v84
+; GFX11-NEXT:    v_dual_max_f32 v29, v29, v29 :: v_dual_max_f32 v30, v30, v30
+; GFX11-NEXT:    v_dual_max_f32 v13, v13, v13 :: v_dual_max_f32 v14, v14, v14
+; GFX11-NEXT:    v_dual_max_f32 v27, v27, v27 :: v_dual_max_f32 v28, v28, v28
+; GFX11-NEXT:    v_dual_max_f32 v11, v11, v11 :: v_dual_max_f32 v12, v12, v12
+; GFX11-NEXT:    v_dual_max_f32 v23, v23, v23 :: v_dual_max_f32 v24, v24, v24
+; GFX11-NEXT:    v_dual_max_f32 v7, v7, v7 :: v_dual_max_f32 v8, v8, v8
+; GFX11-NEXT:    v_dual_max_f32 v19, v19, v19 :: v_dual_max_f32 v20, v20, v20
+; GFX11-NEXT:    v_dual_max_f32 v3, v3, v3 :: v_dual_max_f32 v4, v4, v4
+; GFX11-NEXT:    v_max_f32_e32 v18, v18, v18
+; GFX11-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX11-NEXT:    v_dual_min_f32 v38, v52, v51 :: v_dual_min_f32 v53, v82, v81
+; GFX11-NEXT:    v_dual_min_f32 v48, v64, v55 :: v_dual_min_f32 v55, v86, v85
+; GFX11-NEXT:    v_dual_min_f32 v49, v66, v65 :: v_dual_min_f32 v50, v68, v67
+; GFX11-NEXT:    v_min_f32_e32 v13, v13, v29
+; GFX11-NEXT:    v_dual_min_f32 v51, v70, v69 :: v_dual_min_f32 v52, v80, v71
+; GFX11-NEXT:    v_dual_min_f32 v9, v9, v25 :: v_dual_min_f32 v54, v84, v83
+; GFX11-NEXT:    v_dual_min_f32 v5, v5, v21 :: v_dual_min_f32 v14, v14, v30
+; GFX11-NEXT:    v_dual_min_f32 v11, v11, v27 :: v_dual_min_f32 v12, v12, v28
+; GFX11-NEXT:    v_dual_min_f32 v7, v7, v23 :: v_dual_min_f32 v8, v8, v24
+; GFX11-NEXT:    v_dual_min_f32 v3, v3, v19 :: v_dual_min_f32 v4, v4, v20
+; GFX11-NEXT:    v_perm_b32 v1, v1, v54, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v5, v5, v50, 0x3020706
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_perm_b32 v7, v7, v48, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v3, v3, v52, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v4, v4, v51, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v8, v8, v39, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v9, v9, v38, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v10, v10, v37, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v11, v11, v36, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v12, v12, v35, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v13, v13, v34, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v14, v14, v33, 0x3020706
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_dual_min_f32 v0, v0, v16 :: v_dual_lshlrev_b32 v17, 16, v31
+; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff0000, v31
+; GFX11-NEXT:    v_perm_b32 v6, v6, v49, 0x3020706
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_max_f32 v17, v17, v17 :: v_dual_min_f32 v2, v2, v18
+; GFX11-NEXT:    v_max_f32_e32 v16, v16, v16
+; GFX11-NEXT:    v_perm_b32 v0, v0, v55, 0x3020706
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_min_f32_e32 v15, v15, v17
+; GFX11-NEXT:    v_perm_b32 v2, v2, v53, 0x3020706
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_min_f32_e32 v16, v32, v16
+; GFX11-NEXT:    v_perm_b32 v15, v15, v16, 0x3020706
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %op = call bfloat @llvm.fabs.bf16(bfloat %a)
-  ret bfloat %op
+  %op = call <32 x bfloat> @llvm.minnum.v32bf16(<32 x bfloat> %a, <32 x bfloat> %b)
+  ret <32 x bfloat> %op
 }
 
-define amdgpu_ps i32 @s_fabs_bf16(bfloat inreg %a) {
-; GCN-LABEL: s_fabs_bf16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_bfe_u32 s0, s0, 0xf0010
-; GCN-NEXT:    ; return to shader part epilog
-;
-; GFX7-LABEL: s_fabs_bf16:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_bfe_u32 s0, s0, 0xf0010
-; GFX7-NEXT:    ; return to shader part epilog
-;
-; GFX8-LABEL: s_fabs_bf16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_bfe_u32 s0, s0, 0xf0010
-; GFX8-NEXT:    ; return to shader part epilog
-;
-; GFX9-LABEL: s_fabs_bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_bfe_u32 s0, s0, 0xf0010
-; GFX9-NEXT:    ; return to shader part epilog
-;
-; GFX10-LABEL: s_fabs_bf16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_bfe_u32 s0, s0, 0xf0010
-; GFX10-NEXT:    ; return to shader part epilog
-;
-; GFX11-LABEL: s_fabs_bf16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_bfe_u32 s0, s0, 0xf0010
-; GFX11-NEXT:    ; return to shader part epilog
-  %op = call bfloat @llvm.fabs.bf16(bfloat %a)
-  %cast = bitcast bfloat %op to i16
-  %zext = zext i16 %cast to i32
-  %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %zext)
-  ret i32 %readlane
-}
 
-define bfloat @v_fneg_bf16(bfloat %a) {
-; GCN-LABEL: v_fneg_bf16:
+declare bfloat @llvm.maxnum.bf16(bfloat, bfloat)
+declare <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat>, <2 x bfloat>)
+declare <3 x bfloat> @llvm.maxnum.v3bf16(<3 x bfloat>, <3 x bfloat>)
+declare <4 x bfloat> @llvm.maxnum.v4bf16(<4 x bfloat>, <4 x bfloat>)
+declare <8 x bfloat> @llvm.maxnum.v8bf16(<8 x bfloat>, <8 x bfloat>)
+declare <16 x bfloat> @llvm.maxnum.v16bf16(<16 x bfloat>, <16 x bfloat>)
+declare <32 x bfloat> @llvm.maxnum.v32bf16(<32 x bfloat>, <32 x bfloat>)
+
+define bfloat @v_maxnum_bf16(bfloat %a, bfloat %b) {
+; GCN-LABEL: v_maxnum_bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT:    v_max_f32_e32 v0, v0, v1
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: v_fneg_bf16:
+; GFX7-LABEL: v_maxnum_bf16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: v_fneg_bf16:
+; GFX8-LABEL: v_maxnum_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX8-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fneg_bf16:
+; GFX9-LABEL: v_maxnum_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: v_fneg_bf16:
+; GFX10-LABEL: v_maxnum_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_fneg_bf16:
+; GFX11-LABEL: v_maxnum_bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_and_b32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %op = fneg bfloat %a
+  %op = call bfloat @llvm.maxnum.bf16(bfloat %a, bfloat %b)
   ret bfloat %op
 }
 
-declare i32 @llvm.amdgcn.readfirstlane(i32)
-
-; FIXME: readfirstlane hack for other bugs
-define amdgpu_ps i32 @s_fneg_bf16(bfloat inreg %a) {
-; GCN-LABEL: s_fneg_bf16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_lshr_b32 s0, s0, 16
-; GCN-NEXT:    s_xor_b32 s0, s0, 0x8000
-; GCN-NEXT:    ; return to shader part epilog
-;
-; GFX7-LABEL: s_fneg_bf16:
-; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_lshr_b32 s0, s0, 16
-; GFX7-NEXT:    s_xor_b32 s0, s0, 0x8000
-; GFX7-NEXT:    ; return to shader part epilog
-;
-; GFX8-LABEL: s_fneg_bf16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_lshr_b32 s0, s0, 16
-; GFX8-NEXT:    v_mov_b32_e32 v0, 0xffff8000
-; GFX8-NEXT:    v_xor_b32_e32 v0, s0, v0
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX8-NEXT:    ; return to shader part epilog
-;
-; GFX9-LABEL: s_fneg_bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_lshr_b32 s0, s0, 16
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0xffff8000
-; GFX9-NEXT:    v_xor_b32_e32 v0, s0, v0
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX9-NEXT:    ; return to shader part epilog
-;
-; GFX10-LABEL: s_fneg_bf16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_lshr_b32 s0, s0, 16
-; GFX10-NEXT:    v_xor_b32_e64 v0, 0xffff8000, s0
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX10-NEXT:    ; return to shader part epilog
-;
-; GFX11-LABEL: s_fneg_bf16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_lshr_b32 s0, s0, 16
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_xor_b32_e64 v0, 0xffff8000, s0
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX11-NEXT:    ; return to shader part epilog
-  %op = fneg bfloat %a
-  %cast = bitcast bfloat %op to i16
-  %zext = zext i16 %cast to i32
-  %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %zext)
-  ret i32 %readlane
-}
-
-define bfloat @v_fneg_fabs_bf16(bfloat %a) {
-; GCN-LABEL: v_fneg_fabs_bf16:
+define <2 x bfloat> @v_maxnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
+; GCN-LABEL: v_maxnum_v2bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
-; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT:    v_max_f32_e32 v1, v1, v3
+; GCN-NEXT:    v_max_f32_e32 v0, v0, v2
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: v_fneg_fabs_bf16:
+; GFX7-LABEL: v_maxnum_v2bf16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT:    v_max_f32_e32 v1, v1, v3
+; GFX7-NEXT:    v_max_f32_e32 v0, v0, v2
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: v_fneg_fabs_bf16:
+; GFX8-LABEL: v_maxnum_v2bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX8-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX8-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX8-NEXT:    v_max_f32_e32 v2, v3, v2
+; GFX8-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX8-NEXT:    v_perm_b32 v0, v0, v2, s4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fneg_fabs_bf16:
+; GFX9-LABEL: v_maxnum_v2bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX9-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX9-NEXT:    v_max_f32_e32 v2, v3, v2
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    v_perm_b32 v0, v0, v2, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: v_fneg_fabs_bf16:
+; GFX10-LABEL: v_maxnum_v2bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX10-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX10-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX10-NEXT:    v_max_f32_e32 v2, v3, v2
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x3020706
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_fneg_fabs_bf16:
+; GFX11-LABEL: v_maxnum_v2bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_max_f32 v3, v3, v3 :: v_dual_max_f32 v2, v2, v2
+; GFX11-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_max_f32_e32 v2, v3, v2
+; GFX11-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v2, 0x3020706
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %fabs = call bfloat @llvm.fabs.bf16(bfloat %a)
-  %op = fneg bfloat %fabs
-  ret bfloat %op
+  %op = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b)
+  ret <2 x bfloat> %op
 }
 
-; FIXME: readfirstlane hack for other bugs
-define amdgpu_ps i32 @s_fneg_fabs_bf16(bfloat inreg %a) {
-; GCN-LABEL: s_fneg_fabs_bf16:
+define <3 x bfloat> @v_maxnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
+; GCN-LABEL: v_maxnum_v3bf16:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_lshr_b32 s0, s0, 16
-; GCN-NEXT:    s_bitset1_b32 s0, 15
-; GCN-NEXT:    ; return to shader part epilog
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GCN-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT:    v_max_f32_e32 v2, v2, v5
+; GCN-NEXT:    v_max_f32_e32 v1, v1, v4
+; GCN-NEXT:    v_max_f32_e32 v0, v0, v3
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: s_fneg_fabs_bf16:
+; GFX7-LABEL: v_maxnum_v3bf16:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_lshr_b32 s0, s0, 16
-; GFX7-NEXT:    s_bitset1_b32 s0, 15
-; GFX7-NEXT:    ; return to shader part epilog
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT:    v_max_f32_e32 v2, v2, v5
+; GFX7-NEXT:    v_max_f32_e32 v1, v1, v4
+; GFX7-NEXT:    v_max_f32_e32 v0, v0, v3
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: s_fneg_fabs_bf16:
+; GFX8-LABEL: v_maxnum_v3bf16:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_lshr_b32 s0, s0, 16
-; GFX8-NEXT:    v_mov_b32_e32 v0, 0xffff8000
-; GFX8-NEXT:    v_or_b32_e32 v0, s0, v0
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX8-NEXT:    ; return to shader part epilog
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX8-NEXT:    v_max_f32_e32 v1, v1, v3
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX8-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX8-NEXT:    v_max_f32_e32 v3, v4, v3
+; GFX8-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: s_fneg_fabs_bf16:
+; GFX9-LABEL: v_maxnum_v3bf16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_lshr_b32 s0, s0, 16
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0xffff8000
-; GFX9-NEXT:    v_or_b32_e32 v0, s0, v0
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX9-NEXT:    ; return to shader part epilog
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX9-NEXT:    v_max_f32_e32 v1, v1, v3
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX9-NEXT:    v_max_f32_e32 v4, v4, v4
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX9-NEXT:    v_max_f32_e32 v3, v4, v3
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: s_fneg_fabs_bf16:
+; GFX10-LABEL: v_maxnum_v3bf16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_lshr_b32 s0, s0, 16
-; GFX10-NEXT:    v_or_b32_e64 v0, 0xffff8000, s0
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX10-NEXT:    ; return to shader part epilog
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX10-NEXT:    v_max_f32_e32 v4, v4, v4
+; GFX10-NEXT:    v_max_f32_e32 v5, v5, v5
+; GFX10-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX10-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX10-NEXT:    v_max_f32_e32 v4, v5, v4
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX10-NEXT:    v_max_f32_e32 v1, v1, v3
+; GFX10-NEXT:    v_perm_b32 v0, v0, v4, 0x3020706
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: s_fneg_fabs_bf16:
+; GFX11-LABEL: v_maxnum_v3bf16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_lshr_b32 s0, s0, 16
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_or_b32_e64 v0, 0xffff8000, s0
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_max_f32 v4, v4, v4 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX11-NEXT:    v_dual_max_f32 v2, v2, v2 :: v_dual_and_b32 v5, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_max_f32 v1, v1, v1 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-NEXT:    v_dual_max_f32 v3, v3, v3 :: v_dual_max_f32 v0, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_dual_max_f32 v1, v1, v3 :: v_dual_max_f32 v0, v0, v2
+; GFX11-NEXT:    v_max_f32_e32 v5, v5, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_max_f32_e32 v4, v5, v4
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX11-NEXT:    ; return to shader part epilog
-  %fabs = call bfloat @llvm.fabs.bf16(bfloat %a)
-  %op = fneg bfloat %fabs
-  %cast = bitcast bfloat %op to i16
-  %zext = zext i16 %cast to i32
-  %readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %zext)
-  ret i32 %readlane
+; GFX11-NEXT:    v_perm_b32 v0, v0, v4, 0x3020706
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %op = call <3 x bfloat> @llvm.maxnum.v3bf16(<3 x bfloat> %a, <3 x bfloat> %b)
+  ret <3 x bfloat> %op
 }
 
-declare bfloat @llvm.minnum.bf16(bfloat, bfloat)
-declare <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat>, <2 x bfloat>)
-
-define bfloat @v_minnum_bf16(bfloat %a, bfloat %b) {
-; GCN-LABEL: v_minnum_bf16:
+define <4 x bfloat> @v_maxnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
+; GCN-LABEL: v_maxnum_v4bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_mul_f32_e32 v7, 1.0, v7
+; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT:    v_mul_f32_e32 v6, 1.0, v6
+; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GCN-NEXT:    v_mul_f32_e32 v5, 1.0, v5
 ; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT:    v_mul_f32_e32 v4, 1.0, v4
 ; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GCN-NEXT:    v_min_f32_e32 v0, v0, v1
+; GCN-NEXT:    v_max_f32_e32 v3, v3, v7
+; GCN-NEXT:    v_max_f32_e32 v2, v2, v6
+; GCN-NEXT:    v_max_f32_e32 v1, v1, v5
+; GCN-NEXT:    v_max_f32_e32 v0, v0, v4
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: v_minnum_bf16:
+; GFX7-LABEL: v_maxnum_v4bf16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_mul_f32_e32 v7, 1.0, v7
+; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v6
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
 ; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
 ; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX7-NEXT:    v_max_f32_e32 v3, v3, v7
+; GFX7-NEXT:    v_max_f32_e32 v2, v2, v6
+; GFX7-NEXT:    v_max_f32_e32 v1, v1, v5
+; GFX7-NEXT:    v_max_f32_e32 v0, v0, v4
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: v_minnum_bf16:
+; GFX8-LABEL: v_maxnum_v4bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v3
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GFX8-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v3
 ; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX8-NEXT:    v_max_f32_e32 v4, v5, v4
+; GFX8-NEXT:    v_max_f32_e32 v1, v1, v3
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX8-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX8-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_max_f32_e32 v3, v5, v3
+; GFX8-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX8-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX8-NEXT:    v_perm_b32 v1, v1, v4, s4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_minnum_bf16:
+; GFX9-LABEL: v_maxnum_v4bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_max_f32_e32 v4, v4, v4
+; GFX9-NEXT:    v_max_f32_e32 v5, v5, v5
+; GFX9-NEXT:    v_max_f32_e32 v3, v3, v3
 ; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX9-NEXT:    v_max_f32_e32 v4, v5, v4
+; GFX9-NEXT:    v_max_f32_e32 v1, v1, v3
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX9-NEXT:    v_max_f32_e32 v5, v5, v5
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX9-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_max_f32_e32 v3, v5, v3
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
+; GFX9-NEXT:    v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: v_minnum_bf16:
+; GFX10-LABEL: v_maxnum_v4bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v2
+; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX10-NEXT:    v_max_f32_e32 v4, v4, v4
+; GFX10-NEXT:    v_max_f32_e32 v5, v5, v5
+; GFX10-NEXT:    v_max_f32_e32 v6, v6, v6
+; GFX10-NEXT:    v_max_f32_e32 v7, v7, v7
+; GFX10-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX10-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX10-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_max_f32_e32 v1, v1, v3
+; GFX10-NEXT:    v_max_f32_e32 v3, v5, v4
+; GFX10-NEXT:    v_max_f32_e32 v4, v7, v6
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_perm_b32 v0, v0, v4, 0x3020706
+; GFX10-NEXT:    v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_minnum_bf16:
+; GFX11-LABEL: v_maxnum_v4bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_and_b32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX11-NEXT:    v_dual_max_f32 v5, v5, v5 :: v_dual_lshlrev_b32 v4, 16, v3
+; GFX11-NEXT:    v_dual_max_f32 v4, v4, v4 :: v_dual_and_b32 v3, 0xffff0000, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_max_f32_e32 v4, v5, v4
+; GFX11-NEXT:    v_dual_max_f32 v3, v3, v3 :: v_dual_and_b32 v6, 0xffff0000, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_max_f32 v5, v6, v6 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-NEXT:    v_dual_max_f32 v2, v2, v2 :: v_dual_and_b32 v1, 0xffff0000, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_dual_max_f32 v6, v7, v7 :: v_dual_max_f32 v1, v1, v1
+; GFX11-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v3
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_max_f32_e32 v4, v6, v5
+; GFX11-NEXT:    v_dual_max_f32 v0, v0, v2 :: v_dual_and_b32 v1, 0xffff0000, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v4, 0x3020706
+; GFX11-NEXT:    v_or_b32_e32 v1, v3, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %op = call bfloat @llvm.minnum.bf16(bfloat %a, bfloat %b)
-  ret bfloat %op
+  %op = call <4 x bfloat> @llvm.maxnum.v4bf16(<4 x bfloat> %a, <4 x bfloat> %b)
+  ret <4 x bfloat> %op
 }
 
-define <2 x bfloat> @v_minnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
-; GCN-LABEL: v_minnum_v2bf16:
+define <8 x bfloat> @v_maxnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
+; GCN-LABEL: v_maxnum_v8bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
 ; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
 ; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_mul_f32_e32 v15, 1.0, v15
+; GCN-NEXT:    v_mul_f32_e32 v7, 1.0, v7
+; GCN-NEXT:    v_mul_f32_e32 v14, 1.0, v14
+; GCN-NEXT:    v_mul_f32_e32 v6, 1.0, v6
+; GCN-NEXT:    v_mul_f32_e32 v13, 1.0, v13
+; GCN-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT:    v_mul_f32_e32 v12, 1.0, v12
+; GCN-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GCN-NEXT:    v_mul_f32_e32 v11, 1.0, v11
 ; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT:    v_mul_f32_e32 v10, 1.0, v10
 ; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GCN-NEXT:    v_mul_f32_e32 v9, 1.0, v9
+; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT:    v_mul_f32_e32 v8, 1.0, v8
 ; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GCN-NEXT:    v_min_f32_e32 v1, v1, v3
-; GCN-NEXT:    v_min_f32_e32 v0, v0, v2
+; GCN-NEXT:    v_max_f32_e32 v7, v7, v15
+; GCN-NEXT:    v_max_f32_e32 v6, v6, v14
+; GCN-NEXT:    v_max_f32_e32 v5, v5, v13
+; GCN-NEXT:    v_max_f32_e32 v4, v4, v12
+; GCN-NEXT:    v_max_f32_e32 v3, v3, v11
+; GCN-NEXT:    v_max_f32_e32 v2, v2, v10
+; GCN-NEXT:    v_max_f32_e32 v1, v1, v9
+; GCN-NEXT:    v_max_f32_e32 v0, v0, v8
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: v_minnum_v2bf16:
+; GFX7-LABEL: v_maxnum_v8bf16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
 ; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
 ; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_mul_f32_e32 v15, 1.0, v15
+; GFX7-NEXT:    v_mul_f32_e32 v7, 1.0, v7
+; GFX7-NEXT:    v_mul_f32_e32 v14, 1.0, v14
+; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v6
+; GFX7-NEXT:    v_mul_f32_e32 v13, 1.0, v13
+; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT:    v_mul_f32_e32 v12, 1.0, v12
+; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT:    v_mul_f32_e32 v11, 1.0, v11
 ; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT:    v_mul_f32_e32 v10, 1.0, v10
 ; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT:    v_mul_f32_e32 v9, 1.0, v9
+; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT:    v_mul_f32_e32 v8, 1.0, v8
 ; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT:    v_min_f32_e32 v1, v1, v3
-; GFX7-NEXT:    v_min_f32_e32 v0, v0, v2
+; GFX7-NEXT:    v_max_f32_e32 v7, v7, v15
+; GFX7-NEXT:    v_max_f32_e32 v6, v6, v14
+; GFX7-NEXT:    v_max_f32_e32 v5, v5, v13
+; GFX7-NEXT:    v_max_f32_e32 v4, v4, v12
+; GFX7-NEXT:    v_max_f32_e32 v3, v3, v11
+; GFX7-NEXT:    v_max_f32_e32 v2, v2, v10
+; GFX7-NEXT:    v_max_f32_e32 v1, v1, v9
+; GFX7-NEXT:    v_max_f32_e32 v0, v0, v8
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: v_minnum_v2bf16:
+; GFX8-LABEL: v_maxnum_v8bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX8-NEXT:    v_and_b32_e32 v8, 0xffff0000, v7
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_mul_f32_e32 v8, 1.0, v8
+; GFX8-NEXT:    v_mul_f32_e32 v9, 1.0, v9
+; GFX8-NEXT:    v_mul_f32_e32 v7, 1.0, v7
+; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX8-NEXT:    v_max_f32_e32 v8, v9, v8
+; GFX8-NEXT:    v_max_f32_e32 v3, v3, v7
+; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v6
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_mul_f32_e32 v7, 1.0, v7
+; GFX8-NEXT:    v_mul_f32_e32 v9, 1.0, v9
+; GFX8-NEXT:    v_mul_f32_e32 v6, 1.0, v6
+; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX8-NEXT:    v_max_f32_e32 v7, v9, v7
+; GFX8-NEXT:    v_max_f32_e32 v2, v2, v6
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v5
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX8-NEXT:    v_mul_f32_e32 v6, 1.0, v6
+; GFX8-NEXT:    v_mul_f32_e32 v9, 1.0, v9
+; GFX8-NEXT:    v_mul_f32_e32 v5, 1.0, v5
 ; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX8-NEXT:    v_max_f32_e32 v6, v9, v6
+; GFX8-NEXT:    v_max_f32_e32 v1, v1, v5
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GFX8-NEXT:    v_mul_f32_e32 v9, 1.0, v9
+; GFX8-NEXT:    v_mul_f32_e32 v4, 1.0, v4
 ; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX8-NEXT:    v_min_f32_e32 v2, v3, v2
-; GFX8-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX8-NEXT:    v_max_f32_e32 v5, v9, v5
+; GFX8-NEXT:    v_max_f32_e32 v0, v0, v4
 ; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX8-NEXT:    v_perm_b32 v0, v0, v2, s4
+; GFX8-NEXT:    v_perm_b32 v0, v0, v5, s4
+; GFX8-NEXT:    v_perm_b32 v1, v1, v6, s4
+; GFX8-NEXT:    v_perm_b32 v2, v2, v7, s4
+; GFX8-NEXT:    v_perm_b32 v3, v3, v8, s4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_minnum_v2bf16:
+; GFX9-LABEL: v_maxnum_v8bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX9-NEXT:    v_and_b32_e32 v8, 0xffff0000, v7
+; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v3
+; GFX9-NEXT:    v_max_f32_e32 v8, v8, v8
+; GFX9-NEXT:    v_max_f32_e32 v9, v9, v9
+; GFX9-NEXT:    v_max_f32_e32 v8, v9, v8
+; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v5
+; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v1
+; GFX9-NEXT:    v_max_f32_e32 v9, v9, v9
+; GFX9-NEXT:    v_max_f32_e32 v10, v10, v10
+; GFX9-NEXT:    v_max_f32_e32 v9, v10, v9
+; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v6
+; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v2
+; GFX9-NEXT:    v_max_f32_e32 v10, v10, v10
+; GFX9-NEXT:    v_max_f32_e32 v11, v11, v11
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX9-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX9-NEXT:    v_max_f32_e32 v10, v11, v10
+; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v4
+; GFX9-NEXT:    v_and_b32_e32 v12, 0xffff0000, v0
+; GFX9-NEXT:    v_max_f32_e32 v5, v5, v5
 ; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_max_f32_e32 v1, v1, v5
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_max_f32_e32 v4, v4, v4
 ; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX9-NEXT:    v_min_f32_e32 v2, v3, v2
-; GFX9-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX9-NEXT:    v_max_f32_e32 v11, v11, v11
+; GFX9-NEXT:    v_max_f32_e32 v12, v12, v12
+; GFX9-NEXT:    v_max_f32_e32 v7, v7, v7
+; GFX9-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX9-NEXT:    v_max_f32_e32 v5, v5, v5
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v4
+; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v9
+; GFX9-NEXT:    v_max_f32_e32 v11, v12, v11
+; GFX9-NEXT:    v_max_f32_e32 v3, v3, v7
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v5
 ; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX9-NEXT:    v_perm_b32 v0, v0, v2, s4
+; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v8
+; GFX9-NEXT:    v_perm_b32 v0, v0, v11, s4
+; GFX9-NEXT:    v_perm_b32 v2, v2, v10, s4
+; GFX9-NEXT:    v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: v_minnum_v2bf16:
+; GFX10-LABEL: v_maxnum_v8bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX10-NEXT:    v_and_b32_e32 v8, 0xffff0000, v7
+; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v3
+; GFX10-NEXT:    v_and_b32_e32 v10, 0xffff0000, v5
+; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v1
+; GFX10-NEXT:    v_and_b32_e32 v12, 0xffff0000, v2
+; GFX10-NEXT:    v_max_f32_e32 v8, v8, v8
+; GFX10-NEXT:    v_max_f32_e32 v9, v9, v9
+; GFX10-NEXT:    v_max_f32_e32 v10, v10, v10
+; GFX10-NEXT:    v_and_b32_e32 v13, 0xffff0000, v4
+; GFX10-NEXT:    v_and_b32_e32 v14, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX10-NEXT:    v_max_f32_e32 v8, v9, v8
+; GFX10-NEXT:    v_max_f32_e32 v9, v11, v11
+; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    v_max_f32_e32 v9, v9, v10
+; GFX10-NEXT:    v_max_f32_e32 v10, v11, v11
+; GFX10-NEXT:    v_max_f32_e32 v11, v12, v12
+; GFX10-NEXT:    v_max_f32_e32 v12, v13, v13
+; GFX10-NEXT:    v_max_f32_e32 v13, v14, v14
+; GFX10-NEXT:    v_max_f32_e32 v7, v7, v7
 ; GFX10-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX10-NEXT:    v_max_f32_e32 v5, v5, v5
 ; GFX10-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX10-NEXT:    v_max_f32_e32 v4, v4, v4
 ; GFX10-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX10-NEXT:    v_min_f32_e32 v2, v3, v2
-; GFX10-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x3020706
+; GFX10-NEXT:    v_max_f32_e32 v6, v6, v6
+; GFX10-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX10-NEXT:    v_max_f32_e32 v10, v11, v10
+; GFX10-NEXT:    v_max_f32_e32 v11, v13, v12
+; GFX10-NEXT:    v_max_f32_e32 v3, v3, v7
+; GFX10-NEXT:    v_max_f32_e32 v1, v1, v5
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v4
+; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v9
+; GFX10-NEXT:    v_max_f32_e32 v2, v2, v6
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v8
+; GFX10-NEXT:    v_perm_b32 v0, v0, v11, 0x3020706
+; GFX10-NEXT:    v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_perm_b32 v2, v2, v10, 0x3020706
+; GFX10-NEXT:    v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_minnum_v2bf16:
+; GFX11-LABEL: v_maxnum_v8bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_max_f32 v3, v3, v3 :: v_dual_max_f32 v2, v2, v2
-; GFX11-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1
+; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v3
+; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff0000, v7
+; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v1
+; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff0000, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_max_f32 v9, v9, v9 :: v_dual_max_f32 v8, v8, v8
+; GFX11-NEXT:    v_dual_max_f32 v11, v11, v11 :: v_dual_max_f32 v10, v10, v10
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_min_f32_e32 v2, v3, v2
-; GFX11-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v2, 0x3020706
+; GFX11-NEXT:    v_dual_max_f32 v7, v7, v7 :: v_dual_max_f32 v8, v9, v8
+; GFX11-NEXT:    v_max_f32_e32 v9, v11, v10
+; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v2
+; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff0000, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_dual_max_f32 v11, v11, v11 :: v_dual_lshlrev_b32 v6, 16, v6
+; GFX11-NEXT:    v_max_f32_e32 v10, v12, v12
+; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    v_dual_max_f32 v2, v2, v2 :: v_dual_lshlrev_b32 v5, 16, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_max_f32_e32 v10, v11, v10
+; GFX11-NEXT:    v_dual_max_f32 v13, v13, v13 :: v_dual_and_b32 v12, 0xffff0000, v4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v12, v12, v12
+; GFX11-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX11-NEXT:    v_dual_max_f32 v4, v4, v4 :: v_dual_max_f32 v3, v3, v3
+; GFX11-NEXT:    v_max_f32_e32 v5, v5, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_max_f32_e32 v11, v13, v12
+; GFX11-NEXT:    v_dual_max_f32 v0, v0, v4 :: v_dual_max_f32 v3, v3, v7
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_max_f32_e32 v1, v1, v5
+; GFX11-NEXT:    v_dual_max_f32 v5, v6, v6 :: v_dual_and_b32 v4, 0xffff0000, v9
+; GFX11-NEXT:    v_perm_b32 v0, v0, v11, 0x3020706
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_max_f32 v2, v2, v5 :: v_dual_and_b32 v5, 0xffff0000, v8
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_perm_b32 v2, v2, v10, 0x3020706
+; GFX11-NEXT:    v_or_b32_e32 v3, v3, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %op = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b)
-  ret <2 x bfloat> %op
+  %op = call <8 x bfloat> @llvm.maxnum.v8bf16(<8 x bfloat> %a, <8 x bfloat> %b)
+  ret <8 x bfloat> %op
 }
 
-declare bfloat @llvm.maxnum.bf16(bfloat, bfloat)
-declare <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat>, <2 x bfloat>)
-
-define bfloat @v_maxnum_bf16(bfloat %a, bfloat %b) {
-; GCN-LABEL: v_maxnum_bf16:
+define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
+; GCN-LABEL: v_maxnum_v16bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GCN-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GCN-NEXT:    v_mul_f32_e32 v30, 1.0, v30
+; GCN-NEXT:    v_mul_f32_e32 v14, 1.0, v14
+; GCN-NEXT:    v_max_f32_e32 v14, v14, v30
+; GCN-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GCN-NEXT:    v_mul_f32_e32 v29, 1.0, v29
+; GCN-NEXT:    v_mul_f32_e32 v13, 1.0, v13
+; GCN-NEXT:    v_max_f32_e32 v13, v13, v29
+; GCN-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GCN-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GCN-NEXT:    v_mul_f32_e32 v28, 1.0, v28
+; GCN-NEXT:    v_mul_f32_e32 v12, 1.0, v12
+; GCN-NEXT:    v_max_f32_e32 v12, v12, v28
+; GCN-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GCN-NEXT:    v_mul_f32_e32 v27, 1.0, v27
+; GCN-NEXT:    v_mul_f32_e32 v11, 1.0, v11
+; GCN-NEXT:    v_max_f32_e32 v11, v11, v27
+; GCN-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GCN-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GCN-NEXT:    v_mul_f32_e32 v26, 1.0, v26
+; GCN-NEXT:    v_mul_f32_e32 v10, 1.0, v10
+; GCN-NEXT:    v_max_f32_e32 v10, v10, v26
+; GCN-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GCN-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GCN-NEXT:    v_mul_f32_e32 v25, 1.0, v25
+; GCN-NEXT:    v_mul_f32_e32 v9, 1.0, v9
+; GCN-NEXT:    v_max_f32_e32 v9, v9, v25
+; GCN-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GCN-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GCN-NEXT:    v_mul_f32_e32 v24, 1.0, v24
+; GCN-NEXT:    v_mul_f32_e32 v8, 1.0, v8
+; GCN-NEXT:    v_max_f32_e32 v8, v8, v24
+; GCN-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT:    v_mul_f32_e32 v23, 1.0, v23
+; GCN-NEXT:    v_mul_f32_e32 v7, 1.0, v7
+; GCN-NEXT:    v_max_f32_e32 v7, v7, v23
+; GCN-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT:    v_mul_f32_e32 v22, 1.0, v22
+; GCN-NEXT:    v_mul_f32_e32 v6, 1.0, v6
+; GCN-NEXT:    v_max_f32_e32 v6, v6, v22
+; GCN-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT:    v_mul_f32_e32 v21, 1.0, v21
+; GCN-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT:    v_max_f32_e32 v5, v5, v21
+; GCN-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT:    v_mul_f32_e32 v20, 1.0, v20
+; GCN-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GCN-NEXT:    v_max_f32_e32 v4, v4, v20
+; GCN-NEXT:    buffer_load_dword v20, off, s[0:3], s32
+; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GCN-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_mul_f32_e32 v15, 1.0, v15
+; GCN-NEXT:    v_mul_f32_e32 v19, 1.0, v19
+; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GCN-NEXT:    v_mul_f32_e32 v18, 1.0, v18
+; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GCN-NEXT:    v_mul_f32_e32 v17, 1.0, v17
 ; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT:    v_mul_f32_e32 v16, 1.0, v16
 ; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GCN-NEXT:    v_max_f32_e32 v0, v0, v1
+; GCN-NEXT:    v_max_f32_e32 v3, v3, v19
+; GCN-NEXT:    v_max_f32_e32 v2, v2, v18
+; GCN-NEXT:    v_max_f32_e32 v1, v1, v17
+; GCN-NEXT:    v_max_f32_e32 v0, v0, v16
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GCN-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GCN-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GCN-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v16, 0xffff0000, v20
+; GCN-NEXT:    v_mul_f32_e32 v16, 1.0, v16
+; GCN-NEXT:    v_max_f32_e32 v15, v15, v16
+; GCN-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: v_maxnum_bf16:
+; GFX7-LABEL: v_maxnum_v16bf16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT:    v_mul_f32_e32 v22, 1.0, v22
+; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v6
+; GFX7-NEXT:    v_max_f32_e32 v6, v6, v22
+; GFX7-NEXT:    buffer_load_dword v22, off, s[0:3], s32
+; GFX7-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GFX7-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX7-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX7-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GFX7-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX7-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX7-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX7-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX7-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX7-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_mul_f32_e32 v30, 1.0, v30
+; GFX7-NEXT:    v_mul_f32_e32 v14, 1.0, v14
+; GFX7-NEXT:    v_mul_f32_e32 v29, 1.0, v29
+; GFX7-NEXT:    v_mul_f32_e32 v13, 1.0, v13
+; GFX7-NEXT:    v_mul_f32_e32 v28, 1.0, v28
+; GFX7-NEXT:    v_mul_f32_e32 v12, 1.0, v12
+; GFX7-NEXT:    v_mul_f32_e32 v27, 1.0, v27
+; GFX7-NEXT:    v_mul_f32_e32 v11, 1.0, v11
+; GFX7-NEXT:    v_mul_f32_e32 v26, 1.0, v26
+; GFX7-NEXT:    v_mul_f32_e32 v10, 1.0, v10
+; GFX7-NEXT:    v_mul_f32_e32 v25, 1.0, v25
+; GFX7-NEXT:    v_mul_f32_e32 v9, 1.0, v9
+; GFX7-NEXT:    v_mul_f32_e32 v24, 1.0, v24
+; GFX7-NEXT:    v_mul_f32_e32 v8, 1.0, v8
+; GFX7-NEXT:    v_mul_f32_e32 v23, 1.0, v23
+; GFX7-NEXT:    v_mul_f32_e32 v7, 1.0, v7
+; GFX7-NEXT:    v_mul_f32_e32 v15, 1.0, v15
+; GFX7-NEXT:    v_mul_f32_e32 v21, 1.0, v21
+; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT:    v_mul_f32_e32 v20, 1.0, v20
+; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT:    v_mul_f32_e32 v19, 1.0, v19
+; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT:    v_mul_f32_e32 v18, 1.0, v18
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT:    v_mul_f32_e32 v17, 1.0, v17
 ; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT:    v_mul_f32_e32 v16, 1.0, v16
 ; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX7-NEXT:    v_max_f32_e32 v14, v14, v30
+; GFX7-NEXT:    v_max_f32_e32 v13, v13, v29
+; GFX7-NEXT:    v_max_f32_e32 v12, v12, v28
+; GFX7-NEXT:    v_max_f32_e32 v11, v11, v27
+; GFX7-NEXT:    v_max_f32_e32 v10, v10, v26
+; GFX7-NEXT:    v_max_f32_e32 v9, v9, v25
+; GFX7-NEXT:    v_max_f32_e32 v8, v8, v24
+; GFX7-NEXT:    v_max_f32_e32 v7, v7, v23
+; GFX7-NEXT:    v_max_f32_e32 v5, v5, v21
+; GFX7-NEXT:    v_max_f32_e32 v4, v4, v20
+; GFX7-NEXT:    v_max_f32_e32 v3, v3, v19
+; GFX7-NEXT:    v_max_f32_e32 v2, v2, v18
+; GFX7-NEXT:    v_max_f32_e32 v1, v1, v17
+; GFX7-NEXT:    v_max_f32_e32 v0, v0, v16
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX7-NEXT:    v_mul_f32_e32 v22, 1.0, v22
+; GFX7-NEXT:    v_max_f32_e32 v15, v15, v22
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX7-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX7-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: v_maxnum_bf16:
+; GFX8-LABEL: v_maxnum_v16bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_and_b32_e32 v16, 0xffff0000, v15
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v7
+; GFX8-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX8-NEXT:    v_mul_f32_e32 v16, 1.0, v16
+; GFX8-NEXT:    v_mul_f32_e32 v17, 1.0, v17
+; GFX8-NEXT:    v_mul_f32_e32 v15, 1.0, v15
+; GFX8-NEXT:    v_mul_f32_e32 v7, 1.0, v7
+; GFX8-NEXT:    v_max_f32_e32 v16, v17, v16
+; GFX8-NEXT:    v_max_f32_e32 v7, v7, v15
+; GFX8-NEXT:    v_and_b32_e32 v15, 0xffff0000, v14
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v6
+; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_mul_f32_e32 v15, 1.0, v15
+; GFX8-NEXT:    v_mul_f32_e32 v17, 1.0, v17
+; GFX8-NEXT:    v_mul_f32_e32 v14, 1.0, v14
+; GFX8-NEXT:    v_mul_f32_e32 v6, 1.0, v6
+; GFX8-NEXT:    v_max_f32_e32 v15, v17, v15
+; GFX8-NEXT:    v_max_f32_e32 v6, v6, v14
+; GFX8-NEXT:    v_and_b32_e32 v14, 0xffff0000, v13
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_mul_f32_e32 v14, 1.0, v14
+; GFX8-NEXT:    v_mul_f32_e32 v17, 1.0, v17
+; GFX8-NEXT:    v_mul_f32_e32 v13, 1.0, v13
+; GFX8-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GFX8-NEXT:    v_max_f32_e32 v14, v17, v14
+; GFX8-NEXT:    v_max_f32_e32 v5, v5, v13
+; GFX8-NEXT:    v_and_b32_e32 v13, 0xffff0000, v12
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX8-NEXT:    v_mul_f32_e32 v13, 1.0, v13
+; GFX8-NEXT:    v_mul_f32_e32 v17, 1.0, v17
+; GFX8-NEXT:    v_mul_f32_e32 v12, 1.0, v12
+; GFX8-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GFX8-NEXT:    v_max_f32_e32 v13, v17, v13
+; GFX8-NEXT:    v_max_f32_e32 v4, v4, v12
+; GFX8-NEXT:    v_and_b32_e32 v12, 0xffff0000, v11
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_mul_f32_e32 v12, 1.0, v12
+; GFX8-NEXT:    v_mul_f32_e32 v17, 1.0, v17
+; GFX8-NEXT:    v_mul_f32_e32 v11, 1.0, v11
+; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX8-NEXT:    v_max_f32_e32 v12, v17, v12
+; GFX8-NEXT:    v_max_f32_e32 v3, v3, v11
+; GFX8-NEXT:    v_and_b32_e32 v11, 0xffff0000, v10
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_mul_f32_e32 v11, 1.0, v11
+; GFX8-NEXT:    v_mul_f32_e32 v17, 1.0, v17
+; GFX8-NEXT:    v_mul_f32_e32 v10, 1.0, v10
+; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX8-NEXT:    v_max_f32_e32 v11, v17, v11
+; GFX8-NEXT:    v_max_f32_e32 v2, v2, v10
+; GFX8-NEXT:    v_and_b32_e32 v10, 0xffff0000, v9
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_mul_f32_e32 v10, 1.0, v10
+; GFX8-NEXT:    v_mul_f32_e32 v17, 1.0, v17
+; GFX8-NEXT:    v_mul_f32_e32 v9, 1.0, v9
 ; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX8-NEXT:    v_max_f32_e32 v10, v17, v10
+; GFX8-NEXT:    v_max_f32_e32 v1, v1, v9
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v8
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_mul_f32_e32 v9, 1.0, v9
+; GFX8-NEXT:    v_mul_f32_e32 v17, 1.0, v17
+; GFX8-NEXT:    v_mul_f32_e32 v8, 1.0, v8
 ; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX8-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX8-NEXT:    v_max_f32_e32 v9, v17, v9
+; GFX8-NEXT:    v_max_f32_e32 v0, v0, v8
+; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX8-NEXT:    v_perm_b32 v0, v0, v9, s4
+; GFX8-NEXT:    v_perm_b32 v1, v1, v10, s4
+; GFX8-NEXT:    v_perm_b32 v2, v2, v11, s4
+; GFX8-NEXT:    v_perm_b32 v3, v3, v12, s4
+; GFX8-NEXT:    v_perm_b32 v4, v4, v13, s4
+; GFX8-NEXT:    v_perm_b32 v5, v5, v14, s4
+; GFX8-NEXT:    v_perm_b32 v6, v6, v15, s4
+; GFX8-NEXT:    v_perm_b32 v7, v7, v16, s4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_maxnum_bf16:
+; GFX9-LABEL: v_maxnum_v16bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_and_b32_e32 v16, 0xffff0000, v15
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v7
+; GFX9-NEXT:    v_max_f32_e32 v16, v16, v16
+; GFX9-NEXT:    v_max_f32_e32 v17, v17, v17
+; GFX9-NEXT:    v_max_f32_e32 v16, v17, v16
+; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v14
+; GFX9-NEXT:    v_and_b32_e32 v18, 0xffff0000, v6
+; GFX9-NEXT:    v_max_f32_e32 v17, v17, v17
+; GFX9-NEXT:    v_max_f32_e32 v18, v18, v18
+; GFX9-NEXT:    v_max_f32_e32 v17, v18, v17
+; GFX9-NEXT:    v_and_b32_e32 v18, 0xffff0000, v13
+; GFX9-NEXT:    v_and_b32_e32 v19, 0xffff0000, v5
+; GFX9-NEXT:    v_max_f32_e32 v18, v18, v18
+; GFX9-NEXT:    v_max_f32_e32 v19, v19, v19
+; GFX9-NEXT:    v_max_f32_e32 v18, v19, v18
+; GFX9-NEXT:    v_and_b32_e32 v19, 0xffff0000, v12
+; GFX9-NEXT:    v_and_b32_e32 v20, 0xffff0000, v4
+; GFX9-NEXT:    v_max_f32_e32 v19, v19, v19
+; GFX9-NEXT:    v_max_f32_e32 v20, v20, v20
+; GFX9-NEXT:    v_max_f32_e32 v19, v20, v19
+; GFX9-NEXT:    v_and_b32_e32 v20, 0xffff0000, v11
+; GFX9-NEXT:    v_and_b32_e32 v21, 0xffff0000, v3
+; GFX9-NEXT:    v_max_f32_e32 v20, v20, v20
+; GFX9-NEXT:    v_max_f32_e32 v21, v21, v21
+; GFX9-NEXT:    v_max_f32_e32 v20, v21, v20
+; GFX9-NEXT:    v_and_b32_e32 v21, 0xffff0000, v10
+; GFX9-NEXT:    v_and_b32_e32 v22, 0xffff0000, v2
+; GFX9-NEXT:    v_max_f32_e32 v21, v21, v21
+; GFX9-NEXT:    v_max_f32_e32 v22, v22, v22
+; GFX9-NEXT:    v_max_f32_e32 v21, v22, v21
+; GFX9-NEXT:    v_and_b32_e32 v22, 0xffff0000, v9
+; GFX9-NEXT:    v_and_b32_e32 v23, 0xffff0000, v1
+; GFX9-NEXT:    v_max_f32_e32 v22, v22, v22
+; GFX9-NEXT:    v_max_f32_e32 v23, v23, v23
+; GFX9-NEXT:    v_max_f32_e32 v22, v23, v22
+; GFX9-NEXT:    v_and_b32_e32 v23, 0xffff0000, v8
+; GFX9-NEXT:    v_and_b32_e32 v24, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX9-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX9-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_max_f32_e32 v23, v23, v23
+; GFX9-NEXT:    v_max_f32_e32 v24, v24, v24
+; GFX9-NEXT:    v_max_f32_e32 v15, v15, v15
+; GFX9-NEXT:    v_max_f32_e32 v7, v7, v7
+; GFX9-NEXT:    v_max_f32_e32 v14, v14, v14
+; GFX9-NEXT:    v_max_f32_e32 v6, v6, v6
+; GFX9-NEXT:    v_max_f32_e32 v13, v13, v13
+; GFX9-NEXT:    v_max_f32_e32 v5, v5, v5
+; GFX9-NEXT:    v_max_f32_e32 v12, v12, v12
+; GFX9-NEXT:    v_max_f32_e32 v4, v4, v4
+; GFX9-NEXT:    v_max_f32_e32 v11, v11, v11
+; GFX9-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX9-NEXT:    v_max_f32_e32 v10, v10, v10
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX9-NEXT:    v_max_f32_e32 v9, v9, v9
 ; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX9-NEXT:    v_max_f32_e32 v8, v8, v8
 ; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX9-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_max_f32_e32 v23, v24, v23
+; GFX9-NEXT:    v_max_f32_e32 v7, v7, v15
+; GFX9-NEXT:    v_max_f32_e32 v6, v6, v14
+; GFX9-NEXT:    v_max_f32_e32 v5, v5, v13
+; GFX9-NEXT:    v_max_f32_e32 v4, v4, v12
+; GFX9-NEXT:    v_max_f32_e32 v3, v3, v11
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v10
+; GFX9-NEXT:    v_max_f32_e32 v1, v1, v9
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v8
+; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    v_perm_b32 v0, v0, v23, s4
+; GFX9-NEXT:    v_perm_b32 v1, v1, v22, s4
+; GFX9-NEXT:    v_perm_b32 v2, v2, v21, s4
+; GFX9-NEXT:    v_perm_b32 v3, v3, v20, s4
+; GFX9-NEXT:    v_perm_b32 v4, v4, v19, s4
+; GFX9-NEXT:    v_perm_b32 v5, v5, v18, s4
+; GFX9-NEXT:    v_perm_b32 v6, v6, v17, s4
+; GFX9-NEXT:    v_perm_b32 v7, v7, v16, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: v_maxnum_bf16:
+; GFX10-LABEL: v_maxnum_v16bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX10-NEXT:    v_and_b32_e32 v16, 0xffff0000, v15
+; GFX10-NEXT:    v_and_b32_e32 v17, 0xffff0000, v7
+; GFX10-NEXT:    v_and_b32_e32 v18, 0xffff0000, v14
+; GFX10-NEXT:    v_and_b32_e32 v19, 0xffff0000, v6
+; GFX10-NEXT:    v_and_b32_e32 v20, 0xffff0000, v5
+; GFX10-NEXT:    v_max_f32_e32 v16, v16, v16
+; GFX10-NEXT:    v_max_f32_e32 v17, v17, v17
+; GFX10-NEXT:    v_and_b32_e32 v21, 0xffff0000, v12
+; GFX10-NEXT:    v_and_b32_e32 v22, 0xffff0000, v4
+; GFX10-NEXT:    v_and_b32_e32 v23, 0xffff0000, v3
+; GFX10-NEXT:    v_and_b32_e32 v24, 0xffff0000, v10
+; GFX10-NEXT:    v_max_f32_e32 v16, v17, v16
+; GFX10-NEXT:    v_max_f32_e32 v17, v18, v18
+; GFX10-NEXT:    v_max_f32_e32 v18, v19, v19
+; GFX10-NEXT:    v_and_b32_e32 v19, 0xffff0000, v13
+; GFX10-NEXT:    v_and_b32_e32 v25, 0xffff0000, v1
+; GFX10-NEXT:    v_and_b32_e32 v26, 0xffff0000, v8
+; GFX10-NEXT:    v_and_b32_e32 v27, 0xffff0000, v0
+; GFX10-NEXT:    v_max_f32_e32 v17, v18, v17
+; GFX10-NEXT:    v_max_f32_e32 v18, v19, v19
+; GFX10-NEXT:    v_max_f32_e32 v19, v20, v20
+; GFX10-NEXT:    v_max_f32_e32 v20, v21, v21
+; GFX10-NEXT:    v_max_f32_e32 v21, v22, v22
+; GFX10-NEXT:    v_and_b32_e32 v22, 0xffff0000, v11
+; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX10-NEXT:    v_max_f32_e32 v18, v19, v18
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX10-NEXT:    v_max_f32_e32 v19, v21, v20
+; GFX10-NEXT:    v_max_f32_e32 v20, v22, v22
+; GFX10-NEXT:    v_max_f32_e32 v21, v23, v23
+; GFX10-NEXT:    v_max_f32_e32 v22, v24, v24
+; GFX10-NEXT:    v_and_b32_e32 v23, 0xffff0000, v2
+; GFX10-NEXT:    v_and_b32_e32 v24, 0xffff0000, v9
+; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    v_max_f32_e32 v23, v23, v23
+; GFX10-NEXT:    v_max_f32_e32 v24, v24, v24
+; GFX10-NEXT:    v_max_f32_e32 v25, v25, v25
+; GFX10-NEXT:    v_max_f32_e32 v26, v26, v26
+; GFX10-NEXT:    v_max_f32_e32 v27, v27, v27
+; GFX10-NEXT:    v_max_f32_e32 v15, v15, v15
+; GFX10-NEXT:    v_max_f32_e32 v7, v7, v7
+; GFX10-NEXT:    v_max_f32_e32 v14, v14, v14
+; GFX10-NEXT:    v_max_f32_e32 v6, v6, v6
+; GFX10-NEXT:    v_max_f32_e32 v13, v13, v13
+; GFX10-NEXT:    v_max_f32_e32 v5, v5, v5
+; GFX10-NEXT:    v_max_f32_e32 v12, v12, v12
+; GFX10-NEXT:    v_max_f32_e32 v11, v11, v11
+; GFX10-NEXT:    v_max_f32_e32 v10, v10, v10
+; GFX10-NEXT:    v_max_f32_e32 v9, v9, v9
+; GFX10-NEXT:    v_max_f32_e32 v8, v8, v8
 ; GFX10-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX10-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX10-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX10-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX10-NEXT:    v_max_f32_e32 v4, v4, v4
+; GFX10-NEXT:    v_max_f32_e32 v20, v21, v20
+; GFX10-NEXT:    v_max_f32_e32 v21, v23, v22
+; GFX10-NEXT:    v_max_f32_e32 v22, v25, v24
+; GFX10-NEXT:    v_max_f32_e32 v23, v27, v26
+; GFX10-NEXT:    v_max_f32_e32 v7, v7, v15
+; GFX10-NEXT:    v_max_f32_e32 v6, v6, v14
+; GFX10-NEXT:    v_max_f32_e32 v5, v5, v13
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v8
+; GFX10-NEXT:    v_max_f32_e32 v1, v1, v9
+; GFX10-NEXT:    v_max_f32_e32 v2, v2, v10
+; GFX10-NEXT:    v_max_f32_e32 v3, v3, v11
+; GFX10-NEXT:    v_max_f32_e32 v4, v4, v12
+; GFX10-NEXT:    v_perm_b32 v0, v0, v23, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v1, v1, v22, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v2, v2, v21, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v3, v3, v20, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v4, v4, v19, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v5, v5, v18, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v6, v6, v17, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v7, v7, v16, 0x3020706
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_maxnum_bf16:
+; GFX11-LABEL: v_maxnum_v16bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_and_b32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_max_f32_e32 v1, v1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff0000, v7
+; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff0000, v15
+; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff0000, v4
+; GFX11-NEXT:    v_and_b32_e32 v20, 0xffff0000, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_dual_max_f32 v17, v17, v17 :: v_dual_and_b32 v18, 0xffff0000, v14
+; GFX11-NEXT:    v_dual_max_f32 v16, v16, v16 :: v_dual_and_b32 v19, 0xffff0000, v6
+; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff0000, v3
+; GFX11-NEXT:    v_and_b32_e32 v24, 0xffff0000, v10
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_dual_max_f32 v16, v17, v16 :: v_dual_and_b32 v25, 0xffff0000, v1
+; GFX11-NEXT:    v_dual_max_f32 v17, v18, v18 :: v_dual_max_f32 v18, v19, v19
+; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff0000, v13
+; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff0000, v12
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_max_f32 v25, v25, v25 :: v_dual_and_b32 v26, 0xffff0000, v8
+; GFX11-NEXT:    v_dual_max_f32 v17, v18, v17 :: v_dual_max_f32 v18, v19, v19
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_max_f32 v19, v20, v20 :: v_dual_max_f32 v20, v21, v21
+; GFX11-NEXT:    v_dual_max_f32 v21, v22, v22 :: v_dual_and_b32 v22, 0xffff0000, v11
+; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-NEXT:    v_dual_max_f32 v18, v19, v18 :: v_dual_lshlrev_b32 v7, 16, v7
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_max_f32 v19, v21, v20 :: v_dual_max_f32 v20, v22, v22
+; GFX11-NEXT:    v_dual_max_f32 v14, v14, v14 :: v_dual_max_f32 v21, v23, v23
+; GFX11-NEXT:    v_dual_max_f32 v22, v24, v24 :: v_dual_lshlrev_b32 v15, 16, v15
+; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff0000, v2
+; GFX11-NEXT:    v_and_b32_e32 v24, 0xffff0000, v9
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_max_f32 v20, v21, v20 :: v_dual_max_f32 v15, v15, v15
+; GFX11-NEXT:    v_dual_max_f32 v26, v26, v26 :: v_dual_and_b32 v27, 0xffff0000, v0
+; GFX11-NEXT:    v_dual_max_f32 v7, v7, v7 :: v_dual_lshlrev_b32 v6, 16, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX11-NEXT:    v_dual_max_f32 v23, v23, v23 :: v_dual_max_f32 v24, v24, v24
+; GFX11-NEXT:    v_dual_max_f32 v27, v27, v27 :: v_dual_max_f32 v6, v6, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_dual_max_f32 v13, v13, v13 :: v_dual_lshlrev_b32 v10, 16, v10
+; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX11-NEXT:    v_dual_max_f32 v21, v23, v22 :: v_dual_max_f32 v22, v25, v24
+; GFX11-NEXT:    v_dual_max_f32 v23, v27, v26 :: v_dual_lshlrev_b32 v12, 16, v12
+; GFX11-NEXT:    v_dual_max_f32 v6, v6, v14 :: v_dual_max_f32 v5, v5, v5
+; GFX11-NEXT:    v_dual_max_f32 v10, v10, v10 :: v_dual_max_f32 v11, v11, v11
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-NEXT:    v_dual_max_f32 v7, v7, v15 :: v_dual_lshlrev_b32 v4, 16, v4
+; GFX11-NEXT:    v_dual_max_f32 v12, v12, v12 :: v_dual_max_f32 v5, v5, v13
+; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_lshlrev_b32 v3, 16, v3
+; GFX11-NEXT:    v_dual_max_f32 v9, v9, v9 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_max_f32 v8, v8, v8 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX11-NEXT:    v_dual_max_f32 v3, v3, v3 :: v_dual_max_f32 v2, v2, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_max_f32 v4, v4, v4 :: v_dual_max_f32 v1, v1, v1
+; GFX11-NEXT:    v_dual_max_f32 v0, v0, v8 :: v_dual_max_f32 v3, v3, v11
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_max_f32_e32 v2, v2, v10
+; GFX11-NEXT:    v_dual_max_f32 v4, v4, v12 :: v_dual_max_f32 v1, v1, v9
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v23, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v3, v3, v20, 0x3020706
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_perm_b32 v2, v2, v21, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v4, v4, v19, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v1, v1, v22, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v5, v5, v18, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v6, v6, v17, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v7, v7, v16, 0x3020706
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %op = call bfloat @llvm.maxnum.bf16(bfloat %a, bfloat %b)
-  ret bfloat %op
+  %op = call <16 x bfloat> @llvm.maxnum.v16bf16(<16 x bfloat> %a, <16 x bfloat> %b)
+  ret <16 x bfloat> %op
 }
 
-define <2 x bfloat> @v_maxnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
-; GCN-LABEL: v_maxnum_v2bf16:
+define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
+; GCN-LABEL: v_maxnum_v32bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:128
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32
+; GCN-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GCN-NEXT:    v_mul_f32_e32 v31, 1.0, v31
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:124
+; GCN-NEXT:    v_max_f32_e32 v31, v32, v31
+; GCN-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GCN-NEXT:    v_mul_f32_e32 v30, 1.0, v30
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:120
+; GCN-NEXT:    v_max_f32_e32 v30, v30, v32
+; GCN-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GCN-NEXT:    v_mul_f32_e32 v29, 1.0, v29
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:116
+; GCN-NEXT:    v_max_f32_e32 v29, v29, v32
+; GCN-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GCN-NEXT:    v_mul_f32_e32 v28, 1.0, v28
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:112
+; GCN-NEXT:    v_max_f32_e32 v28, v28, v32
+; GCN-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GCN-NEXT:    v_mul_f32_e32 v27, 1.0, v27
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:108
+; GCN-NEXT:    v_max_f32_e32 v27, v27, v32
+; GCN-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GCN-NEXT:    v_mul_f32_e32 v26, 1.0, v26
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:104
+; GCN-NEXT:    v_max_f32_e32 v26, v26, v32
+; GCN-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GCN-NEXT:    v_mul_f32_e32 v25, 1.0, v25
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:100
+; GCN-NEXT:    v_max_f32_e32 v25, v25, v32
+; GCN-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GCN-NEXT:    v_mul_f32_e32 v24, 1.0, v24
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:96
+; GCN-NEXT:    v_max_f32_e32 v24, v24, v32
+; GCN-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GCN-NEXT:    v_mul_f32_e32 v23, 1.0, v23
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:92
+; GCN-NEXT:    v_max_f32_e32 v23, v23, v32
+; GCN-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GCN-NEXT:    v_mul_f32_e32 v22, 1.0, v22
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:88
+; GCN-NEXT:    v_max_f32_e32 v22, v22, v32
+; GCN-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GCN-NEXT:    v_mul_f32_e32 v21, 1.0, v21
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:84
+; GCN-NEXT:    v_max_f32_e32 v21, v21, v32
+; GCN-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GCN-NEXT:    v_mul_f32_e32 v20, 1.0, v20
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:80
+; GCN-NEXT:    v_max_f32_e32 v20, v20, v32
+; GCN-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GCN-NEXT:    v_mul_f32_e32 v19, 1.0, v19
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:76
+; GCN-NEXT:    v_max_f32_e32 v19, v19, v32
+; GCN-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GCN-NEXT:    v_mul_f32_e32 v18, 1.0, v18
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:72
+; GCN-NEXT:    v_max_f32_e32 v18, v18, v32
+; GCN-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GCN-NEXT:    v_mul_f32_e32 v17, 1.0, v17
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:68
+; GCN-NEXT:    v_max_f32_e32 v17, v17, v32
+; GCN-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GCN-NEXT:    v_mul_f32_e32 v16, 1.0, v16
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:64
+; GCN-NEXT:    v_max_f32_e32 v16, v16, v32
+; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GCN-NEXT:    v_mul_f32_e32 v15, 1.0, v15
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:60
+; GCN-NEXT:    v_max_f32_e32 v15, v15, v32
+; GCN-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GCN-NEXT:    v_mul_f32_e32 v14, 1.0, v14
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:56
+; GCN-NEXT:    v_max_f32_e32 v14, v14, v32
+; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GCN-NEXT:    v_mul_f32_e32 v13, 1.0, v13
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:52
+; GCN-NEXT:    v_max_f32_e32 v13, v13, v32
+; GCN-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GCN-NEXT:    v_mul_f32_e32 v12, 1.0, v12
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:48
+; GCN-NEXT:    v_max_f32_e32 v12, v12, v32
+; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GCN-NEXT:    v_mul_f32_e32 v11, 1.0, v11
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:44
+; GCN-NEXT:    v_max_f32_e32 v11, v11, v32
+; GCN-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GCN-NEXT:    v_mul_f32_e32 v10, 1.0, v10
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:40
+; GCN-NEXT:    v_max_f32_e32 v10, v10, v32
+; GCN-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GCN-NEXT:    v_mul_f32_e32 v9, 1.0, v9
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:36
+; GCN-NEXT:    v_max_f32_e32 v9, v9, v32
+; GCN-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GCN-NEXT:    v_mul_f32_e32 v8, 1.0, v8
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:32
+; GCN-NEXT:    v_max_f32_e32 v8, v8, v32
+; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT:    v_mul_f32_e32 v7, 1.0, v7
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:28
+; GCN-NEXT:    v_max_f32_e32 v7, v7, v32
+; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT:    v_mul_f32_e32 v6, 1.0, v6
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:24
+; GCN-NEXT:    v_max_f32_e32 v6, v6, v32
+; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:20
+; GCN-NEXT:    v_max_f32_e32 v5, v5, v32
+; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:16
+; GCN-NEXT:    v_max_f32_e32 v4, v4, v32
 ; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:12
+; GCN-NEXT:    v_max_f32_e32 v3, v3, v32
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GCN-NEXT:    v_max_f32_e32 v1, v1, v3
-; GCN-NEXT:    v_max_f32_e32 v0, v0, v2
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
+; GCN-NEXT:    v_max_f32_e32 v2, v2, v32
+; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:4
+; GCN-NEXT:    v_max_f32_e32 v1, v1, v32
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v33
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    v_max_f32_e32 v0, v0, v32
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GCN-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GCN-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GCN-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GCN-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GCN-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GCN-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GCN-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GCN-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GCN-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GCN-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GCN-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GCN-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GCN-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GCN-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GCN-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GCN-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GCN-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GCN-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GCN-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GCN-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GCN-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX7-LABEL: v_maxnum_v2bf16:
+; GFX7-LABEL: v_maxnum_v32bf16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:128
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32
+; GFX7-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GFX7-NEXT:    v_mul_f32_e32 v30, 1.0, v30
+; GFX7-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GFX7-NEXT:    v_mul_f32_e32 v29, 1.0, v29
+; GFX7-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GFX7-NEXT:    v_mul_f32_e32 v28, 1.0, v28
+; GFX7-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GFX7-NEXT:    v_mul_f32_e32 v27, 1.0, v27
+; GFX7-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GFX7-NEXT:    v_mul_f32_e32 v26, 1.0, v26
+; GFX7-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GFX7-NEXT:    v_mul_f32_e32 v25, 1.0, v25
+; GFX7-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GFX7-NEXT:    v_mul_f32_e32 v24, 1.0, v24
+; GFX7-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GFX7-NEXT:    v_mul_f32_e32 v23, 1.0, v23
+; GFX7-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX7-NEXT:    v_mul_f32_e32 v22, 1.0, v22
+; GFX7-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GFX7-NEXT:    v_mul_f32_e32 v21, 1.0, v21
+; GFX7-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GFX7-NEXT:    v_mul_f32_e32 v20, 1.0, v20
+; GFX7-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GFX7-NEXT:    v_mul_f32_e32 v19, 1.0, v19
+; GFX7-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX7-NEXT:    v_mul_f32_e32 v18, 1.0, v18
+; GFX7-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GFX7-NEXT:    v_mul_f32_e32 v17, 1.0, v17
+; GFX7-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX7-NEXT:    v_mul_f32_e32 v16, 1.0, v16
+; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT:    v_mul_f32_e32 v15, 1.0, v15
+; GFX7-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX7-NEXT:    v_mul_f32_e32 v14, 1.0, v14
+; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX7-NEXT:    v_mul_f32_e32 v13, 1.0, v13
+; GFX7-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX7-NEXT:    v_mul_f32_e32 v12, 1.0, v12
+; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX7-NEXT:    v_mul_f32_e32 v11, 1.0, v11
+; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX7-NEXT:    v_mul_f32_e32 v10, 1.0, v10
+; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX7-NEXT:    v_mul_f32_e32 v9, 1.0, v9
+; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX7-NEXT:    v_mul_f32_e32 v8, 1.0, v8
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT:    v_mul_f32_e32 v7, 1.0, v7
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v6
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
 ; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT:    v_max_f32_e32 v1, v1, v3
-; GFX7-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v31, 1.0, v31
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_max_f32_e32 v31, v32, v31
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:124
+; GFX7-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_max_f32_e32 v30, v30, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:120
+; GFX7-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_max_f32_e32 v29, v29, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:116
+; GFX7-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_max_f32_e32 v28, v28, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:112
+; GFX7-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_max_f32_e32 v27, v27, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:108
+; GFX7-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_max_f32_e32 v26, v26, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:104
+; GFX7-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_max_f32_e32 v25, v25, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:100
+; GFX7-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_max_f32_e32 v24, v24, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:96
+; GFX7-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_max_f32_e32 v23, v23, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:92
+; GFX7-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_max_f32_e32 v22, v22, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:88
+; GFX7-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_max_f32_e32 v21, v21, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:84
+; GFX7-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_max_f32_e32 v20, v20, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:80
+; GFX7-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_max_f32_e32 v19, v19, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:76
+; GFX7-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_max_f32_e32 v18, v18, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:72
+; GFX7-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_max_f32_e32 v17, v17, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:68
+; GFX7-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_max_f32_e32 v16, v16, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:64
+; GFX7-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_max_f32_e32 v15, v15, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:60
+; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_max_f32_e32 v14, v14, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:56
+; GFX7-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_max_f32_e32 v13, v13, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:52
+; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_max_f32_e32 v12, v12, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:48
+; GFX7-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_max_f32_e32 v11, v11, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:44
+; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_max_f32_e32 v10, v10, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:40
+; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_max_f32_e32 v9, v9, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:36
+; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_max_f32_e32 v8, v8, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:32
+; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_max_f32_e32 v7, v7, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:28
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_max_f32_e32 v6, v6, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:24
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_max_f32_e32 v5, v5, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:20
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_max_f32_e32 v4, v4, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:16
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_max_f32_e32 v3, v3, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:12
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_max_f32_e32 v2, v2, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:8
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_max_f32_e32 v1, v1, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_max_f32_e32 v0, v0, v32
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: v_maxnum_v2bf16:
+; GFX8-LABEL: v_maxnum_v32bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX8-NEXT:    v_and_b32_e32 v31, 0xffff0000, v30
+; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v14
+; GFX8-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
+; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX8-NEXT:    v_mul_f32_e32 v31, 1.0, v31
+; GFX8-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX8-NEXT:    v_mul_f32_e32 v30, 1.0, v30
+; GFX8-NEXT:    v_mul_f32_e32 v14, 1.0, v14
+; GFX8-NEXT:    v_max_f32_e32 v31, v32, v31
+; GFX8-NEXT:    v_max_f32_e32 v14, v14, v30
+; GFX8-NEXT:    v_and_b32_e32 v30, 0xffff0000, v29
+; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v13
+; GFX8-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
+; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX8-NEXT:    v_mul_f32_e32 v30, 1.0, v30
+; GFX8-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX8-NEXT:    v_mul_f32_e32 v29, 1.0, v29
+; GFX8-NEXT:    v_mul_f32_e32 v13, 1.0, v13
+; GFX8-NEXT:    v_max_f32_e32 v30, v32, v30
+; GFX8-NEXT:    v_max_f32_e32 v13, v13, v29
+; GFX8-NEXT:    v_and_b32_e32 v29, 0xffff0000, v28
+; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v12
+; GFX8-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
+; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX8-NEXT:    v_mul_f32_e32 v29, 1.0, v29
+; GFX8-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX8-NEXT:    v_mul_f32_e32 v28, 1.0, v28
+; GFX8-NEXT:    v_mul_f32_e32 v12, 1.0, v12
+; GFX8-NEXT:    v_max_f32_e32 v29, v32, v29
+; GFX8-NEXT:    v_max_f32_e32 v12, v12, v28
+; GFX8-NEXT:    v_and_b32_e32 v28, 0xffff0000, v27
+; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v11
+; GFX8-NEXT:    v_mul_f32_e32 v28, 1.0, v28
+; GFX8-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX8-NEXT:    v_max_f32_e32 v28, v32, v28
+; GFX8-NEXT:    buffer_load_dword v32, off, s[0:3], s32
+; GFX8-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
+; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX8-NEXT:    v_mul_f32_e32 v27, 1.0, v27
+; GFX8-NEXT:    v_mul_f32_e32 v11, 1.0, v11
+; GFX8-NEXT:    v_max_f32_e32 v11, v11, v27
+; GFX8-NEXT:    v_and_b32_e32 v27, 0xffff0000, v15
+; GFX8-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX8-NEXT:    v_mul_f32_e32 v27, 1.0, v27
+; GFX8-NEXT:    v_mul_f32_e32 v15, 1.0, v15
+; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX8-NEXT:    v_perm_b32 v11, v11, v28, s4
+; GFX8-NEXT:    v_perm_b32 v12, v12, v29, s4
+; GFX8-NEXT:    v_perm_b32 v13, v13, v30, s4
+; GFX8-NEXT:    v_perm_b32 v14, v14, v31, s4
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v32
+; GFX8-NEXT:    v_lshlrev_b32_e32 v32, 16, v32
+; GFX8-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX8-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX8-NEXT:    v_max_f32_e32 v27, v27, v33
+; GFX8-NEXT:    v_max_f32_e32 v15, v15, v32
+; GFX8-NEXT:    v_and_b32_e32 v32, 0xffff0000, v26
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v10
+; GFX8-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
+; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX8-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX8-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX8-NEXT:    v_mul_f32_e32 v26, 1.0, v26
+; GFX8-NEXT:    v_mul_f32_e32 v10, 1.0, v10
+; GFX8-NEXT:    v_max_f32_e32 v32, v33, v32
+; GFX8-NEXT:    v_max_f32_e32 v10, v10, v26
+; GFX8-NEXT:    v_and_b32_e32 v26, 0xffff0000, v25
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v9
+; GFX8-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
+; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX8-NEXT:    v_mul_f32_e32 v26, 1.0, v26
+; GFX8-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX8-NEXT:    v_mul_f32_e32 v25, 1.0, v25
+; GFX8-NEXT:    v_mul_f32_e32 v9, 1.0, v9
+; GFX8-NEXT:    v_max_f32_e32 v26, v33, v26
+; GFX8-NEXT:    v_max_f32_e32 v9, v9, v25
+; GFX8-NEXT:    v_and_b32_e32 v25, 0xffff0000, v24
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v8
+; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX8-NEXT:    v_mul_f32_e32 v25, 1.0, v25
+; GFX8-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX8-NEXT:    v_mul_f32_e32 v24, 1.0, v24
+; GFX8-NEXT:    v_mul_f32_e32 v8, 1.0, v8
+; GFX8-NEXT:    v_max_f32_e32 v25, v33, v25
+; GFX8-NEXT:    v_max_f32_e32 v8, v8, v24
+; GFX8-NEXT:    v_and_b32_e32 v24, 0xffff0000, v23
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v7
+; GFX8-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX8-NEXT:    v_mul_f32_e32 v24, 1.0, v24
+; GFX8-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX8-NEXT:    v_mul_f32_e32 v23, 1.0, v23
+; GFX8-NEXT:    v_mul_f32_e32 v7, 1.0, v7
+; GFX8-NEXT:    v_max_f32_e32 v24, v33, v24
+; GFX8-NEXT:    v_max_f32_e32 v7, v7, v23
+; GFX8-NEXT:    v_and_b32_e32 v23, 0xffff0000, v22
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v6
+; GFX8-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
+; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_mul_f32_e32 v23, 1.0, v23
+; GFX8-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX8-NEXT:    v_mul_f32_e32 v22, 1.0, v22
+; GFX8-NEXT:    v_mul_f32_e32 v6, 1.0, v6
+; GFX8-NEXT:    v_max_f32_e32 v23, v33, v23
+; GFX8-NEXT:    v_max_f32_e32 v6, v6, v22
+; GFX8-NEXT:    v_and_b32_e32 v22, 0xffff0000, v21
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_mul_f32_e32 v22, 1.0, v22
+; GFX8-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX8-NEXT:    v_mul_f32_e32 v21, 1.0, v21
+; GFX8-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GFX8-NEXT:    v_max_f32_e32 v22, v33, v22
+; GFX8-NEXT:    v_max_f32_e32 v5, v5, v21
+; GFX8-NEXT:    v_and_b32_e32 v21, 0xffff0000, v20
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v4
+; GFX8-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX8-NEXT:    v_mul_f32_e32 v21, 1.0, v21
+; GFX8-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX8-NEXT:    v_mul_f32_e32 v20, 1.0, v20
+; GFX8-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GFX8-NEXT:    v_max_f32_e32 v21, v33, v21
+; GFX8-NEXT:    v_max_f32_e32 v4, v4, v20
+; GFX8-NEXT:    v_and_b32_e32 v20, 0xffff0000, v19
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT:    v_mul_f32_e32 v20, 1.0, v20
+; GFX8-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX8-NEXT:    v_mul_f32_e32 v19, 1.0, v19
 ; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX8-NEXT:    v_max_f32_e32 v20, v33, v20
+; GFX8-NEXT:    v_max_f32_e32 v3, v3, v19
+; GFX8-NEXT:    v_and_b32_e32 v19, 0xffff0000, v18
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT:    v_mul_f32_e32 v19, 1.0, v19
+; GFX8-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX8-NEXT:    v_mul_f32_e32 v18, 1.0, v18
+; GFX8-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX8-NEXT:    v_max_f32_e32 v19, v33, v19
+; GFX8-NEXT:    v_max_f32_e32 v2, v2, v18
+; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v17
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT:    v_mul_f32_e32 v18, 1.0, v18
+; GFX8-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX8-NEXT:    v_mul_f32_e32 v17, 1.0, v17
 ; GFX8-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX8-NEXT:    v_max_f32_e32 v18, v33, v18
+; GFX8-NEXT:    v_max_f32_e32 v1, v1, v17
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v16
+; GFX8-NEXT:    v_and_b32_e32 v33, 0xffff0000, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_mul_f32_e32 v17, 1.0, v17
+; GFX8-NEXT:    v_mul_f32_e32 v33, 1.0, v33
+; GFX8-NEXT:    v_mul_f32_e32 v16, 1.0, v16
 ; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX8-NEXT:    v_max_f32_e32 v2, v3, v2
-; GFX8-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX8-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX8-NEXT:    v_perm_b32 v0, v0, v2, s4
+; GFX8-NEXT:    v_max_f32_e32 v17, v33, v17
+; GFX8-NEXT:    v_max_f32_e32 v0, v0, v16
+; GFX8-NEXT:    v_perm_b32 v0, v0, v17, s4
+; GFX8-NEXT:    v_perm_b32 v1, v1, v18, s4
+; GFX8-NEXT:    v_perm_b32 v2, v2, v19, s4
+; GFX8-NEXT:    v_perm_b32 v3, v3, v20, s4
+; GFX8-NEXT:    v_perm_b32 v4, v4, v21, s4
+; GFX8-NEXT:    v_perm_b32 v5, v5, v22, s4
+; GFX8-NEXT:    v_perm_b32 v6, v6, v23, s4
+; GFX8-NEXT:    v_perm_b32 v7, v7, v24, s4
+; GFX8-NEXT:    v_perm_b32 v8, v8, v25, s4
+; GFX8-NEXT:    v_perm_b32 v9, v9, v26, s4
+; GFX8-NEXT:    v_perm_b32 v10, v10, v32, s4
+; GFX8-NEXT:    v_perm_b32 v15, v15, v27, s4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_maxnum_v2bf16:
+; GFX9-LABEL: v_maxnum_v32bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX9-NEXT:    buffer_load_dword v35, off, s[0:3], s32
+; GFX9-NEXT:    v_and_b32_e32 v31, 0xffff0000, v30
+; GFX9-NEXT:    v_and_b32_e32 v32, 0xffff0000, v14
+; GFX9-NEXT:    v_and_b32_e32 v33, 0xffff0000, v29
+; GFX9-NEXT:    v_and_b32_e32 v34, 0xffff0000, v13
+; GFX9-NEXT:    v_and_b32_e32 v36, 0xffff0000, v28
+; GFX9-NEXT:    v_and_b32_e32 v37, 0xffff0000, v12
+; GFX9-NEXT:    v_and_b32_e32 v50, 0xffff0000, v25
+; GFX9-NEXT:    v_and_b32_e32 v51, 0xffff0000, v9
+; GFX9-NEXT:    v_max_f32_e32 v31, v31, v31
+; GFX9-NEXT:    v_max_f32_e32 v32, v32, v32
+; GFX9-NEXT:    v_max_f32_e32 v33, v33, v33
+; GFX9-NEXT:    v_max_f32_e32 v34, v34, v34
+; GFX9-NEXT:    v_max_f32_e32 v36, v36, v36
+; GFX9-NEXT:    v_max_f32_e32 v37, v37, v37
+; GFX9-NEXT:    v_max_f32_e32 v50, v50, v50
+; GFX9-NEXT:    v_max_f32_e32 v51, v51, v51
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_and_b32_e32 v38, 0xffff0000, v27
+; GFX9-NEXT:    v_and_b32_e32 v39, 0xffff0000, v11
+; GFX9-NEXT:    v_and_b32_e32 v52, 0xffff0000, v24
+; GFX9-NEXT:    v_and_b32_e32 v53, 0xffff0000, v8
+; GFX9-NEXT:    v_and_b32_e32 v43, 0xffff0000, v21
+; GFX9-NEXT:    v_max_f32_e32 v31, v32, v31
+; GFX9-NEXT:    v_max_f32_e32 v32, v34, v33
+; GFX9-NEXT:    v_max_f32_e32 v33, v37, v36
+; GFX9-NEXT:    v_max_f32_e32 v37, v51, v50
+; GFX9-NEXT:    v_and_b32_e32 v51, 0xffff0000, v5
+; GFX9-NEXT:    v_max_f32_e32 v38, v38, v38
+; GFX9-NEXT:    v_max_f32_e32 v39, v39, v39
+; GFX9-NEXT:    v_max_f32_e32 v52, v52, v52
+; GFX9-NEXT:    v_max_f32_e32 v53, v53, v53
+; GFX9-NEXT:    v_max_f32_e32 v50, v43, v43
+; GFX9-NEXT:    v_max_f32_e32 v51, v51, v51
+; GFX9-NEXT:    v_max_f32_e32 v34, v39, v38
+; GFX9-NEXT:    v_max_f32_e32 v38, v53, v52
+; GFX9-NEXT:    v_max_f32_e32 v50, v51, v50
+; GFX9-NEXT:    v_and_b32_e32 v51, 0xffff0000, v20
+; GFX9-NEXT:    v_and_b32_e32 v52, 0xffff0000, v4
+; GFX9-NEXT:    v_max_f32_e32 v51, v51, v51
+; GFX9-NEXT:    v_max_f32_e32 v52, v52, v52
+; GFX9-NEXT:    v_and_b32_e32 v54, 0xffff0000, v23
+; GFX9-NEXT:    v_and_b32_e32 v55, 0xffff0000, v7
+; GFX9-NEXT:    v_max_f32_e32 v51, v52, v51
+; GFX9-NEXT:    v_and_b32_e32 v52, 0xffff0000, v19
+; GFX9-NEXT:    v_and_b32_e32 v53, 0xffff0000, v3
+; GFX9-NEXT:    v_max_f32_e32 v54, v54, v54
+; GFX9-NEXT:    v_max_f32_e32 v55, v55, v55
+; GFX9-NEXT:    v_max_f32_e32 v52, v52, v52
+; GFX9-NEXT:    v_max_f32_e32 v53, v53, v53
+; GFX9-NEXT:    v_max_f32_e32 v39, v55, v54
+; GFX9-NEXT:    v_max_f32_e32 v52, v53, v52
+; GFX9-NEXT:    v_and_b32_e32 v53, 0xffff0000, v18
+; GFX9-NEXT:    v_and_b32_e32 v54, 0xffff0000, v2
+; GFX9-NEXT:    v_max_f32_e32 v53, v53, v53
+; GFX9-NEXT:    v_max_f32_e32 v54, v54, v54
+; GFX9-NEXT:    v_and_b32_e32 v48, 0xffff0000, v26
+; GFX9-NEXT:    v_and_b32_e32 v49, 0xffff0000, v10
+; GFX9-NEXT:    v_and_b32_e32 v40, 0xffff0000, v22
+; GFX9-NEXT:    v_and_b32_e32 v41, 0xffff0000, v6
+; GFX9-NEXT:    v_max_f32_e32 v53, v54, v53
+; GFX9-NEXT:    v_and_b32_e32 v54, 0xffff0000, v17
+; GFX9-NEXT:    v_and_b32_e32 v55, 0xffff0000, v1
+; GFX9-NEXT:    v_max_f32_e32 v48, v48, v48
+; GFX9-NEXT:    v_max_f32_e32 v49, v49, v49
+; GFX9-NEXT:    v_max_f32_e32 v40, v40, v40
+; GFX9-NEXT:    v_max_f32_e32 v41, v41, v41
+; GFX9-NEXT:    v_max_f32_e32 v54, v54, v54
+; GFX9-NEXT:    v_max_f32_e32 v55, v55, v55
+; GFX9-NEXT:    v_and_b32_e32 v42, 0xffff0000, v15
+; GFX9-NEXT:    v_max_f32_e32 v36, v49, v48
+; GFX9-NEXT:    v_max_f32_e32 v48, v41, v40
+; GFX9-NEXT:    v_max_f32_e32 v54, v55, v54
+; GFX9-NEXT:    v_and_b32_e32 v55, 0xffff0000, v16
+; GFX9-NEXT:    v_and_b32_e32 v40, 0xffff0000, v0
+; GFX9-NEXT:    v_max_f32_e32 v42, v42, v42
+; GFX9-NEXT:    v_max_f32_e32 v55, v55, v55
+; GFX9-NEXT:    v_max_f32_e32 v40, v40, v40
+; GFX9-NEXT:    v_max_f32_e32 v55, v40, v55
+; GFX9-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    v_and_b32_e32 v49, 0xffff0000, v35
+; GFX9-NEXT:    v_max_f32_e32 v49, v49, v49
+; GFX9-NEXT:    v_max_f32_e32 v49, v42, v49
+; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9-NEXT:    v_lshlrev_b32_e32 v35, 16, v35
+; GFX9-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
+; GFX9-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX9-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
+; GFX9-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX9-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
+; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX9-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
+; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX9-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
+; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX9-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX9-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX9-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX9-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
+; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX9-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX9-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX9-NEXT:    v_max_f32_e32 v35, v35, v35
+; GFX9-NEXT:    v_max_f32_e32 v15, v15, v15
+; GFX9-NEXT:    v_max_f32_e32 v30, v30, v30
+; GFX9-NEXT:    v_max_f32_e32 v14, v14, v14
+; GFX9-NEXT:    v_max_f32_e32 v29, v29, v29
+; GFX9-NEXT:    v_max_f32_e32 v13, v13, v13
+; GFX9-NEXT:    v_max_f32_e32 v28, v28, v28
+; GFX9-NEXT:    v_max_f32_e32 v12, v12, v12
+; GFX9-NEXT:    v_max_f32_e32 v27, v27, v27
+; GFX9-NEXT:    v_max_f32_e32 v11, v11, v11
+; GFX9-NEXT:    v_max_f32_e32 v26, v26, v26
+; GFX9-NEXT:    v_max_f32_e32 v10, v10, v10
+; GFX9-NEXT:    v_max_f32_e32 v25, v25, v25
+; GFX9-NEXT:    v_max_f32_e32 v9, v9, v9
+; GFX9-NEXT:    v_max_f32_e32 v24, v24, v24
+; GFX9-NEXT:    v_max_f32_e32 v8, v8, v8
+; GFX9-NEXT:    v_max_f32_e32 v23, v23, v23
+; GFX9-NEXT:    v_max_f32_e32 v7, v7, v7
+; GFX9-NEXT:    v_max_f32_e32 v22, v22, v22
+; GFX9-NEXT:    v_max_f32_e32 v6, v6, v6
+; GFX9-NEXT:    v_max_f32_e32 v21, v21, v21
+; GFX9-NEXT:    v_max_f32_e32 v5, v5, v5
+; GFX9-NEXT:    v_max_f32_e32 v20, v20, v20
+; GFX9-NEXT:    v_max_f32_e32 v4, v4, v4
+; GFX9-NEXT:    v_max_f32_e32 v19, v19, v19
 ; GFX9-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX9-NEXT:    v_max_f32_e32 v18, v18, v18
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX9-NEXT:    v_max_f32_e32 v17, v17, v17
 ; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX9-NEXT:    v_max_f32_e32 v16, v16, v16
 ; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX9-NEXT:    v_max_f32_e32 v2, v3, v2
-; GFX9-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX9-NEXT:    v_max_f32_e32 v15, v15, v35
+; GFX9-NEXT:    v_max_f32_e32 v14, v14, v30
+; GFX9-NEXT:    v_max_f32_e32 v13, v13, v29
+; GFX9-NEXT:    v_max_f32_e32 v12, v12, v28
+; GFX9-NEXT:    v_max_f32_e32 v11, v11, v27
+; GFX9-NEXT:    v_max_f32_e32 v10, v10, v26
+; GFX9-NEXT:    v_max_f32_e32 v9, v9, v25
+; GFX9-NEXT:    v_max_f32_e32 v8, v8, v24
+; GFX9-NEXT:    v_max_f32_e32 v7, v7, v23
+; GFX9-NEXT:    v_max_f32_e32 v6, v6, v22
+; GFX9-NEXT:    v_max_f32_e32 v5, v5, v21
+; GFX9-NEXT:    v_max_f32_e32 v4, v4, v20
+; GFX9-NEXT:    v_max_f32_e32 v3, v3, v19
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v18
+; GFX9-NEXT:    v_max_f32_e32 v1, v1, v17
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v16
 ; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
-; GFX9-NEXT:    v_perm_b32 v0, v0, v2, s4
+; GFX9-NEXT:    v_perm_b32 v0, v0, v55, s4
+; GFX9-NEXT:    v_perm_b32 v1, v1, v54, s4
+; GFX9-NEXT:    v_perm_b32 v2, v2, v53, s4
+; GFX9-NEXT:    v_perm_b32 v3, v3, v52, s4
+; GFX9-NEXT:    v_perm_b32 v4, v4, v51, s4
+; GFX9-NEXT:    v_perm_b32 v5, v5, v50, s4
+; GFX9-NEXT:    v_perm_b32 v6, v6, v48, s4
+; GFX9-NEXT:    v_perm_b32 v7, v7, v39, s4
+; GFX9-NEXT:    v_perm_b32 v8, v8, v38, s4
+; GFX9-NEXT:    v_perm_b32 v9, v9, v37, s4
+; GFX9-NEXT:    v_perm_b32 v10, v10, v36, s4
+; GFX9-NEXT:    v_perm_b32 v11, v11, v34, s4
+; GFX9-NEXT:    v_perm_b32 v12, v12, v33, s4
+; GFX9-NEXT:    v_perm_b32 v13, v13, v32, s4
+; GFX9-NEXT:    v_perm_b32 v14, v14, v31, s4
+; GFX9-NEXT:    v_perm_b32 v15, v15, v49, s4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: v_maxnum_v2bf16:
+; GFX10-LABEL: v_maxnum_v32bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX10-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX10-NEXT:    v_and_b32_e32 v53, 0xffff0000, v24
+; GFX10-NEXT:    v_and_b32_e32 v54, 0xffff0000, v8
+; GFX10-NEXT:    v_and_b32_e32 v55, 0xffff0000, v23
+; GFX10-NEXT:    v_and_b32_e32 v64, 0xffff0000, v7
+; GFX10-NEXT:    v_and_b32_e32 v65, 0xffff0000, v22
+; GFX10-NEXT:    v_and_b32_e32 v66, 0xffff0000, v6
+; GFX10-NEXT:    v_and_b32_e32 v67, 0xffff0000, v21
+; GFX10-NEXT:    v_and_b32_e32 v68, 0xffff0000, v5
+; GFX10-NEXT:    v_max_f32_e32 v53, v53, v53
+; GFX10-NEXT:    v_max_f32_e32 v54, v54, v54
+; GFX10-NEXT:    v_max_f32_e32 v55, v55, v55
+; GFX10-NEXT:    v_max_f32_e32 v64, v64, v64
+; GFX10-NEXT:    v_max_f32_e32 v65, v65, v65
+; GFX10-NEXT:    v_max_f32_e32 v66, v66, v66
+; GFX10-NEXT:    v_max_f32_e32 v67, v67, v67
+; GFX10-NEXT:    v_max_f32_e32 v68, v68, v68
+; GFX10-NEXT:    v_and_b32_e32 v32, 0xffff0000, v30
+; GFX10-NEXT:    v_and_b32_e32 v34, 0xffff0000, v14
+; GFX10-NEXT:    v_and_b32_e32 v35, 0xffff0000, v29
+; GFX10-NEXT:    v_and_b32_e32 v36, 0xffff0000, v13
+; GFX10-NEXT:    v_and_b32_e32 v37, 0xffff0000, v28
+; GFX10-NEXT:    v_and_b32_e32 v38, 0xffff0000, v12
+; GFX10-NEXT:    v_and_b32_e32 v39, 0xffff0000, v27
+; GFX10-NEXT:    v_and_b32_e32 v48, 0xffff0000, v11
+; GFX10-NEXT:    v_and_b32_e32 v49, 0xffff0000, v26
+; GFX10-NEXT:    v_and_b32_e32 v50, 0xffff0000, v10
+; GFX10-NEXT:    v_and_b32_e32 v51, 0xffff0000, v25
+; GFX10-NEXT:    v_and_b32_e32 v52, 0xffff0000, v9
+; GFX10-NEXT:    v_max_f32_e32 v53, v54, v53
+; GFX10-NEXT:    v_and_b32_e32 v54, 0xffff0000, v17
+; GFX10-NEXT:    v_max_f32_e32 v55, v64, v55
+; GFX10-NEXT:    v_and_b32_e32 v64, 0xffff0000, v1
+; GFX10-NEXT:    v_max_f32_e32 v65, v66, v65
+; GFX10-NEXT:    v_and_b32_e32 v66, 0xffff0000, v16
+; GFX10-NEXT:    v_max_f32_e32 v67, v68, v67
+; GFX10-NEXT:    v_and_b32_e32 v68, 0xffff0000, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX10-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX10-NEXT:    v_max_f32_e32 v32, v32, v32
+; GFX10-NEXT:    v_max_f32_e32 v34, v34, v34
+; GFX10-NEXT:    v_max_f32_e32 v35, v35, v35
+; GFX10-NEXT:    v_max_f32_e32 v36, v36, v36
+; GFX10-NEXT:    v_max_f32_e32 v37, v37, v37
+; GFX10-NEXT:    v_max_f32_e32 v38, v38, v38
+; GFX10-NEXT:    v_max_f32_e32 v39, v39, v39
+; GFX10-NEXT:    v_max_f32_e32 v48, v48, v48
+; GFX10-NEXT:    v_max_f32_e32 v49, v49, v49
+; GFX10-NEXT:    v_max_f32_e32 v50, v50, v50
+; GFX10-NEXT:    v_max_f32_e32 v51, v51, v51
+; GFX10-NEXT:    v_max_f32_e32 v52, v52, v52
+; GFX10-NEXT:    v_max_f32_e32 v17, v17, v17
 ; GFX10-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX10-NEXT:    v_max_f32_e32 v16, v16, v16
 ; GFX10-NEXT:    v_max_f32_e32 v0, v0, v0
-; GFX10-NEXT:    v_max_f32_e32 v2, v3, v2
-; GFX10-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x3020706
+; GFX10-NEXT:    v_and_b32_e32 v33, 0xffff0000, v15
+; GFX10-NEXT:    v_max_f32_e32 v32, v34, v32
+; GFX10-NEXT:    v_and_b32_e32 v34, 0xffff0000, v20
+; GFX10-NEXT:    v_max_f32_e32 v35, v36, v35
+; GFX10-NEXT:    v_and_b32_e32 v36, 0xffff0000, v4
+; GFX10-NEXT:    v_max_f32_e32 v37, v38, v37
+; GFX10-NEXT:    v_and_b32_e32 v38, 0xffff0000, v19
+; GFX10-NEXT:    v_max_f32_e32 v39, v48, v39
+; GFX10-NEXT:    v_and_b32_e32 v48, 0xffff0000, v3
+; GFX10-NEXT:    v_max_f32_e32 v49, v50, v49
+; GFX10-NEXT:    v_and_b32_e32 v50, 0xffff0000, v18
+; GFX10-NEXT:    v_max_f32_e32 v51, v52, v51
+; GFX10-NEXT:    v_and_b32_e32 v52, 0xffff0000, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX10-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
+; GFX10-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX10-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
+; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX10-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
+; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX10-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX10-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
+; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX10-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX10-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    v_max_f32_e32 v0, v0, v16
+; GFX10-NEXT:    v_max_f32_e32 v1, v1, v17
+; GFX10-NEXT:    v_max_f32_e32 v33, v33, v33
+; GFX10-NEXT:    v_max_f32_e32 v34, v34, v34
+; GFX10-NEXT:    v_max_f32_e32 v36, v36, v36
+; GFX10-NEXT:    v_max_f32_e32 v38, v38, v38
+; GFX10-NEXT:    v_max_f32_e32 v48, v48, v48
+; GFX10-NEXT:    v_max_f32_e32 v50, v50, v50
+; GFX10-NEXT:    v_max_f32_e32 v52, v52, v52
+; GFX10-NEXT:    v_max_f32_e32 v54, v54, v54
+; GFX10-NEXT:    v_max_f32_e32 v64, v64, v64
+; GFX10-NEXT:    v_max_f32_e32 v66, v66, v66
+; GFX10-NEXT:    v_max_f32_e32 v68, v68, v68
+; GFX10-NEXT:    v_max_f32_e32 v15, v15, v15
+; GFX10-NEXT:    v_max_f32_e32 v30, v30, v30
+; GFX10-NEXT:    v_max_f32_e32 v14, v14, v14
+; GFX10-NEXT:    v_max_f32_e32 v29, v29, v29
+; GFX10-NEXT:    v_max_f32_e32 v13, v13, v13
+; GFX10-NEXT:    v_max_f32_e32 v28, v28, v28
+; GFX10-NEXT:    v_max_f32_e32 v12, v12, v12
+; GFX10-NEXT:    v_max_f32_e32 v27, v27, v27
+; GFX10-NEXT:    v_max_f32_e32 v11, v11, v11
+; GFX10-NEXT:    v_max_f32_e32 v26, v26, v26
+; GFX10-NEXT:    v_max_f32_e32 v10, v10, v10
+; GFX10-NEXT:    v_max_f32_e32 v25, v25, v25
+; GFX10-NEXT:    v_max_f32_e32 v9, v9, v9
+; GFX10-NEXT:    v_max_f32_e32 v24, v24, v24
+; GFX10-NEXT:    v_max_f32_e32 v8, v8, v8
+; GFX10-NEXT:    v_max_f32_e32 v23, v23, v23
+; GFX10-NEXT:    v_max_f32_e32 v7, v7, v7
+; GFX10-NEXT:    v_max_f32_e32 v22, v22, v22
+; GFX10-NEXT:    v_max_f32_e32 v6, v6, v6
+; GFX10-NEXT:    v_max_f32_e32 v21, v21, v21
+; GFX10-NEXT:    v_max_f32_e32 v5, v5, v5
+; GFX10-NEXT:    v_max_f32_e32 v20, v20, v20
+; GFX10-NEXT:    v_max_f32_e32 v4, v4, v4
+; GFX10-NEXT:    v_max_f32_e32 v19, v19, v19
+; GFX10-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX10-NEXT:    v_max_f32_e32 v18, v18, v18
+; GFX10-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX10-NEXT:    v_max_f32_e32 v34, v36, v34
+; GFX10-NEXT:    v_max_f32_e32 v36, v48, v38
+; GFX10-NEXT:    v_max_f32_e32 v38, v52, v50
+; GFX10-NEXT:    v_max_f32_e32 v48, v64, v54
+; GFX10-NEXT:    v_max_f32_e32 v50, v68, v66
+; GFX10-NEXT:    v_max_f32_e32 v14, v14, v30
+; GFX10-NEXT:    v_max_f32_e32 v13, v13, v29
+; GFX10-NEXT:    v_max_f32_e32 v12, v12, v28
+; GFX10-NEXT:    v_max_f32_e32 v11, v11, v27
+; GFX10-NEXT:    v_max_f32_e32 v10, v10, v26
+; GFX10-NEXT:    v_max_f32_e32 v9, v9, v25
+; GFX10-NEXT:    v_max_f32_e32 v8, v8, v24
+; GFX10-NEXT:    v_max_f32_e32 v7, v7, v23
+; GFX10-NEXT:    v_max_f32_e32 v6, v6, v22
+; GFX10-NEXT:    v_max_f32_e32 v5, v5, v21
+; GFX10-NEXT:    v_max_f32_e32 v2, v2, v18
+; GFX10-NEXT:    v_max_f32_e32 v3, v3, v19
+; GFX10-NEXT:    v_max_f32_e32 v4, v4, v20
+; GFX10-NEXT:    v_perm_b32 v0, v0, v50, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v1, v1, v48, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v2, v2, v38, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v3, v3, v36, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v4, v4, v34, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v5, v5, v67, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v6, v6, v65, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v7, v7, v55, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v8, v8, v53, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v9, v9, v51, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v10, v10, v49, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v11, v11, v39, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v12, v12, v37, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v13, v13, v35, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v14, v14, v32, 0x3020706
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v16, 0xffff0000, v31
+; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v31
+; GFX10-NEXT:    v_max_f32_e32 v16, v16, v16
+; GFX10-NEXT:    v_max_f32_e32 v17, v17, v17
+; GFX10-NEXT:    v_max_f32_e32 v16, v33, v16
+; GFX10-NEXT:    v_max_f32_e32 v15, v15, v17
+; GFX10-NEXT:    v_perm_b32 v15, v15, v16, 0x3020706
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_maxnum_v2bf16:
+; GFX11-LABEL: v_maxnum_v32bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    scratch_load_b32 v31, off, s32
+; GFX11-NEXT:    v_and_b32_e32 v33, 0xffff0000, v30
+; GFX11-NEXT:    v_and_b32_e32 v35, 0xffff0000, v29
+; GFX11-NEXT:    v_and_b32_e32 v34, 0xffff0000, v14
+; GFX11-NEXT:    v_and_b32_e32 v38, 0xffff0000, v12
+; GFX11-NEXT:    v_and_b32_e32 v37, 0xffff0000, v28
+; GFX11-NEXT:    v_and_b32_e32 v39, 0xffff0000, v27
+; GFX11-NEXT:    v_and_b32_e32 v36, 0xffff0000, v13
+; GFX11-NEXT:    v_dual_max_f32 v33, v33, v33 :: v_dual_and_b32 v32, 0xffff0000, v15
+; GFX11-NEXT:    v_and_b32_e32 v49, 0xffff0000, v26
+; GFX11-NEXT:    v_and_b32_e32 v48, 0xffff0000, v11
+; GFX11-NEXT:    v_and_b32_e32 v51, 0xffff0000, v25
+; GFX11-NEXT:    v_and_b32_e32 v50, 0xffff0000, v10
+; GFX11-NEXT:    v_and_b32_e32 v54, 0xffff0000, v8
+; GFX11-NEXT:    v_and_b32_e32 v53, 0xffff0000, v24
+; GFX11-NEXT:    v_and_b32_e32 v55, 0xffff0000, v23
+; GFX11-NEXT:    v_and_b32_e32 v52, 0xffff0000, v9
+; GFX11-NEXT:    v_and_b32_e32 v65, 0xffff0000, v22
+; GFX11-NEXT:    v_and_b32_e32 v67, 0xffff0000, v21
+; GFX11-NEXT:    v_and_b32_e32 v66, 0xffff0000, v6
+; GFX11-NEXT:    v_and_b32_e32 v71, 0xffff0000, v19
+; GFX11-NEXT:    v_and_b32_e32 v68, 0xffff0000, v5
+; GFX11-NEXT:    v_and_b32_e32 v83, 0xffff0000, v17
+; GFX11-NEXT:    v_and_b32_e32 v86, 0xffff0000, v0
+; GFX11-NEXT:    v_and_b32_e32 v85, 0xffff0000, v16
+; GFX11-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
+; GFX11-NEXT:    v_and_b32_e32 v84, 0xffff0000, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
+; GFX11-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
+; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
+; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX11-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; GFX11-NEXT:    v_lshlrev_b32_e32 v22, 16, v22
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v17
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_max_f32 v3, v3, v3 :: v_dual_max_f32 v2, v2, v2
-; GFX11-NEXT:    v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_max_f32_e32 v2, v3, v2
-; GFX11-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v2, 0x3020706
+; GFX11-NEXT:    v_dual_max_f32 v35, v35, v35 :: v_dual_max_f32 v34, v34, v34
+; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
+; GFX11-NEXT:    v_dual_max_f32 v38, v38, v38 :: v_dual_max_f32 v37, v37, v37
+; GFX11-NEXT:    v_dual_max_f32 v39, v39, v39 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-NEXT:    v_max_f32_e32 v36, v36, v36
+; GFX11-NEXT:    v_dual_max_f32 v65, v65, v65 :: v_dual_and_b32 v64, 0xffff0000, v7
+; GFX11-NEXT:    v_and_b32_e32 v70, 0xffff0000, v4
+; GFX11-NEXT:    v_and_b32_e32 v69, 0xffff0000, v20
+; GFX11-NEXT:    v_and_b32_e32 v81, 0xffff0000, v18
+; GFX11-NEXT:    v_dual_max_f32 v83, v83, v83 :: v_dual_and_b32 v82, 0xffff0000, v2
+; GFX11-NEXT:    v_dual_max_f32 v17, v17, v17 :: v_dual_lshlrev_b32 v18, 16, v18
+; GFX11-NEXT:    v_dual_max_f32 v1, v1, v1 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX11-NEXT:    v_dual_max_f32 v49, v49, v49 :: v_dual_max_f32 v48, v48, v48
+; GFX11-NEXT:    v_dual_max_f32 v51, v51, v51 :: v_dual_max_f32 v50, v50, v50
+; GFX11-NEXT:    v_dual_max_f32 v54, v54, v54 :: v_dual_max_f32 v53, v53, v53
+; GFX11-NEXT:    v_dual_max_f32 v67, v67, v67 :: v_dual_max_f32 v66, v66, v66
+; GFX11-NEXT:    v_dual_max_f32 v25, v25, v25 :: v_dual_max_f32 v26, v26, v26
+; GFX11-NEXT:    v_dual_max_f32 v9, v9, v9 :: v_dual_max_f32 v10, v10, v10
+; GFX11-NEXT:    v_dual_max_f32 v21, v21, v21 :: v_dual_max_f32 v22, v22, v22
+; GFX11-NEXT:    v_dual_max_f32 v5, v5, v5 :: v_dual_max_f32 v6, v6, v6
+; GFX11-NEXT:    v_dual_max_f32 v33, v34, v33 :: v_dual_max_f32 v16, v16, v16
+; GFX11-NEXT:    v_dual_max_f32 v34, v36, v35 :: v_dual_max_f32 v35, v38, v37
+; GFX11-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX11-NEXT:    v_dual_max_f32 v81, v81, v81 :: v_dual_and_b32 v80, 0xffff0000, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v29, 16, v29
+; GFX11-NEXT:    v_lshlrev_b32_e32 v30, 16, v30
+; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
+; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
+; GFX11-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
+; GFX11-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
+; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
+; GFX11-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
+; GFX11-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
+; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
+; GFX11-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
+; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-NEXT:    v_dual_max_f32 v70, v70, v70 :: v_dual_max_f32 v69, v69, v69
+; GFX11-NEXT:    v_dual_max_f32 v36, v48, v39 :: v_dual_max_f32 v37, v50, v49
+; GFX11-NEXT:    v_max_f32_e32 v39, v54, v53
+; GFX11-NEXT:    v_dual_max_f32 v10, v10, v26 :: v_dual_max_f32 v1, v1, v17
+; GFX11-NEXT:    v_max_f32_e32 v6, v6, v22
+; GFX11-NEXT:    v_dual_max_f32 v32, v32, v32 :: v_dual_max_f32 v55, v55, v55
+; GFX11-NEXT:    v_max_f32_e32 v52, v52, v52
+; GFX11-NEXT:    v_dual_max_f32 v64, v64, v64 :: v_dual_max_f32 v71, v71, v71
+; GFX11-NEXT:    v_max_f32_e32 v68, v68, v68
+; GFX11-NEXT:    v_max_f32_e32 v80, v80, v80
+; GFX11-NEXT:    v_max_f32_e32 v82, v82, v82
+; GFX11-NEXT:    v_dual_max_f32 v86, v86, v86 :: v_dual_max_f32 v85, v85, v85
+; GFX11-NEXT:    v_dual_max_f32 v15, v15, v15 :: v_dual_max_f32 v84, v84, v84
+; GFX11-NEXT:    v_dual_max_f32 v29, v29, v29 :: v_dual_max_f32 v30, v30, v30
+; GFX11-NEXT:    v_dual_max_f32 v13, v13, v13 :: v_dual_max_f32 v14, v14, v14
+; GFX11-NEXT:    v_dual_max_f32 v27, v27, v27 :: v_dual_max_f32 v28, v28, v28
+; GFX11-NEXT:    v_dual_max_f32 v11, v11, v11 :: v_dual_max_f32 v12, v12, v12
+; GFX11-NEXT:    v_dual_max_f32 v23, v23, v23 :: v_dual_max_f32 v24, v24, v24
+; GFX11-NEXT:    v_dual_max_f32 v7, v7, v7 :: v_dual_max_f32 v8, v8, v8
+; GFX11-NEXT:    v_dual_max_f32 v19, v19, v19 :: v_dual_max_f32 v20, v20, v20
+; GFX11-NEXT:    v_dual_max_f32 v3, v3, v3 :: v_dual_max_f32 v4, v4, v4
+; GFX11-NEXT:    v_max_f32_e32 v18, v18, v18
+; GFX11-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX11-NEXT:    v_dual_max_f32 v38, v52, v51 :: v_dual_max_f32 v53, v82, v81
+; GFX11-NEXT:    v_dual_max_f32 v48, v64, v55 :: v_dual_max_f32 v55, v86, v85
+; GFX11-NEXT:    v_dual_max_f32 v49, v66, v65 :: v_dual_max_f32 v50, v68, v67
+; GFX11-NEXT:    v_max_f32_e32 v13, v13, v29
+; GFX11-NEXT:    v_dual_max_f32 v51, v70, v69 :: v_dual_max_f32 v52, v80, v71
+; GFX11-NEXT:    v_dual_max_f32 v9, v9, v25 :: v_dual_max_f32 v54, v84, v83
+; GFX11-NEXT:    v_dual_max_f32 v5, v5, v21 :: v_dual_max_f32 v14, v14, v30
+; GFX11-NEXT:    v_dual_max_f32 v11, v11, v27 :: v_dual_max_f32 v12, v12, v28
+; GFX11-NEXT:    v_dual_max_f32 v7, v7, v23 :: v_dual_max_f32 v8, v8, v24
+; GFX11-NEXT:    v_dual_max_f32 v3, v3, v19 :: v_dual_max_f32 v4, v4, v20
+; GFX11-NEXT:    v_perm_b32 v1, v1, v54, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v5, v5, v50, 0x3020706
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_perm_b32 v7, v7, v48, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v3, v3, v52, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v4, v4, v51, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v8, v8, v39, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v9, v9, v38, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v10, v10, v37, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v11, v11, v36, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v12, v12, v35, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v13, v13, v34, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v14, v14, v33, 0x3020706
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_dual_max_f32 v0, v0, v16 :: v_dual_lshlrev_b32 v17, 16, v31
+; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff0000, v31
+; GFX11-NEXT:    v_perm_b32 v6, v6, v49, 0x3020706
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_max_f32 v17, v17, v17 :: v_dual_max_f32 v2, v2, v18
+; GFX11-NEXT:    v_max_f32_e32 v16, v16, v16
+; GFX11-NEXT:    v_perm_b32 v0, v0, v55, 0x3020706
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_max_f32_e32 v15, v15, v17
+; GFX11-NEXT:    v_perm_b32 v2, v2, v53, 0x3020706
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f32_e32 v16, v32, v16
+; GFX11-NEXT:    v_perm_b32 v15, v15, v16, 0x3020706
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %op = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b)
-  ret <2 x bfloat> %op
+  %op = call <32 x bfloat> @llvm.maxnum.v32bf16(<32 x bfloat> %a, <32 x bfloat> %b)
+  ret <32 x bfloat> %op
 }
 
 declare bfloat @llvm.sqrt.bf16(bfloat)
diff --git a/llvm/test/CodeGen/AMDGPU/function-args.ll b/llvm/test/CodeGen/AMDGPU/function-args.ll
index de9e320a363a0..f24cc6f177d62 100644
--- a/llvm/test/CodeGen/AMDGPU/function-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-args.ll
@@ -3087,8 +3087,8 @@ define void @void_func_v32i32_v2i32_v2f32(<32 x i32> %arg0, <2 x i32> %arg1, <2
   ret void
 }
 
-define void @void_func_v32i32_v2i16_v2f16_v2bf16(<32 x i32> %arg0, <2 x i16> %arg1, <2 x half> %arg2, <2 x bfloat> %arg3) #0 {
-; CI-LABEL: void_func_v32i32_v2i16_v2f16_v2bf16:
+define void @void_func_v32i32_v2i16_v2f16_v2bf16_v4bf16(<32 x i32> %arg0, <2 x i16> %arg1, <2 x half> %arg2, <2 x bfloat> %arg3, <4 x bfloat> %arg4) #0 {
+; CI-LABEL: void_func_v32i32_v2i16_v2f16_v2bf16_v4bf16:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
@@ -3103,39 +3103,55 @@ define void @void_func_v32i32_v2i16_v2f16_v2bf16(<32 x i32> %arg0, <2 x i16> %ar
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:20
-; CI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:24
-; CI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:12
-; CI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:16
-; CI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:8
+; CI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:32
+; CI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:36
+; CI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:40
+; CI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:20
 ; CI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:4
-; CI-NEXT:    v_lshrrev_b32_e32 v14, 16, v16
-; CI-NEXT:    v_cvt_f16_f32_e32 v15, v17
-; CI-NEXT:    v_lshrrev_b32_e32 v13, 16, v20
-; CI-NEXT:    v_cvt_f16_f32_e32 v16, v18
+; CI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:28
+; CI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:24
+; CI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:12
+; CI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:16
+; CI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:8
 ; CI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:4
+; CI-NEXT:    v_lshrrev_b32_e32 v10, 16, v16
+; CI-NEXT:    v_lshrrev_b32_e32 v11, 16, v17
+; CI-NEXT:    v_lshrrev_b32_e32 v16, 16, v18
+; CI-NEXT:    v_lshrrev_b32_e32 v17, 16, v19
+; CI-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
+; CI-NEXT:    v_lshrrev_b32_e32 v9, 16, v20
+; CI-NEXT:    v_cvt_f16_f32_e32 v13, v13
+; CI-NEXT:    v_cvt_f16_f32_e32 v14, v14
 ; CI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    buffer_store_short v19, off, s[4:7], 0
+; CI-NEXT:    buffer_store_short v15, off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_store_short v8, off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_store_short v14, off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_store_short v13, off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    buffer_store_short v12, off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_store_short v17, off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    buffer_store_short v16, off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    buffer_store_short v15, off, s[4:7], 0
+; CI-NEXT:    buffer_store_short v11, off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    buffer_store_short v14, off, s[4:7], 0
+; CI-NEXT:    buffer_store_short v10, off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    buffer_store_short v13, off, s[4:7], 0
+; CI-NEXT:    buffer_store_short v9, off, s[4:7], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; VI-LABEL: void_func_v32i32_v2i16_v2f16_v2bf16:
+; VI-LABEL: void_func_v32i32_v2i16_v2f16_v2bf16_v4bf16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
@@ -3150,26 +3166,38 @@ define void @void_func_v32i32_v2i16_v2f16_v2bf16(<32 x i32> %arg0, <2 x i16> %ar
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:8
-; VI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:12
-; VI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:4
+; VI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:16
+; VI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:20
+; VI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:4
+; VI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:8
+; VI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:12
 ; VI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_lshrrev_b32_e32 v13, 16, v16
+; VI-NEXT:    v_lshrrev_b32_e32 v12, 16, v20
 ; VI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_dword v20, off, s[4:7], 0
+; VI-NEXT:    buffer_store_dword v17, off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_dword v16, off, s[4:7], 0
+; VI-NEXT:    buffer_store_dword v18, off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    buffer_store_dword v17, off, s[4:7], 0
+; VI-NEXT:    buffer_store_dword v19, off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_short v13, off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_short v16, off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_short v12, off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_short v20, off, s[4:7], 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: void_func_v32i32_v2i16_v2f16_v2bf16:
+; GFX9-LABEL: void_func_v32i32_v2i16_v2f16_v2bf16_v4bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
@@ -3184,36 +3212,54 @@ define void @void_func_v32i32_v2i16_v2f16_v2bf16(<32 x i32> %arg0, <2 x i16> %ar
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:8
-; GFX9-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:12
-; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:16
+; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:20
+; GFX9-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:8
+; GFX9-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:12
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 16, v16
+; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 16, v20
 ; GFX9-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v20, off, s[4:7], 0
+; GFX9-NEXT:    buffer_store_dword v17, off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v16, off, s[4:7], 0
+; GFX9-NEXT:    buffer_store_dword v18, off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_store_dword v17, off, s[4:7], 0
+; GFX9-NEXT:    buffer_store_dword v19, off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_short v13, off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_short v16, off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_short v12, off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_short v20, off, s[4:7], 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: void_func_v32i32_v2i16_v2f16_v2bf16:
+; GFX11-LABEL: void_func_v32i32_v2i16_v2f16_v2bf16_v4bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_clause 0x3
+; GFX11-NEXT:    s_clause 0x5
+; GFX11-NEXT:    scratch_load_b32 v32, off, s32 offset:16
+; GFX11-NEXT:    scratch_load_b32 v33, off, s32 offset:20
 ; GFX11-NEXT:    scratch_load_b32 v31, off, s32
-; GFX11-NEXT:    scratch_load_b32 v32, off, s32 offset:4
-; GFX11-NEXT:    scratch_load_b32 v33, off, s32 offset:8
-; GFX11-NEXT:    scratch_load_b32 v34, off, s32 offset:12
+; GFX11-NEXT:    scratch_load_b32 v34, off, s32 offset:4
+; GFX11-NEXT:    scratch_load_b32 v35, off, s32 offset:8
+; GFX11-NEXT:    scratch_load_b32 v36, off, s32 offset:12
 ; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    s_waitcnt vmcnt(5)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v37, 16, v32
+; GFX11-NEXT:    s_waitcnt vmcnt(4)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v38, 16, v33
 ; GFX11-NEXT:    s_waitcnt vmcnt(3)
 ; GFX11-NEXT:    buffer_store_b128 v[28:31], off, s[0:3], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -3232,19 +3278,28 @@ define void @void_func_v32i32_v2i16_v2f16_v2bf16(<32 x i32> %arg0, <2 x i16> %ar
 ; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_waitcnt vmcnt(2)
-; GFX11-NEXT:    buffer_store_b32 v32, off, s[0:3], 0 dlc
+; GFX11-NEXT:    buffer_store_b32 v34, off, s[0:3], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    buffer_store_b32 v33, off, s[0:3], 0 dlc
+; GFX11-NEXT:    buffer_store_b32 v35, off, s[0:3], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_store_b32 v34, off, s[0:3], 0 dlc
+; GFX11-NEXT:    buffer_store_b32 v36, off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b16 v38, off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b16 v33, off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b16 v37, off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b16 v32, off, s[0:3], 0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store volatile <32 x i32> %arg0, ptr addrspace(1) undef
   store volatile <2 x i16> %arg1, ptr addrspace(1) undef
   store volatile <2 x half> %arg2, ptr addrspace(1) undef
   store volatile <2 x bfloat> %arg3, ptr addrspace(1) undef
+  store volatile <4 x bfloat> %arg4, ptr addrspace(1) undef
   ret void
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
index 91cff9d1a5419..844aa57de05ce 100644
--- a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
@@ -1582,8 +1582,8 @@ define <6 x half> @shuffle_v6f16_452367(ptr addrspace(1) %arg0, ptr addrspace(1)
   ret <6 x half> %shuffle
 }
 
-define amdgpu_kernel void @fma_shuffle(ptr addrspace(1) nocapture readonly %A, ptr addrspace(1) nocapture readonly %B, ptr addrspace(1) nocapture %C)  {
-; GFX9-LABEL: fma_shuffle:
+define amdgpu_kernel void @fma_shuffle_v2f16(ptr addrspace(1) nocapture readonly %A, ptr addrspace(1) nocapture readonly %B, ptr addrspace(1) nocapture %C)  {
+; GFX9-LABEL: fma_shuffle_v2f16:
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x10
@@ -1600,7 +1600,7 @@ define amdgpu_kernel void @fma_shuffle(ptr addrspace(1) nocapture readonly %A, p
 ; GFX9-NEXT:    global_store_dwordx2 v6, v[0:1], s[6:7]
 ; GFX9-NEXT:    s_endpgm
 ;
-; GFX10-LABEL: fma_shuffle:
+; GFX10-LABEL: fma_shuffle_v2f16:
 ; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_clause 0x1
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
@@ -1619,7 +1619,7 @@ define amdgpu_kernel void @fma_shuffle(ptr addrspace(1) nocapture readonly %A, p
 ; GFX10-NEXT:    global_store_dwordx2 v6, v[0:1], s[6:7]
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: fma_shuffle:
+; GFX11-LABEL: fma_shuffle_v2f16:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x0
@@ -1758,12 +1758,8 @@ define amdgpu_kernel void @shuffle_scalar_load_v8i32_0123(ptr addrspace(4) %in,
   ret void
 }
 
-declare <2 x half> @llvm.fma.v2f16(<2 x half>, <2 x half>, <2 x half>) #0
-declare i32 @llvm.amdgcn.workitem.id.x() #0
-
-attributes #0 = { nounwind readnone speculatable }
-define <2 x half> @low16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
-; GFX9-LABEL: low16bits:
+define <2 x half> @low16bits_v2f16(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
+; GFX9-LABEL: low16bits_v2f16:
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v4, v[0:1], off
@@ -1773,7 +1769,7 @@ define <2 x half> @low16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
 ; GFX9-NEXT:    v_perm_b32 v0, v5, v4, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: low16bits:
+; GFX10-LABEL: low16bits_v2f16:
 ; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v4, v[0:1], off
@@ -1782,7 +1778,7 @@ define <2 x half> @low16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
 ; GFX10-NEXT:    v_perm_b32 v0, v5, v4, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: low16bits:
+; GFX11-LABEL: low16bits_v2f16:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
@@ -1798,8 +1794,8 @@ entry:
   ret <2 x half> %vy1.2.vec.insert
 }
 
-define <2 x half> @hi16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
-; GFX9-LABEL: hi16bits:
+define <2 x half> @hi16bits_v2f16(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
+; GFX9-LABEL: hi16bits_v2f16:
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v4, v[0:1], off
@@ -1809,7 +1805,7 @@ define <2 x half> @hi16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
 ; GFX9-NEXT:    v_perm_b32 v0, v5, v4, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: hi16bits:
+; GFX10-LABEL: hi16bits_v2f16:
 ; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v4, v[0:1], off
@@ -1818,7 +1814,7 @@ define <2 x half> @hi16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
 ; GFX10-NEXT:    v_perm_b32 v0, v5, v4, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: hi16bits:
+; GFX11-LABEL: hi16bits_v2f16:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
@@ -1834,8 +1830,8 @@ entry:
   ret <2 x half> %vy1.2.vec.insert
 }
 
-define <2 x half> @low16hi16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
-; GFX9-LABEL: low16hi16bits:
+define <2 x half> @low16hi16bits_v2f16(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
+; GFX9-LABEL: low16hi16bits_v2f16:
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v4, v[0:1], off
@@ -1845,7 +1841,7 @@ define <2 x half> @low16hi16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
 ; GFX9-NEXT:    v_bfi_b32 v0, s4, v4, v5
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: low16hi16bits:
+; GFX10-LABEL: low16hi16bits_v2f16:
 ; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v4, v[0:1], off
@@ -1854,7 +1850,7 @@ define <2 x half> @low16hi16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
 ; GFX10-NEXT:    v_bfi_b32 v0, 0xffff, v4, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: low16hi16bits:
+; GFX11-LABEL: low16hi16bits_v2f16:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
@@ -1870,8 +1866,8 @@ entry:
   ret <2 x half> %vy1.2.vec.insert
 }
 
-define <2 x half> @hi16low16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
-; GFX9-LABEL: hi16low16bits:
+define <2 x half> @hi16low16bits_v2bf16(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
+; GFX9-LABEL: hi16low16bits_v2bf16:
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v4, v[0:1], off
@@ -1880,7 +1876,7 @@ define <2 x half> @hi16low16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
 ; GFX9-NEXT:    v_alignbit_b32 v0, v5, v4, 16
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: hi16low16bits:
+; GFX10-LABEL: hi16low16bits_v2bf16:
 ; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v4, v[0:1], off
@@ -1889,7 +1885,7 @@ define <2 x half> @hi16low16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
 ; GFX10-NEXT:    v_alignbit_b32 v0, v5, v4, 16
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: hi16low16bits:
+; GFX11-LABEL: hi16low16bits_v2bf16:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
@@ -2675,3 +2671,2354 @@ define void @shuffle_v16i32_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg
   store <16 x i32> %shuffle, ptr addrspace(1) %out
   ret void
 }
+
+define <4 x bfloat> @shuffle_v4bf16_23uu(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_23uu:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off offset:4
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
+; GFX9-NEXT:    v_and_or_b32 v0, v0, s4, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_23uu:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off offset:4
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
+; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_23uu:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+  ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_234u(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_234u:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:4
+; GFX9-NEXT:    global_load_dword v5, v[2:3], off
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v5
+; GFX9-NEXT:    v_and_or_b32 v0, v4, s4, v0
+; GFX9-NEXT:    v_and_or_b32 v1, v5, s4, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_234u:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:4
+; GFX10-NEXT:    global_load_dword v5, v[2:3], off
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v4
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v5
+; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v4, v0
+; GFX10-NEXT:    v_and_or_b32 v1, 0xffff, v5, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_234u:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
+; GFX11-NEXT:    global_load_b32 v1, v[2:3], off
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v0, v2
+; GFX11-NEXT:    v_and_or_b32 v1, 0xffff, v1, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 2, i32 3, i32 4, i32 undef>
+  ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_u1u3(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_u1u3:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_u1u3:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_u1u3:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 undef, i32 1, i32 undef, i32 3>
+  ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_u3u1(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_u3u1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx2 v[1:2], v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX9-NEXT:    v_and_or_b32 v0, v2, s4, v0
+; GFX9-NEXT:    v_and_or_b32 v1, v1, s4, v3
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_u3u1:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dwordx2 v[1:2], v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v2, v0
+; GFX10-NEXT:    v_and_or_b32 v1, 0xffff, v1, v3
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_u3u1:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b64 v[1:2], v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v2, v0
+; GFX11-NEXT:    v_and_or_b32 v1, 0xffff, v1, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 undef, i32 3, i32 undef, i32 1>
+  ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_u3uu(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_u3uu:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off offset:4
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
+; GFX9-NEXT:    v_and_or_b32 v0, v0, s4, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_u3uu:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off offset:4
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
+; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_u3uu:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 undef, i32 3, i32 undef, i32 undef>
+  ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_3u6u(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_3u6u:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v5, v[0:1], off offset:4
+; GFX9-NEXT:    global_load_dword v4, v[2:3], off offset:4
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v5
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v1, v4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_3u6u:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v5, v[0:1], off offset:4
+; GFX10-NEXT:    global_load_dword v4, v[2:3], off offset:4
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v5
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v1, v4
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_3u6u:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
+; GFX11-NEXT:    global_load_b32 v1, v[2:3], off offset:4
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 3, i32 undef, i32 6, i32 undef>
+  ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_3uu7(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_3uu7:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v5, v[0:1], off offset:4
+; GFX9-NEXT:    global_load_dword v4, v[2:3], off offset:4
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v5
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v1, v4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_3uu7:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v5, v[0:1], off offset:4
+; GFX10-NEXT:    global_load_dword v4, v[2:3], off offset:4
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v5
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v1, v4
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_3uu7:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
+; GFX11-NEXT:    global_load_b32 v1, v[2:3], off offset:4
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 3, i32 undef, i32 undef, i32 7>
+  ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_35u5(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_35u5:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:4
+; GFX9-NEXT:    global_load_dword v5, v[2:3], off
+; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    s_mov_b32 s5, 0xffff
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_perm_b32 v0, v4, v5, s4
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v5
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
+; GFX9-NEXT:    v_and_or_b32 v1, v5, s5, v1
+; GFX9-NEXT:    v_and_or_b32 v0, v0, s5, v2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_35u5:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:4
+; GFX10-NEXT:    global_load_dword v5, v[2:3], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_perm_b32 v0, v4, v5, 0x3020706
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v5
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
+; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
+; GFX10-NEXT:    v_and_or_b32 v1, 0xffff, v5, v2
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_35u5:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
+; GFX11-NEXT:    global_load_b32 v1, v[2:3], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x3020706
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
+; GFX11-NEXT:    v_and_or_b32 v1, 0xffff, v1, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v0, v2
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 3, i32 5, i32 undef, i32 5>
+  ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_357u(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_357u:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v6, v[0:1], off offset:4
+; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
+; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    s_mov_b32 s5, 0xffff
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_perm_b32 v0, v6, v4, s4
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v5
+; GFX9-NEXT:    v_and_or_b32 v0, v0, s5, v2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_357u:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v6, v[0:1], off offset:4
+; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_perm_b32 v0, v6, v4, 0x3020706
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
+; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v5
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_357u:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v4, v[0:1], off offset:4
+; GFX11-NEXT:    global_load_b64 v[0:1], v[2:3], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_perm_b32 v0, v4, v0, 0x3020706
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
+; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v0, v2
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 3, i32 5, i32 7, i32 undef>
+  ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_0101(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_0101:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
+; GFX9-NEXT:    v_and_or_b32 v0, v0, s4, v1
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_0101:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
+; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_0101:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
+; GFX11-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+  ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_0123(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_0123:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_0123:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_0123:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_0145(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_0145:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v4, v[0:1], off
+; GFX9-NEXT:    global_load_dword v5, v[2:3], off
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v5
+; GFX9-NEXT:    v_and_or_b32 v0, v4, s4, v0
+; GFX9-NEXT:    v_and_or_b32 v1, v5, s4, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_0145:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v4, v[0:1], off
+; GFX10-NEXT:    global_load_dword v5, v[2:3], off
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v4
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v5
+; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v4, v0
+; GFX10-NEXT:    v_and_or_b32 v1, 0xffff, v5, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_0145:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-NEXT:    global_load_b32 v1, v[2:3], off
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v0, v2
+; GFX11-NEXT:    v_and_or_b32 v1, 0xffff, v1, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+  ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_0167(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_0167:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v5, v[0:1], off
+; GFX9-NEXT:    global_load_dword v4, v[2:3], off offset:4
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v5
+; GFX9-NEXT:    v_and_or_b32 v0, v5, s4, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v1, v4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_0167:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v5, v[0:1], off
+; GFX10-NEXT:    global_load_dword v4, v[2:3], off offset:4
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v5
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v1, v4
+; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v5, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_0167:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-NEXT:    global_load_b32 v1, v[2:3], off offset:4
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v0, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+  ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_2301(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_2301:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx2 v[1:2], v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX9-NEXT:    v_and_or_b32 v0, v2, s4, v0
+; GFX9-NEXT:    v_and_or_b32 v1, v1, s4, v3
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_2301:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dwordx2 v[1:2], v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v2, v0
+; GFX10-NEXT:    v_and_or_b32 v1, 0xffff, v1, v3
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_2301:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b64 v[1:2], v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v2, v0
+; GFX11-NEXT:    v_and_or_b32 v1, 0xffff, v1, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
+  ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_2323(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_2323:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off offset:4
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
+; GFX9-NEXT:    v_and_or_b32 v0, v0, s4, v1
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_2323:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off offset:4
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
+; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_2323:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
+; GFX11-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 2, i32 3, i32 2, i32 3>
+  ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_2345(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_2345:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:4
+; GFX9-NEXT:    global_load_dword v5, v[2:3], off
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v5
+; GFX9-NEXT:    v_and_or_b32 v0, v4, s4, v0
+; GFX9-NEXT:    v_and_or_b32 v1, v5, s4, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_2345:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:4
+; GFX10-NEXT:    global_load_dword v5, v[2:3], off
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v4
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v5
+; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v4, v0
+; GFX10-NEXT:    v_and_or_b32 v1, 0xffff, v5, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_2345:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
+; GFX11-NEXT:    global_load_b32 v1, v[2:3], off
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v0, v2
+; GFX11-NEXT:    v_and_or_b32 v1, 0xffff, v1, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+  ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_2367(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_2367:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v5, v[0:1], off offset:4
+; GFX9-NEXT:    global_load_dword v4, v[2:3], off offset:4
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v5
+; GFX9-NEXT:    v_and_or_b32 v0, v5, s4, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v1, v4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_2367:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v5, v[0:1], off offset:4
+; GFX10-NEXT:    global_load_dword v4, v[2:3], off offset:4
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v5
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v1, v4
+; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v5, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_2367:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
+; GFX11-NEXT:    global_load_b32 v1, v[2:3], off offset:4
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v0, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+  ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_4501(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_4501:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v4, v[2:3], off
+; GFX9-NEXT:    global_load_dword v5, v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v5
+; GFX9-NEXT:    v_and_or_b32 v0, v4, s4, v0
+; GFX9-NEXT:    v_and_or_b32 v1, v5, s4, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_4501:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v4, v[2:3], off
+; GFX10-NEXT:    global_load_dword v5, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v4
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v5
+; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v4, v0
+; GFX10-NEXT:    v_and_or_b32 v1, 0xffff, v5, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_4501:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v2, v[2:3], off
+; GFX11-NEXT:    global_load_b32 v1, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v2, v0
+; GFX11-NEXT:    v_and_or_b32 v1, 0xffff, v1, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
+  ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_4523(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_4523:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v5, v[2:3], off
+; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:4
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v5
+; GFX9-NEXT:    v_and_or_b32 v0, v5, s4, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v1, v4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_4523:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v5, v[2:3], off
+; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:4
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v5
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v1, v4
+; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v5, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_4523:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v2, v[2:3], off
+; GFX11-NEXT:    global_load_b32 v1, v[0:1], off offset:4
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v2, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+  ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_4545(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_4545:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[2:3], off
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
+; GFX9-NEXT:    v_and_or_b32 v0, v0, s4, v1
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_4545:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[2:3], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
+; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_4545:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[2:3], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
+; GFX11-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 4, i32 5, i32 4, i32 5>
+  ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_4567(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_4567:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[2:3], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_4567:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[2:3], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_4567:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b64 v[0:1], v[2:3], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_6701(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_6701:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v4, v[2:3], off offset:4
+; GFX9-NEXT:    global_load_dword v5, v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v5
+; GFX9-NEXT:    v_and_or_b32 v0, v4, s4, v0
+; GFX9-NEXT:    v_and_or_b32 v1, v5, s4, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_6701:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v4, v[2:3], off offset:4
+; GFX10-NEXT:    global_load_dword v5, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v4
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v5
+; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v4, v0
+; GFX10-NEXT:    v_and_or_b32 v1, 0xffff, v5, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_6701:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v2, v[2:3], off offset:4
+; GFX11-NEXT:    global_load_b32 v1, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v2, v0
+; GFX11-NEXT:    v_and_or_b32 v1, 0xffff, v1, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
+  ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_6723(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_6723:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v5, v[2:3], off offset:4
+; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:4
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v5
+; GFX9-NEXT:    v_and_or_b32 v0, v5, s4, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v1, v4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_6723:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v5, v[2:3], off offset:4
+; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:4
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v5
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v1, v4
+; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v5, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_6723:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v2, v[2:3], off offset:4
+; GFX11-NEXT:    global_load_b32 v1, v[0:1], off offset:4
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v2, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 6, i32 7, i32 2, i32 3>
+  ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_6745(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_6745:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx2 v[1:2], v[2:3], off
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX9-NEXT:    v_and_or_b32 v0, v2, s4, v0
+; GFX9-NEXT:    v_and_or_b32 v1, v1, s4, v3
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_6745:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dwordx2 v[1:2], v[2:3], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v2, v0
+; GFX10-NEXT:    v_and_or_b32 v1, 0xffff, v1, v3
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_6745:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b64 v[1:2], v[2:3], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v2, v0
+; GFX11-NEXT:    v_and_or_b32 v1, 0xffff, v1, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 6, i32 7, i32 4, i32 5>
+  ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_6767(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_6767:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[2:3], off offset:4
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
+; GFX9-NEXT:    v_and_or_b32 v0, v0, s4, v1
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_6767:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[2:3], off offset:4
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
+; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_6767:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[2:3], off offset:4
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
+; GFX11-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 6, i32 7, i32 6, i32 7>
+  ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_2356(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_2356:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
+; GFX9-NEXT:    global_load_dword v6, v[0:1], off offset:4
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-NEXT:    s_mov_b32 s5, 0x3020706
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_alignbit_b32 v1, v5, v4, 16
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v6
+; GFX9-NEXT:    v_and_or_b32 v0, v6, s4, v0
+; GFX9-NEXT:    v_perm_b32 v1, v4, v1, s5
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_2356:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v6, v[0:1], off offset:4
+; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_alignbit_b32 v1, v5, v4, 16
+; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v6, v0
+; GFX10-NEXT:    v_perm_b32 v1, v4, v1, 0x3020706
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_2356:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
+; GFX11-NEXT:    global_load_b64 v[1:2], v[2:3], off
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_alignbit_b32 v2, v2, v1, 16
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v0, v3
+; GFX11-NEXT:    v_perm_b32 v1, v1, v2, 0x3020706
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 2, i32 3, i32 5, i32 6>
+  ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_5623(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_5623:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx2 v[5:6], v[2:3], off
+; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:4
+; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_alignbit_b32 v0, v6, v5, 16
+; GFX9-NEXT:    v_perm_b32 v0, v5, v0, s4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v1, v4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_5623:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dwordx2 v[5:6], v[2:3], off
+; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:4
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    v_alignbit_b32 v0, v6, v5, 16
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v1, v4
+; GFX10-NEXT:    v_perm_b32 v0, v5, v0, 0x3020706
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_5623:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b64 v[2:3], v[2:3], off
+; GFX11-NEXT:    global_load_b32 v1, v[0:1], off offset:4
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_alignbit_b32 v0, v3, v2, 16
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v0, v2, v0, 0x3020706
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 5, i32 6, i32 2, i32 3>
+  ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_3456(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_3456:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
+; GFX9-NEXT:    global_load_dword v6, v[0:1], off offset:4
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-NEXT:    s_mov_b32 s5, 0x3020706
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_alignbit_b32 v0, v5, v4, 16
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_alignbit_b32 v2, v4, v6, 16
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX9-NEXT:    v_perm_b32 v1, v4, v0, s5
+; GFX9-NEXT:    v_and_or_b32 v0, v2, s4, v3
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_3456:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v6, v[0:1], off offset:4
+; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_alignbit_b32 v0, v4, v6, 16
+; GFX10-NEXT:    v_alignbit_b32 v2, v5, v4, 16
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
+; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
+; GFX10-NEXT:    v_perm_b32 v1, v4, v2, 0x3020706
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_3456:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
+; GFX11-NEXT:    global_load_b64 v[1:2], v[2:3], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_alignbit_b32 v0, v1, v0, 16
+; GFX11-NEXT:    v_alignbit_b32 v2, v2, v1, 16
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v0
+; GFX11-NEXT:    v_perm_b32 v1, v1, v2, 0x3020706
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v0, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+  ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_5634(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_5634:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
+; GFX9-NEXT:    global_load_dword v6, v[0:1], off offset:4
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-NEXT:    s_mov_b32 s5, 0x3020706
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_alignbit_b32 v0, v5, v4, 16
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_alignbit_b32 v1, v4, v6, 16
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
+; GFX9-NEXT:    v_perm_b32 v0, v4, v0, s5
+; GFX9-NEXT:    v_and_or_b32 v1, v1, s4, v2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_5634:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v6, v[0:1], off offset:4
+; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_alignbit_b32 v1, v4, v6, 16
+; GFX10-NEXT:    v_alignbit_b32 v0, v5, v4, 16
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
+; GFX10-NEXT:    v_perm_b32 v0, v4, v0, 0x3020706
+; GFX10-NEXT:    v_and_or_b32 v1, 0xffff, v1, v2
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_5634:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v4, v[0:1], off offset:4
+; GFX11-NEXT:    global_load_b64 v[0:1], v[2:3], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_alignbit_b32 v2, v0, v4, 16
+; GFX11-NEXT:    v_alignbit_b32 v1, v1, v0, 16
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x3020706
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_and_or_b32 v1, 0xffff, v2, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 5, i32 6, i32 3, i32 4>
+  ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_5734(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_5734:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
+; GFX9-NEXT:    global_load_dword v6, v[0:1], off offset:4
+; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    s_mov_b32 s5, 0xffff
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_perm_b32 v0, v4, v5, s4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_alignbit_b32 v1, v4, v6, 16
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
+; GFX9-NEXT:    v_perm_b32 v0, v4, v0, s4
+; GFX9-NEXT:    v_and_or_b32 v1, v1, s5, v2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_5734:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v6, v[0:1], off offset:4
+; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_alignbit_b32 v1, v4, v6, 16
+; GFX10-NEXT:    v_perm_b32 v0, v4, v5, 0x3020706
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
+; GFX10-NEXT:    v_perm_b32 v0, v4, v0, 0x3020706
+; GFX10-NEXT:    v_and_or_b32 v1, 0xffff, v1, v2
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_5734:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v4, v[0:1], off offset:4
+; GFX11-NEXT:    global_load_b64 v[0:1], v[2:3], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_alignbit_b32 v2, v0, v4, 16
+; GFX11-NEXT:    v_perm_b32 v1, v0, v1, 0x3020706
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v2
+; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x3020706
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_and_or_b32 v1, 0xffff, v2, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 5, i32 7, i32 3, i32 4>
+  ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_0000(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_0000:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
+; GFX9-NEXT:    v_and_or_b32 v0, v0, s4, v1
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_0000:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
+; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_0000:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
+; GFX11-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> zeroinitializer
+  ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_1010(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_1010:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_alignbit_b32 v0, v0, v0, 16
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
+; GFX9-NEXT:    v_and_or_b32 v0, v0, s4, v1
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_1010:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_alignbit_b32 v0, v0, v0, 16
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
+; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_1010:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_alignbit_b32 v0, v0, v0, 16
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
+; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
+  ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_1100(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_1100:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v1, v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-NEXT:    s_mov_b32 s5, 0x7060706
+; GFX9-NEXT:    s_mov_b32 s6, 0x3020706
+; GFX9-NEXT:    s_mov_b32 s7, 0x3020504
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX9-NEXT:    v_perm_b32 v2, v1, v1, s5
+; GFX9-NEXT:    v_and_or_b32 v3, v1, s4, v0
+; GFX9-NEXT:    v_perm_b32 v0, v1, v2, s6
+; GFX9-NEXT:    v_perm_b32 v1, v1, v3, s7
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_1100:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v1, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX10-NEXT:    v_perm_b32 v2, v1, v1, 0x7060706
+; GFX10-NEXT:    v_and_or_b32 v3, 0xffff, v1, v0
+; GFX10-NEXT:    v_perm_b32 v0, v1, v2, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v1, v1, v3, 0x3020504
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_1100:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v1, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX11-NEXT:    v_perm_b32 v2, v1, v1, 0x7060706
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_or_b32 v3, 0xffff, v1, v0
+; GFX11-NEXT:    v_perm_b32 v0, v1, v2, 0x3020706
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v1, v1, v3, 0x3020504
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 1, i32 1, i32 0, i32 0>
+  ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_6161(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_6161:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v4, v[0:1], off
+; GFX9-NEXT:    global_load_dword v5, v[2:3], off offset:4
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_or_b32 v0, v5, s4, v0
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_6161:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v4, v[0:1], off
+; GFX10-NEXT:    global_load_dword v5, v[2:3], off offset:4
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v4
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v5, v0
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_6161:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-NEXT:    global_load_b32 v1, v[2:3], off offset:4
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v1, v0
+; GFX11-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 6, i32 1, i32 6, i32 1>
+  ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_2333(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_2333:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off offset:4
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060706
+; GFX9-NEXT:    s_mov_b32 s5, 0xffff
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_perm_b32 v1, v0, v0, s4
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
+; GFX9-NEXT:    v_and_or_b32 v0, v0, s5, v2
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
+; GFX9-NEXT:    v_and_or_b32 v1, v1, s5, v2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_2333:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off offset:4
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_perm_b32 v1, v0, v0, 0x7060706
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v0, v2
+; GFX10-NEXT:    v_and_or_b32 v1, 0xffff, v1, v3
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_2333:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_perm_b32 v1, v0, v0, 0x7060706
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_and_or_b32 v1, 0xffff, v1, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 2, i32 3, i32 3, i32 3>
+  ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v4bf16_6667(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_6667:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off offset:4
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060706
+; GFX9-NEXT:    s_mov_b32 s5, 0xffff
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_perm_b32 v1, v0, v0, s4
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
+; GFX9-NEXT:    v_and_or_b32 v0, v0, s5, v2
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
+; GFX9-NEXT:    v_and_or_b32 v1, v1, s5, v2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_6667:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off offset:4
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_perm_b32 v1, v0, v0, 0x7060706
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v0, v2
+; GFX10-NEXT:    v_and_or_b32 v1, 0xffff, v1, v3
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_6667:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:4
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_perm_b32 v1, v0, v0, 0x7060706
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v0, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_and_or_b32 v1, 0xffff, v1, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 2, i32 3, i32 3, i32 3>
+  ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v8bf16_0101(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v8bf16_0101:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
+; GFX9-NEXT:    v_and_or_b32 v0, v0, s4, v1
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v8bf16_0101:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
+; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v8bf16_0101:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v0, v1
+; GFX11-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <8 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <8 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <8 x bfloat> %val0, <8 x bfloat> %val1, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+  ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v8bf16_0123(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v8bf16_0123:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v8bf16_0123:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v8bf16_0123:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <8 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <8 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <8 x bfloat> %val0, <8 x bfloat> %val1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v8bf16_4589(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v8bf16_4589:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:8
+; GFX9-NEXT:    global_load_dword v5, v[2:3], off
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v5
+; GFX9-NEXT:    v_and_or_b32 v0, v4, s4, v0
+; GFX9-NEXT:    v_and_or_b32 v1, v5, s4, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v8bf16_4589:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:8
+; GFX10-NEXT:    global_load_dword v5, v[2:3], off
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v4
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v5
+; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v4, v0
+; GFX10-NEXT:    v_and_or_b32 v1, 0xffff, v5, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v8bf16_4589:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off offset:8
+; GFX11-NEXT:    global_load_b32 v1, v[2:3], off
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v0, v2
+; GFX11-NEXT:    v_and_or_b32 v1, 0xffff, v1, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <8 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <8 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <8 x bfloat> %val0, <8 x bfloat> %val1, <4 x i32> <i32 4, i32 5, i32 8, i32 9>
+  ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v8bf16_10_11_2_3(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v8bf16_10_11_2_3:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v5, v[2:3], off offset:4
+; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:4
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v5
+; GFX9-NEXT:    v_and_or_b32 v0, v5, s4, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v1, v4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v8bf16_10_11_2_3:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v5, v[2:3], off offset:4
+; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:4
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v5
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v1, v4
+; GFX10-NEXT:    v_and_or_b32 v0, 0xffff, v5, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v8bf16_10_11_2_3:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v2, v[2:3], off offset:4
+; GFX11-NEXT:    global_load_b32 v1, v[0:1], off offset:4
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_and_or_b32 v0, 0xffff, v2, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <8 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <8 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <8 x bfloat> %val0, <8 x bfloat> %val1, <4 x i32> <i32 10, i32 11, i32 2, i32 3>
+  ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v8bf16_13_14_2_3(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v8bf16_13_14_2_3:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx2 v[5:6], v[2:3], off offset:8
+; GFX9-NEXT:    global_load_dword v4, v[0:1], off offset:4
+; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_alignbit_b32 v0, v6, v5, 16
+; GFX9-NEXT:    v_perm_b32 v0, v5, v0, s4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v1, v4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v8bf16_13_14_2_3:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dwordx2 v[5:6], v[2:3], off offset:8
+; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:4
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    v_alignbit_b32 v0, v6, v5, 16
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v1, v4
+; GFX10-NEXT:    v_perm_b32 v0, v5, v0, 0x3020706
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v8bf16_13_14_2_3:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b64 v[2:3], v[2:3], off offset:8
+; GFX11-NEXT:    global_load_b32 v1, v[0:1], off offset:4
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_alignbit_b32 v0, v3, v2, 16
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v0, v2, v0, 0x3020706
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <8 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <8 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <8 x bfloat> %val0, <8 x bfloat> %val1, <4 x i32> <i32 13, i32 14, i32 2, i32 3>
+  ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v3bf16_0122(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v3bf16_0122:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX9-NEXT:    v_and_or_b32 v1, v1, s4, v2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v3bf16_0122:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX10-NEXT:    v_and_or_b32 v1, 0xffff, v1, v2
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v3bf16_0122:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_and_or_b32 v1, 0xffff, v1, v2
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <3 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <3 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <3 x bfloat> %val0, <3 x bfloat> %val1, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
+  ret <4 x bfloat> %shuffle
+}
+
+define <4 x bfloat> @shuffle_v2bf16_0122(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v2bf16_0122:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_alignbit_b32 v1, v0, v0, 16
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
+; GFX9-NEXT:    v_and_or_b32 v1, v1, s4, v2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v2bf16_0122:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_alignbit_b32 v1, v0, v0, 16
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
+; GFX10-NEXT:    v_and_or_b32 v1, 0xffff, v1, v2
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v2bf16_0122:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_alignbit_b32 v1, v0, v0, 16
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
+; GFX11-NEXT:    v_and_or_b32 v1, 0xffff, v1, v2
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <2 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <2 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <2 x bfloat> %val0, <2 x bfloat> %val1, <4 x i32> <i32 0, i32 1, i32 1, i32 0>
+  ret <4 x bfloat> %shuffle
+}
+
+define <6 x bfloat> @shuffle_v6bf16_452367(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v6bf16_452367:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v6, v1
+; GFX9-NEXT:    v_mov_b32_e32 v5, v0
+; GFX9-NEXT:    v_mov_b32_e32 v4, v3
+; GFX9-NEXT:    v_mov_b32_e32 v3, v2
+; GFX9-NEXT:    global_load_dwordx3 v[0:2], v[5:6], off
+; GFX9-NEXT:    global_load_dword v7, v[3:4], off
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v2, v7
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v6bf16_452367:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v6, v1
+; GFX10-NEXT:    v_mov_b32_e32 v5, v0
+; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_mov_b32_e32 v3, v2
+; GFX10-NEXT:    global_load_dwordx3 v[0:2], v[5:6], off
+; GFX10-NEXT:    global_load_dword v7, v[3:4], off
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    v_mov_b32_e32 v0, v2
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v2, v7
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v6bf16_452367:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v4, v3 :: v_dual_mov_b32 v3, v2
+; GFX11-NEXT:    global_load_b96 v[0:2], v[0:1], off
+; GFX11-NEXT:    global_load_b32 v3, v[3:4], off
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_mov_b32_e32 v0, v2
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v2, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <6 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <6 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <6 x bfloat> %val0, <6 x bfloat> %val1, <6 x i32> <i32 4, i32 5, i32 2, i32 3, i32 6, i32 7>
+  ret <6 x bfloat> %shuffle
+}
+
+define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonly %A, ptr addrspace(1) nocapture readonly %B, ptr addrspace(1) nocapture %C)  {
+; GFX9-LABEL: fma_shuffle_v2bf16:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x10
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 3, v0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v6, s[6:7]
+; GFX9-NEXT:    global_load_dwordx2 v[2:3], v6, s[2:3]
+; GFX9-NEXT:    global_load_dwordx2 v[4:5], v6, s[0:1]
+; GFX9-NEXT:    s_mov_b32 s0, 0x3020706
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v4
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 16, v5
+; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 16, v1
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_fma_f32 v7, v9, v8, v7
+; GFX9-NEXT:    v_fma_f32 v0, v9, v2, v0
+; GFX9-NEXT:    v_fma_f32 v8, v11, v8, v12
+; GFX9-NEXT:    v_fma_f32 v1, v11, v2, v1
+; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v3
+; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v7
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v8
+; GFX9-NEXT:    v_fma_f32 v0, v4, v10, v0
+; GFX9-NEXT:    v_fma_f32 v2, v4, v3, v2
+; GFX9-NEXT:    v_fma_f32 v1, v5, v10, v1
+; GFX9-NEXT:    v_fma_f32 v3, v5, v3, v7
+; GFX9-NEXT:    v_perm_b32 v0, v2, v0, s0
+; GFX9-NEXT:    v_perm_b32 v1, v3, v1, s0
+; GFX9-NEXT:    global_store_dwordx2 v6, v[0:1], s[6:7]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: fma_shuffle_v2bf16:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x10
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 3, v0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_clause 0x2
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v6, s[6:7]
+; GFX10-NEXT:    global_load_dwordx2 v[2:3], v6, s[2:3]
+; GFX10-NEXT:    global_load_dwordx2 v[4:5], v6, s[0:1]
+; GFX10-NEXT:    s_waitcnt vmcnt(2)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v4
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v5
+; GFX10-NEXT:    v_and_b32_e32 v12, 0xffff0000, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_fmac_f32_e32 v7, v9, v8
+; GFX10-NEXT:    v_fmac_f32_e32 v0, v9, v2
+; GFX10-NEXT:    v_and_b32_e32 v10, 0xffff0000, v3
+; GFX10-NEXT:    v_fmac_f32_e32 v12, v11, v2
+; GFX10-NEXT:    v_fmac_f32_e32 v1, v11, v8
+; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v5
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
+; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v12
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX10-NEXT:    v_fmac_f32_e32 v0, v4, v10
+; GFX10-NEXT:    v_fmac_f32_e32 v5, v4, v3
+; GFX10-NEXT:    v_fmac_f32_e32 v7, v2, v10
+; GFX10-NEXT:    v_fmac_f32_e32 v1, v2, v3
+; GFX10-NEXT:    v_perm_b32 v0, v5, v0, 0x3020706
+; GFX10-NEXT:    v_perm_b32 v1, v1, v7, 0x3020706
+; GFX10-NEXT:    global_store_dwordx2 v6, v[0:1], s[6:7]
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: fma_shuffle_v2bf16:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b64 s[4:5], s[0:1], 0x10
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 3, v0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_clause 0x2
+; GFX11-NEXT:    global_load_b64 v[0:1], v6, s[4:5]
+; GFX11-NEXT:    global_load_b64 v[2:3], v6, s[2:3]
+; GFX11-NEXT:    global_load_b64 v[4:5], v6, s[0:1]
+; GFX11-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff0000, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v5
+; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff0000, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v4
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_fmac_f32_e32 v1, v11, v8
+; GFX11-NEXT:    v_dual_fmac_f32 v12, v11, v2 :: v_dual_lshlrev_b32 v7, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_fmac_f32 v0, v9, v2 :: v_dual_and_b32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v5
+; GFX11-NEXT:    v_dual_fmac_f32 v1, v2, v3 :: v_dual_and_b32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_fmac_f32 v7, v9, v8 :: v_dual_fmac_f32 v0, v4, v10
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v12
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_fmac_f32_e32 v5, v4, v3
+; GFX11-NEXT:    v_fmac_f32_e32 v7, v2, v10
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v0, v5, v0, 0x3020706
+; GFX11-NEXT:    v_perm_b32 v1, v1, v7, 0x3020706
+; GFX11-NEXT:    global_store_b64 v6, v[0:1], s[4:5]
+; GFX11-NEXT:    s_nop 0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+entry:
+  %tmp1 = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %tmp12 = zext i32 %tmp1 to i64
+  %arrayidx = getelementptr inbounds <4 x bfloat>, ptr addrspace(1) %A, i64 %tmp12
+  %tmp14 = load <4 x bfloat>, ptr addrspace(1) %arrayidx, align 8
+  %arrayidx1 = getelementptr inbounds <4 x bfloat>, ptr addrspace(1) %B, i64 %tmp12
+  %tmp15 = load <4 x bfloat>, ptr addrspace(1) %arrayidx1, align 8
+  %arrayidx2 = getelementptr inbounds <4 x bfloat>, ptr addrspace(1) %C, i64 %tmp12
+  %tmp16 = load <4 x bfloat>, ptr addrspace(1) %arrayidx2, align 8
+  %tmp17 = shufflevector <4 x bfloat> %tmp14, <4 x bfloat> undef, <2 x i32> zeroinitializer
+  %tmp18 = shufflevector <4 x bfloat> %tmp15, <4 x bfloat> undef, <2 x i32> <i32 0, i32 1>
+  %tmp19 = shufflevector <4 x bfloat> %tmp16, <4 x bfloat> undef, <2 x i32> <i32 0, i32 1>
+  %tmp20 = tail call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> %tmp17, <2 x bfloat> %tmp18, <2 x bfloat> %tmp19)
+  %tmp21 = shufflevector <4 x bfloat> %tmp14, <4 x bfloat> undef, <2 x i32> <i32 1, i32 1>
+  %tmp22 = shufflevector <4 x bfloat> %tmp15, <4 x bfloat> undef, <2 x i32> <i32 2, i32 3>
+  %tmp23 = tail call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> %tmp21, <2 x bfloat> %tmp22, <2 x bfloat> %tmp20)
+  %tmp24 = shufflevector <2 x bfloat> %tmp23, <2 x bfloat> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  %tmp25 = shufflevector <4 x bfloat> %tmp24, <4 x bfloat> %tmp16, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+  %tmp26 = shufflevector <4 x bfloat> %tmp14, <4 x bfloat> undef, <2 x i32> <i32 2, i32 2>
+  %tmp27 = shufflevector <4 x bfloat> %tmp25, <4 x bfloat> undef, <2 x i32> <i32 2, i32 3>
+  %tmp28 = tail call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> %tmp26, <2 x bfloat> %tmp18, <2 x bfloat> %tmp27)
+  %tmp29 = shufflevector <4 x bfloat> %tmp14, <4 x bfloat> undef, <2 x i32> <i32 3, i32 3>
+  %tmp30 = tail call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> %tmp29, <2 x bfloat> %tmp22, <2 x bfloat> %tmp28)
+  %tmp31 = shufflevector <2 x bfloat> %tmp30, <2 x bfloat> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  %tmp32 = shufflevector <4 x bfloat> %tmp25, <4 x bfloat> %tmp31, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+  store <4 x bfloat> %tmp32, ptr addrspace(1) %arrayidx2, align 8
+  ret void
+}
+
+define <4 x bfloat> @shuffle_v4bf16_0456(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
+; GFX9-LABEL: shuffle_v4bf16_0456:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
+; GFX9-NEXT:    global_load_dword v6, v[0:1], off
+; GFX9-NEXT:    s_mov_b32 s4, 0x1000504
+; GFX9-NEXT:    s_mov_b32 s5, 0x3020706
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_alignbit_b32 v1, v5, v4, 16
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_perm_b32 v0, v6, v4, s4
+; GFX9-NEXT:    v_perm_b32 v1, v4, v1, s5
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v4bf16_0456:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
+; GFX10-NEXT:    global_load_dword v6, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    v_alignbit_b32 v1, v5, v4, 16
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_perm_b32 v0, v6, v4, 0x1000504
+; GFX10-NEXT:    v_perm_b32 v1, v4, v1, 0x3020706
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v4bf16_0456:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b64 v[2:3], v[2:3], off
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_alignbit_b32 v1, v3, v2, 16
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v2, 0x1000504
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v1, v2, v1, 0x3020706
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <4 x i32> <i32 0, i32 4, i32 5, i32 6>
+  ret <4 x bfloat> %shuffle
+}
+
+define <2 x bfloat> @low16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
+; GFX9-LABEL: low16bits:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v4, v[0:1], off
+; GFX9-NEXT:    global_load_dword v5, v[2:3], off
+; GFX9-NEXT:    s_mov_b32 s4, 0x1000504
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_perm_b32 v0, v4, v5, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: low16bits:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v4, v[0:1], off
+; GFX10-NEXT:    global_load_dword v5, v[2:3], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_perm_b32 v0, v4, v5, 0x1000504
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: low16bits:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-NEXT:    global_load_b32 v1, v[2:3], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x1000504
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %0 = load <2 x bfloat>, ptr addrspace(1) %x0, align 4
+  %1 = load <2 x bfloat>, ptr addrspace(1) %x1, align 4
+  %vy1.0.vec.insert = shufflevector <2 x bfloat> %0, <2 x bfloat> poison, <2 x i32> <i32 0, i32 undef>
+  %vy1.2.vec.insert = shufflevector <2 x bfloat> %vy1.0.vec.insert, <2 x bfloat> %1, <2 x i32> <i32 0, i32 2>
+  ret <2 x bfloat> %vy1.2.vec.insert
+}
+
+define <2 x bfloat> @hi16bits_v2bf16(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
+; GFX9-LABEL: hi16bits_v2bf16:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v4, v[0:1], off
+; GFX9-NEXT:    global_load_dword v5, v[2:3], off
+; GFX9-NEXT:    s_mov_b32 s4, 0x3020706
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_perm_b32 v0, v4, v5, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: hi16bits_v2bf16:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v4, v[0:1], off
+; GFX10-NEXT:    global_load_dword v5, v[2:3], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_perm_b32 v0, v4, v5, 0x3020706
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: hi16bits_v2bf16:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-NEXT:    global_load_b32 v1, v[2:3], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x3020706
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %0 = load <2 x bfloat>, ptr addrspace(1) %x0, align 4
+  %1 = load <2 x bfloat>, ptr addrspace(1) %x1, align 4
+  %vy1.0.vec.insert = shufflevector <2 x bfloat> %0, <2 x bfloat> poison, <2 x i32> <i32 1, i32 undef>
+  %vy1.2.vec.insert = shufflevector <2 x bfloat> %vy1.0.vec.insert, <2 x bfloat> %1, <2 x i32> <i32 0, i32 3>
+  ret <2 x bfloat> %vy1.2.vec.insert
+}
+
+define <2 x bfloat> @low16hi16bits_v2bf16(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
+; GFX9-LABEL: low16hi16bits_v2bf16:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v4, v[0:1], off
+; GFX9-NEXT:    global_load_dword v5, v[2:3], off
+; GFX9-NEXT:    s_mov_b32 s4, 0x3020504
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_perm_b32 v0, v4, v5, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: low16hi16bits_v2bf16:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v4, v[0:1], off
+; GFX10-NEXT:    global_load_dword v5, v[2:3], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_perm_b32 v0, v4, v5, 0x3020504
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: low16hi16bits_v2bf16:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-NEXT:    global_load_b32 v1, v[2:3], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x3020504
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %0 = load <2 x bfloat>, ptr addrspace(1) %x0, align 4
+  %1 = load <2 x bfloat>, ptr addrspace(1) %x1, align 4
+  %vy1.0.vec.insert = shufflevector <2 x bfloat> %0, <2 x bfloat> poison, <2 x i32> <i32 0, i32 undef>
+  %vy1.2.vec.insert = shufflevector <2 x bfloat> %vy1.0.vec.insert, <2 x bfloat> %1, <2 x i32> <i32 0, i32 3>
+  ret <2 x bfloat> %vy1.2.vec.insert
+}
+
+define <2 x bfloat> @hi16low16bits(ptr addrspace(1) %x0, ptr addrspace(1) %x1) {
+; GFX9-LABEL: hi16low16bits:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v4, v[0:1], off
+; GFX9-NEXT:    global_load_dword v5, v[2:3], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_alignbit_b32 v0, v5, v4, 16
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: hi16low16bits:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v4, v[0:1], off
+; GFX10-NEXT:    global_load_dword v5, v[2:3], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_alignbit_b32 v0, v5, v4, 16
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: hi16low16bits:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-NEXT:    global_load_b32 v1, v[2:3], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_alignbit_b32 v0, v1, v0, 16
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %0 = load <2 x bfloat>, ptr addrspace(1) %x0, align 4
+  %1 = load <2 x bfloat>, ptr addrspace(1) %x1, align 4
+  %vy1.0.vec.insert = shufflevector <2 x bfloat> %0, <2 x bfloat> poison, <2 x i32> <i32 1, i32 undef>
+  %vy1.2.vec.insert = shufflevector <2 x bfloat> %vy1.0.vec.insert, <2 x bfloat> %1, <2 x i32> <i32 0, i32 2>
+  ret <2 x bfloat> %vy1.2.vec.insert
+}
+
+define <2 x bfloat> @v2bfloat_hi16bits(ptr addrspace(1) %x0) {
+; GFX9-LABEL: v2bfloat_hi16bits:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v0, v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v2bfloat_hi16bits:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v2bfloat_hi16bits:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %load0 = load <2 x bfloat>, ptr addrspace(1) %x0, align 4
+  %insert1 = insertelement <2 x bfloat> undef, bfloat 0.0, i32 0
+  %insert2 = insertelement <2 x bfloat> %insert1, bfloat 0.0, i32 1
+  %vec.ret = shufflevector <2 x bfloat> %insert2, <2 x bfloat> %load0, <2 x i32> <i32 0, i32 3>
+  ret <2 x bfloat> %vec.ret
+}
+
+define void @shuffle_v8bf16_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1, ptr addrspace(1) %out) {
+; GFX9-LABEL: shuffle_v8bf16_concat:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off
+; GFX9-NEXT:    global_load_dwordx2 v[8:9], v[2:3], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[6:9], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v8bf16_concat:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off
+; GFX10-NEXT:    global_load_dwordx2 v[8:9], v[2:3], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_store_dwordx4 v[4:5], v[6:9], off
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v8bf16_concat:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
+; GFX11-NEXT:    global_load_b64 v[2:3], v[2:3], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_store_b128 v[4:5], v[0:3], off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <4 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <4 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <4 x bfloat> %val0, <4 x bfloat> %val1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  store <8 x bfloat> %shuffle, ptr addrspace(1) %out
+  ret void
+}
+
+define void @shuffle_v16bf16_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1, ptr addrspace(1) %out) {
+; GFX9-LABEL: shuffle_v16bf16_concat:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx4 v[6:9], v[2:3], off
+; GFX9-NEXT:    global_load_dwordx4 v[10:13], v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[6:9], off offset:16
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[10:13], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v16bf16_concat:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dwordx4 v[6:9], v[2:3], off
+; GFX10-NEXT:    global_load_dwordx4 v[10:13], v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    global_store_dwordx4 v[4:5], v[6:9], off offset:16
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_store_dwordx4 v[4:5], v[10:13], off
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v16bf16_concat:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b128 v[6:9], v[2:3], off
+; GFX11-NEXT:    global_load_b128 v[0:3], v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    global_store_b128 v[4:5], v[6:9], off offset:16
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_store_b128 v[4:5], v[0:3], off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <8 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <8 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <8 x bfloat> %val0, <8 x bfloat> %val1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  store <16 x bfloat> %shuffle, ptr addrspace(1) %out
+  ret void
+}
+
+define void @shuffle_v32bf16_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1, ptr addrspace(1) %out) {
+; GFX9-LABEL: shuffle_v32bf16_concat:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx4 v[6:9], v[2:3], off
+; GFX9-NEXT:    global_load_dwordx4 v[10:13], v[2:3], off offset:16
+; GFX9-NEXT:    global_load_dwordx4 v[14:17], v[0:1], off
+; GFX9-NEXT:    global_load_dwordx4 v[18:21], v[0:1], off offset:16
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[6:9], off offset:32
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[10:13], off offset:48
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[14:17], off
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[18:21], off offset:16
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: shuffle_v32bf16_concat:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    global_load_dwordx4 v[6:9], v[2:3], off
+; GFX10-NEXT:    global_load_dwordx4 v[10:13], v[2:3], off offset:16
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    global_load_dwordx4 v[14:17], v[0:1], off
+; GFX10-NEXT:    global_load_dwordx4 v[18:21], v[0:1], off offset:16
+; GFX10-NEXT:    s_waitcnt vmcnt(3)
+; GFX10-NEXT:    global_store_dwordx4 v[4:5], v[6:9], off offset:32
+; GFX10-NEXT:    s_waitcnt vmcnt(2)
+; GFX10-NEXT:    global_store_dwordx4 v[4:5], v[10:13], off offset:48
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    global_store_dwordx4 v[4:5], v[14:17], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_store_dwordx4 v[4:5], v[18:21], off offset:16
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: shuffle_v32bf16_concat:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    global_load_b128 v[6:9], v[2:3], off
+; GFX11-NEXT:    global_load_b128 v[10:13], v[2:3], off offset:16
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    global_load_b128 v[14:17], v[0:1], off
+; GFX11-NEXT:    global_load_b128 v[0:3], v[0:1], off offset:16
+; GFX11-NEXT:    s_waitcnt vmcnt(3)
+; GFX11-NEXT:    global_store_b128 v[4:5], v[6:9], off offset:32
+; GFX11-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:48
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    global_store_b128 v[4:5], v[14:17], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_store_b128 v[4:5], v[0:3], off offset:16
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %val0 = load <16 x bfloat>, ptr addrspace(1) %arg0
+  %val1 = load <16 x bfloat>, ptr addrspace(1) %arg1
+  %shuffle = shufflevector <16 x bfloat> %val0, <16 x bfloat> %val1, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  store <32 x bfloat> %shuffle, ptr addrspace(1) %out
+  ret void
+}
+
+declare <2 x half> @llvm.fma.v2f16(<2 x half>, <2 x half>, <2 x half>) #0
+declare <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat>, <2 x bfloat>, <2 x bfloat>) #0
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+
+attributes #0 = { nounwind readnone speculatable }

From cca63f9a363e1ce1b61ed6d4fb33b66d603eb388 Mon Sep 17 00:00:00 2001
From: Aiden Grossman <agrossman154@yahoo.com>
Date: Thu, 21 Dec 2023 21:42:41 -0800
Subject: [PATCH 139/342] [Docs] Remove llvm-objdump man page (#75759)

This patch removes the explicit llvm-objdump man page. By enabling
sphinx man page output with `-DLLVM_ENABLE_SPHINX=ON` and
`-DSPHINX_OUTPUT_MAN=ON`, we can generate man pages for all the llvm
binary utilities from the restructured text documentation. Having an
additional man page upstream increases fragementation and maintenance.
---
 llvm/docs/llvm-objdump.1 | 209 ---------------------------------------
 1 file changed, 209 deletions(-)
 delete mode 100644 llvm/docs/llvm-objdump.1

diff --git a/llvm/docs/llvm-objdump.1 b/llvm/docs/llvm-objdump.1
deleted file mode 100644
index 42dcc73676597..0000000000000
--- a/llvm/docs/llvm-objdump.1
+++ /dev/null
@@ -1,209 +0,0 @@
-.\" Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-.\" See https://llvm.org/LICENSE.txt for license information.
-.\" SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-.\"
-.Dd December 19, 2018
-.Dt LLVM-OBJDUMP 1
-.Os
-.Sh NAME
-.Nm llvm-objdump
-.Nd LLVM object file dumper
-.Sh SYNOPSIS
-.Nm llvm-objdump
-.Op Ar options
-.Ar objfile ...
-.Sh DESCRIPTION
-.Nm
-prints the contents of object files and final linked images named on the
-command line.
-If no file name is specified,
-.Nm
-will attempt to read from
-.Pa a.out .
-If
-.Pa -
-is used as a file name,
-.Nm
-will process a file on its standard input stream.
-.Nm
-accepts many of the same command line arguments as GNU objdump.
-.Sh OPTIONS
-.Ss General Options
-.Bl -tag -width indent
-.It Fl -aarch64-neon-syntax Ns = Ns Ar value
-Choose style of NEON code to emit from AArch64 backend.
-.Ar value
-may be one of:
-.Bl -tag -width indent
-.It generic
-Generic NEON assembly
-.It apple
-Apple-style NEON assembly
-.El
-.It Fl -arch Ns = Ns Ar value
-Choose architecture(s) from a Mach-O file to dump
-.It Fl -arch-name Ns = Ns ar arch
-Target arch to disassemble for.
-See
-.Fl -version
-for available targets.
-.It Fl -bind
-Display mach-o binding info.
-.It Fl -color
-Use colored syntax highlighting.
-Default autodetect.
-.It Fl -disassemble
-Display assembler mnemonics for machine instructions.
-.It Fl -disassemble-all
-Display assembler mnemonics for the machine instruction in all sections.
-.It Fl -dsym Ns = Ns Ar file
-Use
-.Ar file
-for debug info.
-.It Fl -dwarf Ns = Ns Ar sections
-Dump of dwarf debug sections.
-.Bl -tag -width indent
-.It frames
-.Dv .debug_frame
-.El
-.It Fl -exports-trie
-Display mach-o exported symbols.
-.It Fl -fault-map-section
-Display contents of faultmap section.
-.It Fl -filter-print-funcs Ns = Ns Ar functions
-Only print IR for functions whose name match
-.Ar functions
-for all print-[before|after][-all] options.
-.It Fl -full-leading-addr
-Print full leading address.
-.It Fl g
-Print line information from debug info if available.
-.It Fl h , -headers , -section-headers
-Display summaries of the headers for each section.
-.It Fl -help
-Display available options.
-Use
-.Fl -help-hidden
-for more.
-.It Fl -lazy-bind
-Display mach-o lazy binding info.
-.It Fl -line-numbers
-Display source line numbers with disassembly.
-Implies disassemble object.
-.It Fl -macho
-Use MachO specific object file parser.
-.It Fl -mattr Ns = Ns Ar attribute ...
-Target specific attributes.
-.It Fl -mcpu Ns = Ns Ar CPU
-Target a specific cpu type.
-Use
-.Fl mcpu Ns = Ns help
-for details.
-.It Fl -no-leading-addr
-Print no leading address.
-.It Fl -no-leading-headers
-Print no leading headers.
-.It Fl -no-show-raw-insn
-When disassembling instructions, do not print the instruction bytes.
-.It Fl -offloading
-Display the content of the LLVM offloading section.
-.It Fl -prefix Ns = Ns Ar PREFIX
-When disassembling, add
-.Ar PREFIX
-to absolute paths.
-.It Fl -prefix-strip Ns = Ns Ar LEVEL
-When disassembling, strip out
-.Ar LEVEL
-initial directories from absolute paths. This option has no effect without
-.Fl -prefix Ns = Ns PREFIX .
-.It Fl -print-imm-hex
-Use hex format for immediate values.
-.It Fl -private-header
-Display only the first format specific file header.
-.It Fl -private-headers
-Display format specific file headers.
-.It Fl r
-Display the relocation entries in the file.
-.It Fl -raw-clang-ast
-Dump the raw binary contents of the clang AST section.
-.It Fl -rebase
-Display mach-o rebasing info.
-.It Fl -reverse-iterate
-Reverse iterate.
-.It Fl s
-Display the content of each section.
-.It Fl -section Ns = Ns Ar section
-Operate on the specified sections only.
-With
-.Fl -macho
-dump segment,section.
-.It Fl -source
-Display source inline with disassembly.
-Implies disassemble object.
-.It Fl -start-address Ns = Ns Ar address
-Disassemble beginning at
-.Ar address .
-.It Fl -stop-address Ns = Ns Ar address
-Stop disassembly at
-.Ar address .
-.It Fl t
-Display the symbol table.
-.It Fl -triple Ns = Ns Ar triple
-Target triple to disassemble for.
-See
-.Fl -version
-for available targets.
-.It Fl -unwind-info
-Display unwind information.
-.It Fl -version
-Display the version of this program.
-.It Fl -weak-bind
-Display mach-o weak binding info.
-.It Fl -x86-asm-syntax Ns = Ns Ar syntax
-Choose style of code to emit from X86 backend.
-.Bl -tag -width indent
-.It att
-Emit AT&T-style assembly.
-.It intel
-Emit Intel-style assembly.
-.El
-.El
-.Ss Mach-O Options
-There are a number of options specific to the Mach-O format.
-These are used in combination with the
-.Fl -macho
-option.
-.Bl -tag -width indent
-.It Fl -archive-headers
-Print archive headers for Mach-O archives.
-.It Fl -archive-member-offsets
-Print the offset to each archive member for Mach-O archives.
-Requires
-.Fl -macho
-and
-.Fl -archive-headers .
-.It Fl -data-in-code
-Print the data in code table for Mach-O objects.
-.It Fl -dis-symname Ns = Ns Ar symbol
-Disassemble just
-.Ar symbol 's
-instructions.
-.It Fl -dylib-id
-Print the shared library's id for the dylib Mach-O file.
-.It Fl -dylibs-used
-Print the shared libraries used for linked Mach-O files.
-.It Fl -indirect-symbols
-Print indirect symbol table for Mach-O objects.
-.It Fl -info-plist
-Print the info plist section as strings for Mach-O objects.
-.It Fl -link-opt-hints
-Print the linker optimization hints for Mach-O objects.
-.It Fl -no-symbolic-operands
-do not symbolic operands when disassembling.
-.It Fl -non-verbose
-Print the info for Mach-O objects in non-verbose or numeric form.
-.It Fl -objc-meta-data
-Print the Objective-C runtime meta data for Mach-O files.
-.It Fl -universal-headers
-Print Mach-O universal headers.
-.El

From bb7f8f7938bf486ce67e824c94e9c87767694710 Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Thu, 21 Dec 2023 21:46:17 -0800
Subject: [PATCH 140/342] [test][hwasan] Update another test after #76133

---
 compiler-rt/test/hwasan/TestCases/stack-uas.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/compiler-rt/test/hwasan/TestCases/stack-uas.c b/compiler-rt/test/hwasan/TestCases/stack-uas.c
index d38eedb87fc26..53a7054c1c435 100644
--- a/compiler-rt/test/hwasan/TestCases/stack-uas.c
+++ b/compiler-rt/test/hwasan/TestCases/stack-uas.c
@@ -69,6 +69,8 @@ int main() {
   // CHECK: Cause: stack tag-mismatch
   // CHECK: is located in stack of thread
   // CHECK: Potentially referenced stack objects:
+  // CHECK: Cause: use-after-scope
+  // CHECK-NEXT: 0x{{.*}} is located 0 bytes inside a 2048-byte region
   // CHECK-NEXT: {{zzz|yyy}} in buggy {{.*}}stack-uas.c:
   // CHECK: Memory tags around the buggy address
 

From 782cf12d1145cccfa786f244e46561c424b95982 Mon Sep 17 00:00:00 2001
From: Artur Pilipenko <apilipenko@azul.com>
Date: Thu, 21 Dec 2023 21:46:51 -0800
Subject: [PATCH 141/342] Fix chunk-print-before-all script

After c718336c the output of print-before/print-after was changed.
A semicolon was added before the banner "*** IR Dump ..." and this
change broke chunk-print-before-all script.

This change makes it more resilient to small output format variations.
---
 llvm/utils/chunk-print-before-all.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/utils/chunk-print-before-all.py b/llvm/utils/chunk-print-before-all.py
index fe0eaaea1c20d..fef8eb64c5403 100755
--- a/llvm/utils/chunk-print-before-all.py
+++ b/llvm/utils/chunk-print-before-all.py
@@ -30,13 +30,13 @@ def print_chunk(lines, prefix, pass_name):
 is_dump = False
 cur = []
 for line in sys.stdin:
-    if line.startswith("*** IR Dump Before "):
+    if "*** IR Dump Before " in line:
         if len(cur) != 0:
             print_chunk(cur, "before", pass_name)
             cur = []
         cur.append("; " + line)
         pass_name = get_pass_name(line, "Before")
-    elif line.startswith("*** IR Dump After "):
+    elif "*** IR Dump After " in line:
         if len(cur) != 0:
             print_chunk(cur, "after", pass_name)
             cur = []

From 4d1cd38c95d317f1b6b331ea811f7de8592b4ed6 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Wed, 20 Dec 2023 21:40:15 +0700
Subject: [PATCH 142/342] DAG: Handle promotion of fcanonicalize

This avoids a regression in a future commit
---
 llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 0917d0e4eb3e2..a483b8028fda9 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -5352,6 +5352,7 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) {
   case ISD::FEXP:
   case ISD::FEXP2:
   case ISD::FEXP10:
+  case ISD::FCANONICALIZE:
     Tmp1 = DAG.getNode(ISD::FP_EXTEND, dl, NVT, Node->getOperand(0));
     Tmp2 = DAG.getNode(Node->getOpcode(), dl, NVT, Tmp1);
     Results.push_back(

From 7b3323fffb2609b243d53c650d28c8760a144898 Mon Sep 17 00:00:00 2001
From: XinWang10 <108658776+XinWang10@users.noreply.github.com>
Date: Fri, 22 Dec 2023 14:11:32 +0800
Subject: [PATCH 143/342] [X86][MC] Support Enc/Dec for EGPR for promoted CET
 instruction (#76023)

R16-R31 was added into GPRs in
https://github.com/llvm/llvm-project/pull/70958,
This patch supports the encoding/decoding for promoted CET instruction
in EVEX space.

RFC:
https://discourse.llvm.org/t/rfc-design-for-apx-feature-egpr-and-ndd-support/73031/4
---
 llvm/lib/Target/X86/X86InstrSystem.td        | 17 +++++++++++++++++
 llvm/test/MC/Disassembler/X86/apx/wrssd.txt  |  6 ++++++
 llvm/test/MC/Disassembler/X86/apx/wrssq.txt  |  6 ++++++
 llvm/test/MC/Disassembler/X86/apx/wrussd.txt |  6 ++++++
 llvm/test/MC/Disassembler/X86/apx/wrussq.txt |  6 ++++++
 llvm/test/MC/X86/apx/wrssd-att.s             |  8 ++++++++
 llvm/test/MC/X86/apx/wrssd-intel.s           |  5 +++++
 llvm/test/MC/X86/apx/wrssq-att.s             |  8 ++++++++
 llvm/test/MC/X86/apx/wrssq-intel.s           |  5 +++++
 llvm/test/MC/X86/apx/wrussd-att.s            |  8 ++++++++
 llvm/test/MC/X86/apx/wrussd-intel.s          |  5 +++++
 llvm/test/MC/X86/apx/wrussq-att.s            |  8 ++++++++
 llvm/test/MC/X86/apx/wrussq-intel.s          |  5 +++++
 13 files changed, 93 insertions(+)
 create mode 100644 llvm/test/MC/Disassembler/X86/apx/wrssd.txt
 create mode 100644 llvm/test/MC/Disassembler/X86/apx/wrssq.txt
 create mode 100644 llvm/test/MC/Disassembler/X86/apx/wrussd.txt
 create mode 100644 llvm/test/MC/Disassembler/X86/apx/wrussq.txt
 create mode 100644 llvm/test/MC/X86/apx/wrssd-att.s
 create mode 100644 llvm/test/MC/X86/apx/wrssd-intel.s
 create mode 100644 llvm/test/MC/X86/apx/wrssq-att.s
 create mode 100644 llvm/test/MC/X86/apx/wrssq-intel.s
 create mode 100644 llvm/test/MC/X86/apx/wrussd-att.s
 create mode 100644 llvm/test/MC/X86/apx/wrussd-intel.s
 create mode 100644 llvm/test/MC/X86/apx/wrussq-att.s
 create mode 100644 llvm/test/MC/X86/apx/wrussq-intel.s

diff --git a/llvm/lib/Target/X86/X86InstrSystem.td b/llvm/lib/Target/X86/X86InstrSystem.td
index 51972c63bb2ce..25db96b31be7a 100644
--- a/llvm/lib/Target/X86/X86InstrSystem.td
+++ b/llvm/lib/Target/X86/X86InstrSystem.td
@@ -520,6 +520,7 @@ let SchedRW = [WriteSystem] in {
     } // Defs SSP
   } // Uses SSP
 
+let Predicates = [NoEGPR] in {
   def WRSSD : I<0xF6, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
                 "wrssd\t{$src, $dst|$dst, $src}",
                 [(int_x86_wrssd GR32:$src, addr:$dst)]>, T8PS;
@@ -532,6 +533,22 @@ let SchedRW = [WriteSystem] in {
   def WRUSSQ : RI<0xF5, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
                   "wrussq\t{$src, $dst|$dst, $src}",
                   [(int_x86_wrussq GR64:$src, addr:$dst)]>, T8PD;
+}
+
+let Predicates = [HasEGPR, In64BitMode] in {
+  def WRSSD_EVEX : I<0x66, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
+                     "wrssd\t{$src, $dst|$dst, $src}",
+                     [(int_x86_wrssd GR32:$src, addr:$dst)]>, EVEX, NoCD8, T_MAP4PS;
+  def WRSSQ_EVEX : RI<0x66, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
+                      "wrssq\t{$src, $dst|$dst, $src}",
+                      [(int_x86_wrssq GR64:$src, addr:$dst)]>, EVEX, NoCD8, T_MAP4PS;
+  def WRUSSD_EVEX : I<0x65, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
+                      "wrussd\t{$src, $dst|$dst, $src}",
+                      [(int_x86_wrussd GR32:$src, addr:$dst)]>, EVEX, NoCD8, T_MAP4PD;
+  def WRUSSQ_EVEX : RI<0x65, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
+                       "wrussq\t{$src, $dst|$dst, $src}",
+                       [(int_x86_wrussq GR64:$src, addr:$dst)]>, EVEX, NoCD8, T_MAP4PD;
+}
 
   let Defs = [SSP] in {
     let Uses = [SSP] in {
diff --git a/llvm/test/MC/Disassembler/X86/apx/wrssd.txt b/llvm/test/MC/Disassembler/X86/apx/wrssd.txt
new file mode 100644
index 0000000000000..600e85e1440e8
--- /dev/null
+++ b/llvm/test/MC/Disassembler/X86/apx/wrssd.txt
@@ -0,0 +1,6 @@
+# RUN: llvm-mc --disassemble %s -triple=x86_64 | FileCheck %s --check-prefixes=ATT
+# RUN: llvm-mc --disassemble %s -triple=x86_64 -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck %s --check-prefixes=INTEL
+
+# ATT:   wrssd	%r18d, 291(%r28,%r29,4)
+# INTEL: wrssd	dword ptr [r28 + 4*r29 + 291], r18d
+0x62,0x8c,0x78,0x08,0x66,0x94,0xac,0x23,0x01,0x00,0x00
diff --git a/llvm/test/MC/Disassembler/X86/apx/wrssq.txt b/llvm/test/MC/Disassembler/X86/apx/wrssq.txt
new file mode 100644
index 0000000000000..9f5b26321fd2b
--- /dev/null
+++ b/llvm/test/MC/Disassembler/X86/apx/wrssq.txt
@@ -0,0 +1,6 @@
+# RUN: llvm-mc --disassemble %s -triple=x86_64 | FileCheck %s --check-prefixes=ATT
+# RUN: llvm-mc --disassemble %s -triple=x86_64 -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck %s --check-prefixes=INTEL
+
+# ATT:   wrssq	%r19, 291(%r28,%r29,4)
+# INTEL: wrssq	qword ptr [r28 + 4*r29 + 291], r19
+0x62,0x8c,0xf8,0x08,0x66,0x9c,0xac,0x23,0x01,0x00,0x00
diff --git a/llvm/test/MC/Disassembler/X86/apx/wrussd.txt b/llvm/test/MC/Disassembler/X86/apx/wrussd.txt
new file mode 100644
index 0000000000000..1b8b0007e2d32
--- /dev/null
+++ b/llvm/test/MC/Disassembler/X86/apx/wrussd.txt
@@ -0,0 +1,6 @@
+# RUN: llvm-mc --disassemble %s -triple=x86_64 | FileCheck %s --check-prefixes=ATT
+# RUN: llvm-mc --disassemble %s -triple=x86_64 -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck %s --check-prefixes=INTEL
+
+# ATT:   wrussd	%r18d, 291(%r28,%r29,4)
+# INTEL: wrussd	dword ptr [r28 + 4*r29 + 291], r18d
+0x62,0x8c,0x79,0x08,0x65,0x94,0xac,0x23,0x01,0x00,0x00
diff --git a/llvm/test/MC/Disassembler/X86/apx/wrussq.txt b/llvm/test/MC/Disassembler/X86/apx/wrussq.txt
new file mode 100644
index 0000000000000..7ff51f617c5cc
--- /dev/null
+++ b/llvm/test/MC/Disassembler/X86/apx/wrussq.txt
@@ -0,0 +1,6 @@
+# RUN: llvm-mc --disassemble %s -triple=x86_64 | FileCheck %s --check-prefixes=ATT
+# RUN: llvm-mc --disassemble %s -triple=x86_64 -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck %s --check-prefixes=INTEL
+
+# ATT:   wrussq	%r19, 291(%r28,%r29,4)
+# INTEL: wrussq	qword ptr [r28 + 4*r29 + 291], r19
+0x62,0x8c,0xf9,0x08,0x65,0x9c,0xac,0x23,0x01,0x00,0x00
diff --git a/llvm/test/MC/X86/apx/wrssd-att.s b/llvm/test/MC/X86/apx/wrssd-att.s
new file mode 100644
index 0000000000000..409b3010f5c76
--- /dev/null
+++ b/llvm/test/MC/X86/apx/wrssd-att.s
@@ -0,0 +1,8 @@
+# RUN: llvm-mc -triple x86_64 --show-encoding %s | FileCheck %s
+# RUN: not llvm-mc -triple i386 -show-encoding %s 2>&1 | FileCheck %s --check-prefix=ERROR
+
+# ERROR-COUNT-1: error:
+# ERROR-NOT: error:
+# CHECK: wrssd	%r18d, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8c,0x78,0x08,0x66,0x94,0xac,0x23,0x01,0x00,0x00]
+         wrssd	%r18d, 291(%r28,%r29,4)
diff --git a/llvm/test/MC/X86/apx/wrssd-intel.s b/llvm/test/MC/X86/apx/wrssd-intel.s
new file mode 100644
index 0000000000000..1d402f2c51776
--- /dev/null
+++ b/llvm/test/MC/X86/apx/wrssd-intel.s
@@ -0,0 +1,5 @@
+# RUN: llvm-mc -triple x86_64 -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s
+
+# CHECK: wrssd	dword ptr [r28 + 4*r29 + 291], r18d
+# CHECK: encoding: [0x62,0x8c,0x78,0x08,0x66,0x94,0xac,0x23,0x01,0x00,0x00]
+         wrssd	dword ptr [r28 + 4*r29 + 291], r18d
diff --git a/llvm/test/MC/X86/apx/wrssq-att.s b/llvm/test/MC/X86/apx/wrssq-att.s
new file mode 100644
index 0000000000000..1f616ac2e4e47
--- /dev/null
+++ b/llvm/test/MC/X86/apx/wrssq-att.s
@@ -0,0 +1,8 @@
+# RUN: llvm-mc -triple x86_64 --show-encoding %s | FileCheck %s
+# RUN: not llvm-mc -triple i386 -show-encoding %s 2>&1 | FileCheck %s --check-prefix=ERROR
+
+# ERROR-COUNT-1: error:
+# ERROR-NOT: error:
+# CHECK: wrssq	%r19, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8c,0xf8,0x08,0x66,0x9c,0xac,0x23,0x01,0x00,0x00]
+         wrssq	%r19, 291(%r28,%r29,4)
diff --git a/llvm/test/MC/X86/apx/wrssq-intel.s b/llvm/test/MC/X86/apx/wrssq-intel.s
new file mode 100644
index 0000000000000..d31dca55ca4a4
--- /dev/null
+++ b/llvm/test/MC/X86/apx/wrssq-intel.s
@@ -0,0 +1,5 @@
+# RUN: llvm-mc -triple x86_64 -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s
+
+# CHECK: wrssq	qword ptr [r28 + 4*r29 + 291], r19
+# CHECK: encoding: [0x62,0x8c,0xf8,0x08,0x66,0x9c,0xac,0x23,0x01,0x00,0x00]
+         wrssq	qword ptr [r28 + 4*r29 + 291], r19
diff --git a/llvm/test/MC/X86/apx/wrussd-att.s b/llvm/test/MC/X86/apx/wrussd-att.s
new file mode 100644
index 0000000000000..269d9a8aa8586
--- /dev/null
+++ b/llvm/test/MC/X86/apx/wrussd-att.s
@@ -0,0 +1,8 @@
+# RUN: llvm-mc -triple x86_64 --show-encoding %s | FileCheck %s
+# RUN: not llvm-mc -triple i386 -show-encoding %s 2>&1 | FileCheck %s --check-prefix=ERROR
+
+# ERROR-COUNT-1: error:
+# ERROR-NOT: error:
+# CHECK: wrussd	%r18d, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8c,0x79,0x08,0x65,0x94,0xac,0x23,0x01,0x00,0x00]
+         wrussd	%r18d, 291(%r28,%r29,4)
diff --git a/llvm/test/MC/X86/apx/wrussd-intel.s b/llvm/test/MC/X86/apx/wrussd-intel.s
new file mode 100644
index 0000000000000..fed6eb10d4add
--- /dev/null
+++ b/llvm/test/MC/X86/apx/wrussd-intel.s
@@ -0,0 +1,5 @@
+# RUN: llvm-mc -triple x86_64 -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s
+
+# CHECK: wrussd	dword ptr [r28 + 4*r29 + 291], r18d
+# CHECK: encoding: [0x62,0x8c,0x79,0x08,0x65,0x94,0xac,0x23,0x01,0x00,0x00]
+         wrussd	dword ptr [r28 + 4*r29 + 291], r18d
diff --git a/llvm/test/MC/X86/apx/wrussq-att.s b/llvm/test/MC/X86/apx/wrussq-att.s
new file mode 100644
index 0000000000000..b41360cd9db04
--- /dev/null
+++ b/llvm/test/MC/X86/apx/wrussq-att.s
@@ -0,0 +1,8 @@
+# RUN: llvm-mc -triple x86_64 --show-encoding %s | FileCheck %s
+# RUN: not llvm-mc -triple i386 -show-encoding %s 2>&1 | FileCheck %s --check-prefix=ERROR
+
+# ERROR-COUNT-1: error:
+# ERROR-NOT: error:
+# CHECK: wrussq	%r19, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8c,0xf9,0x08,0x65,0x9c,0xac,0x23,0x01,0x00,0x00]
+         wrussq	%r19, 291(%r28,%r29,4)
diff --git a/llvm/test/MC/X86/apx/wrussq-intel.s b/llvm/test/MC/X86/apx/wrussq-intel.s
new file mode 100644
index 0000000000000..a9a96da9d3d1d
--- /dev/null
+++ b/llvm/test/MC/X86/apx/wrussq-intel.s
@@ -0,0 +1,5 @@
+# RUN: llvm-mc -triple x86_64 -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s
+
+# CHECK: wrussq	qword ptr [r28 + 4*r29 + 291], r19
+# CHECK: encoding: [0x62,0x8c,0xf9,0x08,0x65,0x9c,0xac,0x23,0x01,0x00,0x00]
+         wrussq	qword ptr [r28 + 4*r29 + 291], r19

From f25bcfbb291e3d213eaded5cfa84d3d4e7002052 Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Thu, 21 Dec 2023 22:23:54 -0800
Subject: [PATCH 144/342] [test][hwasan] XFAIL new test which fails for unknown
 reason

---
 compiler-rt/test/hwasan/TestCases/strip_path_prefix.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/compiler-rt/test/hwasan/TestCases/strip_path_prefix.c b/compiler-rt/test/hwasan/TestCases/strip_path_prefix.c
index 1c89b47af155b..5e41d03b683e9 100644
--- a/compiler-rt/test/hwasan/TestCases/strip_path_prefix.c
+++ b/compiler-rt/test/hwasan/TestCases/strip_path_prefix.c
@@ -3,6 +3,9 @@
 // Stack histories currently are not recorded on x86.
 // XFAIL: target=x86_64{{.*}}
 
+// FIXME: Android does not see a variable.
+// XFAIL: android
+
 #include <assert.h>
 #include <sanitizer/hwasan_interface.h>
 #include <stdio.h>

From 90f816e61f48c22861aeadf31ca6338f88f9e08a Mon Sep 17 00:00:00 2001
From: wangpc <wangpengcheng.pp@bytedance.com>
Date: Fri, 22 Dec 2023 14:20:09 +0800
Subject: [PATCH 145/342] [RISCV] Rename TuneVeyronFusions to TuneVentanaVeyron

And fusion features are added to processor definition.
---
 llvm/lib/Target/RISCV/RISCVFeatures.td   | 8 ++------
 llvm/lib/Target/RISCV/RISCVProcessors.td | 6 +++++-
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td
index 2095446c694bd..5048e28545a3c 100644
--- a/llvm/lib/Target/RISCV/RISCVFeatures.td
+++ b/llvm/lib/Target/RISCV/RISCVFeatures.td
@@ -1001,12 +1001,8 @@ def TuneSiFive7 : SubtargetFeature<"sifive7", "RISCVProcFamily", "SiFive7",
                                    [TuneNoDefaultUnroll,
                                     TuneShortForwardBranchOpt]>;
 
-def TuneVeyronFusions : SubtargetFeature<"ventana-veyron", "RISCVProcFamily", "VentanaVeyron",
-                                         "Ventana Veyron-Series processors",
-                                         [TuneLUIADDIFusion,
-                                          TuneAUIPCADDIFusion,
-                                          TuneShiftedZExtFusion,
-                                          TuneLDADDFusion]>;
+def TuneVentanaVeyron : SubtargetFeature<"ventana-veyron", "RISCVProcFamily", "VentanaVeyron",
+                                         "Ventana Veyron-Series processors">;
 
 // Assume that lock-free native-width atomics are available, even if the target
 // and operating system combination would not usually provide them. The user
diff --git a/llvm/lib/Target/RISCV/RISCVProcessors.td b/llvm/lib/Target/RISCV/RISCVProcessors.td
index 16c79519fcacc..71c250634cfc9 100644
--- a/llvm/lib/Target/RISCV/RISCVProcessors.td
+++ b/llvm/lib/Target/RISCV/RISCVProcessors.td
@@ -273,7 +273,11 @@ def VENTANA_VEYRON_V1 : RISCVProcessorModel<"veyron-v1",
                                              FeatureStdExtZicbop,
                                              FeatureStdExtZicboz,
                                              FeatureVendorXVentanaCondOps],
-                                             [TuneVeyronFusions]>;
+                                             [TuneVentanaVeyron,
+                                              TuneLUIADDIFusion,
+                                              TuneAUIPCADDIFusion,
+                                              TuneShiftedZExtFusion,
+                                              TuneLDADDFusion]>;
 
 def XIANGSHAN_NANHU : RISCVProcessorModel<"xiangshan-nanhu",
                                           NoSchedModel,

From f9c908862a34b464694087705ebaf070f87f251c Mon Sep 17 00:00:00 2001
From: Wang Pengcheng <wangpengcheng.pp@bytedance.com>
Date: Fri, 22 Dec 2023 14:37:26 +0800
Subject: [PATCH 146/342] [RISCV] Split TuneShiftedZExtFusion (#76032)

We split `TuneShiftedZExtFusion` into three fusions to make them
reusable and match the GCC implementation[1].

The zexth/zextw fusions can be reused by XiangShan[2] and other
commercial processors, but shifted zero extension is not so common.

`macro-fusions-veyron-v1.mir` is renamed so it's not relevant to
specific processor.

References:
[1] https://gcc.gnu.org/pipermail/gcc-patches/2023-November/637303.html
[2] https://xiangshan-doc.readthedocs.io/zh_CN/latest/frontend/decode
---
 llvm/lib/Target/RISCV/RISCVFeatures.td        | 16 +++-
 llvm/lib/Target/RISCV/RISCVMacroFusion.cpp    | 82 +++++++++++++++----
 llvm/lib/Target/RISCV/RISCVProcessors.td      |  4 +-
 llvm/lib/Target/RISCV/RISCVSubtarget.h        |  4 +-
 ...usions-veyron-v1.mir => macro-fusions.mir} | 29 +++++--
 5 files changed, 108 insertions(+), 27 deletions(-)
 rename llvm/test/CodeGen/RISCV/{macro-fusions-veyron-v1.mir => macro-fusions.mir} (83%)

diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td
index 5048e28545a3c..a66dd135ae5f8 100644
--- a/llvm/lib/Target/RISCV/RISCVFeatures.td
+++ b/llvm/lib/Target/RISCV/RISCVFeatures.td
@@ -977,9 +977,19 @@ def TuneLUIADDIFusion
 def TuneAUIPCADDIFusion
     : SubtargetFeature<"auipc-addi-fusion", "HasAUIPCADDIFusion",
                        "true", "Enable AUIPC+ADDI macrofusion">;
-def TuneShiftedZExtFusion
-    : SubtargetFeature<"shifted-zext-fusion", "HasShiftedZExtFusion",
-                       "true", "Enable SLLI+SRLI to be fused when computing (shifted) zero extension">;
+
+def TuneZExtHFusion
+    : SubtargetFeature<"zexth-fusion", "HasZExtHFusion",
+                       "true", "Enable SLLI+SRLI to be fused to zero extension of halfword">;
+
+def TuneZExtWFusion
+    : SubtargetFeature<"zextw-fusion", "HasZExtWFusion",
+                       "true", "Enable SLLI+SRLI to be fused to zero extension of word">;
+
+def TuneShiftedZExtWFusion
+    : SubtargetFeature<"shifted-zextw-fusion", "HasShiftedZExtWFusion",
+                       "true", "Enable SLLI+SRLI to be fused when computing (shifted) zero extension of word">;
+
 def TuneLDADDFusion
     : SubtargetFeature<"ld-add-fusion", "HasLDADDFusion",
                        "true", "Enable LD+ADD macrofusion.">;
diff --git a/llvm/lib/Target/RISCV/RISCVMacroFusion.cpp b/llvm/lib/Target/RISCV/RISCVMacroFusion.cpp
index 02ea5270823d8..f948f05b22f77 100644
--- a/llvm/lib/Target/RISCV/RISCVMacroFusion.cpp
+++ b/llvm/lib/Target/RISCV/RISCVMacroFusion.cpp
@@ -58,18 +58,66 @@ static bool isLDADD(const MachineInstr *FirstMI, const MachineInstr &SecondMI) {
   return checkRegisters(FirstMI->getOperand(0).getReg(), SecondMI);
 }
 
-// Fuse these patterns:
-//
-// slli rd, rs1, 32
-// srli rd, rd, x
-// where 0 <= x <= 32
-//
-// and
-//
+// Fuse zero extension of halfword:
 // slli rd, rs1, 48
+// srli rd, rd, 48
+static bool isZExtH(const MachineInstr *FirstMI, const MachineInstr &SecondMI) {
+  if (SecondMI.getOpcode() != RISCV::SRLI)
+    return false;
+
+  if (!SecondMI.getOperand(2).isImm())
+    return false;
+
+  if (SecondMI.getOperand(2).getImm() != 48)
+    return false;
+
+  // Given SecondMI, when FirstMI is unspecified, we must return
+  // if SecondMI may be part of a fused pair at all.
+  if (!FirstMI)
+    return true;
+
+  if (FirstMI->getOpcode() != RISCV::SLLI)
+    return false;
+
+  if (FirstMI->getOperand(2).getImm() != 48)
+    return false;
+
+  return checkRegisters(FirstMI->getOperand(0).getReg(), SecondMI);
+}
+
+// Fuse zero extension of word:
+// slli rd, rs1, 32
+// srli rd, rd, 32
+static bool isZExtW(const MachineInstr *FirstMI, const MachineInstr &SecondMI) {
+  if (SecondMI.getOpcode() != RISCV::SRLI)
+    return false;
+
+  if (!SecondMI.getOperand(2).isImm())
+    return false;
+
+  if (SecondMI.getOperand(2).getImm() != 32)
+    return false;
+
+  // Given SecondMI, when FirstMI is unspecified, we must return
+  // if SecondMI may be part of a fused pair at all.
+  if (!FirstMI)
+    return true;
+
+  if (FirstMI->getOpcode() != RISCV::SLLI)
+    return false;
+
+  if (FirstMI->getOperand(2).getImm() != 32)
+    return false;
+
+  return checkRegisters(FirstMI->getOperand(0).getReg(), SecondMI);
+}
+
+// Fuse shifted zero extension of word:
+// slli rd, rs1, 32
 // srli rd, rd, x
-static bool isShiftedZExt(const MachineInstr *FirstMI,
-                          const MachineInstr &SecondMI) {
+// where 0 <= x < 32
+static bool isShiftedZExtW(const MachineInstr *FirstMI,
+                           const MachineInstr &SecondMI) {
   if (SecondMI.getOpcode() != RISCV::SRLI)
     return false;
 
@@ -77,8 +125,7 @@ static bool isShiftedZExt(const MachineInstr *FirstMI,
     return false;
 
   unsigned SRLIImm = SecondMI.getOperand(2).getImm();
-  bool IsShiftBy48 = SRLIImm == 48;
-  if (SRLIImm > 32 && !IsShiftBy48)
+  if (SRLIImm >= 32)
     return false;
 
   // Given SecondMI, when FirstMI is unspecified, we must return
@@ -89,8 +136,7 @@ static bool isShiftedZExt(const MachineInstr *FirstMI,
   if (FirstMI->getOpcode() != RISCV::SLLI)
     return false;
 
-  unsigned SLLIImm = FirstMI->getOperand(2).getImm();
-  if (IsShiftBy48 ? (SLLIImm != 48) : (SLLIImm != 32))
+  if (FirstMI->getOperand(2).getImm() != 32)
     return false;
 
   return checkRegisters(FirstMI->getOperand(0).getReg(), SecondMI);
@@ -144,7 +190,13 @@ static bool shouldScheduleAdjacent(const TargetInstrInfo &TII,
   if (ST.hasAUIPCADDIFusion() && isAUIPCADDI(FirstMI, SecondMI))
     return true;
 
-  if (ST.hasShiftedZExtFusion() && isShiftedZExt(FirstMI, SecondMI))
+  if (ST.hasZExtHFusion() && isZExtH(FirstMI, SecondMI))
+    return true;
+
+  if (ST.hasZExtWFusion() && isZExtW(FirstMI, SecondMI))
+    return true;
+
+  if (ST.hasShiftedZExtWFusion() && isShiftedZExtW(FirstMI, SecondMI))
     return true;
 
   if (ST.hasLDADDFusion() && isLDADD(FirstMI, SecondMI))
diff --git a/llvm/lib/Target/RISCV/RISCVProcessors.td b/llvm/lib/Target/RISCV/RISCVProcessors.td
index 71c250634cfc9..6362a3bef6f28 100644
--- a/llvm/lib/Target/RISCV/RISCVProcessors.td
+++ b/llvm/lib/Target/RISCV/RISCVProcessors.td
@@ -276,7 +276,9 @@ def VENTANA_VEYRON_V1 : RISCVProcessorModel<"veyron-v1",
                                              [TuneVentanaVeyron,
                                               TuneLUIADDIFusion,
                                               TuneAUIPCADDIFusion,
-                                              TuneShiftedZExtFusion,
+                                              TuneZExtHFusion,
+                                              TuneZExtWFusion,
+                                              TuneShiftedZExtWFusion,
                                               TuneLDADDFusion]>;
 
 def XIANGSHAN_NANHU : RISCVProcessorModel<"xiangshan-nanhu",
diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.h b/llvm/lib/Target/RISCV/RISCVSubtarget.h
index 7540218633bfc..26320b05d9be2 100644
--- a/llvm/lib/Target/RISCV/RISCVSubtarget.h
+++ b/llvm/lib/Target/RISCV/RISCVSubtarget.h
@@ -190,8 +190,8 @@ class RISCVSubtarget : public RISCVGenSubtargetInfo {
   }
 
   bool hasMacroFusion() const {
-    return hasLUIADDIFusion() || hasAUIPCADDIFusion() ||
-           hasShiftedZExtFusion() || hasLDADDFusion();
+    return hasLUIADDIFusion() || hasAUIPCADDIFusion() || hasZExtHFusion() ||
+           hasZExtWFusion() || hasShiftedZExtWFusion() || hasLDADDFusion();
   }
 
   // Vector codegen related methods.
diff --git a/llvm/test/CodeGen/RISCV/macro-fusions-veyron-v1.mir b/llvm/test/CodeGen/RISCV/macro-fusions.mir
similarity index 83%
rename from llvm/test/CodeGen/RISCV/macro-fusions-veyron-v1.mir
rename to llvm/test/CodeGen/RISCV/macro-fusions.mir
index 6d1e92e997b32..b7568ae6f0f69 100644
--- a/llvm/test/CodeGen/RISCV/macro-fusions-veyron-v1.mir
+++ b/llvm/test/CodeGen/RISCV/macro-fusions.mir
@@ -1,7 +1,7 @@
 # REQUIRES: asserts
-# RUN: llc -mtriple=riscv64-linux-gnu  -mcpu=veyron-v1 -x=mir < %s \
+# RUN: llc -mtriple=riscv64-linux-gnu -x=mir < %s \
 # RUN:   -debug-only=machine-scheduler -start-before=machine-scheduler 2>&1 \
-# RUN:   -mattr=+lui-addi-fusion,+auipc-addi-fusion,+shifted-zext-fusion,+ld-add-fusion \
+# RUN:   -mattr=+lui-addi-fusion,+auipc-addi-fusion,+zexth-fusion,+zextw-fusion,+shifted-zextw-fusion,+ld-add-fusion \
 # RUN:   | FileCheck %s
 
 # CHECK: lui_addi:%bb.0
@@ -38,10 +38,10 @@ body:             |
     PseudoRET
 ...
 
-# CHECK: slli_srli
+# CHECK: slli_srli_shifted_zext
 # CHECK: Macro fuse: {{.*}}SLLI - SRLI
 ---
-name: slli_srli
+name: shifted_zext
 tracksRegLiveness: true
 body:             |
   bb.0.entry:
@@ -55,10 +55,10 @@ body:             |
     PseudoRET
 ...
 
-# CHECK: slli_srli_48
+# CHECK: slli_srli_zexth
 # CHECK: Macro fuse: {{.*}}SLLI - SRLI
 ---
-name: slli_srli_48
+name: zexth
 tracksRegLiveness: true
 body:             |
   bb.0.entry:
@@ -72,6 +72,23 @@ body:             |
     PseudoRET
 ...
 
+# CHECK: slli_srli_zextw
+# CHECK: Macro fuse: {{.*}}SLLI - SRLI
+---
+name: zextw
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x10
+    %1:gpr = COPY $x10
+    %2:gpr = SLLI %1, 32
+    %3:gpr = XORI %1, 3
+    %4:gpr = SRLI %2, 32
+    $x10 = COPY %3
+    $x11 = COPY %4
+    PseudoRET
+...
+
 # CHECK: slli_srli_no_fusion_0
 # CHECK-NOT: Macro fuse: {{.*}}SLLI - SRLI
 ---

From fc3eed1bce0322fcfd9726b9f2ba747cb9c63802 Mon Sep 17 00:00:00 2001
From: Aiden Grossman <agrossman154@yahoo.com>
Date: Thu, 21 Dec 2023 22:49:04 -0800
Subject: [PATCH 147/342] [Github] Reformat strings for code format action
 (#75764)

Before this patch, there was a regression in comment formatting due to
some code formatting in bd3e8eb6e325081bf7cfbe93652aa825de3170e5. This
was fixed in 428660cfb986dd0a59cd2a16972c5f7109080522. Github interprets
a tab before a string as starting code formatting. The message that
indicted the code formatting in a PR had been fixed was refactored to a
python multi-line string, but with a tab in front, causing these
messages to be rendered as code blocks in Github, instead of as
intended. This patch builds upon the original fix to reformat the
strings so that they fit within ~80 character lines and are simpler to
modify in the future, hopefully removing traps like the one that caused
the original issue.
---
 llvm/utils/git/code-format-helper.py | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/llvm/utils/git/code-format-helper.py b/llvm/utils/git/code-format-helper.py
index 849ae996f7339..8a29a57d8d16b 100755
--- a/llvm/utils/git/code-format-helper.py
+++ b/llvm/utils/git/code-format-helper.py
@@ -130,9 +130,10 @@ def run(self, changed_files: List[str], args: FormatArgs) -> bool:
 
         if diff is None:
             if should_update_gh:
-                comment_text = f"""
-:white_check_mark: With the latest revision this PR passed the {self.friendly_name}.
-"""
+                comment_text = (
+                    ":white_check_mark: With the latest revision "
+                    f"this PR passed the {self.friendly_name}."
+                )
                 self.update_pr(comment_text, args, create_new=False)
             return True
         elif len(diff) > 0:
@@ -141,15 +142,17 @@ def run(self, changed_files: List[str], args: FormatArgs) -> bool:
                 self.update_pr(comment_text, args, create_new=True)
             else:
                 print(
-                    f"Warning: {self.friendly_name}, {self.name} detected some issues with your code formatting..."
+                    f"Warning: {self.friendly_name}, {self.name} detected "
+                    "some issues with your code formatting..."
                 )
             return False
         else:
             # The formatter failed but didn't output a diff (e.g. some sort of
             # infrastructure failure).
-            comment_text = f"""
-:warning: The {self.friendly_name} failed without printing a diff. Check the logs for stderr output. :warning:
-"""
+            comment_text = (
+                f":warning: The {self.friendly_name} failed without printing "
+                "a diff. Check the logs for stderr output. :warning:"
+            )
             self.update_pr(comment_text, args, create_new=False)
             return False
 

From 59eebb40fbedf6bc35746a0639f823a19ab0f030 Mon Sep 17 00:00:00 2001
From: wangpc <wangpengcheng.pp@bytedance.com>
Date: Fri, 22 Dec 2023 14:49:20 +0800
Subject: [PATCH 148/342] [RISCV] Fix macro-fusions.mir

---
 llvm/test/CodeGen/RISCV/macro-fusions.mir | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/test/CodeGen/RISCV/macro-fusions.mir b/llvm/test/CodeGen/RISCV/macro-fusions.mir
index b7568ae6f0f69..13464141ce27e 100644
--- a/llvm/test/CodeGen/RISCV/macro-fusions.mir
+++ b/llvm/test/CodeGen/RISCV/macro-fusions.mir
@@ -41,7 +41,7 @@ body:             |
 # CHECK: slli_srli_shifted_zext
 # CHECK: Macro fuse: {{.*}}SLLI - SRLI
 ---
-name: shifted_zext
+name: slli_srli_shifted_zext
 tracksRegLiveness: true
 body:             |
   bb.0.entry:
@@ -58,7 +58,7 @@ body:             |
 # CHECK: slli_srli_zexth
 # CHECK: Macro fuse: {{.*}}SLLI - SRLI
 ---
-name: zexth
+name: slli_srli_zexth
 tracksRegLiveness: true
 body:             |
   bb.0.entry:
@@ -75,7 +75,7 @@ body:             |
 # CHECK: slli_srli_zextw
 # CHECK: Macro fuse: {{.*}}SLLI - SRLI
 ---
-name: zextw
+name: slli_srli_zextw
 tracksRegLiveness: true
 body:             |
   bb.0.entry:

From 33707fe14bbec63ab2de3515fdd8bd26a50f7283 Mon Sep 17 00:00:00 2001
From: XinWang10 <108658776+XinWang10@users.noreply.github.com>
Date: Fri, 22 Dec 2023 15:09:18 +0800
Subject: [PATCH 149/342] Omited resolution change (#12238)

---
 openmp/libomptarget/test/lit.cfg | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/openmp/libomptarget/test/lit.cfg b/openmp/libomptarget/test/lit.cfg
index 64694128c032f..d44748a5b636e 100644
--- a/openmp/libomptarget/test/lit.cfg
+++ b/openmp/libomptarget/test/lit.cfg
@@ -140,13 +140,9 @@ else: # Unices
     if config.cuda_libdir:
         config.test_flags += " -Wl,-rpath," + config.cuda_libdir
     if config.libomptarget_current_target.startswith('nvptx'):
-<<<<<<< HEAD
-        config.test_flags += " --libomptarget-nvptx-bc-path=" + config.library_dir + '/DeviceRTL'
+        config.test_flags_clang += " --libomptarget-nvptx-bc-path=" + config.library_dir + '/DeviceRTL'
     if config.libomptarget_current_target.endswith('-oldDriver'):
         config.test_flags += " -fno-openmp-new-driver"
-=======
-        config.test_flags_clang += " --libomptarget-nvptx-bc-path=" + config.library_dir + '/DeviceRTL'
->>>>>>> 12250c4092b9f8fd043b37cbb73555706a4a412b
     if config.libomptarget_current_target.endswith('-LTO'):
         config.test_flags += " -foffload-lto"
     if config.libomptarget_current_target.endswith('-JIT-LTO') and evaluate_bool_env(

From 1d4691a2338c816e18e0d7c7db9a6062dd89f68c Mon Sep 17 00:00:00 2001
From: XinWang10 <108658776+XinWang10@users.noreply.github.com>
Date: Fri, 22 Dec 2023 15:19:56 +0800
Subject: [PATCH 150/342] [X86][MC] Support Enc/Dec for EGPR for promoted
 CMPCCXADD instruction (#76125)

R16-R31 was added into GPRs in
https://github.com/llvm/llvm-project/pull/70958,
This patch supports the encoding/decoding for promoted CMPCCXADD
instruction in EVEX space.

RFC:
https://discourse.llvm.org/t/rfc-design-for-apx-feature-egpr-and-ndd-support/73031/4
---
 .../X86/MCTargetDesc/X86MCCodeEmitter.cpp     |   2 +-
 llvm/lib/Target/X86/X86InstrAsmAlias.td       |   5 +
 llvm/lib/Target/X86/X86InstrMisc.td           |  21 ++-
 .../MC/Disassembler/X86/apx/cmpccxadd.txt     | 122 +++++++++++++++++
 .../MC/Disassembler/X86/apx/evex-format.txt   |   6 +
 llvm/test/MC/X86/apx/cmpccxadd-att.s          | 124 ++++++++++++++++++
 llvm/test/MC/X86/apx/cmpccxadd-intel.s        | 121 +++++++++++++++++
 llvm/test/MC/X86/apx/evex-format-att.s        |   6 +
 llvm/test/MC/X86/apx/evex-format-intel.s      |   6 +
 9 files changed, 410 insertions(+), 3 deletions(-)
 create mode 100644 llvm/test/MC/Disassembler/X86/apx/cmpccxadd.txt
 create mode 100644 llvm/test/MC/X86/apx/cmpccxadd-att.s
 create mode 100644 llvm/test/MC/X86/apx/cmpccxadd-intel.s

diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
index b6ebbcf56aef7..9e1f1eb97e703 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
@@ -1060,7 +1060,7 @@ X86MCCodeEmitter::emitVEXOpcodePrefix(int MemOperand, const MCInst &MI,
     Prefix.setBB2(MI, MemOperand + X86::AddrBaseReg);
     Prefix.setXX2(MI, MemOperand + X86::AddrIndexReg);
     CurOp += X86::AddrNumOperands;
-    Prefix.set4V(MI, CurOp++);
+    Prefix.set4VV2(MI, CurOp++);
     break;
   }
   case X86II::MRM_C0:
diff --git a/llvm/lib/Target/X86/X86InstrAsmAlias.td b/llvm/lib/Target/X86/X86InstrAsmAlias.td
index f1a90d9c59c3d..2590be8651d51 100644
--- a/llvm/lib/Target/X86/X86InstrAsmAlias.td
+++ b/llvm/lib/Target/X86/X86InstrAsmAlias.td
@@ -55,6 +55,11 @@ multiclass CMPCCXADD_Aliases<string Cond, int CC> {
                   (CMPCCXADDmr32 GR32:$dst, i32mem:$dstsrc2, GR32:$src3, CC), 0>;
   def : InstAlias<"cmp"#Cond#"xadd"#"\t{$src3, $dst, $dstsrc2|$dstsrc2, $dst, $src3}",
                   (CMPCCXADDmr64 GR64:$dst, i64mem:$dstsrc2, GR64:$src3, CC), 0>;
+
+  def : InstAlias<"cmp"#Cond#"xadd"#"\t{$src3, $dst, $dstsrc2|$dstsrc2, $dst, $src3}",
+                  (CMPCCXADDmr32_EVEX GR32:$dst, i32mem:$dstsrc2, GR32:$src3, CC), 0>;
+  def : InstAlias<"cmp"#Cond#"xadd"#"\t{$src3, $dst, $dstsrc2|$dstsrc2, $dst, $src3}",
+                  (CMPCCXADDmr64_EVEX GR64:$dst, i64mem:$dstsrc2, GR64:$src3, CC), 0>;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/X86/X86InstrMisc.td b/llvm/lib/Target/X86/X86InstrMisc.td
index a6bed74b5bef1..d3a3fb7fefc23 100644
--- a/llvm/lib/Target/X86/X86InstrMisc.td
+++ b/llvm/lib/Target/X86/X86InstrMisc.td
@@ -1663,8 +1663,8 @@ let Predicates = [HasPREFETCHI, In64BitMode], SchedRW = [WriteLoad] in {
 // CMPCCXADD Instructions
 //
 let isCodeGenOnly = 1, ForceDisassemble = 1, mayLoad = 1, mayStore = 1,
-    Predicates = [HasCMPCCXADD, In64BitMode], Defs = [EFLAGS],
-    Constraints = "$dstsrc1 = $dst" in {
+    Defs = [EFLAGS], Constraints = "$dstsrc1 = $dst" in {
+let Predicates = [HasCMPCCXADD, NoEGPR, In64BitMode] in {
 def CMPCCXADDmr32 : I<0xe0, MRMDestMem4VOp3CC, (outs GR32:$dst),
           (ins GR32:$dstsrc1, i32mem:$dstsrc2, GR32:$src3, ccode:$cond),
           "cmp${cond}xadd\t{$src3, $dst, $dstsrc2|$dstsrc2, $dst, $src3}",
@@ -1680,6 +1680,23 @@ def CMPCCXADDmr64 : I<0xe0, MRMDestMem4VOp3CC, (outs GR64:$dst),
           VEX, VVVV, REX_W, T8PD, Sched<[WriteXCHG]>;
 }
 
+let Predicates = [HasCMPCCXADD, HasEGPR, In64BitMode] in {
+def CMPCCXADDmr32_EVEX : I<0xe0, MRMDestMem4VOp3CC, (outs GR32:$dst),
+          (ins GR32:$dstsrc1, i32mem:$dstsrc2, GR32:$src3, ccode:$cond),
+          "cmp${cond}xadd\t{$src3, $dst, $dstsrc2|$dstsrc2, $dst, $src3}",
+          [(set GR32:$dst, (X86cmpccxadd addr:$dstsrc2,
+            GR32:$dstsrc1, GR32:$src3, timm:$cond))]>,
+          EVEX, VVVV, NoCD8, T8PD, Sched<[WriteXCHG]>;
+
+def CMPCCXADDmr64_EVEX : I<0xe0, MRMDestMem4VOp3CC, (outs GR64:$dst),
+          (ins GR64:$dstsrc1, i64mem:$dstsrc2, GR64:$src3, ccode:$cond),
+          "cmp${cond}xadd\t{$src3, $dst, $dstsrc2|$dstsrc2, $dst, $src3}",
+          [(set GR64:$dst, (X86cmpccxadd addr:$dstsrc2,
+            GR64:$dstsrc1, GR64:$src3, timm:$cond))]>,
+          EVEX, VVVV, NoCD8, REX_W, T8PD, Sched<[WriteXCHG]>;
+}
+}
+
 //===----------------------------------------------------------------------===//
 // Memory Instructions
 //
diff --git a/llvm/test/MC/Disassembler/X86/apx/cmpccxadd.txt b/llvm/test/MC/Disassembler/X86/apx/cmpccxadd.txt
new file mode 100644
index 0000000000000..9f65d4c8d25ce
--- /dev/null
+++ b/llvm/test/MC/Disassembler/X86/apx/cmpccxadd.txt
@@ -0,0 +1,122 @@
+# RUN: llvm-mc --disassemble %s -triple=x86_64 | FileCheck %s --check-prefixes=ATT
+# RUN: llvm-mc --disassemble %s -triple=x86_64 -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck %s --check-prefixes=INTEL
+
+# ATT:   cmpbexadd	%r18d, %r22d, 291(%r28,%r29,4)
+# INTEL: cmpbexadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+0x62,0x8a,0x69,0x00,0xe6,0xb4,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cmpbexadd	%r19, %r23, 291(%r28,%r29,4)
+# INTEL: cmpbexadd	qword ptr [r28 + 4*r29 + 291], r23, r19
+0x62,0x8a,0xe1,0x00,0xe6,0xbc,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cmpbxadd	%r18d, %r22d, 291(%r28,%r29,4)
+# INTEL: cmpbxadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+0x62,0x8a,0x69,0x00,0xe2,0xb4,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cmpbxadd	%r19, %r23, 291(%r28,%r29,4)
+# INTEL: cmpbxadd	qword ptr [r28 + 4*r29 + 291], r23, r19
+0x62,0x8a,0xe1,0x00,0xe2,0xbc,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cmplexadd	%r18d, %r22d, 291(%r28,%r29,4)
+# INTEL: cmplexadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+0x62,0x8a,0x69,0x00,0xee,0xb4,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cmplexadd	%r19, %r23, 291(%r28,%r29,4)
+# INTEL: cmplexadd	qword ptr [r28 + 4*r29 + 291], r23, r19
+0x62,0x8a,0xe1,0x00,0xee,0xbc,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cmplxadd	%r18d, %r22d, 291(%r28,%r29,4)
+# INTEL: cmplxadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+0x62,0x8a,0x69,0x00,0xec,0xb4,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cmplxadd	%r19, %r23, 291(%r28,%r29,4)
+# INTEL: cmplxadd	qword ptr [r28 + 4*r29 + 291], r23, r19
+0x62,0x8a,0xe1,0x00,0xec,0xbc,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cmpaxadd	%r18d, %r22d, 291(%r28,%r29,4)
+# INTEL: cmpaxadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+0x62,0x8a,0x69,0x00,0xe7,0xb4,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cmpaxadd	%r19, %r23, 291(%r28,%r29,4)
+# INTEL: cmpaxadd	qword ptr [r28 + 4*r29 + 291], r23, r19
+0x62,0x8a,0xe1,0x00,0xe7,0xbc,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cmpgxadd	%r18d, %r22d, 291(%r28,%r29,4)
+# INTEL: cmpgxadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+0x62,0x8a,0x69,0x00,0xef,0xb4,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cmpgxadd	%r19, %r23, 291(%r28,%r29,4)
+# INTEL: cmpgxadd	qword ptr [r28 + 4*r29 + 291], r23, r19
+0x62,0x8a,0xe1,0x00,0xef,0xbc,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cmpgexadd	%r18d, %r22d, 291(%r28,%r29,4)
+# INTEL: cmpgexadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+0x62,0x8a,0x69,0x00,0xed,0xb4,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cmpgexadd	%r19, %r23, 291(%r28,%r29,4)
+# INTEL: cmpgexadd	qword ptr [r28 + 4*r29 + 291], r23, r19
+0x62,0x8a,0xe1,0x00,0xed,0xbc,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cmpnoxadd	%r18d, %r22d, 291(%r28,%r29,4)
+# INTEL: cmpnoxadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+0x62,0x8a,0x69,0x00,0xe1,0xb4,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cmpnoxadd	%r19, %r23, 291(%r28,%r29,4)
+# INTEL: cmpnoxadd	qword ptr [r28 + 4*r29 + 291], r23, r19
+0x62,0x8a,0xe1,0x00,0xe1,0xbc,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cmpnpxadd	%r18d, %r22d, 291(%r28,%r29,4)
+# INTEL: cmpnpxadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+0x62,0x8a,0x69,0x00,0xeb,0xb4,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cmpnpxadd	%r19, %r23, 291(%r28,%r29,4)
+# INTEL: cmpnpxadd	qword ptr [r28 + 4*r29 + 291], r23, r19
+0x62,0x8a,0xe1,0x00,0xeb,0xbc,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cmpnsxadd	%r18d, %r22d, 291(%r28,%r29,4)
+# INTEL: cmpnsxadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+0x62,0x8a,0x69,0x00,0xe9,0xb4,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cmpnsxadd	%r19, %r23, 291(%r28,%r29,4)
+# INTEL: cmpnsxadd	qword ptr [r28 + 4*r29 + 291], r23, r19
+0x62,0x8a,0xe1,0x00,0xe9,0xbc,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cmpnexadd	%r18d, %r22d, 291(%r28,%r29,4)
+# INTEL: cmpnexadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+0x62,0x8a,0x69,0x00,0xe5,0xb4,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cmpnexadd	%r19, %r23, 291(%r28,%r29,4)
+# INTEL: cmpnexadd	qword ptr [r28 + 4*r29 + 291], r23, r19
+0x62,0x8a,0xe1,0x00,0xe5,0xbc,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cmpoxadd	%r18d, %r22d, 291(%r28,%r29,4)
+# INTEL: cmpoxadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+0x62,0x8a,0x69,0x00,0xe0,0xb4,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cmpoxadd	%r19, %r23, 291(%r28,%r29,4)
+# INTEL: cmpoxadd	qword ptr [r28 + 4*r29 + 291], r23, r19
+0x62,0x8a,0xe1,0x00,0xe0,0xbc,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cmppxadd	%r18d, %r22d, 291(%r28,%r29,4)
+# INTEL: cmppxadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+0x62,0x8a,0x69,0x00,0xea,0xb4,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cmppxadd	%r19, %r23, 291(%r28,%r29,4)
+# INTEL: cmppxadd	qword ptr [r28 + 4*r29 + 291], r23, r19
+0x62,0x8a,0xe1,0x00,0xea,0xbc,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cmpsxadd	%r18d, %r22d, 291(%r28,%r29,4)
+# INTEL: cmpsxadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+0x62,0x8a,0x69,0x00,0xe8,0xb4,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cmpsxadd	%r19, %r23, 291(%r28,%r29,4)
+# INTEL: cmpsxadd	qword ptr [r28 + 4*r29 + 291], r23, r19
+0x62,0x8a,0xe1,0x00,0xe8,0xbc,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cmpexadd	%r18d, %r22d, 291(%r28,%r29,4)
+# INTEL: cmpexadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+0x62,0x8a,0x69,0x00,0xe4,0xb4,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   cmpexadd	%r19, %r23, 291(%r28,%r29,4)
+# INTEL: cmpexadd	qword ptr [r28 + 4*r29 + 291], r23, r19
+0x62,0x8a,0xe1,0x00,0xe4,0xbc,0xac,0x23,0x01,0x00,0x00
diff --git a/llvm/test/MC/Disassembler/X86/apx/evex-format.txt b/llvm/test/MC/Disassembler/X86/apx/evex-format.txt
index 389b22cb4a223..01676fe056925 100644
--- a/llvm/test/MC/Disassembler/X86/apx/evex-format.txt
+++ b/llvm/test/MC/Disassembler/X86/apx/evex-format.txt
@@ -62,6 +62,12 @@
 # INTEL: vpslldq	zmm0, zmmword ptr [r16 + r17], 0
 0x62,0xf9,0x79,0x48,0x73,0x3c,0x08,0x00
 
+## MRMDestMem4VOp3CC
+
+# ATT:   cmpbexadd	%r18d, %r22d, 291(%r28,%r29,4)
+# INTEL: cmpbexadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+0x62,0x8a,0x69,0x00,0xe6,0xb4,0xac,0x23,0x01,0x00,0x00
+
 ## MRMSrcMem4VOp3
 
 # ATT:   bzhiq	%r19, 291(%r28,%r29,4), %r23
diff --git a/llvm/test/MC/X86/apx/cmpccxadd-att.s b/llvm/test/MC/X86/apx/cmpccxadd-att.s
new file mode 100644
index 0000000000000..ce23588a18499
--- /dev/null
+++ b/llvm/test/MC/X86/apx/cmpccxadd-att.s
@@ -0,0 +1,124 @@
+# RUN: llvm-mc -triple x86_64 --show-encoding %s | FileCheck %s
+# RUN: not llvm-mc -triple i386 -show-encoding %s 2>&1 | FileCheck %s --check-prefix=ERROR
+
+# ERROR-COUNT-30: error:
+# ERROR-NOT: error:
+# CHECK: cmpbexadd	%r18d, %r22d, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8a,0x69,0x00,0xe6,0xb4,0xac,0x23,0x01,0x00,0x00]
+         cmpbexadd	%r18d, %r22d, 291(%r28,%r29,4)
+
+# CHECK: cmpbexadd	%r19, %r23, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xe6,0xbc,0xac,0x23,0x01,0x00,0x00]
+         cmpbexadd	%r19, %r23, 291(%r28,%r29,4)
+
+# CHECK: cmpbxadd	%r18d, %r22d, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8a,0x69,0x00,0xe2,0xb4,0xac,0x23,0x01,0x00,0x00]
+         cmpbxadd	%r18d, %r22d, 291(%r28,%r29,4)
+
+# CHECK: cmpbxadd	%r19, %r23, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xe2,0xbc,0xac,0x23,0x01,0x00,0x00]
+         cmpbxadd	%r19, %r23, 291(%r28,%r29,4)
+
+# CHECK: cmplexadd	%r18d, %r22d, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8a,0x69,0x00,0xee,0xb4,0xac,0x23,0x01,0x00,0x00]
+         cmplexadd	%r18d, %r22d, 291(%r28,%r29,4)
+
+# CHECK: cmplexadd	%r19, %r23, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xee,0xbc,0xac,0x23,0x01,0x00,0x00]
+         cmplexadd	%r19, %r23, 291(%r28,%r29,4)
+
+# CHECK: cmplxadd	%r18d, %r22d, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8a,0x69,0x00,0xec,0xb4,0xac,0x23,0x01,0x00,0x00]
+         cmplxadd	%r18d, %r22d, 291(%r28,%r29,4)
+
+# CHECK: cmplxadd	%r19, %r23, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xec,0xbc,0xac,0x23,0x01,0x00,0x00]
+         cmplxadd	%r19, %r23, 291(%r28,%r29,4)
+
+# CHECK: cmpaxadd	%r18d, %r22d, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8a,0x69,0x00,0xe7,0xb4,0xac,0x23,0x01,0x00,0x00]
+         cmpaxadd	%r18d, %r22d, 291(%r28,%r29,4)
+
+# CHECK: cmpaxadd	%r19, %r23, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xe7,0xbc,0xac,0x23,0x01,0x00,0x00]
+         cmpaxadd	%r19, %r23, 291(%r28,%r29,4)
+
+# CHECK: cmpgxadd	%r18d, %r22d, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8a,0x69,0x00,0xef,0xb4,0xac,0x23,0x01,0x00,0x00]
+         cmpgxadd	%r18d, %r22d, 291(%r28,%r29,4)
+
+# CHECK: cmpgxadd	%r19, %r23, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xef,0xbc,0xac,0x23,0x01,0x00,0x00]
+         cmpgxadd	%r19, %r23, 291(%r28,%r29,4)
+
+# CHECK: cmpgexadd	%r18d, %r22d, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8a,0x69,0x00,0xed,0xb4,0xac,0x23,0x01,0x00,0x00]
+         cmpgexadd	%r18d, %r22d, 291(%r28,%r29,4)
+
+# CHECK: cmpgexadd	%r19, %r23, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xed,0xbc,0xac,0x23,0x01,0x00,0x00]
+         cmpgexadd	%r19, %r23, 291(%r28,%r29,4)
+
+# CHECK: cmpnoxadd	%r18d, %r22d, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8a,0x69,0x00,0xe1,0xb4,0xac,0x23,0x01,0x00,0x00]
+         cmpnoxadd	%r18d, %r22d, 291(%r28,%r29,4)
+
+# CHECK: cmpnoxadd	%r19, %r23, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xe1,0xbc,0xac,0x23,0x01,0x00,0x00]
+         cmpnoxadd	%r19, %r23, 291(%r28,%r29,4)
+
+# CHECK: cmpnpxadd	%r18d, %r22d, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8a,0x69,0x00,0xeb,0xb4,0xac,0x23,0x01,0x00,0x00]
+         cmpnpxadd	%r18d, %r22d, 291(%r28,%r29,4)
+
+# CHECK: cmpnpxadd	%r19, %r23, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xeb,0xbc,0xac,0x23,0x01,0x00,0x00]
+         cmpnpxadd	%r19, %r23, 291(%r28,%r29,4)
+
+# CHECK: cmpnsxadd	%r18d, %r22d, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8a,0x69,0x00,0xe9,0xb4,0xac,0x23,0x01,0x00,0x00]
+         cmpnsxadd	%r18d, %r22d, 291(%r28,%r29,4)
+
+# CHECK: cmpnsxadd	%r19, %r23, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xe9,0xbc,0xac,0x23,0x01,0x00,0x00]
+         cmpnsxadd	%r19, %r23, 291(%r28,%r29,4)
+
+# CHECK: cmpnexadd	%r18d, %r22d, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8a,0x69,0x00,0xe5,0xb4,0xac,0x23,0x01,0x00,0x00]
+         cmpnexadd	%r18d, %r22d, 291(%r28,%r29,4)
+
+# CHECK: cmpnexadd	%r19, %r23, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xe5,0xbc,0xac,0x23,0x01,0x00,0x00]
+         cmpnexadd	%r19, %r23, 291(%r28,%r29,4)
+
+# CHECK: cmpoxadd	%r18d, %r22d, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8a,0x69,0x00,0xe0,0xb4,0xac,0x23,0x01,0x00,0x00]
+         cmpoxadd	%r18d, %r22d, 291(%r28,%r29,4)
+
+# CHECK: cmpoxadd	%r19, %r23, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xe0,0xbc,0xac,0x23,0x01,0x00,0x00]
+         cmpoxadd	%r19, %r23, 291(%r28,%r29,4)
+
+# CHECK: cmppxadd	%r18d, %r22d, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8a,0x69,0x00,0xea,0xb4,0xac,0x23,0x01,0x00,0x00]
+         cmppxadd	%r18d, %r22d, 291(%r28,%r29,4)
+
+# CHECK: cmppxadd	%r19, %r23, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xea,0xbc,0xac,0x23,0x01,0x00,0x00]
+         cmppxadd	%r19, %r23, 291(%r28,%r29,4)
+
+# CHECK: cmpsxadd	%r18d, %r22d, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8a,0x69,0x00,0xe8,0xb4,0xac,0x23,0x01,0x00,0x00]
+         cmpsxadd	%r18d, %r22d, 291(%r28,%r29,4)
+
+# CHECK: cmpsxadd	%r19, %r23, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xe8,0xbc,0xac,0x23,0x01,0x00,0x00]
+         cmpsxadd	%r19, %r23, 291(%r28,%r29,4)
+
+# CHECK: cmpexadd	%r18d, %r22d, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8a,0x69,0x00,0xe4,0xb4,0xac,0x23,0x01,0x00,0x00]
+         cmpexadd	%r18d, %r22d, 291(%r28,%r29,4)
+
+# CHECK: cmpexadd	%r19, %r23, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xe4,0xbc,0xac,0x23,0x01,0x00,0x00]
+         cmpexadd	%r19, %r23, 291(%r28,%r29,4)
diff --git a/llvm/test/MC/X86/apx/cmpccxadd-intel.s b/llvm/test/MC/X86/apx/cmpccxadd-intel.s
new file mode 100644
index 0000000000000..c2630d3d9273b
--- /dev/null
+++ b/llvm/test/MC/X86/apx/cmpccxadd-intel.s
@@ -0,0 +1,121 @@
+# RUN: llvm-mc -triple x86_64 -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s
+
+# CHECK: cmpbexadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+# CHECK: encoding: [0x62,0x8a,0x69,0x00,0xe6,0xb4,0xac,0x23,0x01,0x00,0x00]
+         cmpbexadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+
+# CHECK: cmpbexadd	qword ptr [r28 + 4*r29 + 291], r23, r19
+# CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xe6,0xbc,0xac,0x23,0x01,0x00,0x00]
+         cmpbexadd	qword ptr [r28 + 4*r29 + 291], r23, r19
+
+# CHECK: cmpbxadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+# CHECK: encoding: [0x62,0x8a,0x69,0x00,0xe2,0xb4,0xac,0x23,0x01,0x00,0x00]
+         cmpbxadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+
+# CHECK: cmpbxadd	qword ptr [r28 + 4*r29 + 291], r23, r19
+# CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xe2,0xbc,0xac,0x23,0x01,0x00,0x00]
+         cmpbxadd	qword ptr [r28 + 4*r29 + 291], r23, r19
+
+# CHECK: cmplexadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+# CHECK: encoding: [0x62,0x8a,0x69,0x00,0xee,0xb4,0xac,0x23,0x01,0x00,0x00]
+         cmplexadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+
+# CHECK: cmplexadd	qword ptr [r28 + 4*r29 + 291], r23, r19
+# CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xee,0xbc,0xac,0x23,0x01,0x00,0x00]
+         cmplexadd	qword ptr [r28 + 4*r29 + 291], r23, r19
+
+# CHECK: cmplxadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+# CHECK: encoding: [0x62,0x8a,0x69,0x00,0xec,0xb4,0xac,0x23,0x01,0x00,0x00]
+         cmplxadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+
+# CHECK: cmplxadd	qword ptr [r28 + 4*r29 + 291], r23, r19
+# CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xec,0xbc,0xac,0x23,0x01,0x00,0x00]
+         cmplxadd	qword ptr [r28 + 4*r29 + 291], r23, r19
+
+# CHECK: cmpaxadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+# CHECK: encoding: [0x62,0x8a,0x69,0x00,0xe7,0xb4,0xac,0x23,0x01,0x00,0x00]
+         cmpaxadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+
+# CHECK: cmpaxadd	qword ptr [r28 + 4*r29 + 291], r23, r19
+# CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xe7,0xbc,0xac,0x23,0x01,0x00,0x00]
+         cmpaxadd	qword ptr [r28 + 4*r29 + 291], r23, r19
+
+# CHECK: cmpgxadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+# CHECK: encoding: [0x62,0x8a,0x69,0x00,0xef,0xb4,0xac,0x23,0x01,0x00,0x00]
+         cmpgxadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+
+# CHECK: cmpgxadd	qword ptr [r28 + 4*r29 + 291], r23, r19
+# CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xef,0xbc,0xac,0x23,0x01,0x00,0x00]
+         cmpgxadd	qword ptr [r28 + 4*r29 + 291], r23, r19
+
+# CHECK: cmpgexadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+# CHECK: encoding: [0x62,0x8a,0x69,0x00,0xed,0xb4,0xac,0x23,0x01,0x00,0x00]
+         cmpgexadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+
+# CHECK: cmpgexadd	qword ptr [r28 + 4*r29 + 291], r23, r19
+# CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xed,0xbc,0xac,0x23,0x01,0x00,0x00]
+         cmpgexadd	qword ptr [r28 + 4*r29 + 291], r23, r19
+
+# CHECK: cmpnoxadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+# CHECK: encoding: [0x62,0x8a,0x69,0x00,0xe1,0xb4,0xac,0x23,0x01,0x00,0x00]
+         cmpnoxadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+
+# CHECK: cmpnoxadd	qword ptr [r28 + 4*r29 + 291], r23, r19
+# CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xe1,0xbc,0xac,0x23,0x01,0x00,0x00]
+         cmpnoxadd	qword ptr [r28 + 4*r29 + 291], r23, r19
+
+# CHECK: cmpnpxadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+# CHECK: encoding: [0x62,0x8a,0x69,0x00,0xeb,0xb4,0xac,0x23,0x01,0x00,0x00]
+         cmpnpxadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+
+# CHECK: cmpnpxadd	qword ptr [r28 + 4*r29 + 291], r23, r19
+# CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xeb,0xbc,0xac,0x23,0x01,0x00,0x00]
+         cmpnpxadd	qword ptr [r28 + 4*r29 + 291], r23, r19
+
+# CHECK: cmpnsxadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+# CHECK: encoding: [0x62,0x8a,0x69,0x00,0xe9,0xb4,0xac,0x23,0x01,0x00,0x00]
+         cmpnsxadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+
+# CHECK: cmpnsxadd	qword ptr [r28 + 4*r29 + 291], r23, r19
+# CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xe9,0xbc,0xac,0x23,0x01,0x00,0x00]
+         cmpnsxadd	qword ptr [r28 + 4*r29 + 291], r23, r19
+
+# CHECK: cmpnexadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+# CHECK: encoding: [0x62,0x8a,0x69,0x00,0xe5,0xb4,0xac,0x23,0x01,0x00,0x00]
+         cmpnexadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+
+# CHECK: cmpnexadd	qword ptr [r28 + 4*r29 + 291], r23, r19
+# CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xe5,0xbc,0xac,0x23,0x01,0x00,0x00]
+         cmpnexadd	qword ptr [r28 + 4*r29 + 291], r23, r19
+
+# CHECK: cmpoxadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+# CHECK: encoding: [0x62,0x8a,0x69,0x00,0xe0,0xb4,0xac,0x23,0x01,0x00,0x00]
+         cmpoxadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+
+# CHECK: cmpoxadd	qword ptr [r28 + 4*r29 + 291], r23, r19
+# CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xe0,0xbc,0xac,0x23,0x01,0x00,0x00]
+         cmpoxadd	qword ptr [r28 + 4*r29 + 291], r23, r19
+
+# CHECK: cmppxadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+# CHECK: encoding: [0x62,0x8a,0x69,0x00,0xea,0xb4,0xac,0x23,0x01,0x00,0x00]
+         cmppxadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+
+# CHECK: cmppxadd	qword ptr [r28 + 4*r29 + 291], r23, r19
+# CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xea,0xbc,0xac,0x23,0x01,0x00,0x00]
+         cmppxadd	qword ptr [r28 + 4*r29 + 291], r23, r19
+
+# CHECK: cmpsxadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+# CHECK: encoding: [0x62,0x8a,0x69,0x00,0xe8,0xb4,0xac,0x23,0x01,0x00,0x00]
+         cmpsxadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+
+# CHECK: cmpsxadd	qword ptr [r28 + 4*r29 + 291], r23, r19
+# CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xe8,0xbc,0xac,0x23,0x01,0x00,0x00]
+         cmpsxadd	qword ptr [r28 + 4*r29 + 291], r23, r19
+
+# CHECK: cmpexadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+# CHECK: encoding: [0x62,0x8a,0x69,0x00,0xe4,0xb4,0xac,0x23,0x01,0x00,0x00]
+         cmpexadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+
+# CHECK: cmpexadd	qword ptr [r28 + 4*r29 + 291], r23, r19
+# CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xe4,0xbc,0xac,0x23,0x01,0x00,0x00]
+         cmpexadd	qword ptr [r28 + 4*r29 + 291], r23, r19
diff --git a/llvm/test/MC/X86/apx/evex-format-att.s b/llvm/test/MC/X86/apx/evex-format-att.s
index 0b2e860d6ba09..33ad8e3abe75f 100644
--- a/llvm/test/MC/X86/apx/evex-format-att.s
+++ b/llvm/test/MC/X86/apx/evex-format-att.s
@@ -60,6 +60,12 @@
 # CHECK: encoding: [0x62,0xf9,0x79,0x48,0x73,0x3c,0x08,0x00]
          vpslldq	$0, (%r16,%r17), %zmm0
 
+## MRMDestMem4VOp3CC
+
+# CHECK: cmpbexadd	%r18d, %r22d, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x8a,0x69,0x00,0xe6,0xb4,0xac,0x23,0x01,0x00,0x00]
+         cmpbexadd	%r18d, %r22d, 291(%r28,%r29,4)
+
 ## MRMSrcMem4VOp3
 
 # CHECK: bzhiq	%r19, 291(%r28,%r29,4), %r23
diff --git a/llvm/test/MC/X86/apx/evex-format-intel.s b/llvm/test/MC/X86/apx/evex-format-intel.s
index ececb7137b110..1b8f761cdfd3a 100644
--- a/llvm/test/MC/X86/apx/evex-format-intel.s
+++ b/llvm/test/MC/X86/apx/evex-format-intel.s
@@ -60,6 +60,12 @@
 # CHECK: encoding: [0x62,0xf9,0x79,0x48,0x73,0x3c,0x08,0x00]
          vpslldq	zmm0, zmmword ptr [r16 + r17], 0
 
+## MRMDestMem4VOp3CC
+
+# CHECK: cmpbexadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+# CHECK: encoding: [0x62,0x8a,0x69,0x00,0xe6,0xb4,0xac,0x23,0x01,0x00,0x00]
+         cmpbexadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+
 ## MRMSrcMem4VOp3
 
 # CHECK: bzhi	r23, qword ptr [r28 + 4*r29 + 291], r19

From e2d0f50cd6f2887c32508faba54a9a9499576a4e Mon Sep 17 00:00:00 2001
From: Ben Shi <2283975856@qq.com>
Date: Fri, 22 Dec 2023 15:34:13 +0800
Subject: [PATCH 151/342] [clang][NFC] Remove trailing whitespace characters

---
 clang/lib/Serialization/ASTReaderStmt.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/lib/Serialization/ASTReaderStmt.cpp b/clang/lib/Serialization/ASTReaderStmt.cpp
index cf37ffe4c38b5..21aed570ba26c 100644
--- a/clang/lib/Serialization/ASTReaderStmt.cpp
+++ b/clang/lib/Serialization/ASTReaderStmt.cpp
@@ -2007,7 +2007,7 @@ void ASTStmtReader::VisitCXXDependentScopeMemberExpr(
   E->QualifierLoc = Record.readNestedNameSpecifierLoc();
   // not ImplicitAccess
   if (CurrentUnpackingBits->getNextBit())
-    E->Base = Record.readSubExpr();  
+    E->Base = Record.readSubExpr();
   else
     E->Base = nullptr;
 

From a15532d7647a8a4b7fd2889bd97f6f72f273c4bf Mon Sep 17 00:00:00 2001
From: Aiden Grossman <agrossman154@yahoo.com>
Date: Thu, 21 Dec 2023 23:39:28 -0800
Subject: [PATCH 152/342] [X86] Add CPU detection for more znver2 CPUs (#74955)

This patch adds proper detection support for more znver2 CPUs.

Specifically, this adds in support for CPUs codenamed Renoir, Lucienne,
and Mendocino.

This was originally proposedfor Renoir in
https://reviews.llvm.org/D96220 and
got approved, but slipped through the cracks. However, there is still a
demand for this feature.

In addition to adding support for more znver2 CPUs, this patch also includes
some additional refactoring and comments related to cpu model
information for zen CPUs.

Fixes https://github.com/llvm/llvm-project/issues/74934.
---
 compiler-rt/lib/builtins/cpu_model/x86.c | 50 +++++++++++++++++------
 llvm/lib/TargetParser/Host.cpp           | 52 +++++++++++++++++-------
 2 files changed, 74 insertions(+), 28 deletions(-)

diff --git a/compiler-rt/lib/builtins/cpu_model/x86.c b/compiler-rt/lib/builtins/cpu_model/x86.c
index 72b0d55d65f0f..c6a917715e12c 100644
--- a/compiler-rt/lib/builtins/cpu_model/x86.c
+++ b/compiler-rt/lib/builtins/cpu_model/x86.c
@@ -647,35 +647,59 @@ static const char *getAMDProcessorTypeAndSubtype(unsigned Family,
   case 23:
     CPU = "znver1";
     *Type = AMDFAM17H;
-    if ((Model >= 0x30 && Model <= 0x3f) || Model == 0x71) {
+    if ((Model >= 0x30 && Model <= 0x3f) || (Model == 0x47) ||
+        (Model >= 0x60 && Model <= 0x67) || (Model >= 0x68 && Model <= 0x6f) ||
+        (Model >= 0x70 && Model <= 0x7f) || (Model >= 0x84 && Model <= 0x87) ||
+        (Model >= 0x90 && Model <= 0x97) || (Model >= 0x98 && Model <= 0x9f) ||
+        (Model >= 0xa0 && Model <= 0xaf)) {
+      // Family 17h Models 30h-3Fh (Starship) Zen 2
+      // Family 17h Models 47h (Cardinal) Zen 2
+      // Family 17h Models 60h-67h (Renoir) Zen 2
+      // Family 17h Models 68h-6Fh (Lucienne) Zen 2
+      // Family 17h Models 70h-7Fh (Matisse) Zen 2
+      // Family 17h Models 84h-87h (ProjectX) Zen 2
+      // Family 17h Models 90h-97h (VanGogh) Zen 2
+      // Family 17h Models 98h-9Fh (Mero) Zen 2
+      // Family 17h Models A0h-AFh (Mendocino) Zen 2
       CPU = "znver2";
       *Subtype = AMDFAM17H_ZNVER2;
-      break; // 30h-3fh, 71h: Zen2
+      break;
     }
-    if (Model <= 0x0f) {
+    if ((Model >= 0x10 && Model <= 0x1f) || (Model >= 0x20 && Model <= 0x2f)) {
+      // Family 17h Models 10h-1Fh (Raven1) Zen
+      // Family 17h Models 10h-1Fh (Picasso) Zen+
+      // Family 17h Models 20h-2Fh (Raven2 x86) Zen
       *Subtype = AMDFAM17H_ZNVER1;
-      break; // 00h-0Fh: Zen1
+      break;
     }
     break;
   case 25:
     CPU = "znver3";
     *Type = AMDFAM19H;
-    if (Model <= 0x0f || (Model >= 0x20 && Model <= 0x5f)) {
-      // Family 19h Models 00h-0Fh - Zen3
-      // Family 19h Models 20h-2Fh - Zen3
-      // Family 19h Models 30h-3Fh - Zen3
-      // Family 19h Models 40h-4Fh - Zen3+
-      // Family 19h Models 50h-5Fh - Zen3+
+    if ((Model >= 0x00 && Model <= 0x0f) || (Model >= 0x20 && Model <= 0x2f) ||
+        (Model >= 0x30 && Model <= 0x3f) || (Model >= 0x40 && Model <= 0x4f) ||
+        (Model >= 0x50 && Model <= 0x5f)) {
+      // Family 19h Models 00h-0Fh (Genesis, Chagall) Zen 3
+      // Family 19h Models 20h-2Fh (Vermeer) Zen 3
+      // Family 19h Models 30h-3Fh (Badami) Zen 3
+      // Family 19h Models 40h-4Fh (Rembrandt) Zen 3+
+      // Family 19h Models 50h-5Fh (Cezanne) Zen 3
       *Subtype = AMDFAM19H_ZNVER3;
       break;
     }
-    if ((Model >= 0x10 && Model <= 0x1f) || (Model >= 0x60 && Model <= 0x74) ||
-        (Model >= 0x78 && Model <= 0x7b) || (Model >= 0xA0 && Model <= 0xAf)) {
+    if ((Model >= 0x10 && Model <= 0x1f) || (Model >= 0x60 && Model <= 0x6f) ||
+        (Model >= 0x70 && Model <= 0x77) || (Model >= 0x78 && Model <= 0x7f) ||
+        (Model >= 0xa0 && Model <= 0xaf)) {
+      // Family 19h Models 10h-1Fh (Stones; Storm Peak) Zen 4
+      // Family 19h Models 60h-6Fh (Raphael) Zen 4
+      // Family 19h Models 70h-77h (Phoenix, Hawkpoint1) Zen 4
+      // Family 19h Models 78h-7Fh (Phoenix 2, Hawkpoint2) Zen 4
+      // Family 19h Models A0h-AFh (Stones-Dense) Zen 4
       CPU = "znver4";
       *Subtype = AMDFAM19H_ZNVER4;
       break; //  "znver4"
     }
-    break;
+    break; // family 19h
   default:
     break; // Unknown AMD CPU.
   }
diff --git a/llvm/lib/TargetParser/Host.cpp b/llvm/lib/TargetParser/Host.cpp
index e61fcb248faec..11c5000acc077 100644
--- a/llvm/lib/TargetParser/Host.cpp
+++ b/llvm/lib/TargetParser/Host.cpp
@@ -1131,37 +1131,59 @@ getAMDProcessorTypeAndSubtype(unsigned Family, unsigned Model,
   case 23:
     CPU = "znver1";
     *Type = X86::AMDFAM17H;
-    if ((Model >= 0x30 && Model <= 0x3f) || Model == 0x71) {
+    if ((Model >= 0x30 && Model <= 0x3f) || (Model == 0x47) ||
+        (Model >= 0x60 && Model <= 0x67) || (Model >= 0x68 && Model <= 0x6f) ||
+        (Model >= 0x70 && Model <= 0x7f) || (Model >= 0x84 && Model <= 0x87) ||
+        (Model >= 0x90 && Model <= 0x97) || (Model >= 0x98 && Model <= 0x9f) ||
+        (Model >= 0xa0 && Model <= 0xaf)) {
+      // Family 17h Models 30h-3Fh (Starship) Zen 2
+      // Family 17h Models 47h (Cardinal) Zen 2
+      // Family 17h Models 60h-67h (Renoir) Zen 2
+      // Family 17h Models 68h-6Fh (Lucienne) Zen 2
+      // Family 17h Models 70h-7Fh (Matisse) Zen 2
+      // Family 17h Models 84h-87h (ProjectX) Zen 2
+      // Family 17h Models 90h-97h (VanGogh) Zen 2
+      // Family 17h Models 98h-9Fh (Mero) Zen 2
+      // Family 17h Models A0h-AFh (Mendocino) Zen 2
       CPU = "znver2";
       *Subtype = X86::AMDFAM17H_ZNVER2;
-      break; // 30h-3fh, 71h: Zen2
+      break;
     }
-    if (Model <= 0x0f) {
+    if ((Model >= 0x10 && Model <= 0x1f) || (Model >= 0x20 && Model <= 0x2f)) {
+      // Family 17h Models 10h-1Fh (Raven1) Zen
+      // Family 17h Models 10h-1Fh (Picasso) Zen+
+      // Family 17h Models 20h-2Fh (Raven2 x86) Zen
       *Subtype = X86::AMDFAM17H_ZNVER1;
-      break; // 00h-0Fh: Zen1
+      break;
     }
     break;
   case 25:
     CPU = "znver3";
     *Type = X86::AMDFAM19H;
-    if (Model <= 0x0f || (Model >= 0x20 && Model <= 0x5f)) {
-      // Family 19h Models 00h-0Fh - Zen3
-      // Family 19h Models 20h-2Fh - Zen3
-      // Family 19h Models 30h-3Fh - Zen3
-      // Family 19h Models 40h-4Fh - Zen3+
-      // Family 19h Models 50h-5Fh - Zen3+
+    if ((Model >= 0x00 && Model <= 0x0f) || (Model >= 0x20 && Model <= 0x2f) ||
+        (Model >= 0x30 && Model <= 0x3f) || (Model >= 0x40 && Model <= 0x4f) ||
+        (Model >= 0x50 && Model <= 0x5f)) {
+      // Family 19h Models 00h-0Fh (Genesis, Chagall) Zen 3
+      // Family 19h Models 20h-2Fh (Vermeer) Zen 3
+      // Family 19h Models 30h-3Fh (Badami) Zen 3
+      // Family 19h Models 40h-4Fh (Rembrandt) Zen 3+
+      // Family 19h Models 50h-5Fh (Cezanne) Zen 3
       *Subtype = X86::AMDFAM19H_ZNVER3;
       break;
     }
-    if ((Model >= 0x10 && Model <= 0x1f) ||
-        (Model >= 0x60 && Model <= 0x74) ||
-        (Model >= 0x78 && Model <= 0x7b) ||
-        (Model >= 0xA0 && Model <= 0xAf)) {
+    if ((Model >= 0x10 && Model <= 0x1f) || (Model >= 0x60 && Model <= 0x6f) ||
+        (Model >= 0x70 && Model <= 0x77) || (Model >= 0x78 && Model <= 0x7f) ||
+        (Model >= 0xa0 && Model <= 0xaf)) {
+      // Family 19h Models 10h-1Fh (Stones; Storm Peak) Zen 4
+      // Family 19h Models 60h-6Fh (Raphael) Zen 4
+      // Family 19h Models 70h-77h (Phoenix, Hawkpoint1) Zen 4
+      // Family 19h Models 78h-7Fh (Phoenix 2, Hawkpoint2) Zen 4
+      // Family 19h Models A0h-AFh (Stones-Dense) Zen 4
       CPU = "znver4";
       *Subtype = X86::AMDFAM19H_ZNVER4;
       break; //  "znver4"
     }
-    break; // family 19h
+    break;
   default:
     break; // Unknown AMD CPU.
   }

From 73b86d1b2d6822984c24d27da10ea3de7056931f Mon Sep 17 00:00:00 2001
From: Matthias Springer <me@m-sp.org>
Date: Fri, 22 Dec 2023 16:44:07 +0900
Subject: [PATCH 153/342] [mlir][Transforms] `GreedyPatternRewriteDriver`:
 verify IR (#74270)

This commit adds an additional "expensive check" that verifies the IR
before starting a greedy pattern rewriter, after every pattern
application and after every folding. (Only if
`MLIR_ENABLE_EXPENSIVE_PATTERN_API_CHECKS` is set.)

It also adds an assertion that the `scope` region (part of
`GreedyRewriteConfig`) is not being erased as part of the greedy pattern
rewrite. That would break the scoping mechanism and the expensive
checks.

This commit does not fix any patterns, this is done in separate commits.
---
 .../Utils/GreedyPatternRewriteDriver.cpp      | 45 +++++++++++++++++--
 1 file changed, 41 insertions(+), 4 deletions(-)

diff --git a/mlir/lib/Transforms/Utils/GreedyPatternRewriteDriver.cpp b/mlir/lib/Transforms/Utils/GreedyPatternRewriteDriver.cpp
index 7decbce018a87..eca13f52f53dc 100644
--- a/mlir/lib/Transforms/Utils/GreedyPatternRewriteDriver.cpp
+++ b/mlir/lib/Transforms/Utils/GreedyPatternRewriteDriver.cpp
@@ -15,6 +15,7 @@
 #include "mlir/Config/mlir-config.h"
 #include "mlir/IR/Action.h"
 #include "mlir/IR/Matchers.h"
+#include "mlir/IR/Verifier.h"
 #include "mlir/Interfaces/SideEffectInterfaces.h"
 #include "mlir/Rewrite/PatternApplicator.h"
 #include "mlir/Transforms/FoldUtils.h"
@@ -432,6 +433,10 @@ bool GreedyPatternRewriteDriver::processWorklist() {
     if (succeeded(folder.tryToFold(op))) {
       LLVM_DEBUG(logResultWithLine("success", "operation was folded"));
       changed = true;
+#if MLIR_ENABLE_EXPENSIVE_PATTERN_API_CHECKS
+      if (config.scope && failed(verify(config.scope->getParentOp())))
+        llvm::report_fatal_error("IR failed to verify after folding");
+#endif // MLIR_ENABLE_EXPENSIVE_PATTERN_API_CHECKS
       continue;
     }
 
@@ -464,8 +469,9 @@ bool GreedyPatternRewriteDriver::processWorklist() {
 #endif
 
 #if MLIR_ENABLE_EXPENSIVE_PATTERN_API_CHECKS
-    debugFingerPrints.computeFingerPrints(
-        /*topLevel=*/config.scope ? config.scope->getParentOp() : op);
+    if (config.scope) {
+      debugFingerPrints.computeFingerPrints(config.scope->getParentOp());
+    }
     auto clearFingerprints =
         llvm::make_scope_exit([&]() { debugFingerPrints.clear(); });
 #endif // MLIR_ENABLE_EXPENSIVE_PATTERN_API_CHECKS
@@ -473,17 +479,24 @@ bool GreedyPatternRewriteDriver::processWorklist() {
     LogicalResult matchResult =
         matcher.matchAndRewrite(op, *this, canApply, onFailure, onSuccess);
 
+#if MLIR_ENABLE_EXPENSIVE_PATTERN_API_CHECKS
+    if (config.scope && failed(verify(config.scope->getParentOp())))
+      llvm::report_fatal_error("IR failed to verify after pattern application");
+#endif // MLIR_ENABLE_EXPENSIVE_PATTERN_API_CHECKS
+
     if (succeeded(matchResult)) {
       LLVM_DEBUG(logResultWithLine("success", "pattern matched"));
 #if MLIR_ENABLE_EXPENSIVE_PATTERN_API_CHECKS
-      debugFingerPrints.notifyRewriteSuccess();
+      if (config.scope)
+        debugFingerPrints.notifyRewriteSuccess();
 #endif // MLIR_ENABLE_EXPENSIVE_PATTERN_API_CHECKS
       changed = true;
       ++numRewrites;
     } else {
       LLVM_DEBUG(logResultWithLine("failure", "pattern failed to match"));
 #if MLIR_ENABLE_EXPENSIVE_PATTERN_API_CHECKS
-      debugFingerPrints.notifyRewriteFailure();
+      if (config.scope)
+        debugFingerPrints.notifyRewriteFailure();
 #endif // MLIR_ENABLE_EXPENSIVE_PATTERN_API_CHECKS
     }
   }
@@ -562,6 +575,18 @@ void GreedyPatternRewriteDriver::notifyOperationRemoved(Operation *op) {
     logger.startLine() << "** Erase   : '" << op->getName() << "'(" << op
                        << ")\n";
   });
+
+#ifndef NDEBUG
+  // Only ops that are within the configured scope are added to the worklist of
+  // the greedy pattern rewriter. Moreover, the parent op of the scope region is
+  // the part of the IR that is taken into account for the "expensive checks".
+  // A greedy pattern rewrite is not allowed to erase the parent op of the scope
+  // region, as that would break the worklist handling and the expensive checks.
+  if (config.scope && config.scope->getParentOp() == op)
+    llvm_unreachable(
+        "scope region must not be erased during greedy pattern rewrite");
+#endif // NDEBUG
+
   if (config.listener)
     config.listener->notifyOperationRemoved(op);
 
@@ -721,6 +746,12 @@ mlir::applyPatternsAndFoldGreedily(Region &region,
   if (!config.scope)
     config.scope = &region;
 
+#if MLIR_ENABLE_EXPENSIVE_PATTERN_API_CHECKS
+  if (failed(verify(config.scope->getParentOp())))
+    llvm::report_fatal_error(
+        "greedy pattern rewriter input IR failed to verify");
+#endif // MLIR_ENABLE_EXPENSIVE_PATTERN_API_CHECKS
+
   // Start the pattern driver.
   RegionPatternRewriteDriver driver(region.getContext(), patterns, config,
                                     region);
@@ -846,6 +877,12 @@ LogicalResult mlir::applyOpPatternsAndFold(
 #endif // NDEBUG
   }
 
+#if MLIR_ENABLE_EXPENSIVE_PATTERN_API_CHECKS
+  if (config.scope && failed(verify(config.scope->getParentOp())))
+    llvm::report_fatal_error(
+        "greedy pattern rewriter input IR failed to verify");
+#endif // MLIR_ENABLE_EXPENSIVE_PATTERN_API_CHECKS
+
   // Start the pattern driver.
   llvm::SmallDenseSet<Operation *, 4> surviving;
   MultiOpPatternRewriteDriver driver(ops.front()->getContext(), patterns,

From ff32ab3ae7f4ae32907bb802b67a962743db7ba0 Mon Sep 17 00:00:00 2001
From: Shengchen Kan <shengchen.kan@intel.com>
Date: Fri, 22 Dec 2023 14:36:42 +0800
Subject: [PATCH 154/342] [X86][NFC] Not imply TB in PS|PD|XS|XD

This can help us aovid introducing new classes T_MAP*PS|PD|XS|XD
when a new opcode map is supported.

And, T_MAP*PS|PD|XS|XD does not look better than T_MAP*, PS|PD|XS|XD.
---
 llvm/lib/Target/X86/X86InstrAMX.td         |  30 +-
 llvm/lib/Target/X86/X86InstrAVX512.td      | 742 ++++++++++-----------
 llvm/lib/Target/X86/X86InstrArithmetic.td  |  28 +-
 llvm/lib/Target/X86/X86InstrFPStack.td     |   8 +-
 llvm/lib/Target/X86/X86InstrKL.td          |  22 +-
 llvm/lib/Target/X86/X86InstrMMX.td         |  12 +-
 llvm/lib/Target/X86/X86InstrMisc.td        | 182 ++---
 llvm/lib/Target/X86/X86InstrRAOINT.td      |   8 +-
 llvm/lib/Target/X86/X86InstrSGX.td         |   6 +-
 llvm/lib/Target/X86/X86InstrSNP.td         |  12 +-
 llvm/lib/Target/X86/X86InstrSSE.td         | 538 +++++++--------
 llvm/lib/Target/X86/X86InstrShiftRotate.td |  28 +-
 llvm/lib/Target/X86/X86InstrSystem.td      | 132 ++--
 llvm/lib/Target/X86/X86InstrTDX.td         |  14 +-
 llvm/lib/Target/X86/X86InstrTSX.td         |   4 +-
 llvm/lib/Target/X86/X86InstrUtils.td       | 185 +++--
 llvm/lib/Target/X86/X86InstrVMX.td         |  38 +-
 17 files changed, 980 insertions(+), 1009 deletions(-)

diff --git a/llvm/lib/Target/X86/X86InstrAMX.td b/llvm/lib/Target/X86/X86InstrAMX.td
index 71e6a44c9d8e7..a4292b99511bb 100644
--- a/llvm/lib/Target/X86/X86InstrAMX.td
+++ b/llvm/lib/Target/X86/X86InstrAMX.td
@@ -20,32 +20,32 @@ let Predicates = [HasAMXTILE, In64BitMode] in {
         Defs = [TMM0,TMM1,TMM2,TMM3,TMM4,TMM5,TMM6,TMM7] in
     def LDTILECFG : I <0x49, MRM0m, (outs), (ins opaquemem:$src),
                        "ldtilecfg\t$src",
-                       [(int_x86_ldtilecfg addr:$src)]>, VEX, T8PS;
+                       [(int_x86_ldtilecfg addr:$src)]>, VEX, T8, PS;
     let hasSideEffects = 1 in
     def STTILECFG : I <0x49, MRM0m, (outs), (ins opaquemem:$src),
                        "sttilecfg\t$src",
-                       [(int_x86_sttilecfg addr:$src)]>, VEX, T8PD;
+                       [(int_x86_sttilecfg addr:$src)]>, VEX, T8, PD;
     let mayLoad = 1 in
     def TILELOADD : I<0x4b, MRMSrcMemFSIB, (outs TILE:$dst),
                       (ins sibmem:$src),
                       "tileloadd\t{$src, $dst|$dst, $src}", []>,
-                      VEX, T8XD;
+                      VEX, T8, XD;
     let mayLoad = 1 in
     def TILELOADDT1 : I<0x4b, MRMSrcMemFSIB, (outs TILE:$dst),
                         (ins sibmem:$src),
                         "tileloaddt1\t{$src, $dst|$dst, $src}", []>,
-                        VEX, T8PD;
+                        VEX, T8, PD;
     let Defs = [TMM0,TMM1,TMM2,TMM3,TMM4,TMM5,TMM6,TMM7] in
     def TILERELEASE : I<0x49, MRM_C0, (outs), (ins),
-                        "tilerelease", [(int_x86_tilerelease)]>, VEX, T8PS;
+                        "tilerelease", [(int_x86_tilerelease)]>, VEX, T8, PS;
     let mayStore = 1 in
     def TILESTORED : I<0x4b, MRMDestMemFSIB, (outs),
                        (ins sibmem:$dst, TILE:$src),
                        "tilestored\t{$src, $dst|$dst, $src}", []>,
-                       VEX, T8XS;
+                       VEX, T8, XS;
     def TILEZERO : I<0x49, MRMr0, (outs TILE:$dst), (ins),
                      "tilezero\t$dst", []>,
-                     VEX, T8XD;
+                     VEX, T8, XD;
 
     // Pseduo instruction for RA.
     let isPseudo = true, mayLoad = 1, hasSideEffects = 1,
@@ -91,19 +91,19 @@ let Predicates = [HasAMXINT8, In64BitMode] in {
       def TDPBSSD : I<0x5e, MRMSrcReg4VOp3, (outs TILE:$dst),
                       (ins TILE:$src1, TILE:$src2, TILE:$src3),
                       "tdpbssd\t{$src3, $src2, $dst|$dst, $src2, $src3}", []>,
-                      VEX, VVVV, T8XD;
+                      VEX, VVVV, T8, XD;
       def TDPBSUD : I<0x5e, MRMSrcReg4VOp3, (outs TILE:$dst),
                       (ins TILE:$src1, TILE:$src2, TILE:$src3),
                       "tdpbsud\t{$src3, $src2, $dst|$dst, $src2, $src3}", []>,
-                      VEX, VVVV, T8XS;
+                      VEX, VVVV, T8, XS;
       def TDPBUSD : I<0x5e, MRMSrcReg4VOp3, (outs TILE:$dst),
                       (ins TILE:$src1, TILE:$src2, TILE:$src3),
                       "tdpbusd\t{$src3, $src2, $dst|$dst, $src2, $src3}", []>,
-                      VEX, VVVV, T8PD;
+                      VEX, VVVV, T8, PD;
       def TDPBUUD : I<0x5e, MRMSrcReg4VOp3, (outs TILE:$dst),
                       (ins TILE:$src1, TILE:$src2, TILE:$src3),
                       "tdpbuud\t{$src3, $src2, $dst|$dst, $src2, $src3}", []>,
-                      VEX, VVVV, T8PS;
+                      VEX, VVVV, T8, PS;
     }
 
     // Pseduo instruction for RA.
@@ -163,7 +163,7 @@ let Predicates = [HasAMXBF16, In64BitMode] in {
     def TDPBF16PS : I<0x5c, MRMSrcReg4VOp3, (outs TILE:$dst),
                       (ins TILE:$src1, TILE:$src2, TILE:$src3),
                       "tdpbf16ps\t{$src3, $src2, $dst|$dst, $src2, $src3}",
-                      []>, VEX, VVVV, T8XS;
+                      []>, VEX, VVVV, T8, XS;
 
     // Pseduo instruction for RA.
     let isPseudo = true, Constraints = "$src4 = $dst" in
@@ -193,7 +193,7 @@ let Predicates = [HasAMXFP16, In64BitMode] in {
       def TDPFP16PS : I<0x5c, MRMSrcReg4VOp3, (outs TILE:$dst),
                         (ins TILE:$src1, TILE:$src2, TILE:$src3),
                         "tdpfp16ps\t{$src3, $src2, $src1|$src1, $src2, $src3}",
-                        []>, VEX, VVVV, T8XD;
+                        []>, VEX, VVVV, T8, XD;
     }
 
     // Pseduo instruction for RA.
@@ -222,11 +222,11 @@ let Predicates = [HasAMXCOMPLEX, In64BitMode] in {
       def TCMMIMFP16PS   : I<0x6c, MRMSrcReg4VOp3, (outs TILE:$dst),
                             (ins TILE:$src1, TILE:$src2, TILE:$src3),
                             "tcmmimfp16ps\t{$src3, $src2, $src1|$src1, $src2, $src3}",
-                            []>, T8PD, VEX, VVVV;
+                            []>, T8, PD, VEX, VVVV;
       def TCMMRLFP16PS : I<0x6c, MRMSrcReg4VOp3, (outs TILE:$dst),
                             (ins TILE:$src1, TILE:$src2, TILE:$src3),
                             "tcmmrlfp16ps\t{$src3, $src2, $src1|$src1, $src2, $src3}",
-                            []>, VEX, VVVV, WIG, T8PS;
+                            []>, VEX, VVVV, WIG, T8, PS;
 
     } // Constraints = "$src1 = $dst"
 
diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td
index 86619dfd07bca..e3a4aee3aceb7 100644
--- a/llvm/lib/Target/X86/X86InstrAVX512.td
+++ b/llvm/lib/Target/X86/X86InstrAVX512.td
@@ -1039,7 +1039,7 @@ multiclass avx512_broadcast_rm_split<bits<8> opc, string OpcodeStr,
                        (bitconvert
                         (DestInfo.VT
                          (UnmaskedOp (SrcInfo.VT SrcInfo.RC:$src))))))],
-                    DestInfo.ExeDomain>, T8PD, EVEX, Sched<[SchedRR]>;
+                    DestInfo.ExeDomain>, T8, PD, EVEX, Sched<[SchedRR]>;
   def rrkz : AVX512PI<opc, MRMSrcReg, (outs MaskInfo.RC:$dst),
                       (ins MaskInfo.KRCWM:$mask, SrcInfo.RC:$src),
                       !strconcat(OpcodeStr, "\t{$src, ${dst} {${mask}} {z}|",
@@ -1051,7 +1051,7 @@ multiclass avx512_broadcast_rm_split<bits<8> opc, string OpcodeStr,
                             (DestInfo.VT
                              (X86VBroadcast (SrcInfo.VT SrcInfo.RC:$src))))),
                           MaskInfo.ImmAllZerosV))],
-                       DestInfo.ExeDomain>, T8PD, EVEX, EVEX_KZ, Sched<[SchedRR]>;
+                       DestInfo.ExeDomain>, T8, PD, EVEX, EVEX_KZ, Sched<[SchedRR]>;
   let Constraints = "$src0 = $dst" in
   def rrk : AVX512PI<opc, MRMSrcReg, (outs MaskInfo.RC:$dst),
                      (ins MaskInfo.RC:$src0, MaskInfo.KRCWM:$mask,
@@ -1065,7 +1065,7 @@ multiclass avx512_broadcast_rm_split<bits<8> opc, string OpcodeStr,
                           (DestInfo.VT
                            (X86VBroadcast (SrcInfo.VT SrcInfo.RC:$src))))),
                         MaskInfo.RC:$src0))],
-                      DestInfo.ExeDomain>, T8PD, EVEX, EVEX_K, Sched<[SchedRR]>;
+                      DestInfo.ExeDomain>, T8, PD, EVEX, EVEX_K, Sched<[SchedRR]>;
 
   let hasSideEffects = 0, mayLoad = 1 in
   def rm : AVX512PI<opc, MRMSrcMem, (outs MaskInfo.RC:$dst),
@@ -1076,7 +1076,7 @@ multiclass avx512_broadcast_rm_split<bits<8> opc, string OpcodeStr,
                        (bitconvert
                         (DestInfo.VT
                          (UnmaskedBcastOp addr:$src)))))],
-                    DestInfo.ExeDomain>, T8PD, EVEX,
+                    DestInfo.ExeDomain>, T8, PD, EVEX,
                     EVEX_CD8<SrcInfo.EltSize, CD8VT1>, Sched<[SchedRM]>;
 
   def rmkz : AVX512PI<opc, MRMSrcMem, (outs MaskInfo.RC:$dst),
@@ -1090,7 +1090,7 @@ multiclass avx512_broadcast_rm_split<bits<8> opc, string OpcodeStr,
                             (DestInfo.VT
                              (SrcInfo.BroadcastLdFrag addr:$src)))),
                           MaskInfo.ImmAllZerosV))],
-                       DestInfo.ExeDomain>, T8PD, EVEX, EVEX_KZ,
+                       DestInfo.ExeDomain>, T8, PD, EVEX, EVEX_KZ,
                        EVEX_CD8<SrcInfo.EltSize, CD8VT1>, Sched<[SchedRM]>;
 
   let Constraints = "$src0 = $dst",
@@ -1107,7 +1107,7 @@ multiclass avx512_broadcast_rm_split<bits<8> opc, string OpcodeStr,
                           (DestInfo.VT
                            (SrcInfo.BroadcastLdFrag addr:$src)))),
                         MaskInfo.RC:$src0))],
-                      DestInfo.ExeDomain>, T8PD, EVEX, EVEX_K,
+                      DestInfo.ExeDomain>, T8, PD, EVEX, EVEX_K,
                       EVEX_CD8<SrcInfo.EltSize, CD8VT1>, Sched<[SchedRM]>;
 }
 
@@ -1173,7 +1173,7 @@ multiclass avx512_int_broadcast_reg<bits<8> opc, SchedWrite SchedRR,
                           "vpbroadcast"#_.Suffix, "$src", "$src",
                           (_.VT (OpNode SrcRC:$src)), /*IsCommutable*/0,
                           /*IsKCommutable*/0, /*IsKZCommutable*/0, vselect>,
-                          T8PD, EVEX, Sched<[SchedRR]>;
+                          T8, PD, EVEX, Sched<[SchedRR]>;
 }
 
 multiclass avx512_int_broadcastbw_reg<bits<8> opc, string Name, SchedWrite SchedRR,
@@ -1185,7 +1185,7 @@ multiclass avx512_int_broadcastbw_reg<bits<8> opc, string Name, SchedWrite Sched
                          !con((ins _.RC:$src0, _.KRCWM:$mask), (ins GR32:$src)),
                          !con((ins _.KRCWM:$mask), (ins GR32:$src)),
                          "vpbroadcast"#_.Suffix, "$src", "$src", [], [], [],
-                         "$src0 = $dst">, T8PD, EVEX, Sched<[SchedRR]>;
+                         "$src0 = $dst">, T8, PD, EVEX, Sched<[SchedRR]>;
 
   def : Pat <(_.VT (OpNode SrcRC:$src)),
              (!cast<Instruction>(Name#rr)
@@ -2082,7 +2082,7 @@ defm VPCMPEQD : avx512_icmp_packed_rmb_vl<0x76, "vpcmpeqd",
 
 defm VPCMPEQQ : avx512_icmp_packed_rmb_vl<0x29, "vpcmpeqq",
                       SchedWriteVecALU, avx512vl_i64_info, HasAVX512, 1>,
-                T8PD, REX_W, EVEX_CD8<64, CD8VF>;
+                T8, REX_W, EVEX_CD8<64, CD8VF>;
 
 defm VPCMPGTB : avx512_icmp_packed_vl<0x64, "vpcmpgtb",
                       SchedWriteVecALU, avx512vl_i8_info, HasBWI>,
@@ -2098,7 +2098,7 @@ defm VPCMPGTD : avx512_icmp_packed_rmb_vl<0x66, "vpcmpgtd",
 
 defm VPCMPGTQ : avx512_icmp_packed_rmb_vl<0x37, "vpcmpgtq",
                       SchedWriteVecALU, avx512vl_i64_info, HasAVX512>,
-                T8PD, REX_W, EVEX_CD8<64, CD8VF>;
+                T8, REX_W, EVEX_CD8<64, CD8VF>;
 }
 
 multiclass avx512_icmp_cc<bits<8> opc, string Suffix, PatFrag Frag,
@@ -2625,40 +2625,40 @@ multiclass avx512_mask_mov_gpr<bits<8> opc_kr, bits<8> opc_rk,
 let Predicates = [HasDQI, NoEGPR] in
   defm KMOVB : avx512_mask_mov<0x90, 0x90, 0x91, "kmovb", VK8, v8i1, i8mem>,
                avx512_mask_mov_gpr<0x92, 0x93, "kmovb", VK8, GR32>,
-               VEX, PD;
+               VEX, TB, PD;
 let Predicates = [HasDQI, HasEGPR, In64BitMode] in
   defm KMOVB : avx512_mask_mov<0x90, 0x90, 0x91, "kmovb", VK8, v8i1, i8mem, "_EVEX">,
                avx512_mask_mov_gpr<0x92, 0x93, "kmovb", VK8, GR32, "_EVEX">,
-               EVEX, PD;
+               EVEX, TB, PD;
 
 let Predicates = [HasAVX512, NoEGPR] in
   defm KMOVW : avx512_mask_mov<0x90, 0x90, 0x91, "kmovw", VK16, v16i1, i16mem>,
                avx512_mask_mov_gpr<0x92, 0x93, "kmovw", VK16, GR32>,
-               VEX, PS;
+               VEX, TB, PS;
 let Predicates = [HasAVX512, HasEGPR, In64BitMode] in
   defm KMOVW : avx512_mask_mov<0x90, 0x90, 0x91, "kmovw", VK16, v16i1, i16mem, "_EVEX">,
                avx512_mask_mov_gpr<0x92, 0x93, "kmovw", VK16, GR32, "_EVEX">,
-               EVEX, PS;
+               EVEX, TB, PS;
 
 let Predicates = [HasBWI, NoEGPR] in {
   defm KMOVD : avx512_mask_mov<0x90, 0x90, 0x91, "kmovd", VK32, v32i1,i32mem>,
-               VEX, PD, REX_W;
+               VEX, TB, PD, REX_W;
   defm KMOVD : avx512_mask_mov_gpr<0x92, 0x93, "kmovd", VK32, GR32>,
-               VEX, XD;
+               VEX, TB, XD;
   defm KMOVQ : avx512_mask_mov<0x90, 0x90, 0x91, "kmovq", VK64, v64i1, i64mem>,
-               VEX, PS, REX_W;
+               VEX, TB, PS, REX_W;
   defm KMOVQ : avx512_mask_mov_gpr<0x92, 0x93, "kmovq", VK64, GR64>,
-               VEX, XD, REX_W;
+               VEX, TB, XD, REX_W;
 }
 let Predicates = [HasBWI, HasEGPR, In64BitMode] in {
   defm KMOVD : avx512_mask_mov<0x90, 0x90, 0x91, "kmovd", VK32, v32i1,i32mem, "_EVEX">,
-               EVEX, PD, REX_W;
+               EVEX, TB, PD, REX_W;
   defm KMOVD : avx512_mask_mov_gpr<0x92, 0x93, "kmovd", VK32, GR32, "_EVEX">,
-               EVEX, XD;
+               EVEX, TB, XD;
   defm KMOVQ : avx512_mask_mov<0x90, 0x90, 0x91, "kmovq", VK64, v64i1, i64mem, "_EVEX">,
-               EVEX, PS, REX_W;
+               EVEX, TB, PS, REX_W;
   defm KMOVQ : avx512_mask_mov_gpr<0x92, 0x93, "kmovq", VK64, GR64, "_EVEX">,
-               EVEX, XD, REX_W;
+               EVEX, TB, XD, REX_W;
 }
 
 // GR from/to mask register
@@ -2769,13 +2769,13 @@ multiclass avx512_mask_unop_all<bits<8> opc, string OpcodeStr,
                                 SDPatternOperator OpNode,
                                 X86FoldableSchedWrite sched> {
   defm B : avx512_mask_unop<opc, !strconcat(OpcodeStr, "b"), VK8, OpNode,
-                            sched, HasDQI>, VEX, PD;
+                            sched, HasDQI>, VEX, TB, PD;
   defm W : avx512_mask_unop<opc, !strconcat(OpcodeStr, "w"), VK16, OpNode,
-                            sched, HasAVX512>, VEX, PS;
+                            sched, HasAVX512>, VEX, TB, PS;
   defm D : avx512_mask_unop<opc, !strconcat(OpcodeStr, "d"), VK32, OpNode,
-                            sched, HasBWI>, VEX, PD, REX_W;
+                            sched, HasBWI>, VEX, TB, PD, REX_W;
   defm Q : avx512_mask_unop<opc, !strconcat(OpcodeStr, "q"), VK64, OpNode,
-                            sched, HasBWI>, VEX, PS, REX_W;
+                            sched, HasBWI>, VEX, TB, PS, REX_W;
 }
 
 // TODO - do we need a X86SchedWriteWidths::KMASK type?
@@ -2812,13 +2812,13 @@ multiclass avx512_mask_binop_all<bits<8> opc, string OpcodeStr,
                                  X86FoldableSchedWrite sched, bit IsCommutable,
                                  Predicate prdW = HasAVX512> {
   defm B : avx512_mask_binop<opc, !strconcat(OpcodeStr, "b"), VK8, OpNode,
-                             sched, HasDQI, IsCommutable>, VEX, VVVV, VEX_L, PD;
+                             sched, HasDQI, IsCommutable>, VEX, VVVV, VEX_L, TB, PD;
   defm W : avx512_mask_binop<opc, !strconcat(OpcodeStr, "w"), VK16, OpNode,
-                             sched, prdW, IsCommutable>, VEX, VVVV, VEX_L, PS;
+                             sched, prdW, IsCommutable>, VEX, VVVV, VEX_L, TB, PS;
   defm D : avx512_mask_binop<opc, !strconcat(OpcodeStr, "d"), VK32, OpNode,
-                             sched, HasBWI, IsCommutable>, VEX, VVVV, VEX_L, REX_W, PD;
+                             sched, HasBWI, IsCommutable>, VEX, VVVV, VEX_L, REX_W, TB, PD;
   defm Q : avx512_mask_binop<opc, !strconcat(OpcodeStr, "q"), VK64, OpNode,
-                             sched, HasBWI, IsCommutable>, VEX, VVVV, VEX_L, REX_W, PS;
+                             sched, HasBWI, IsCommutable>, VEX, VVVV, VEX_L, REX_W, TB, PS;
 }
 
 // TODO - do we need a X86SchedWriteWidths::KMASK type?
@@ -2876,9 +2876,9 @@ multiclass avx512_mask_unpck<string Suffix, X86KVectorVTInfo Dst,
   }
 }
 
-defm KUNPCKBW : avx512_mask_unpck<"bw", v16i1_info, v8i1_info,  WriteShuffle, HasAVX512>, PD;
-defm KUNPCKWD : avx512_mask_unpck<"wd", v32i1_info, v16i1_info, WriteShuffle, HasBWI>, PS;
-defm KUNPCKDQ : avx512_mask_unpck<"dq", v64i1_info, v32i1_info, WriteShuffle, HasBWI>, PS, REX_W;
+defm KUNPCKBW : avx512_mask_unpck<"bw", v16i1_info, v8i1_info,  WriteShuffle, HasAVX512>, TB, PD;
+defm KUNPCKWD : avx512_mask_unpck<"wd", v32i1_info, v16i1_info, WriteShuffle, HasBWI>, TB, PS;
+defm KUNPCKDQ : avx512_mask_unpck<"dq", v64i1_info, v32i1_info, WriteShuffle, HasBWI>, TB, PS, REX_W;
 
 // Mask bit testing
 multiclass avx512_mask_testop<bits<8> opc, string OpcodeStr, RegisterClass KRC,
@@ -2895,13 +2895,13 @@ multiclass avx512_mask_testop_w<bits<8> opc, string OpcodeStr, SDNode OpNode,
                                 X86FoldableSchedWrite sched,
                                 Predicate prdW = HasAVX512> {
   defm B : avx512_mask_testop<opc, OpcodeStr#"b", VK8, OpNode, sched, HasDQI>,
-                                                                VEX, PD;
+                                                                VEX, TB, PD;
   defm W : avx512_mask_testop<opc, OpcodeStr#"w", VK16, OpNode, sched, prdW>,
-                                                                VEX, PS;
+                                                                VEX, TB, PS;
   defm Q : avx512_mask_testop<opc, OpcodeStr#"q", VK64, OpNode, sched, HasBWI>,
-                                                                VEX, PS, REX_W;
+                                                                VEX, TB, PS, REX_W;
   defm D : avx512_mask_testop<opc, OpcodeStr#"d", VK32, OpNode, sched, HasBWI>,
-                                                                VEX, PD, REX_W;
+                                                                VEX, TB, PD, REX_W;
 }
 
 // TODO - do we need a X86SchedWriteWidths::KMASK type?
@@ -2922,15 +2922,15 @@ multiclass avx512_mask_shiftop<bits<8> opc, string OpcodeStr, RegisterClass KRC,
 multiclass avx512_mask_shiftop_w<bits<8> opc1, bits<8> opc2, string OpcodeStr,
                                  SDNode OpNode, X86FoldableSchedWrite sched> {
   defm W : avx512_mask_shiftop<opc1, !strconcat(OpcodeStr, "w"), VK16, OpNode,
-                               sched>, VEX, TAPD, REX_W;
+                               sched>, VEX, TA, PD, REX_W;
   let Predicates = [HasDQI] in
   defm B : avx512_mask_shiftop<opc1, !strconcat(OpcodeStr, "b"), VK8, OpNode,
-                               sched>, VEX, TAPD;
+                               sched>, VEX, TA, PD;
   let Predicates = [HasBWI] in {
   defm Q : avx512_mask_shiftop<opc2, !strconcat(OpcodeStr, "q"), VK64, OpNode,
-                               sched>, VEX, TAPD, REX_W;
+                               sched>, VEX, TA, PD, REX_W;
   defm D : avx512_mask_shiftop<opc2, !strconcat(OpcodeStr, "d"), VK32, OpNode,
-                               sched>, VEX, TAPD;
+                               sched>, VEX, TA, PD;
   }
 }
 
@@ -3371,25 +3371,25 @@ defm VMOVAPS : avx512_alignedload_vl<0x28, "vmovaps", avx512vl_f32_info,
                                      HasAVX512, SchedWriteFMoveLS, "VMOVAPS">,
                avx512_alignedstore_vl<0x29, "vmovaps", avx512vl_f32_info,
                                       HasAVX512, SchedWriteFMoveLS, "VMOVAPS">,
-               PS, EVEX_CD8<32, CD8VF>;
+               TB, PS, EVEX_CD8<32, CD8VF>;
 
 defm VMOVAPD : avx512_alignedload_vl<0x28, "vmovapd", avx512vl_f64_info,
                                      HasAVX512, SchedWriteFMoveLS, "VMOVAPD">,
                avx512_alignedstore_vl<0x29, "vmovapd", avx512vl_f64_info,
                                       HasAVX512, SchedWriteFMoveLS, "VMOVAPD">,
-               PD, REX_W, EVEX_CD8<64, CD8VF>;
+               TB, PD, REX_W, EVEX_CD8<64, CD8VF>;
 
 defm VMOVUPS : avx512_load_vl<0x10, "vmovups", avx512vl_f32_info, HasAVX512,
                               SchedWriteFMoveLS, "VMOVUPS", 0, null_frag>,
                avx512_store_vl<0x11, "vmovups", avx512vl_f32_info, HasAVX512,
                                SchedWriteFMoveLS, "VMOVUPS">,
-                               PS, EVEX_CD8<32, CD8VF>;
+                               TB, PS, EVEX_CD8<32, CD8VF>;
 
 defm VMOVUPD : avx512_load_vl<0x10, "vmovupd", avx512vl_f64_info, HasAVX512,
                               SchedWriteFMoveLS, "VMOVUPD", 0, null_frag>,
                avx512_store_vl<0x11, "vmovupd", avx512vl_f64_info, HasAVX512,
                                SchedWriteFMoveLS, "VMOVUPD">,
-               PD, REX_W, EVEX_CD8<64, CD8VF>;
+               TB, PD, REX_W, EVEX_CD8<64, CD8VF>;
 
 defm VMOVDQA32 : avx512_alignedload_vl<0x6F, "vmovdqa32", avx512vl_i32_info,
                                        HasAVX512, SchedWriteVecMoveLS,
@@ -3397,7 +3397,7 @@ defm VMOVDQA32 : avx512_alignedload_vl<0x6F, "vmovdqa32", avx512vl_i32_info,
                  avx512_alignedstore_vl<0x7F, "vmovdqa32", avx512vl_i32_info,
                                         HasAVX512, SchedWriteVecMoveLS,
                                         "VMOVDQA", 1>,
-                 PD, EVEX_CD8<32, CD8VF>;
+                 TB, PD, EVEX_CD8<32, CD8VF>;
 
 defm VMOVDQA64 : avx512_alignedload_vl<0x6F, "vmovdqa64", avx512vl_i64_info,
                                        HasAVX512, SchedWriteVecMoveLS,
@@ -3405,31 +3405,31 @@ defm VMOVDQA64 : avx512_alignedload_vl<0x6F, "vmovdqa64", avx512vl_i64_info,
                  avx512_alignedstore_vl<0x7F, "vmovdqa64", avx512vl_i64_info,
                                         HasAVX512, SchedWriteVecMoveLS,
                                         "VMOVDQA">,
-                 PD, REX_W, EVEX_CD8<64, CD8VF>;
+                 TB, PD, REX_W, EVEX_CD8<64, CD8VF>;
 
 defm VMOVDQU8 : avx512_load_vl<0x6F, "vmovdqu8", avx512vl_i8_info, HasBWI,
                                SchedWriteVecMoveLS, "VMOVDQU", 1>,
                 avx512_store_vl<0x7F, "vmovdqu8", avx512vl_i8_info, HasBWI,
                                 SchedWriteVecMoveLS, "VMOVDQU", 1>,
-                XD, EVEX_CD8<8, CD8VF>;
+                TB, XD, EVEX_CD8<8, CD8VF>;
 
 defm VMOVDQU16 : avx512_load_vl<0x6F, "vmovdqu16", avx512vl_i16_info, HasBWI,
                                 SchedWriteVecMoveLS, "VMOVDQU", 1>,
                  avx512_store_vl<0x7F, "vmovdqu16", avx512vl_i16_info, HasBWI,
                                  SchedWriteVecMoveLS, "VMOVDQU", 1>,
-                 XD, REX_W, EVEX_CD8<16, CD8VF>;
+                 TB, XD, REX_W, EVEX_CD8<16, CD8VF>;
 
 defm VMOVDQU32 : avx512_load_vl<0x6F, "vmovdqu32", avx512vl_i32_info, HasAVX512,
                                 SchedWriteVecMoveLS, "VMOVDQU", 1, null_frag>,
                  avx512_store_vl<0x7F, "vmovdqu32", avx512vl_i32_info, HasAVX512,
                                  SchedWriteVecMoveLS, "VMOVDQU", 1>,
-                 XS, EVEX_CD8<32, CD8VF>;
+                 TB, XS, EVEX_CD8<32, CD8VF>;
 
 defm VMOVDQU64 : avx512_load_vl<0x6F, "vmovdqu64", avx512vl_i64_info, HasAVX512,
                                 SchedWriteVecMoveLS, "VMOVDQU", 0, null_frag>,
                  avx512_store_vl<0x7F, "vmovdqu64", avx512vl_i64_info, HasAVX512,
                                  SchedWriteVecMoveLS, "VMOVDQU">,
-                 XS, REX_W, EVEX_CD8<64, CD8VF>;
+                 TB, XS, REX_W, EVEX_CD8<64, CD8VF>;
 
 // Special instructions to help with spilling when we don't have VLX. We need
 // to load or store from a ZMM register instead. These are converted in
@@ -3816,12 +3816,12 @@ def VMOVPQIto64Zrr : I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128X:$src),
                       "vmovq\t{$src, $dst|$dst, $src}",
                       [(set GR64:$dst, (extractelt (v2i64 VR128X:$src),
                                                    (iPTR 0)))]>,
-                      PD, EVEX, REX_W, Sched<[WriteVecMoveToGpr]>,
+                      TB, PD, EVEX, REX_W, Sched<[WriteVecMoveToGpr]>,
                       Requires<[HasAVX512]>;
 
 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in
 def VMOVPQIto64Zmr : I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, VR128X:$src),
-                      "vmovq\t{$src, $dst|$dst, $src}", []>, PD,
+                      "vmovq\t{$src, $dst|$dst, $src}", []>, TB, PD,
                       EVEX, REX_W, EVEX_CD8<64, CD8VT1>, Sched<[WriteVecStore]>,
                       Requires<[HasAVX512, In64BitMode]>;
 
@@ -3830,7 +3830,7 @@ def VMOVPQI2QIZmr : I<0xD6, MRMDestMem, (outs),
                       "vmovq\t{$src, $dst|$dst, $src}",
                       [(store (extractelt (v2i64 VR128X:$src), (iPTR 0)),
                               addr:$dst)]>,
-                      EVEX, PD, REX_W, EVEX_CD8<64, CD8VT1>,
+                      EVEX, TB, PD, REX_W, EVEX_CD8<64, CD8VT1>,
                       Sched<[WriteVecStore]>, Requires<[HasAVX512]>;
 
 let hasSideEffects = 0, isCodeGenOnly = 1, ForceDisassemble = 1 in
@@ -3954,14 +3954,14 @@ multiclass avx512_move_scalar<string asm, SDNode OpNode, PatFrag vzload_frag,
 }
 
 defm VMOVSSZ : avx512_move_scalar<"vmovss", X86Movss, X86vzload32, f32x_info>,
-                                  VEX_LIG, XS, EVEX_CD8<32, CD8VT1>;
+                                  VEX_LIG, TB, XS, EVEX_CD8<32, CD8VT1>;
 
 defm VMOVSDZ : avx512_move_scalar<"vmovsd", X86Movsd, X86vzload64, f64x_info>,
-                                  VEX_LIG, XD, REX_W, EVEX_CD8<64, CD8VT1>;
+                                  VEX_LIG, TB, XD, REX_W, EVEX_CD8<64, CD8VT1>;
 
 defm VMOVSHZ : avx512_move_scalar<"vmovsh", X86Movsh, X86vzload16, f16x_info,
                                   HasFP16>,
-                                  VEX_LIG, T_MAP5XS, EVEX_CD8<16, CD8VT1>;
+                                  VEX_LIG, T_MAP5, XS, EVEX_CD8<16, CD8VT1>;
 
 multiclass avx512_move_scalar_lowering<string InstrStr, SDNode OpNode,
                                        PatLeaf ZeroFP, X86VectorVTInfo _> {
@@ -4286,7 +4286,7 @@ let hasSideEffects = 0, isCodeGenOnly = 1, ForceDisassemble = 1 in {
     def VMOVSHZrr_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst),
         (ins VR128X:$src1, VR128X:$src2),
         "vmovsh\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-        []>, T_MAP5XS, EVEX, VVVV, VEX_LIG,
+        []>, T_MAP5, XS, EVEX, VVVV, VEX_LIG,
         Sched<[SchedWriteFShuffle.XMM]>;
 
     let Constraints = "$src0 = $dst" in
@@ -4295,20 +4295,20 @@ let hasSideEffects = 0, isCodeGenOnly = 1, ForceDisassemble = 1 in {
          VR128X:$src1, VR128X:$src2),
         "vmovsh\t{$src2, $src1, $dst {${mask}}|"#
           "$dst {${mask}}, $src1, $src2}",
-        []>, T_MAP5XS, EVEX_K, EVEX, VVVV, VEX_LIG,
+        []>, T_MAP5, XS, EVEX_K, EVEX, VVVV, VEX_LIG,
         Sched<[SchedWriteFShuffle.XMM]>;
 
     def VMOVSHZrrkz_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst),
         (ins f16x_info.KRCWM:$mask, VR128X:$src1, VR128X:$src2),
         "vmovsh\t{$src2, $src1, $dst {${mask}} {z}|"#
           "$dst {${mask}} {z}, $src1, $src2}",
-        []>, EVEX_KZ, T_MAP5XS, EVEX, VVVV, VEX_LIG,
+        []>, EVEX_KZ, T_MAP5, XS, EVEX, VVVV, VEX_LIG,
         Sched<[SchedWriteFShuffle.XMM]>;
   }
   def VMOVSSZrr_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst),
                            (ins VR128X:$src1, VR128X:$src2),
                            "vmovss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                           []>, XS, EVEX, VVVV, VEX_LIG,
+                           []>, TB, XS, EVEX, VVVV, VEX_LIG,
                            Sched<[SchedWriteFShuffle.XMM]>;
 
   let Constraints = "$src0 = $dst" in
@@ -4317,20 +4317,20 @@ let hasSideEffects = 0, isCodeGenOnly = 1, ForceDisassemble = 1 in {
                                                    VR128X:$src1, VR128X:$src2),
                              "vmovss\t{$src2, $src1, $dst {${mask}}|"#
                                         "$dst {${mask}}, $src1, $src2}",
-                             []>, EVEX_K, XS, EVEX, VVVV, VEX_LIG,
+                             []>, EVEX_K, TB, XS, EVEX, VVVV, VEX_LIG,
                              Sched<[SchedWriteFShuffle.XMM]>;
 
   def VMOVSSZrrkz_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst),
                          (ins f32x_info.KRCWM:$mask, VR128X:$src1, VR128X:$src2),
                          "vmovss\t{$src2, $src1, $dst {${mask}} {z}|"#
                                     "$dst {${mask}} {z}, $src1, $src2}",
-                         []>, EVEX_KZ, XS, EVEX, VVVV, VEX_LIG,
+                         []>, EVEX_KZ, TB, XS, EVEX, VVVV, VEX_LIG,
                          Sched<[SchedWriteFShuffle.XMM]>;
 
   def VMOVSDZrr_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst),
                            (ins VR128X:$src1, VR128X:$src2),
                            "vmovsd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                           []>, XD, EVEX, VVVV, VEX_LIG, REX_W,
+                           []>, TB, XD, EVEX, VVVV, VEX_LIG, REX_W,
                            Sched<[SchedWriteFShuffle.XMM]>;
 
   let Constraints = "$src0 = $dst" in
@@ -4339,7 +4339,7 @@ let hasSideEffects = 0, isCodeGenOnly = 1, ForceDisassemble = 1 in {
                                                    VR128X:$src1, VR128X:$src2),
                              "vmovsd\t{$src2, $src1, $dst {${mask}}|"#
                                         "$dst {${mask}}, $src1, $src2}",
-                             []>, EVEX_K, XD, EVEX, VVVV, VEX_LIG,
+                             []>, EVEX_K, TB, XD, EVEX, VVVV, VEX_LIG,
                              REX_W, Sched<[SchedWriteFShuffle.XMM]>;
 
   def VMOVSDZrrkz_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst),
@@ -4347,7 +4347,7 @@ let hasSideEffects = 0, isCodeGenOnly = 1, ForceDisassemble = 1 in {
                                                           VR128X:$src2),
                               "vmovsd\t{$src2, $src1, $dst {${mask}} {z}|"#
                                          "$dst {${mask}} {z}, $src1, $src2}",
-                              []>, EVEX_KZ, XD, EVEX, VVVV, VEX_LIG,
+                              []>, EVEX_KZ, TB, XD, EVEX, VVVV, VEX_LIG,
                               REX_W, Sched<[SchedWriteFShuffle.XMM]>;
 }
 
@@ -4546,20 +4546,20 @@ let Predicates = [HasAVX512] in {
 def VMOVNTDQAZrm : AVX512PI<0x2A, MRMSrcMem, (outs VR512:$dst),
                       (ins i512mem:$src), "vmovntdqa\t{$src, $dst|$dst, $src}",
                       [], SSEPackedInt>, Sched<[SchedWriteVecMoveLS.ZMM.RM]>,
-                      EVEX, T8PD, EVEX_V512, EVEX_CD8<64, CD8VF>;
+                      EVEX, T8, PD, EVEX_V512, EVEX_CD8<64, CD8VF>;
 
 let Predicates = [HasVLX] in {
   def VMOVNTDQAZ256rm : AVX512PI<0x2A, MRMSrcMem, (outs VR256X:$dst),
                        (ins i256mem:$src),
                        "vmovntdqa\t{$src, $dst|$dst, $src}",
                        [], SSEPackedInt>, Sched<[SchedWriteVecMoveLS.YMM.RM]>,
-                       EVEX, T8PD, EVEX_V256, EVEX_CD8<64, CD8VF>;
+                       EVEX, T8, PD, EVEX_V256, EVEX_CD8<64, CD8VF>;
 
   def VMOVNTDQAZ128rm : AVX512PI<0x2A, MRMSrcMem, (outs VR128X:$dst),
                       (ins i128mem:$src),
                       "vmovntdqa\t{$src, $dst|$dst, $src}",
                       [], SSEPackedInt>, Sched<[SchedWriteVecMoveLS.XMM.RM]>,
-                      EVEX, T8PD, EVEX_V128, EVEX_CD8<64, CD8VF>;
+                      EVEX, T8, PD, EVEX_V128, EVEX_CD8<64, CD8VF>;
 }
 
 multiclass avx512_movnt<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
@@ -4585,11 +4585,11 @@ multiclass avx512_movnt_vl<bits<8> opc, string OpcodeStr,
 }
 
 defm VMOVNTDQ : avx512_movnt_vl<0xE7, "vmovntdq", avx512vl_i64_info,
-                                SchedWriteVecMoveLSNT>, PD;
+                                SchedWriteVecMoveLSNT>, TB, PD;
 defm VMOVNTPD : avx512_movnt_vl<0x2B, "vmovntpd", avx512vl_f64_info,
-                                SchedWriteFMoveLSNT>, PD, REX_W;
+                                SchedWriteFMoveLSNT>, TB, PD, REX_W;
 defm VMOVNTPS : avx512_movnt_vl<0x2B, "vmovntps", avx512vl_f32_info,
-                                SchedWriteFMoveLSNT>, PS;
+                                SchedWriteFMoveLSNT>, TB, PS;
 
 let Predicates = [HasAVX512], AddedComplexity = 400 in {
   def : Pat<(alignednontemporalstore (v16i32 VR512:$src), addr:$dst),
@@ -4829,22 +4829,22 @@ defm VPADDUS : avx512_binop_rm_vl_bw<0xDC, 0xDD, "vpaddus", uaddsat,
 defm VPSUBUS : avx512_binop_rm_vl_bw<0xD8, 0xD9, "vpsubus", usubsat,
                                      SchedWriteVecALU, HasBWI, 0>;
 defm VPMULLD : avx512_binop_rm_vl_d<0x40, "vpmulld", mul,
-                                    SchedWritePMULLD, HasAVX512, 1>, T8PD;
+                                    SchedWritePMULLD, HasAVX512, 1>, T8;
 defm VPMULLW : avx512_binop_rm_vl_w<0xD5, "vpmullw", mul,
                                     SchedWriteVecIMul, HasBWI, 1>;
 defm VPMULLQ : avx512_binop_rm_vl_q<0x40, "vpmullq", mul,
-                                    SchedWriteVecIMul, HasDQI, 1>, T8PD,
+                                    SchedWriteVecIMul, HasDQI, 1>, T8,
                                     NotEVEX2VEXConvertible;
 defm VPMULHW : avx512_binop_rm_vl_w<0xE5, "vpmulhw", mulhs, SchedWriteVecIMul,
                                     HasBWI, 1>;
 defm VPMULHUW : avx512_binop_rm_vl_w<0xE4, "vpmulhuw", mulhu, SchedWriteVecIMul,
                                      HasBWI, 1>;
 defm VPMULHRSW : avx512_binop_rm_vl_w<0x0B, "vpmulhrsw", X86mulhrs,
-                                      SchedWriteVecIMul, HasBWI, 1>, T8PD;
+                                      SchedWriteVecIMul, HasBWI, 1>, T8;
 defm VPAVG : avx512_binop_rm_vl_bw<0xE0, 0xE3, "vpavg", avgceilu,
                                    SchedWriteVecALU, HasBWI, 1>;
 defm VPMULDQ : avx512_binop_rm_vl_q<0x28, "vpmuldq", X86pmuldq,
-                                    SchedWriteVecIMul, HasAVX512, 1>, T8PD;
+                                    SchedWriteVecIMul, HasAVX512, 1>, T8;
 defm VPMULUDQ : avx512_binop_rm_vl_q<0xF4, "vpmuludq", X86pmuludq,
                                      SchedWriteVecIMul, HasAVX512, 1>;
 
@@ -4872,7 +4872,7 @@ multiclass avx512_binop_all<bits<8> opc, string OpcodeStr,
 
 defm VPMULTISHIFTQB : avx512_binop_all<0x83, "vpmultishiftqb", SchedWriteVecALU,
                                 avx512vl_i8_info, avx512vl_i8_info,
-                                X86multishift, HasVBMI, 0>, T8PD;
+                                X86multishift, HasVBMI, 0>, T8;
 
 multiclass avx512_packs_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode,
                             X86VectorVTInfo _Src, X86VectorVTInfo _Dst,
@@ -4967,48 +4967,48 @@ defm VPACKSSWB : avx512_packs_all_i16_i8 <0x63, "vpacksswb", X86Packss>, AVX512B
 defm VPACKUSWB : avx512_packs_all_i16_i8 <0x67, "vpackuswb", X86Packus>, AVX512BIBase;
 
 defm VPMADDUBSW : avx512_vpmadd<0x04, "vpmaddubsw", X86vpmaddubsw,
-                     avx512vl_i8_info, avx512vl_i16_info>, AVX512BIBase, T8PD, WIG;
+                     avx512vl_i8_info, avx512vl_i16_info>, AVX512BIBase, T8, WIG;
 defm VPMADDWD   : avx512_vpmadd<0xF5, "vpmaddwd", X86vpmaddwd,
                      avx512vl_i16_info, avx512vl_i32_info, 1>, AVX512BIBase, WIG;
 
 defm VPMAXSB : avx512_binop_rm_vl_b<0x3C, "vpmaxsb", smax,
-                                    SchedWriteVecALU, HasBWI, 1>, T8PD;
+                                    SchedWriteVecALU, HasBWI, 1>, T8;
 defm VPMAXSW : avx512_binop_rm_vl_w<0xEE, "vpmaxsw", smax,
                                     SchedWriteVecALU, HasBWI, 1>;
 defm VPMAXSD : avx512_binop_rm_vl_d<0x3D, "vpmaxsd", smax,
-                                    SchedWriteVecALU, HasAVX512, 1>, T8PD;
+                                    SchedWriteVecALU, HasAVX512, 1>, T8;
 defm VPMAXSQ : avx512_binop_rm_vl_q<0x3D, "vpmaxsq", smax,
-                                    SchedWriteVecALU, HasAVX512, 1>, T8PD,
+                                    SchedWriteVecALU, HasAVX512, 1>, T8,
                                     NotEVEX2VEXConvertible;
 
 defm VPMAXUB : avx512_binop_rm_vl_b<0xDE, "vpmaxub", umax,
                                     SchedWriteVecALU, HasBWI, 1>;
 defm VPMAXUW : avx512_binop_rm_vl_w<0x3E, "vpmaxuw", umax,
-                                    SchedWriteVecALU, HasBWI, 1>, T8PD;
+                                    SchedWriteVecALU, HasBWI, 1>, T8;
 defm VPMAXUD : avx512_binop_rm_vl_d<0x3F, "vpmaxud", umax,
-                                    SchedWriteVecALU, HasAVX512, 1>, T8PD;
+                                    SchedWriteVecALU, HasAVX512, 1>, T8;
 defm VPMAXUQ : avx512_binop_rm_vl_q<0x3F, "vpmaxuq", umax,
-                                    SchedWriteVecALU, HasAVX512, 1>, T8PD,
+                                    SchedWriteVecALU, HasAVX512, 1>, T8,
                                     NotEVEX2VEXConvertible;
 
 defm VPMINSB : avx512_binop_rm_vl_b<0x38, "vpminsb", smin,
-                                    SchedWriteVecALU, HasBWI, 1>, T8PD;
+                                    SchedWriteVecALU, HasBWI, 1>, T8;
 defm VPMINSW : avx512_binop_rm_vl_w<0xEA, "vpminsw", smin,
                                     SchedWriteVecALU, HasBWI, 1>;
 defm VPMINSD : avx512_binop_rm_vl_d<0x39, "vpminsd", smin,
-                                    SchedWriteVecALU, HasAVX512, 1>, T8PD;
+                                    SchedWriteVecALU, HasAVX512, 1>, T8;
 defm VPMINSQ : avx512_binop_rm_vl_q<0x39, "vpminsq", smin,
-                                    SchedWriteVecALU, HasAVX512, 1>, T8PD,
+                                    SchedWriteVecALU, HasAVX512, 1>, T8,
                                     NotEVEX2VEXConvertible;
 
 defm VPMINUB : avx512_binop_rm_vl_b<0xDA, "vpminub", umin,
                                     SchedWriteVecALU, HasBWI, 1>;
 defm VPMINUW : avx512_binop_rm_vl_w<0x3A, "vpminuw", umin,
-                                    SchedWriteVecALU, HasBWI, 1>, T8PD;
+                                    SchedWriteVecALU, HasBWI, 1>, T8;
 defm VPMINUD : avx512_binop_rm_vl_d<0x3B, "vpminud", umin,
-                                    SchedWriteVecALU, HasAVX512, 1>, T8PD;
+                                    SchedWriteVecALU, HasAVX512, 1>, T8;
 defm VPMINUQ : avx512_binop_rm_vl_q<0x3B, "vpminuq", umin,
-                                    SchedWriteVecALU, HasAVX512, 1>, T8PD,
+                                    SchedWriteVecALU, HasAVX512, 1>, T8,
                                     NotEVEX2VEXConvertible;
 
 // PMULLQ: Use 512bit version to implement 128/256 bit in case NoVLX.
@@ -5445,18 +5445,18 @@ multiclass avx512_binop_s_round<bits<8> opc, string OpcodeStr, SDPatternOperator
                               sched.PS.Scl, IsCommutable>,
              avx512_fp_scalar_round<opc, OpcodeStr#"ss", f32x_info, RndNode,
                               sched.PS.Scl>,
-                              XS, EVEX, VVVV, VEX_LIG,  EVEX_CD8<32, CD8VT1>;
+                              TB, XS, EVEX, VVVV, VEX_LIG,  EVEX_CD8<32, CD8VT1>;
   defm SDZ : avx512_fp_scalar<opc, OpcodeStr#"sd", f64x_info, OpNode, VecNode,
                               sched.PD.Scl, IsCommutable>,
              avx512_fp_scalar_round<opc, OpcodeStr#"sd", f64x_info, RndNode,
                               sched.PD.Scl>,
-                              XD, REX_W, EVEX, VVVV, VEX_LIG, EVEX_CD8<64, CD8VT1>;
+                              TB, XD, REX_W, EVEX, VVVV, VEX_LIG, EVEX_CD8<64, CD8VT1>;
   let Predicates = [HasFP16] in
     defm SHZ : avx512_fp_scalar<opc, OpcodeStr#"sh", f16x_info, OpNode,
                                 VecNode, sched.PH.Scl, IsCommutable>,
                avx512_fp_scalar_round<opc, OpcodeStr#"sh", f16x_info, RndNode,
                                 sched.PH.Scl>,
-                                T_MAP5XS, EVEX, VVVV, VEX_LIG, EVEX_CD8<16, CD8VT1>;
+                                T_MAP5, XS, EVEX, VVVV, VEX_LIG, EVEX_CD8<16, CD8VT1>;
 }
 
 multiclass avx512_binop_s_sae<bits<8> opc, string OpcodeStr, SDNode OpNode,
@@ -5465,16 +5465,16 @@ multiclass avx512_binop_s_sae<bits<8> opc, string OpcodeStr, SDNode OpNode,
   defm SSZ : avx512_fp_scalar_sae<opc, OpcodeStr#"ss", f32x_info, OpNode,
                               VecNode, SaeNode, sched.PS.Scl, IsCommutable,
                               NAME#"SS">,
-                              XS, EVEX, VVVV, VEX_LIG,  EVEX_CD8<32, CD8VT1>;
+                              TB, XS, EVEX, VVVV, VEX_LIG,  EVEX_CD8<32, CD8VT1>;
   defm SDZ : avx512_fp_scalar_sae<opc, OpcodeStr#"sd", f64x_info, OpNode,
                               VecNode, SaeNode, sched.PD.Scl, IsCommutable,
                               NAME#"SD">,
-                              XD, REX_W, EVEX, VVVV, VEX_LIG, EVEX_CD8<64, CD8VT1>;
+                              TB, XD, REX_W, EVEX, VVVV, VEX_LIG, EVEX_CD8<64, CD8VT1>;
   let Predicates = [HasFP16] in {
     defm SHZ : avx512_fp_scalar_sae<opc, OpcodeStr#"sh", f16x_info, OpNode,
                                 VecNode, SaeNode, sched.PH.Scl, IsCommutable,
                                 NAME#"SH">,
-                                T_MAP5XS, EVEX, VVVV, VEX_LIG, EVEX_CD8<16, CD8VT1>,
+                                T_MAP5, XS, EVEX, VVVV, VEX_LIG, EVEX_CD8<16, CD8VT1>,
                                 NotEVEX2VEXConvertible;
   }
 }
@@ -5515,29 +5515,29 @@ multiclass avx512_comutable_binop_s<bits<8> opc, string OpcodeStr,
   }
 }
 defm VMINCSSZ : avx512_comutable_binop_s<0x5D, "vminss", f32x_info, X86fminc,
-                                         SchedWriteFCmp.Scl, "VMINCSS">, XS,
+                                         SchedWriteFCmp.Scl, "VMINCSS">, TB, XS,
                                          EVEX, VVVV, VEX_LIG, EVEX_CD8<32, CD8VT1>, SIMD_EXC;
 
 defm VMINCSDZ : avx512_comutable_binop_s<0x5D, "vminsd", f64x_info, X86fminc,
-                                         SchedWriteFCmp.Scl, "VMINCSD">, XD,
+                                         SchedWriteFCmp.Scl, "VMINCSD">, TB, XD,
                                          REX_W, EVEX, VVVV, VEX_LIG,
                                          EVEX_CD8<64, CD8VT1>, SIMD_EXC;
 
 defm VMAXCSSZ : avx512_comutable_binop_s<0x5F, "vmaxss", f32x_info, X86fmaxc,
-                                         SchedWriteFCmp.Scl, "VMAXCSS">, XS,
+                                         SchedWriteFCmp.Scl, "VMAXCSS">, TB, XS,
                                          EVEX, VVVV, VEX_LIG, EVEX_CD8<32, CD8VT1>, SIMD_EXC;
 
 defm VMAXCSDZ : avx512_comutable_binop_s<0x5F, "vmaxsd", f64x_info, X86fmaxc,
-                                         SchedWriteFCmp.Scl, "VMAXCSD">, XD,
+                                         SchedWriteFCmp.Scl, "VMAXCSD">, TB, XD,
                                          REX_W, EVEX, VVVV, VEX_LIG,
                                          EVEX_CD8<64, CD8VT1>, SIMD_EXC;
 
 defm VMINCSHZ : avx512_comutable_binop_s<0x5D, "vminsh", f16x_info, X86fminc,
-                                         SchedWriteFCmp.Scl, "VMINCSH">, T_MAP5XS,
+                                         SchedWriteFCmp.Scl, "VMINCSH">, T_MAP5, XS,
                                          EVEX, VVVV, VEX_LIG, EVEX_CD8<16, CD8VT1>, SIMD_EXC,
                                          NotEVEX2VEXConvertible;
 defm VMAXCSHZ : avx512_comutable_binop_s<0x5F, "vmaxsh", f16x_info, X86fmaxc,
-                                         SchedWriteFCmp.Scl, "VMAXCSH">, T_MAP5XS,
+                                         SchedWriteFCmp.Scl, "VMAXCSH">, T_MAP5, XS,
                                          EVEX, VVVV, VEX_LIG, EVEX_CD8<16, CD8VT1>, SIMD_EXC,
                                          NotEVEX2VEXConvertible;
 
@@ -5607,27 +5607,27 @@ multiclass avx512_fp_binop_p<bits<8> opc, string OpcodeStr, SDPatternOperator Op
                              bit IsPD128Commutable = IsCommutable> {
   let Predicates = [prd] in {
   defm PSZ : avx512_fp_packed<opc, OpcodeStr, OpNode, MaskOpNode, v16f32_info,
-                              sched.PS.ZMM, IsCommutable>, EVEX_V512, PS,
+                              sched.PS.ZMM, IsCommutable>, EVEX_V512, TB, PS,
                               EVEX_CD8<32, CD8VF>;
   defm PDZ : avx512_fp_packed<opc, OpcodeStr, OpNode, MaskOpNode, v8f64_info,
-                              sched.PD.ZMM, IsCommutable>, EVEX_V512, PD, REX_W,
+                              sched.PD.ZMM, IsCommutable>, EVEX_V512, TB, PD, REX_W,
                               EVEX_CD8<64, CD8VF>;
   }
 
     // Define only if AVX512VL feature is present.
   let Predicates = [prd, HasVLX] in {
     defm PSZ128 : avx512_fp_packed<opc, OpcodeStr, OpNode, MaskOpNode, v4f32x_info,
-                                   sched.PS.XMM, IsCommutable>, EVEX_V128, PS,
+                                   sched.PS.XMM, IsCommutable>, EVEX_V128, TB, PS,
                                    EVEX_CD8<32, CD8VF>;
     defm PSZ256 : avx512_fp_packed<opc, OpcodeStr, OpNode, MaskOpNode, v8f32x_info,
-                                   sched.PS.YMM, IsCommutable>, EVEX_V256, PS,
+                                   sched.PS.YMM, IsCommutable>, EVEX_V256, TB, PS,
                                    EVEX_CD8<32, CD8VF>;
     defm PDZ128 : avx512_fp_packed<opc, OpcodeStr, OpNode, MaskOpNode, v2f64x_info,
                                    sched.PD.XMM, IsPD128Commutable,
-                                   IsCommutable>, EVEX_V128, PD, REX_W,
+                                   IsCommutable>, EVEX_V128, TB, PD, REX_W,
                                    EVEX_CD8<64, CD8VF>;
     defm PDZ256 : avx512_fp_packed<opc, OpcodeStr, OpNode, MaskOpNode, v4f64x_info,
-                                   sched.PD.YMM, IsCommutable>, EVEX_V256, PD, REX_W,
+                                   sched.PD.YMM, IsCommutable>, EVEX_V256, TB, PD, REX_W,
                                    EVEX_CD8<64, CD8VF>;
   }
 }
@@ -5637,15 +5637,15 @@ multiclass avx512_fp_binop_ph<bits<8> opc, string OpcodeStr, SDPatternOperator O
                               X86SchedWriteSizes sched, bit IsCommutable = 0> {
   let Predicates = [HasFP16] in {
     defm PHZ : avx512_fp_packed<opc, OpcodeStr, OpNode, MaskOpNode, v32f16_info,
-                                sched.PH.ZMM, IsCommutable>, EVEX_V512, T_MAP5PS,
+                                sched.PH.ZMM, IsCommutable>, EVEX_V512, T_MAP5, PS,
                                 EVEX_CD8<16, CD8VF>;
   }
   let Predicates = [HasVLX, HasFP16] in {
     defm PHZ128 : avx512_fp_packed<opc, OpcodeStr, OpNode, MaskOpNode, v8f16x_info,
-                                   sched.PH.XMM, IsCommutable>, EVEX_V128, T_MAP5PS,
+                                   sched.PH.XMM, IsCommutable>, EVEX_V128, T_MAP5, PS,
                                    EVEX_CD8<16, CD8VF>;
     defm PHZ256 : avx512_fp_packed<opc, OpcodeStr, OpNode, MaskOpNode, v16f16x_info,
-                                   sched.PH.YMM, IsCommutable>, EVEX_V256, T_MAP5PS,
+                                   sched.PH.YMM, IsCommutable>, EVEX_V256, T_MAP5, PS,
                                    EVEX_CD8<16, CD8VF>;
   }
 }
@@ -5656,14 +5656,14 @@ multiclass avx512_fp_binop_p_round<bits<8> opc, string OpcodeStr, SDNode OpNodeR
   let Predicates = [HasFP16] in {
     defm PHZ : avx512_fp_round_packed<opc, OpcodeStr, OpNodeRnd, sched.PH.ZMM,
                                       v32f16_info>,
-                                      EVEX_V512, T_MAP5PS, EVEX_CD8<16, CD8VF>;
+                                      EVEX_V512, T_MAP5, PS, EVEX_CD8<16, CD8VF>;
   }
   defm PSZ : avx512_fp_round_packed<opc, OpcodeStr, OpNodeRnd, sched.PS.ZMM,
                                     v16f32_info>,
-                                    EVEX_V512, PS, EVEX_CD8<32, CD8VF>;
+                                    EVEX_V512, TB, PS, EVEX_CD8<32, CD8VF>;
   defm PDZ : avx512_fp_round_packed<opc, OpcodeStr, OpNodeRnd, sched.PD.ZMM,
                                     v8f64_info>,
-                                    EVEX_V512, PD, REX_W,EVEX_CD8<64, CD8VF>;
+                                    EVEX_V512, TB, PD, REX_W,EVEX_CD8<64, CD8VF>;
 }
 
 let Uses = [MXCSR] in
@@ -5672,14 +5672,14 @@ multiclass avx512_fp_binop_p_sae<bits<8> opc, string OpcodeStr, SDNode OpNodeRnd
   let Predicates = [HasFP16] in {
     defm PHZ : avx512_fp_sae_packed<opc, OpcodeStr, OpNodeRnd, sched.PH.ZMM,
                                     v32f16_info>,
-                                    EVEX_V512, T_MAP5PS, EVEX_CD8<16, CD8VF>;
+                                    EVEX_V512, T_MAP5, PS, EVEX_CD8<16, CD8VF>;
   }
   defm PSZ : avx512_fp_sae_packed<opc, OpcodeStr, OpNodeRnd, sched.PS.ZMM,
                                   v16f32_info>,
-                                  EVEX_V512, PS, EVEX_CD8<32, CD8VF>;
+                                  EVEX_V512, TB, PS, EVEX_CD8<32, CD8VF>;
   defm PDZ : avx512_fp_sae_packed<opc, OpcodeStr, OpNodeRnd, sched.PD.ZMM,
                                   v8f64_info>,
-                                  EVEX_V512, PD, REX_W,EVEX_CD8<64, CD8VF>;
+                                  EVEX_V512, TB, PD, REX_W,EVEX_CD8<64, CD8VF>;
 }
 
 defm VADD : avx512_fp_binop_p<0x58, "vadd", any_fadd, fadd, HasAVX512,
@@ -5770,43 +5770,43 @@ multiclass avx512_fp_scalef_all<bits<8> opc, bits<8> opcScaler, string OpcodeStr
   let Predicates = [HasFP16] in {
     defm PHZ : avx512_fp_scalef_p<opc, OpcodeStr, X86scalef, sched.ZMM, v32f16_info>,
                avx512_fp_round_packed<opc, OpcodeStr, X86scalefRnd, sched.ZMM, v32f16_info>,
-                                EVEX_V512, T_MAP6PD, EVEX_CD8<16, CD8VF>;
+                                EVEX_V512, T_MAP6, PD, EVEX_CD8<16, CD8VF>;
     defm SHZ : avx512_fp_scalef_scalar<opcScaler, OpcodeStr, X86scalefs, sched.Scl, f16x_info>,
                avx512_fp_scalar_round<opcScaler, OpcodeStr#"sh", f16x_info, X86scalefsRnd, sched.Scl>,
-                             EVEX, VVVV, T_MAP6PD, EVEX_CD8<16, CD8VT1>;
+                             EVEX, VVVV, T_MAP6, PD, EVEX_CD8<16, CD8VT1>;
   }
   defm PSZ : avx512_fp_scalef_p<opc, OpcodeStr, X86scalef, sched.ZMM, v16f32_info>,
              avx512_fp_round_packed<opc, OpcodeStr, X86scalefRnd, sched.ZMM, v16f32_info>,
-                              EVEX_V512, EVEX_CD8<32, CD8VF>, T8PD;
+                              EVEX_V512, EVEX_CD8<32, CD8VF>, T8, PD;
   defm PDZ : avx512_fp_scalef_p<opc, OpcodeStr, X86scalef, sched.ZMM, v8f64_info>,
              avx512_fp_round_packed<opc, OpcodeStr, X86scalefRnd, sched.ZMM, v8f64_info>,
-                              EVEX_V512, REX_W, EVEX_CD8<64, CD8VF>, T8PD;
+                              EVEX_V512, REX_W, EVEX_CD8<64, CD8VF>, T8, PD;
   defm SSZ : avx512_fp_scalef_scalar<opcScaler, OpcodeStr, X86scalefs, sched.Scl, f32x_info>,
              avx512_fp_scalar_round<opcScaler, OpcodeStr#"ss", f32x_info,
                                     X86scalefsRnd, sched.Scl>,
-                                    EVEX, VVVV, VEX_LIG, EVEX_CD8<32, CD8VT1>, T8PD;
+                                    EVEX, VVVV, VEX_LIG, EVEX_CD8<32, CD8VT1>, T8, PD;
   defm SDZ : avx512_fp_scalef_scalar<opcScaler, OpcodeStr, X86scalefs, sched.Scl, f64x_info>,
              avx512_fp_scalar_round<opcScaler, OpcodeStr#"sd", f64x_info,
                                     X86scalefsRnd, sched.Scl>,
-                                    EVEX, VVVV, VEX_LIG, EVEX_CD8<64, CD8VT1>, REX_W, T8PD;
+                                    EVEX, VVVV, VEX_LIG, EVEX_CD8<64, CD8VT1>, REX_W, T8, PD;
 
   // Define only if AVX512VL feature is present.
   let Predicates = [HasVLX] in {
     defm PSZ128 : avx512_fp_scalef_p<opc, OpcodeStr, X86scalef, sched.XMM, v4f32x_info>,
-                                   EVEX_V128, EVEX_CD8<32, CD8VF>, T8PD;
+                                   EVEX_V128, EVEX_CD8<32, CD8VF>, T8, PD;
     defm PSZ256 : avx512_fp_scalef_p<opc, OpcodeStr, X86scalef, sched.YMM, v8f32x_info>,
-                                   EVEX_V256, EVEX_CD8<32, CD8VF>, T8PD;
+                                   EVEX_V256, EVEX_CD8<32, CD8VF>, T8, PD;
     defm PDZ128 : avx512_fp_scalef_p<opc, OpcodeStr, X86scalef, sched.XMM, v2f64x_info>,
-                                   EVEX_V128, REX_W, EVEX_CD8<64, CD8VF>, T8PD;
+                                   EVEX_V128, REX_W, EVEX_CD8<64, CD8VF>, T8, PD;
     defm PDZ256 : avx512_fp_scalef_p<opc, OpcodeStr, X86scalef, sched.YMM, v4f64x_info>,
-                                   EVEX_V256, REX_W, EVEX_CD8<64, CD8VF>, T8PD;
+                                   EVEX_V256, REX_W, EVEX_CD8<64, CD8VF>, T8, PD;
   }
 
   let Predicates = [HasFP16, HasVLX] in {
     defm PHZ128 : avx512_fp_scalef_p<opc, OpcodeStr, X86scalef, sched.XMM, v8f16x_info>,
-                                   EVEX_V128, EVEX_CD8<16, CD8VF>, T_MAP6PD;
+                                   EVEX_V128, EVEX_CD8<16, CD8VF>, T_MAP6, PD;
     defm PHZ256 : avx512_fp_scalef_p<opc, OpcodeStr, X86scalef, sched.YMM, v16f16x_info>,
-                                   EVEX_V256, EVEX_CD8<16, CD8VF>, T_MAP6PD;
+                                   EVEX_V256, EVEX_CD8<16, CD8VF>, T_MAP6, PD;
   }
 }
 defm VSCALEF : avx512_fp_scalef_all<0x2C, 0x2D, "vscalef",
@@ -5898,9 +5898,9 @@ multiclass avx512_vptest_all_forms<bits<8> opc_wb, bits<8> opc_dq, string Opcode
   avx512_vptest_dq<opc_dq, OpcodeStr, sched>;
 
 defm VPTESTM   : avx512_vptest_all_forms<0x26, 0x27, "vptestm",
-                                         SchedWriteVecLogic>, T8PD;
+                                         SchedWriteVecLogic>, T8, PD;
 defm VPTESTNM  : avx512_vptest_all_forms<0x26, 0x27, "vptestnm",
-                                         SchedWriteVecLogic>, T8XS;
+                                         SchedWriteVecLogic>, T8, XS;
 
 //===----------------------------------------------------------------------===//
 // AVX-512  Shift instructions
@@ -6374,14 +6374,14 @@ multiclass avx512_permil_vec<bits<8> OpcVar, string OpcodeStr, SDNode OpNode,
                   "$src2, $src1", "$src1, $src2",
                   (_.VT (OpNode _.RC:$src1,
                                (Ctrl.VT Ctrl.RC:$src2)))>,
-                  T8PD, EVEX, VVVV, Sched<[sched]>;
+                  T8, PD, EVEX, VVVV, Sched<[sched]>;
   defm rm: AVX512_maskable<OpcVar, MRMSrcMem, _, (outs _.RC:$dst),
                   (ins _.RC:$src1, Ctrl.MemOp:$src2), OpcodeStr,
                   "$src2, $src1", "$src1, $src2",
                   (_.VT (OpNode
                            _.RC:$src1,
                            (Ctrl.VT (Ctrl.LdFrag addr:$src2))))>,
-                  T8PD, EVEX, VVVV, EVEX_CD8<_.EltSize, CD8VF>,
+                  T8, PD, EVEX, VVVV, EVEX_CD8<_.EltSize, CD8VF>,
                   Sched<[sched.Folded, sched.ReadAfterFold]>;
   defm rmb: AVX512_maskable<OpcVar, MRMSrcMem, _, (outs _.RC:$dst),
                    (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr,
@@ -6390,7 +6390,7 @@ multiclass avx512_permil_vec<bits<8> OpcVar, string OpcodeStr, SDNode OpNode,
                    (_.VT (OpNode
                             _.RC:$src1,
                             (Ctrl.VT (Ctrl.BroadcastLdFrag addr:$src2))))>,
-                   T8PD, EVEX, VVVV, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>,
+                   T8, PD, EVEX, VVVV, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>,
                    Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
@@ -6500,13 +6500,13 @@ multiclass avx512_mov_hilo_packed<bits<8> opc, string OpcodeStr,
 // No patterns for MOVLPS/MOVHPS as the Movlhps node should only be created in
 // SSE1. And MOVLPS pattern is even more complex.
 defm VMOVHPSZ128 : avx512_mov_hilo_packed<0x16, "vmovhps", null_frag,
-                                  v4f32x_info>, EVEX_CD8<32, CD8VT2>, PS;
+                                  v4f32x_info>, EVEX_CD8<32, CD8VT2>, TB, PS;
 defm VMOVHPDZ128 : avx512_mov_hilo_packed<0x16, "vmovhpd", X86Unpckl,
-                                  v2f64x_info>, EVEX_CD8<64, CD8VT1>, PD, REX_W;
+                                  v2f64x_info>, EVEX_CD8<64, CD8VT1>, TB, PD, REX_W;
 defm VMOVLPSZ128 : avx512_mov_hilo_packed<0x12, "vmovlps", null_frag,
-                                  v4f32x_info>, EVEX_CD8<32, CD8VT2>, PS;
+                                  v4f32x_info>, EVEX_CD8<32, CD8VT2>, TB, PS;
 defm VMOVLPDZ128 : avx512_mov_hilo_packed<0x12, "vmovlpd", X86Movsd,
-                                  v2f64x_info>, EVEX_CD8<64, CD8VT1>, PD, REX_W;
+                                  v2f64x_info>, EVEX_CD8<64, CD8VT1>, TB, PD, REX_W;
 
 let Predicates = [HasAVX512] in {
   // VMOVHPD patterns
@@ -6627,13 +6627,13 @@ multiclass avx512_fma3p_213_f<bits<8> opc, string OpcodeStr, SDPatternOperator O
                               SDNode MaskOpNode, SDNode OpNodeRnd> {
     defm PH : avx512_fma3p_213_common<opc, OpcodeStr#"ph", OpNode, MaskOpNode,
                                       OpNodeRnd, SchedWriteFMA,
-                                      avx512vl_f16_info, HasFP16>, T_MAP6PD;
+                                      avx512vl_f16_info, HasFP16>, T_MAP6, PD;
     defm PS : avx512_fma3p_213_common<opc, OpcodeStr#"ps", OpNode, MaskOpNode,
                                       OpNodeRnd, SchedWriteFMA,
-                                      avx512vl_f32_info>, T8PD;
+                                      avx512vl_f32_info>, T8, PD;
     defm PD : avx512_fma3p_213_common<opc, OpcodeStr#"pd", OpNode, MaskOpNode,
                                       OpNodeRnd, SchedWriteFMA,
-                                      avx512vl_f64_info>, T8PD, REX_W;
+                                      avx512vl_f64_info>, T8, PD, REX_W;
 }
 
 defm VFMADD213    : avx512_fma3p_213_f<0xA8, "vfmadd213", any_fma,
@@ -6724,13 +6724,13 @@ multiclass avx512_fma3p_231_f<bits<8> opc, string OpcodeStr, SDPatternOperator O
                               SDNode MaskOpNode, SDNode OpNodeRnd > {
     defm PH : avx512_fma3p_231_common<opc, OpcodeStr#"ph", OpNode, MaskOpNode,
                                       OpNodeRnd, SchedWriteFMA,
-                                      avx512vl_f16_info, HasFP16>, T_MAP6PD;
+                                      avx512vl_f16_info, HasFP16>, T_MAP6, PD;
     defm PS : avx512_fma3p_231_common<opc, OpcodeStr#"ps", OpNode, MaskOpNode,
                                       OpNodeRnd, SchedWriteFMA,
-                                      avx512vl_f32_info>, T8PD;
+                                      avx512vl_f32_info>, T8, PD;
     defm PD : avx512_fma3p_231_common<opc, OpcodeStr#"pd", OpNode, MaskOpNode,
                                       OpNodeRnd, SchedWriteFMA,
-                                      avx512vl_f64_info>, T8PD, REX_W;
+                                      avx512vl_f64_info>, T8, PD, REX_W;
 }
 
 defm VFMADD231    : avx512_fma3p_231_f<0xB8, "vfmadd231", any_fma,
@@ -6822,13 +6822,13 @@ multiclass avx512_fma3p_132_f<bits<8> opc, string OpcodeStr, SDPatternOperator O
                               SDNode MaskOpNode, SDNode OpNodeRnd > {
     defm PH : avx512_fma3p_132_common<opc, OpcodeStr#"ph", OpNode, MaskOpNode,
                                       OpNodeRnd, SchedWriteFMA,
-                                      avx512vl_f16_info, HasFP16>, T_MAP6PD;
+                                      avx512vl_f16_info, HasFP16>, T_MAP6, PD;
     defm PS : avx512_fma3p_132_common<opc, OpcodeStr#"ps", OpNode, MaskOpNode,
                                       OpNodeRnd, SchedWriteFMA,
-                                      avx512vl_f32_info>, T8PD;
+                                      avx512vl_f32_info>, T8, PD;
     defm PD : avx512_fma3p_132_common<opc, OpcodeStr#"pd", OpNode, MaskOpNode,
                                       OpNodeRnd, SchedWriteFMA,
-                                      avx512vl_f64_info>, T8PD, REX_W;
+                                      avx512vl_f64_info>, T8, PD, REX_W;
 }
 
 defm VFMADD132    : avx512_fma3p_132_f<0x98, "vfmadd132", any_fma,
@@ -6929,15 +6929,15 @@ multiclass avx512_fma3s<bits<8> opc213, bits<8> opc231, bits<8> opc132,
   let Predicates = [HasAVX512] in {
     defm NAME : avx512_fma3s_all<opc213, opc231, opc132, OpcodeStr, OpNode,
                                  OpNodeRnd, f32x_info, "SS">,
-                                 EVEX_CD8<32, CD8VT1>, VEX_LIG, T8PD;
+                                 EVEX_CD8<32, CD8VT1>, VEX_LIG, T8, PD;
     defm NAME : avx512_fma3s_all<opc213, opc231, opc132, OpcodeStr, OpNode,
                                  OpNodeRnd, f64x_info, "SD">,
-                                 EVEX_CD8<64, CD8VT1>, VEX_LIG, REX_W, T8PD;
+                                 EVEX_CD8<64, CD8VT1>, VEX_LIG, REX_W, T8, PD;
   }
   let Predicates = [HasFP16] in {
     defm NAME : avx512_fma3s_all<opc213, opc231, opc132, OpcodeStr, OpNode,
                                  OpNodeRnd, f16x_info, "SH">,
-                                 EVEX_CD8<16, CD8VT1>, VEX_LIG, T_MAP6PD;
+                                 EVEX_CD8<16, CD8VT1>, VEX_LIG, T_MAP6, PD;
   }
 }
 
@@ -7189,13 +7189,13 @@ multiclass avx512_pmadd52_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
           (ins _.RC:$src2, _.RC:$src3),
           OpcodeStr, "$src3, $src2", "$src2, $src3",
           (_.VT (OpNode _.RC:$src2, _.RC:$src3, _.RC:$src1)), 1, 1>,
-          T8PD, EVEX, VVVV, Sched<[sched]>;
+          T8, PD, EVEX, VVVV, Sched<[sched]>;
 
   defm m: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
           (ins _.RC:$src2, _.MemOp:$src3),
           OpcodeStr, "$src3, $src2", "$src2, $src3",
           (_.VT (OpNode _.RC:$src2, (_.LdFrag addr:$src3), _.RC:$src1))>,
-          T8PD, EVEX, VVVV, Sched<[sched.Folded, sched.ReadAfterFold,
+          T8, PD, EVEX, VVVV, Sched<[sched.Folded, sched.ReadAfterFold,
                                 sched.ReadAfterFold]>;
 
   defm mb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
@@ -7205,7 +7205,7 @@ multiclass avx512_pmadd52_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
             (OpNode _.RC:$src2,
                     (_.VT (_.BroadcastLdFrag addr:$src3)),
                     _.RC:$src1)>,
-            T8PD, EVEX, VVVV, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold,
+            T8, PD, EVEX, VVVV, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold,
                                           sched.ReadAfterFold]>;
   }
 }
@@ -7307,18 +7307,18 @@ let Predicates = [HasAVX512] in {
 defm VCVTSI2SSZ  : avx512_vcvtsi_common<0x2A, X86SintToFp, X86SintToFpRnd,
                                  WriteCvtI2SS, GR32,
                                  v4f32x_info, i32mem, loadi32, "cvtsi2ss", "l">,
-                                 XS, EVEX_CD8<32, CD8VT1>;
+                                 TB, XS, EVEX_CD8<32, CD8VT1>;
 defm VCVTSI642SSZ: avx512_vcvtsi_common<0x2A, X86SintToFp, X86SintToFpRnd,
                                  WriteCvtI2SS, GR64,
                                  v4f32x_info, i64mem, loadi64, "cvtsi2ss", "q">,
-                                 XS, REX_W, EVEX_CD8<64, CD8VT1>;
+                                 TB, XS, REX_W, EVEX_CD8<64, CD8VT1>;
 defm VCVTSI2SDZ  : avx512_vcvtsi<0x2A, null_frag, WriteCvtI2SD, GR32,
                                  v2f64x_info, i32mem, loadi32, "cvtsi2sd", "l", [], 0>,
-                                 XD, VEX_LIG, EVEX_CD8<32, CD8VT1>;
+                                 TB, XD, VEX_LIG, EVEX_CD8<32, CD8VT1>;
 defm VCVTSI642SDZ: avx512_vcvtsi_common<0x2A, X86SintToFp, X86SintToFpRnd,
                                  WriteCvtI2SD, GR64,
                                  v2f64x_info, i64mem, loadi64, "cvtsi2sd", "q">,
-                                 XD, REX_W, EVEX_CD8<64, CD8VT1>;
+                                 TB, XD, REX_W, EVEX_CD8<64, CD8VT1>;
 
 def : InstAlias<"vcvtsi2ss\t{$src, $src1, $dst|$dst, $src1, $src}",
               (VCVTSI2SSZrm_Int VR128X:$dst, VR128X:$src1, i32mem:$src), 0, "att">;
@@ -7346,18 +7346,18 @@ def : Pat<(f64 (any_sint_to_fp GR64:$src)),
 defm VCVTUSI2SSZ   : avx512_vcvtsi_common<0x7B, X86UintToFp, X86UintToFpRnd,
                                   WriteCvtI2SS, GR32,
                                   v4f32x_info, i32mem, loadi32,
-                                  "cvtusi2ss", "l">, XS, EVEX_CD8<32, CD8VT1>;
+                                  "cvtusi2ss", "l">, TB, XS, EVEX_CD8<32, CD8VT1>;
 defm VCVTUSI642SSZ : avx512_vcvtsi_common<0x7B, X86UintToFp, X86UintToFpRnd,
                                   WriteCvtI2SS, GR64,
                                   v4f32x_info, i64mem, loadi64, "cvtusi2ss", "q">,
-                                  XS, REX_W, EVEX_CD8<64, CD8VT1>;
+                                  TB, XS, REX_W, EVEX_CD8<64, CD8VT1>;
 defm VCVTUSI2SDZ   : avx512_vcvtsi<0x7B, null_frag, WriteCvtI2SD, GR32, v2f64x_info,
                                   i32mem, loadi32, "cvtusi2sd", "l", [], 0>,
-                                  XD, VEX_LIG, EVEX_CD8<32, CD8VT1>;
+                                  TB, XD, VEX_LIG, EVEX_CD8<32, CD8VT1>;
 defm VCVTUSI642SDZ : avx512_vcvtsi_common<0x7B, X86UintToFp, X86UintToFpRnd,
                                   WriteCvtI2SD, GR64,
                                   v2f64x_info, i64mem, loadi64, "cvtusi2sd", "q">,
-                                  XD, REX_W, EVEX_CD8<64, CD8VT1>;
+                                  TB, XD, REX_W, EVEX_CD8<64, CD8VT1>;
 
 def : InstAlias<"vcvtusi2ss\t{$src, $src1, $dst|$dst, $src1, $src}",
               (VCVTUSI2SSZrm_Int VR128X:$dst, VR128X:$src1, i32mem:$src), 0, "att">;
@@ -7422,28 +7422,28 @@ multiclass avx512_cvt_s_int_round<bits<8> opc, X86VectorVTInfo SrcVT,
 // Convert float/double to signed/unsigned int 32/64
 defm VCVTSS2SIZ: avx512_cvt_s_int_round<0x2D, f32x_info, i32x_info,X86cvts2si,
                                    X86cvts2siRnd, WriteCvtSS2I, "cvtss2si", "{l}">,
-                                   XS, EVEX_CD8<32, CD8VT1>;
+                                   TB, XS, EVEX_CD8<32, CD8VT1>;
 defm VCVTSS2SI64Z: avx512_cvt_s_int_round<0x2D, f32x_info, i64x_info, X86cvts2si,
                                    X86cvts2siRnd, WriteCvtSS2I, "cvtss2si", "{q}">,
-                                   XS, REX_W, EVEX_CD8<32, CD8VT1>;
+                                   TB, XS, REX_W, EVEX_CD8<32, CD8VT1>;
 defm VCVTSS2USIZ: avx512_cvt_s_int_round<0x79, f32x_info, i32x_info, X86cvts2usi,
                                    X86cvts2usiRnd, WriteCvtSS2I, "cvtss2usi", "{l}">,
-                                   XS, EVEX_CD8<32, CD8VT1>;
+                                   TB, XS, EVEX_CD8<32, CD8VT1>;
 defm VCVTSS2USI64Z: avx512_cvt_s_int_round<0x79, f32x_info, i64x_info, X86cvts2usi,
                                    X86cvts2usiRnd, WriteCvtSS2I, "cvtss2usi", "{q}">,
-                                   XS, REX_W, EVEX_CD8<32, CD8VT1>;
+                                   TB, XS, REX_W, EVEX_CD8<32, CD8VT1>;
 defm VCVTSD2SIZ: avx512_cvt_s_int_round<0x2D, f64x_info, i32x_info, X86cvts2si,
                                    X86cvts2siRnd, WriteCvtSD2I, "cvtsd2si", "{l}">,
-                                   XD, EVEX_CD8<64, CD8VT1>;
+                                   TB, XD, EVEX_CD8<64, CD8VT1>;
 defm VCVTSD2SI64Z: avx512_cvt_s_int_round<0x2D, f64x_info, i64x_info, X86cvts2si,
                                    X86cvts2siRnd, WriteCvtSD2I, "cvtsd2si", "{q}">,
-                                   XD, REX_W, EVEX_CD8<64, CD8VT1>;
+                                   TB, XD, REX_W, EVEX_CD8<64, CD8VT1>;
 defm VCVTSD2USIZ:   avx512_cvt_s_int_round<0x79, f64x_info, i32x_info, X86cvts2usi,
                                    X86cvts2usiRnd, WriteCvtSD2I, "cvtsd2usi", "{l}">,
-                                   XD, EVEX_CD8<64, CD8VT1>;
+                                   TB, XD, EVEX_CD8<64, CD8VT1>;
 defm VCVTSD2USI64Z: avx512_cvt_s_int_round<0x79, f64x_info, i64x_info, X86cvts2usi,
                                    X86cvts2usiRnd, WriteCvtSD2I, "cvtsd2usi", "{q}">,
-                                   XD, REX_W, EVEX_CD8<64, CD8VT1>;
+                                   TB, XD, REX_W, EVEX_CD8<64, CD8VT1>;
 
 multiclass avx512_cvt_s<bits<8> opc, string asm, X86VectorVTInfo SrcVT,
                         X86VectorVTInfo DstVT, SDNode OpNode,
@@ -7463,13 +7463,13 @@ multiclass avx512_cvt_s<bits<8> opc, string asm, X86VectorVTInfo SrcVT,
 }
 
 defm VCVTSS2SIZ: avx512_cvt_s<0x2D, "vcvtss2si", f32x_info, i32x_info,
-                       lrint, WriteCvtSS2I>, XS, EVEX_CD8<32, CD8VT1>;
+                       lrint, WriteCvtSS2I>, TB, XS, EVEX_CD8<32, CD8VT1>;
 defm VCVTSS2SI64Z: avx512_cvt_s<0x2D, "vcvtss2si", f32x_info, i64x_info,
-                       llrint, WriteCvtSS2I>, REX_W, XS, EVEX_CD8<32, CD8VT1>;
+                       llrint, WriteCvtSS2I>, REX_W, TB, XS, EVEX_CD8<32, CD8VT1>;
 defm VCVTSD2SIZ: avx512_cvt_s<0x2D, "vcvtsd2si", f64x_info, i32x_info,
-                       lrint, WriteCvtSD2I>, XD, EVEX_CD8<64, CD8VT1>;
+                       lrint, WriteCvtSD2I>, TB, XD, EVEX_CD8<64, CD8VT1>;
 defm VCVTSD2SI64Z: avx512_cvt_s<0x2D, "vcvtsd2si", f64x_info, i64x_info,
-                       llrint, WriteCvtSD2I>, REX_W, XD, EVEX_CD8<64, CD8VT1>;
+                       llrint, WriteCvtSD2I>, REX_W, TB, XD, EVEX_CD8<64, CD8VT1>;
 
 let Predicates = [HasAVX512] in {
   def : Pat<(i64 (lrint FR32:$src)), (VCVTSS2SI64Zrr FR32:$src)>;
@@ -7609,29 +7609,29 @@ let Predicates = [prd], ExeDomain = _SrcRC.ExeDomain in {
 
 defm VCVTTSS2SIZ: avx512_cvt_s_all<0x2C, "vcvttss2si", f32x_info, i32x_info,
                         any_fp_to_sint, X86cvtts2Int, X86cvtts2IntSAE, WriteCvtSS2I,
-                        "{l}">, XS, EVEX_CD8<32, CD8VT1>;
+                        "{l}">, TB, XS, EVEX_CD8<32, CD8VT1>;
 defm VCVTTSS2SI64Z: avx512_cvt_s_all<0x2C, "vcvttss2si", f32x_info, i64x_info,
                         any_fp_to_sint, X86cvtts2Int, X86cvtts2IntSAE, WriteCvtSS2I,
-                        "{q}">, REX_W, XS, EVEX_CD8<32, CD8VT1>;
+                        "{q}">, REX_W, TB, XS, EVEX_CD8<32, CD8VT1>;
 defm VCVTTSD2SIZ: avx512_cvt_s_all<0x2C, "vcvttsd2si", f64x_info, i32x_info,
                         any_fp_to_sint, X86cvtts2Int, X86cvtts2IntSAE, WriteCvtSD2I,
-                        "{l}">, XD, EVEX_CD8<64, CD8VT1>;
+                        "{l}">, TB, XD, EVEX_CD8<64, CD8VT1>;
 defm VCVTTSD2SI64Z: avx512_cvt_s_all<0x2C, "vcvttsd2si", f64x_info, i64x_info,
                         any_fp_to_sint, X86cvtts2Int, X86cvtts2IntSAE, WriteCvtSD2I,
-                        "{q}">, REX_W, XD, EVEX_CD8<64, CD8VT1>;
+                        "{q}">, REX_W, TB, XD, EVEX_CD8<64, CD8VT1>;
 
 defm VCVTTSS2USIZ: avx512_cvt_s_all<0x78, "vcvttss2usi", f32x_info, i32x_info,
                         any_fp_to_uint, X86cvtts2UInt, X86cvtts2UIntSAE, WriteCvtSS2I,
-                        "{l}">, XS, EVEX_CD8<32, CD8VT1>;
+                        "{l}">, TB, XS, EVEX_CD8<32, CD8VT1>;
 defm VCVTTSS2USI64Z: avx512_cvt_s_all<0x78, "vcvttss2usi", f32x_info, i64x_info,
                         any_fp_to_uint, X86cvtts2UInt, X86cvtts2UIntSAE, WriteCvtSS2I,
-                        "{q}">, XS,REX_W, EVEX_CD8<32, CD8VT1>;
+                        "{q}">, TB, XS,REX_W, EVEX_CD8<32, CD8VT1>;
 defm VCVTTSD2USIZ: avx512_cvt_s_all<0x78, "vcvttsd2usi", f64x_info, i32x_info,
                         any_fp_to_uint, X86cvtts2UInt, X86cvtts2UIntSAE, WriteCvtSD2I,
-                        "{l}">, XD, EVEX_CD8<64, CD8VT1>;
+                        "{l}">, TB, XD, EVEX_CD8<64, CD8VT1>;
 defm VCVTTSD2USI64Z: avx512_cvt_s_all<0x78, "vcvttsd2usi", f64x_info, i64x_info,
                         any_fp_to_uint, X86cvtts2UInt, X86cvtts2UIntSAE, WriteCvtSD2I,
-                        "{q}">, XD, REX_W, EVEX_CD8<64, CD8VT1>;
+                        "{q}">, TB, XD, REX_W, EVEX_CD8<64, CD8VT1>;
 
 //===----------------------------------------------------------------------===//
 // AVX-512  Convert form float to double and back
@@ -7719,22 +7719,22 @@ multiclass avx512_cvt_fp_scalar_extend<bits<8> opc, string OpcodeStr,
 }
 defm VCVTSD2SS : avx512_cvt_fp_scalar_trunc<0x5A, "vcvtsd2ss", X86frounds,
                                          X86froundsRnd, WriteCvtSD2SS, f64x_info,
-                                         f32x_info>, XD, REX_W;
+                                         f32x_info>, TB, XD, REX_W;
 defm VCVTSS2SD : avx512_cvt_fp_scalar_extend<0x5A, "vcvtss2sd", X86fpexts,
                                           X86fpextsSAE, WriteCvtSS2SD, f32x_info,
-                                          f64x_info>, XS;
+                                          f64x_info>, TB, XS;
 defm VCVTSD2SH : avx512_cvt_fp_scalar_trunc<0x5A, "vcvtsd2sh", X86frounds,
                                           X86froundsRnd, WriteCvtSD2SS, f64x_info,
-                                          f16x_info, HasFP16>, T_MAP5XD, REX_W;
+                                          f16x_info, HasFP16>, T_MAP5, XD, REX_W;
 defm VCVTSH2SD : avx512_cvt_fp_scalar_extend<0x5A, "vcvtsh2sd", X86fpexts,
                                           X86fpextsSAE, WriteCvtSS2SD, f16x_info,
-                                          f64x_info, HasFP16>, T_MAP5XS;
+                                          f64x_info, HasFP16>, T_MAP5, XS;
 defm VCVTSS2SH : avx512_cvt_fp_scalar_trunc<0x1D, "vcvtss2sh", X86frounds,
                                           X86froundsRnd, WriteCvtSD2SS, f32x_info,
-                                          f16x_info, HasFP16>, T_MAP5PS;
+                                          f16x_info, HasFP16>, T_MAP5, PS;
 defm VCVTSH2SS : avx512_cvt_fp_scalar_extend<0x13, "vcvtsh2ss", X86fpexts,
                                           X86fpextsSAE, WriteCvtSS2SD, f16x_info,
-                                          f32x_info, HasFP16>, T_MAP6PS;
+                                          f32x_info, HasFP16>, T_MAP6, PS;
 
 def : Pat<(f64 (any_fpextend FR32X:$src)),
           (VCVTSS2SDZrr (f64 (IMPLICIT_DEF)), FR32X:$src)>,
@@ -7996,10 +7996,10 @@ multiclass avx512_cvt_trunc<bits<8> opc, string OpcodeStr,
 
 defm VCVTPD2PS : avx512_cvt_trunc<0x5A, "vcvtpd2ps",
                                   avx512vl_f32_info, avx512vl_f64_info, SchedWriteCvtPD2PS>,
-                                  REX_W, PD, EVEX_CD8<64, CD8VF>;
+                                  REX_W, TB, PD, EVEX_CD8<64, CD8VF>;
 defm VCVTPS2PD : avx512_cvt_extend<0x5A, "vcvtps2pd",
                                    avx512vl_f64_info, avx512vl_f32_info, SchedWriteCvtPS2PD>,
-                                   PS, EVEX_CD8<32, CD8VH>;
+                                   TB, PS, EVEX_CD8<32, CD8VH>;
 
 // Extend Half to Double
 multiclass avx512_cvtph2pd<bits<8> opc, string OpcodeStr,
@@ -8108,14 +8108,14 @@ multiclass avx512_cvtpd2ph<bits<8> opc, string OpcodeStr, X86SchedWriteWidths sc
 
 defm VCVTPS2PHX : avx512_cvt_trunc<0x1D, "vcvtps2phx", avx512vl_f16_info,
                                    avx512vl_f32_info, SchedWriteCvtPD2PS,
-                                   HasFP16>, T_MAP5PD, EVEX_CD8<32, CD8VF>;
+                                   HasFP16>, T_MAP5, PD, EVEX_CD8<32, CD8VF>;
 defm VCVTPH2PSX : avx512_cvt_extend<0x13, "vcvtph2psx", avx512vl_f32_info,
                                     avx512vl_f16_info, SchedWriteCvtPS2PD,
-                                    HasFP16>, T_MAP6PD, EVEX_CD8<16, CD8VH>;
+                                    HasFP16>, T_MAP6, PD, EVEX_CD8<16, CD8VH>;
 defm VCVTPD2PH : avx512_cvtpd2ph<0x5A, "vcvtpd2ph", SchedWriteCvtPD2PS>,
-                                 REX_W, T_MAP5PD, EVEX_CD8<64, CD8VF>;
+                                 REX_W, T_MAP5, PD, EVEX_CD8<64, CD8VF>;
 defm VCVTPH2PD : avx512_cvtph2pd<0x5A, "vcvtph2pd", SchedWriteCvtPS2PD>,
-                                 T_MAP5PS, EVEX_CD8<16, CD8VQ>;
+                                 T_MAP5, PS, EVEX_CD8<16, CD8VQ>;
 
 let Predicates = [HasFP16, HasVLX] in {
   // Special patterns to allow use of X86vmfpround for masking. Instruction
@@ -8596,120 +8596,120 @@ multiclass avx512_cvtqq2ps_dq2ph<bits<8> opc, string OpcodeStr, SDPatternOperato
 
 defm VCVTDQ2PD : avx512_cvtdq2pd<0xE6, "vcvtdq2pd", any_sint_to_fp, sint_to_fp,
                                  X86any_VSintToFP, X86VSintToFP,
-                                 SchedWriteCvtDQ2PD>, XS, EVEX_CD8<32, CD8VH>;
+                                 SchedWriteCvtDQ2PD>, TB, XS, EVEX_CD8<32, CD8VH>;
 
 defm VCVTDQ2PS : avx512_cvtdq2ps<0x5B, "vcvtdq2ps", any_sint_to_fp, sint_to_fp,
                                 X86VSintToFpRnd, SchedWriteCvtDQ2PS>,
-                                PS, EVEX_CD8<32, CD8VF>;
+                                TB, PS, EVEX_CD8<32, CD8VF>;
 
 defm VCVTTPS2DQ : avx512_cvttps2dq<0x5B, "vcvttps2dq", X86any_cvttp2si,
                                  X86cvttp2si, X86cvttp2siSAE,
-                                 SchedWriteCvtPS2DQ>, XS, EVEX_CD8<32, CD8VF>;
+                                 SchedWriteCvtPS2DQ>, TB, XS, EVEX_CD8<32, CD8VF>;
 
 defm VCVTTPD2DQ : avx512_cvttpd2dq<0xE6, "vcvttpd2dq", X86any_cvttp2si,
                                  X86cvttp2si, X86cvttp2siSAE,
                                  SchedWriteCvtPD2DQ>,
-                                 PD, REX_W, EVEX_CD8<64, CD8VF>;
+                                 TB, PD, REX_W, EVEX_CD8<64, CD8VF>;
 
 defm VCVTTPS2UDQ : avx512_cvttps2dq<0x78, "vcvttps2udq", X86any_cvttp2ui,
                                  X86cvttp2ui, X86cvttp2uiSAE,
-                                 SchedWriteCvtPS2DQ>, PS, EVEX_CD8<32, CD8VF>;
+                                 SchedWriteCvtPS2DQ>, TB, PS, EVEX_CD8<32, CD8VF>;
 
 defm VCVTTPD2UDQ : avx512_cvttpd2dq<0x78, "vcvttpd2udq", X86any_cvttp2ui,
                                  X86cvttp2ui, X86cvttp2uiSAE,
                                  SchedWriteCvtPD2DQ>,
-                                 PS, REX_W, EVEX_CD8<64, CD8VF>;
+                                 TB, PS, REX_W, EVEX_CD8<64, CD8VF>;
 
 defm VCVTUDQ2PD : avx512_cvtdq2pd<0x7A, "vcvtudq2pd", any_uint_to_fp,
                                   uint_to_fp, X86any_VUintToFP, X86VUintToFP,
-                                  SchedWriteCvtDQ2PD>, XS, EVEX_CD8<32, CD8VH>;
+                                  SchedWriteCvtDQ2PD>, TB, XS, EVEX_CD8<32, CD8VH>;
 
 defm VCVTUDQ2PS : avx512_cvtdq2ps<0x7A, "vcvtudq2ps", any_uint_to_fp,
                                  uint_to_fp, X86VUintToFpRnd,
-                                 SchedWriteCvtDQ2PS>, XD, EVEX_CD8<32, CD8VF>;
+                                 SchedWriteCvtDQ2PS>, TB, XD, EVEX_CD8<32, CD8VF>;
 
 defm VCVTPS2DQ : avx512_cvtps2dq<0x5B, "vcvtps2dq", X86cvtp2Int, X86cvtp2Int,
-                                 X86cvtp2IntRnd, SchedWriteCvtPS2DQ>, PD,
+                                 X86cvtp2IntRnd, SchedWriteCvtPS2DQ>, TB, PD,
                                  EVEX_CD8<32, CD8VF>;
 
 defm VCVTPD2DQ : avx512_cvtpd2dq<0xE6, "vcvtpd2dq", X86cvtp2Int, X86cvtp2Int,
-                                 X86cvtp2IntRnd, SchedWriteCvtPD2DQ>, XD,
+                                 X86cvtp2IntRnd, SchedWriteCvtPD2DQ>, TB, XD,
                                  REX_W, EVEX_CD8<64, CD8VF>;
 
 defm VCVTPS2UDQ : avx512_cvtps2dq<0x79, "vcvtps2udq", X86cvtp2UInt, X86cvtp2UInt,
                                  X86cvtp2UIntRnd, SchedWriteCvtPS2DQ>,
-                                 PS, EVEX_CD8<32, CD8VF>;
+                                 TB, PS, EVEX_CD8<32, CD8VF>;
 
 defm VCVTPD2UDQ : avx512_cvtpd2dq<0x79, "vcvtpd2udq", X86cvtp2UInt, X86cvtp2UInt,
                                  X86cvtp2UIntRnd, SchedWriteCvtPD2DQ>, REX_W,
-                                 PS, EVEX_CD8<64, CD8VF>;
+                                 TB, PS, EVEX_CD8<64, CD8VF>;
 
 defm VCVTPD2QQ : avx512_cvtpd2qq<0x7B, "vcvtpd2qq", X86cvtp2Int, X86cvtp2Int,
                                  X86cvtp2IntRnd, SchedWriteCvtPD2DQ>, REX_W,
-                                 PD, EVEX_CD8<64, CD8VF>;
+                                 TB, PD, EVEX_CD8<64, CD8VF>;
 
 defm VCVTPS2QQ : avx512_cvtps2qq<0x7B, "vcvtps2qq", X86cvtp2Int, X86cvtp2Int,
-                                 X86cvtp2IntRnd, SchedWriteCvtPS2DQ>, PD,
+                                 X86cvtp2IntRnd, SchedWriteCvtPS2DQ>, TB, PD,
                                  EVEX_CD8<32, CD8VH>;
 
 defm VCVTPD2UQQ : avx512_cvtpd2qq<0x79, "vcvtpd2uqq", X86cvtp2UInt, X86cvtp2UInt,
                                  X86cvtp2UIntRnd, SchedWriteCvtPD2DQ>, REX_W,
-                                 PD, EVEX_CD8<64, CD8VF>;
+                                 TB, PD, EVEX_CD8<64, CD8VF>;
 
 defm VCVTPS2UQQ : avx512_cvtps2qq<0x79, "vcvtps2uqq", X86cvtp2UInt, X86cvtp2UInt,
-                                 X86cvtp2UIntRnd, SchedWriteCvtPS2DQ>, PD,
+                                 X86cvtp2UIntRnd, SchedWriteCvtPS2DQ>, TB, PD,
                                  EVEX_CD8<32, CD8VH>;
 
 defm VCVTTPD2QQ : avx512_cvttpd2qq<0x7A, "vcvttpd2qq", X86any_cvttp2si,
                                  X86cvttp2si, X86cvttp2siSAE,
                                  SchedWriteCvtPD2DQ>, REX_W,
-                                 PD, EVEX_CD8<64, CD8VF>;
+                                 TB, PD, EVEX_CD8<64, CD8VF>;
 
 defm VCVTTPS2QQ : avx512_cvttps2qq<0x7A, "vcvttps2qq", X86any_cvttp2si,
                                  X86cvttp2si, X86cvttp2siSAE,
-                                 SchedWriteCvtPS2DQ>, PD,
+                                 SchedWriteCvtPS2DQ>, TB, PD,
                                  EVEX_CD8<32, CD8VH>;
 
 defm VCVTTPD2UQQ : avx512_cvttpd2qq<0x78, "vcvttpd2uqq", X86any_cvttp2ui,
                                  X86cvttp2ui, X86cvttp2uiSAE,
                                  SchedWriteCvtPD2DQ>, REX_W,
-                                 PD, EVEX_CD8<64, CD8VF>;
+                                 TB, PD, EVEX_CD8<64, CD8VF>;
 
 defm VCVTTPS2UQQ : avx512_cvttps2qq<0x78, "vcvttps2uqq", X86any_cvttp2ui,
                                  X86cvttp2ui, X86cvttp2uiSAE,
-                                 SchedWriteCvtPS2DQ>, PD,
+                                 SchedWriteCvtPS2DQ>, TB, PD,
                                  EVEX_CD8<32, CD8VH>;
 
 defm VCVTQQ2PD : avx512_cvtqq2pd<0xE6, "vcvtqq2pd", any_sint_to_fp,
                             sint_to_fp, X86VSintToFpRnd,
-                            SchedWriteCvtDQ2PD>, REX_W, XS, EVEX_CD8<64, CD8VF>;
+                            SchedWriteCvtDQ2PD>, REX_W, TB, XS, EVEX_CD8<64, CD8VF>;
 
 defm VCVTUQQ2PD : avx512_cvtqq2pd<0x7A, "vcvtuqq2pd", any_uint_to_fp,
                             uint_to_fp, X86VUintToFpRnd, SchedWriteCvtDQ2PD>,
-                            REX_W, XS, EVEX_CD8<64, CD8VF>;
+                            REX_W, TB, XS, EVEX_CD8<64, CD8VF>;
 
 defm VCVTDQ2PH : avx512_cvtqq2ps_dq2ph<0x5B, "vcvtdq2ph", any_sint_to_fp, sint_to_fp,
                             X86any_VSintToFP, X86VMSintToFP,
                             X86VSintToFpRnd, avx512vl_f16_info, avx512vl_i32_info,
                             SchedWriteCvtDQ2PS, HasFP16>,
-                            T_MAP5PS, EVEX_CD8<32, CD8VF>;
+                            T_MAP5, PS, EVEX_CD8<32, CD8VF>;
 
 defm VCVTUDQ2PH : avx512_cvtqq2ps_dq2ph<0x7A, "vcvtudq2ph", any_uint_to_fp, uint_to_fp,
                             X86any_VUintToFP, X86VMUintToFP,
                             X86VUintToFpRnd, avx512vl_f16_info, avx512vl_i32_info,
-                            SchedWriteCvtDQ2PS, HasFP16>, T_MAP5XD,
+                            SchedWriteCvtDQ2PS, HasFP16>, T_MAP5, XD,
                             EVEX_CD8<32, CD8VF>;
 
 defm VCVTQQ2PS : avx512_cvtqq2ps_dq2ph<0x5B, "vcvtqq2ps", any_sint_to_fp, sint_to_fp,
                             X86any_VSintToFP, X86VMSintToFP,
                             X86VSintToFpRnd, avx512vl_f32_info, avx512vl_i64_info,
-                            SchedWriteCvtDQ2PS>, REX_W, PS,
+                            SchedWriteCvtDQ2PS>, REX_W, TB, PS,
                             EVEX_CD8<64, CD8VF>;
 
 defm VCVTUQQ2PS : avx512_cvtqq2ps_dq2ph<0x7A, "vcvtuqq2ps", any_uint_to_fp, uint_to_fp,
                             X86any_VUintToFP, X86VMUintToFP,
                             X86VUintToFpRnd, avx512vl_f32_info, avx512vl_i64_info,
-                            SchedWriteCvtDQ2PS>, REX_W, XD,
+                            SchedWriteCvtDQ2PS>, REX_W, TB, XD,
                             EVEX_CD8<64, CD8VF>;
 
 let Predicates = [HasVLX] in {
@@ -8912,12 +8912,12 @@ multiclass avx512_cvtph2ps<X86VectorVTInfo _dest, X86VectorVTInfo _src,
                             (ins _src.RC:$src), "vcvtph2ps", "$src", "$src",
                             (X86any_cvtph2ps (_src.VT _src.RC:$src)),
                             (X86cvtph2ps (_src.VT _src.RC:$src))>,
-                            T8PD, Sched<[sched]>;
+                            T8, PD, Sched<[sched]>;
   defm rm : AVX512_maskable_split<0x13, MRMSrcMem, _dest, (outs _dest.RC:$dst),
                             (ins x86memop:$src), "vcvtph2ps", "$src", "$src",
                             (X86any_cvtph2ps (_src.VT ld_dag)),
                             (X86cvtph2ps (_src.VT ld_dag))>,
-                            T8PD, Sched<[sched.Folded]>;
+                            T8, PD, Sched<[sched.Folded]>;
 }
 
 multiclass avx512_cvtph2ps_sae<X86VectorVTInfo _dest, X86VectorVTInfo _src,
@@ -8927,7 +8927,7 @@ multiclass avx512_cvtph2ps_sae<X86VectorVTInfo _dest, X86VectorVTInfo _src,
                              (ins _src.RC:$src), "vcvtph2ps",
                              "{sae}, $src", "$src, {sae}",
                              (X86cvtph2psSAE (_src.VT _src.RC:$src))>,
-                             T8PD, EVEX_B, Sched<[sched]>;
+                             T8, PD, EVEX_B, Sched<[sched]>;
 }
 
 let Predicates = [HasAVX512] in
@@ -9068,55 +9068,55 @@ let Defs = [EFLAGS], Predicates = [HasAVX512] in {
 
 let Defs = [EFLAGS], Predicates = [HasAVX512] in {
   defm VUCOMISSZ : sse12_ord_cmp<0x2E, FR32X, X86any_fcmp, f32, f32mem, loadf32,
-                                 "ucomiss", SSEPackedSingle>, PS, EVEX, VEX_LIG,
+                                 "ucomiss", SSEPackedSingle>, TB, PS, EVEX, VEX_LIG,
                                  EVEX_CD8<32, CD8VT1>;
   defm VUCOMISDZ : sse12_ord_cmp<0x2E, FR64X, X86any_fcmp, f64, f64mem, loadf64,
-                                  "ucomisd", SSEPackedDouble>, PD, EVEX,
+                                  "ucomisd", SSEPackedDouble>, TB, PD, EVEX,
                                   VEX_LIG, REX_W, EVEX_CD8<64, CD8VT1>;
   defm VCOMISSZ  : sse12_ord_cmp<0x2F, FR32X, X86strict_fcmps, f32, f32mem, loadf32,
-                                 "comiss", SSEPackedSingle>, PS, EVEX, VEX_LIG,
+                                 "comiss", SSEPackedSingle>, TB, PS, EVEX, VEX_LIG,
                                  EVEX_CD8<32, CD8VT1>;
   defm VCOMISDZ  : sse12_ord_cmp<0x2F, FR64X, X86strict_fcmps, f64, f64mem, loadf64,
-                                 "comisd", SSEPackedDouble>, PD, EVEX,
+                                 "comisd", SSEPackedDouble>, TB, PD, EVEX,
                                   VEX_LIG, REX_W, EVEX_CD8<64, CD8VT1>;
   let isCodeGenOnly = 1 in {
     defm VUCOMISSZ  : sse12_ord_cmp_int<0x2E, VR128X, X86ucomi, v4f32, ssmem,
-                          sse_load_f32, "ucomiss", SSEPackedSingle>, PS, EVEX, VEX_LIG,
+                          sse_load_f32, "ucomiss", SSEPackedSingle>, TB, PS, EVEX, VEX_LIG,
                           EVEX_CD8<32, CD8VT1>;
     defm VUCOMISDZ  : sse12_ord_cmp_int<0x2E, VR128X, X86ucomi, v2f64, sdmem,
-                          sse_load_f64, "ucomisd", SSEPackedDouble>, PD, EVEX,
+                          sse_load_f64, "ucomisd", SSEPackedDouble>, TB, PD, EVEX,
                           VEX_LIG, REX_W, EVEX_CD8<64, CD8VT1>;
 
     defm VCOMISSZ  : sse12_ord_cmp_int<0x2F, VR128X, X86comi, v4f32, ssmem,
-                          sse_load_f32, "comiss", SSEPackedSingle>, PS, EVEX, VEX_LIG,
+                          sse_load_f32, "comiss", SSEPackedSingle>, TB, PS, EVEX, VEX_LIG,
                           EVEX_CD8<32, CD8VT1>;
     defm VCOMISDZ  : sse12_ord_cmp_int<0x2F, VR128X, X86comi, v2f64, sdmem,
-                          sse_load_f64, "comisd", SSEPackedDouble>, PD, EVEX,
+                          sse_load_f64, "comisd", SSEPackedDouble>, TB, PD, EVEX,
                           VEX_LIG, REX_W, EVEX_CD8<64, CD8VT1>;
   }
 }
 
 let Defs = [EFLAGS], Predicates = [HasFP16] in {
   defm VUCOMISHZ : avx512_ord_cmp_sae<0x2E, v8f16x_info, "vucomish",
-                                SSEPackedSingle>, AVX512PSIi8Base, T_MAP5PS,
+                                SSEPackedSingle>, AVX512PSIi8Base, T_MAP5,
                                 EVEX_CD8<16, CD8VT1>;
   defm VCOMISHZ : avx512_ord_cmp_sae<0x2F, v8f16x_info, "vcomish",
-                                SSEPackedSingle>, AVX512PSIi8Base, T_MAP5PS,
+                                SSEPackedSingle>, AVX512PSIi8Base, T_MAP5,
                                 EVEX_CD8<16, CD8VT1>;
   defm VUCOMISHZ : sse12_ord_cmp<0x2E, FR16X, X86any_fcmp, f16, f16mem, loadf16,
-                                "ucomish", SSEPackedSingle>, T_MAP5PS, EVEX,
+                                "ucomish", SSEPackedSingle>, T_MAP5, PS, EVEX,
                                 VEX_LIG, EVEX_CD8<16, CD8VT1>;
   defm VCOMISHZ : sse12_ord_cmp<0x2F, FR16X, X86strict_fcmps, f16, f16mem, loadf16,
-                                "comish", SSEPackedSingle>, T_MAP5PS, EVEX,
+                                "comish", SSEPackedSingle>, T_MAP5, PS, EVEX,
                                 VEX_LIG, EVEX_CD8<16, CD8VT1>;
   let isCodeGenOnly = 1 in {
     defm VUCOMISHZ : sse12_ord_cmp_int<0x2E, VR128X, X86ucomi, v8f16, shmem,
                                 sse_load_f16, "ucomish", SSEPackedSingle>,
-                                T_MAP5PS, EVEX, VEX_LIG, EVEX_CD8<16, CD8VT1>;
+                                T_MAP5, PS, EVEX, VEX_LIG, EVEX_CD8<16, CD8VT1>;
 
     defm VCOMISHZ : sse12_ord_cmp_int<0x2F, VR128X, X86comi, v8f16, shmem,
                                 sse_load_f16, "comish", SSEPackedSingle>,
-                                T_MAP5PS, EVEX, VEX_LIG, EVEX_CD8<16, CD8VT1>;
+                                T_MAP5, PS, EVEX, VEX_LIG, EVEX_CD8<16, CD8VT1>;
   }
 }
 
@@ -9141,23 +9141,23 @@ multiclass avx512_fp14_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
 
 defm VRCPSHZ : avx512_fp14_s<0x4D, "vrcpsh", X86rcp14s, SchedWriteFRcp.Scl,
                                f16x_info, HasFP16>, EVEX_CD8<16, CD8VT1>,
-                               T_MAP6PD;
+                               T_MAP6, PD;
 defm VRSQRTSHZ : avx512_fp14_s<0x4F, "vrsqrtsh", X86rsqrt14s,
                                  SchedWriteFRsqrt.Scl, f16x_info, HasFP16>,
-                                 EVEX_CD8<16, CD8VT1>, T_MAP6PD;
+                                 EVEX_CD8<16, CD8VT1>, T_MAP6, PD;
 let Uses = [MXCSR] in {
 defm VRCP14SSZ : avx512_fp14_s<0x4D, "vrcp14ss", X86rcp14s, SchedWriteFRcp.Scl,
                                f32x_info>, EVEX_CD8<32, CD8VT1>,
-                               T8PD;
+                               T8, PD;
 defm VRCP14SDZ : avx512_fp14_s<0x4D, "vrcp14sd", X86rcp14s, SchedWriteFRcp.Scl,
                                f64x_info>, REX_W, EVEX_CD8<64, CD8VT1>,
-                               T8PD;
+                               T8, PD;
 defm VRSQRT14SSZ : avx512_fp14_s<0x4F, "vrsqrt14ss", X86rsqrt14s,
                                  SchedWriteFRsqrt.Scl, f32x_info>,
-                                 EVEX_CD8<32, CD8VT1>, T8PD;
+                                 EVEX_CD8<32, CD8VT1>, T8, PD;
 defm VRSQRT14SDZ : avx512_fp14_s<0x4F, "vrsqrt14sd", X86rsqrt14s,
                                  SchedWriteFRsqrt.Scl, f64x_info>, REX_W,
-                                 EVEX_CD8<64, CD8VT1>, T8PD;
+                                 EVEX_CD8<64, CD8VT1>, T8, PD;
 }
 
 /// avx512_fp14_p rcp14ps, rcp14pd, rsqrt14ps, rsqrt14pd
@@ -9166,19 +9166,19 @@ multiclass avx512_fp14_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
   let ExeDomain = _.ExeDomain in {
   defm r: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
                          (ins _.RC:$src), OpcodeStr, "$src", "$src",
-                         (_.VT (OpNode _.RC:$src))>, EVEX, T8PD,
+                         (_.VT (OpNode _.RC:$src))>, EVEX, T8, PD,
                          Sched<[sched]>;
   defm m: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
                          (ins _.MemOp:$src), OpcodeStr, "$src", "$src",
                          (OpNode (_.VT
-                           (bitconvert (_.LdFrag addr:$src))))>, EVEX, T8PD,
+                           (bitconvert (_.LdFrag addr:$src))))>, EVEX, T8, PD,
                          Sched<[sched.Folded, sched.ReadAfterFold]>;
   defm mb: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
                           (ins _.ScalarMemOp:$src), OpcodeStr,
                           "${src}"#_.BroadcastStr, "${src}"#_.BroadcastStr,
                           (OpNode (_.VT
                             (_.BroadcastLdFrag addr:$src)))>,
-                          EVEX, T8PD, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
+                          EVEX, T8, PD, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
   }
 }
 
@@ -9192,7 +9192,7 @@ multiclass avx512_fp14_p_vl_all<bits<8> opc, string OpcodeStr, SDNode OpNode,
   }
   let Predicates = [HasFP16] in
   defm PHZ : avx512_fp14_p<opc, !strconcat(OpcodeStr, "ph"), OpNode, sched.ZMM,
-                           v32f16_info>, EVEX_V512, T_MAP6PD, EVEX_CD8<16, CD8VF>;
+                           v32f16_info>, EVEX_V512, T_MAP6, EVEX_CD8<16, CD8VF>;
 
   // Define only if AVX512VL feature is present.
   let Predicates = [HasVLX], Uses = [MXCSR] in {
@@ -9212,10 +9212,10 @@ multiclass avx512_fp14_p_vl_all<bits<8> opc, string OpcodeStr, SDNode OpNode,
   let Predicates = [HasFP16, HasVLX] in {
     defm PHZ128 : avx512_fp14_p<opc, !strconcat(OpcodeStr, "ph"),
                                 OpNode, sched.XMM, v8f16x_info>,
-                                EVEX_V128, T_MAP6PD, EVEX_CD8<16, CD8VF>;
+                                EVEX_V128, T_MAP6, EVEX_CD8<16, CD8VF>;
     defm PHZ256 : avx512_fp14_p<opc, !strconcat(OpcodeStr, "ph"),
                                 OpNode, sched.YMM, v16f16x_info>,
-                                EVEX_V256, T_MAP6PD, EVEX_CD8<16, CD8VF>;
+                                EVEX_V256, T_MAP6, EVEX_CD8<16, CD8VF>;
   }
 }
 
@@ -9250,16 +9250,16 @@ multiclass avx512_fp28_s<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
 multiclass avx512_eri_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
                         SDNode OpNodeSAE, X86FoldableSchedWrite sched> {
   defm SSZ : avx512_fp28_s<opc, OpcodeStr#"ss", f32x_info, OpNode, OpNodeSAE,
-                           sched>, EVEX_CD8<32, CD8VT1>, VEX_LIG, T8PD, EVEX, VVVV;
+                           sched>, EVEX_CD8<32, CD8VT1>, VEX_LIG, T8, PD, EVEX, VVVV;
   defm SDZ : avx512_fp28_s<opc, OpcodeStr#"sd", f64x_info, OpNode, OpNodeSAE,
-                           sched>, EVEX_CD8<64, CD8VT1>, VEX_LIG, REX_W, T8PD, EVEX, VVVV;
+                           sched>, EVEX_CD8<64, CD8VT1>, VEX_LIG, REX_W, T8, PD, EVEX, VVVV;
 }
 
 multiclass avx512_vgetexpsh<bits<8> opc, string OpcodeStr, SDNode OpNode,
                         SDNode OpNodeSAE, X86FoldableSchedWrite sched> {
   let Predicates = [HasFP16] in
   defm SHZ : avx512_fp28_s<opc, OpcodeStr#"sh", f16x_info, OpNode,  OpNodeSAE, sched>,
-               EVEX_CD8<16, CD8VT1>, T_MAP6PD, EVEX, VVVV;
+               EVEX_CD8<16, CD8VT1>, T_MAP6, PD, EVEX, VVVV;
 }
 
 let Predicates = [HasERI] in {
@@ -9311,10 +9311,10 @@ multiclass  avx512_eri<bits<8> opc, string OpcodeStr, SDNode OpNode,
                        SDNode OpNodeSAE, X86SchedWriteWidths sched> {
    defm PSZ : avx512_fp28_p<opc, OpcodeStr#"ps", v16f32_info, OpNode, sched.ZMM>,
               avx512_fp28_p_sae<opc, OpcodeStr#"ps", v16f32_info, OpNodeSAE, sched.ZMM>,
-              T8PD, EVEX_V512, EVEX_CD8<32, CD8VF>;
+              T8, PD, EVEX_V512, EVEX_CD8<32, CD8VF>;
    defm PDZ : avx512_fp28_p<opc, OpcodeStr#"pd", v8f64_info, OpNode, sched.ZMM>,
               avx512_fp28_p_sae<opc, OpcodeStr#"pd", v8f64_info, OpNodeSAE, sched.ZMM>,
-              T8PD, EVEX_V512, REX_W, EVEX_CD8<64, CD8VF>;
+              T8, PD, EVEX_V512, REX_W, EVEX_CD8<64, CD8VF>;
 }
 
 multiclass avx512_fp_unaryop_packed<bits<8> opc, string OpcodeStr,
@@ -9323,16 +9323,16 @@ multiclass avx512_fp_unaryop_packed<bits<8> opc, string OpcodeStr,
   let Predicates = [HasVLX] in {
     defm PSZ128 : avx512_fp28_p<opc, OpcodeStr#"ps", v4f32x_info, OpNode,
                                 sched.XMM>,
-                                EVEX_V128, T8PD, EVEX_CD8<32, CD8VF>;
+                                EVEX_V128, T8, PD, EVEX_CD8<32, CD8VF>;
     defm PSZ256 : avx512_fp28_p<opc, OpcodeStr#"ps", v8f32x_info, OpNode,
                                 sched.YMM>,
-                                EVEX_V256, T8PD, EVEX_CD8<32, CD8VF>;
+                                EVEX_V256, T8, PD, EVEX_CD8<32, CD8VF>;
     defm PDZ128 : avx512_fp28_p<opc, OpcodeStr#"pd", v2f64x_info, OpNode,
                                 sched.XMM>,
-                                EVEX_V128, REX_W, T8PD, EVEX_CD8<64, CD8VF>;
+                                EVEX_V128, REX_W, T8, PD, EVEX_CD8<64, CD8VF>;
     defm PDZ256 : avx512_fp28_p<opc, OpcodeStr#"pd", v4f64x_info, OpNode,
                                 sched.YMM>,
-                                EVEX_V256, REX_W, T8PD, EVEX_CD8<64, CD8VF>;
+                                EVEX_V256, REX_W, T8, PD, EVEX_CD8<64, CD8VF>;
   }
 }
 
@@ -9341,12 +9341,12 @@ multiclass  avx512_vgetexp_fp16<bits<8> opc, string OpcodeStr, SDNode OpNode,
   let Predicates = [HasFP16] in
   defm PHZ : avx512_fp28_p<opc, OpcodeStr#"ph", v32f16_info, OpNode, sched.ZMM>,
               avx512_fp28_p_sae<opc, OpcodeStr#"ph", v32f16_info, OpNodeSAE, sched.ZMM>,
-              T_MAP6PD, EVEX_V512, EVEX_CD8<16, CD8VF>;
+              T_MAP6, PD, EVEX_V512, EVEX_CD8<16, CD8VF>;
   let Predicates = [HasFP16, HasVLX] in {
     defm PHZ128 : avx512_fp28_p<opc, OpcodeStr#"ph", v8f16x_info, OpNode, sched.XMM>,
-                                     EVEX_V128, T_MAP6PD, EVEX_CD8<16, CD8VF>;
+                                     EVEX_V128, T_MAP6, PD, EVEX_CD8<16, CD8VF>;
     defm PHZ256 : avx512_fp28_p<opc, OpcodeStr#"ph", v16f16x_info, OpNode, sched.YMM>,
-                                     EVEX_V256, T_MAP6PD, EVEX_CD8<16, CD8VF>;
+                                     EVEX_V256, T_MAP6, PD, EVEX_CD8<16, CD8VF>;
   }
 }
 let Predicates = [HasERI] in {
@@ -9401,35 +9401,35 @@ multiclass avx512_sqrt_packed_all<bits<8> opc, string OpcodeStr,
   let Predicates = [HasFP16] in
   defm PHZ : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "ph"),
                                 sched.PH.ZMM, v32f16_info>,
-                                EVEX_V512, T_MAP5PS, EVEX_CD8<16, CD8VF>;
+                                EVEX_V512, T_MAP5, PS, EVEX_CD8<16, CD8VF>;
   let Predicates = [HasFP16, HasVLX] in {
     defm PHZ128 : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "ph"),
                                      sched.PH.XMM, v8f16x_info>,
-                                     EVEX_V128, T_MAP5PS, EVEX_CD8<16, CD8VF>;
+                                     EVEX_V128, T_MAP5, PS, EVEX_CD8<16, CD8VF>;
     defm PHZ256 : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "ph"),
                                      sched.PH.YMM, v16f16x_info>,
-                                     EVEX_V256, T_MAP5PS, EVEX_CD8<16, CD8VF>;
+                                     EVEX_V256, T_MAP5, PS, EVEX_CD8<16, CD8VF>;
   }
   defm PSZ : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "ps"),
                                 sched.PS.ZMM, v16f32_info>,
-                                EVEX_V512, PS, EVEX_CD8<32, CD8VF>;
+                                EVEX_V512, TB, PS, EVEX_CD8<32, CD8VF>;
   defm PDZ : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "pd"),
                                 sched.PD.ZMM, v8f64_info>,
-                                EVEX_V512, REX_W, PD, EVEX_CD8<64, CD8VF>;
+                                EVEX_V512, REX_W, TB, PD, EVEX_CD8<64, CD8VF>;
   // Define only if AVX512VL feature is present.
   let Predicates = [HasVLX] in {
     defm PSZ128 : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "ps"),
                                      sched.PS.XMM, v4f32x_info>,
-                                     EVEX_V128, PS, EVEX_CD8<32, CD8VF>;
+                                     EVEX_V128, TB, PS, EVEX_CD8<32, CD8VF>;
     defm PSZ256 : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "ps"),
                                      sched.PS.YMM, v8f32x_info>,
-                                     EVEX_V256, PS, EVEX_CD8<32, CD8VF>;
+                                     EVEX_V256, TB, PS, EVEX_CD8<32, CD8VF>;
     defm PDZ128 : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "pd"),
                                      sched.PD.XMM, v2f64x_info>,
-                                     EVEX_V128, REX_W, PD, EVEX_CD8<64, CD8VF>;
+                                     EVEX_V128, REX_W, TB, PD, EVEX_CD8<64, CD8VF>;
     defm PDZ256 : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "pd"),
                                      sched.PD.YMM, v4f64x_info>,
-                                     EVEX_V256, REX_W, PD, EVEX_CD8<64, CD8VF>;
+                                     EVEX_V256, REX_W, TB, PD, EVEX_CD8<64, CD8VF>;
   }
 }
 
@@ -9439,13 +9439,13 @@ multiclass avx512_sqrt_packed_all_round<bits<8> opc, string OpcodeStr,
   let Predicates = [HasFP16] in
   defm PHZ : avx512_sqrt_packed_round<opc, !strconcat(OpcodeStr, "ph"),
                                       sched.PH.ZMM, v32f16_info>,
-                                      EVEX_V512, T_MAP5PS, EVEX_CD8<16, CD8VF>;
+                                      EVEX_V512, T_MAP5, PS, EVEX_CD8<16, CD8VF>;
   defm PSZ : avx512_sqrt_packed_round<opc, !strconcat(OpcodeStr, "ps"),
                                       sched.PS.ZMM, v16f32_info>,
-                                      EVEX_V512, PS, EVEX_CD8<32, CD8VF>;
+                                      EVEX_V512, TB, PS, EVEX_CD8<32, CD8VF>;
   defm PDZ : avx512_sqrt_packed_round<opc, !strconcat(OpcodeStr, "pd"),
                                       sched.PD.ZMM, v8f64_info>,
-                                      EVEX_V512, REX_W, PD, EVEX_CD8<64, CD8VF>;
+                                      EVEX_V512, REX_W, TB, PD, EVEX_CD8<64, CD8VF>;
 }
 
 multiclass avx512_sqrt_scalar<bits<8> opc, string OpcodeStr, X86FoldableSchedWrite sched,
@@ -9501,11 +9501,11 @@ multiclass avx512_sqrt_scalar<bits<8> opc, string OpcodeStr, X86FoldableSchedWri
 multiclass avx512_sqrt_scalar_all<bits<8> opc, string OpcodeStr,
                                   X86SchedWriteSizes sched> {
   defm SHZ : avx512_sqrt_scalar<opc, OpcodeStr#"sh", sched.PH.Scl, f16x_info, NAME#"SH", HasFP16>,
-                        EVEX_CD8<16, CD8VT1>, EVEX, VVVV, T_MAP5XS;
+                        EVEX_CD8<16, CD8VT1>, EVEX, VVVV, T_MAP5, XS;
   defm SSZ : avx512_sqrt_scalar<opc, OpcodeStr#"ss", sched.PS.Scl, f32x_info, NAME#"SS">,
-                        EVEX_CD8<32, CD8VT1>, EVEX, VVVV, XS;
+                        EVEX_CD8<32, CD8VT1>, EVEX, VVVV, TB, XS;
   defm SDZ : avx512_sqrt_scalar<opc, OpcodeStr#"sd", sched.PD.Scl, f64x_info, NAME#"SD">,
-                        EVEX_CD8<64, CD8VT1>, EVEX, VVVV, XD, REX_W;
+                        EVEX_CD8<64, CD8VT1>, EVEX, VVVV, TB, XD, REX_W;
 }
 
 defm VSQRT : avx512_sqrt_packed_all<0x51, "vsqrt", SchedWriteFSqrtSizes>,
@@ -9923,16 +9923,16 @@ multiclass avx512_pmovx_bw<bits<8> opc, string OpcodeStr,
   let Predicates = [HasVLX, HasBWI] in {
     defm Z128:  avx512_pmovx_common<opc, OpcodeStr, sched.XMM, v8i16x_info,
                     v16i8x_info, i64mem, LdFrag, InVecNode>,
-                     EVEX_CD8<8, CD8VH>, T8PD, EVEX_V128, WIG;
+                     EVEX_CD8<8, CD8VH>, T8, PD, EVEX_V128, WIG;
 
     defm Z256:  avx512_pmovx_common<opc, OpcodeStr, sched.YMM, v16i16x_info,
                     v16i8x_info, i128mem, LdFrag, OpNode>,
-                     EVEX_CD8<8, CD8VH>, T8PD, EVEX_V256, WIG;
+                     EVEX_CD8<8, CD8VH>, T8, PD, EVEX_V256, WIG;
   }
   let Predicates = [HasBWI] in {
     defm Z   :  avx512_pmovx_common<opc, OpcodeStr, sched.ZMM, v32i16_info,
                     v32i8x_info, i256mem, LdFrag, OpNode>,
-                     EVEX_CD8<8, CD8VH>, T8PD, EVEX_V512, WIG;
+                     EVEX_CD8<8, CD8VH>, T8, PD, EVEX_V512, WIG;
   }
 }
 
@@ -9943,16 +9943,16 @@ multiclass avx512_pmovx_bd<bits<8> opc, string OpcodeStr,
   let Predicates = [HasVLX, HasAVX512] in {
     defm Z128:  avx512_pmovx_common<opc, OpcodeStr, sched.XMM, v4i32x_info,
                    v16i8x_info, i32mem, LdFrag, InVecNode>,
-                         EVEX_CD8<8, CD8VQ>, T8PD, EVEX_V128, WIG;
+                         EVEX_CD8<8, CD8VQ>, T8, PD, EVEX_V128, WIG;
 
     defm Z256:  avx512_pmovx_common<opc, OpcodeStr, sched.YMM, v8i32x_info,
                    v16i8x_info, i64mem, LdFrag, InVecNode>,
-                         EVEX_CD8<8, CD8VQ>, T8PD, EVEX_V256, WIG;
+                         EVEX_CD8<8, CD8VQ>, T8, PD, EVEX_V256, WIG;
   }
   let Predicates = [HasAVX512] in {
     defm Z   :  avx512_pmovx_common<opc, OpcodeStr, sched.ZMM, v16i32_info,
                    v16i8x_info, i128mem, LdFrag, OpNode>,
-                         EVEX_CD8<8, CD8VQ>, T8PD, EVEX_V512, WIG;
+                         EVEX_CD8<8, CD8VQ>, T8, PD, EVEX_V512, WIG;
   }
 }
 
@@ -9963,16 +9963,16 @@ multiclass avx512_pmovx_bq<bits<8> opc, string OpcodeStr,
   let Predicates = [HasVLX, HasAVX512] in {
     defm Z128:  avx512_pmovx_common<opc, OpcodeStr, sched.XMM, v2i64x_info,
                    v16i8x_info, i16mem, LdFrag, InVecNode>,
-                     EVEX_CD8<8, CD8VO>, T8PD, EVEX_V128, WIG;
+                     EVEX_CD8<8, CD8VO>, T8, PD, EVEX_V128, WIG;
 
     defm Z256:  avx512_pmovx_common<opc, OpcodeStr, sched.YMM, v4i64x_info,
                    v16i8x_info, i32mem, LdFrag, InVecNode>,
-                     EVEX_CD8<8, CD8VO>, T8PD, EVEX_V256, WIG;
+                     EVEX_CD8<8, CD8VO>, T8, PD, EVEX_V256, WIG;
   }
   let Predicates = [HasAVX512] in {
     defm Z   :  avx512_pmovx_common<opc, OpcodeStr, sched.ZMM, v8i64_info,
                    v16i8x_info, i64mem, LdFrag, InVecNode>,
-                     EVEX_CD8<8, CD8VO>, T8PD, EVEX_V512, WIG;
+                     EVEX_CD8<8, CD8VO>, T8, PD, EVEX_V512, WIG;
   }
 }
 
@@ -9983,16 +9983,16 @@ multiclass avx512_pmovx_wd<bits<8> opc, string OpcodeStr,
   let Predicates = [HasVLX, HasAVX512] in {
     defm Z128:  avx512_pmovx_common<opc, OpcodeStr, sched.XMM, v4i32x_info,
                    v8i16x_info, i64mem, LdFrag, InVecNode>,
-                     EVEX_CD8<16, CD8VH>, T8PD, EVEX_V128, WIG;
+                     EVEX_CD8<16, CD8VH>, T8, PD, EVEX_V128, WIG;
 
     defm Z256:  avx512_pmovx_common<opc, OpcodeStr, sched.YMM, v8i32x_info,
                    v8i16x_info, i128mem, LdFrag, OpNode>,
-                     EVEX_CD8<16, CD8VH>, T8PD, EVEX_V256, WIG;
+                     EVEX_CD8<16, CD8VH>, T8, PD, EVEX_V256, WIG;
   }
   let Predicates = [HasAVX512] in {
     defm Z   :  avx512_pmovx_common<opc, OpcodeStr, sched.ZMM, v16i32_info,
                    v16i16x_info, i256mem, LdFrag, OpNode>,
-                     EVEX_CD8<16, CD8VH>, T8PD, EVEX_V512, WIG;
+                     EVEX_CD8<16, CD8VH>, T8, PD, EVEX_V512, WIG;
   }
 }
 
@@ -10003,16 +10003,16 @@ multiclass avx512_pmovx_wq<bits<8> opc, string OpcodeStr,
   let Predicates = [HasVLX, HasAVX512] in {
     defm Z128:  avx512_pmovx_common<opc, OpcodeStr, sched.XMM, v2i64x_info,
                    v8i16x_info, i32mem, LdFrag, InVecNode>,
-                     EVEX_CD8<16, CD8VQ>, T8PD, EVEX_V128, WIG;
+                     EVEX_CD8<16, CD8VQ>, T8, PD, EVEX_V128, WIG;
 
     defm Z256:  avx512_pmovx_common<opc, OpcodeStr, sched.YMM, v4i64x_info,
                    v8i16x_info, i64mem, LdFrag, InVecNode>,
-                     EVEX_CD8<16, CD8VQ>, T8PD, EVEX_V256, WIG;
+                     EVEX_CD8<16, CD8VQ>, T8, PD, EVEX_V256, WIG;
   }
   let Predicates = [HasAVX512] in {
     defm Z   :  avx512_pmovx_common<opc, OpcodeStr, sched.ZMM, v8i64_info,
                    v8i16x_info, i128mem, LdFrag, OpNode>,
-                     EVEX_CD8<16, CD8VQ>, T8PD, EVEX_V512, WIG;
+                     EVEX_CD8<16, CD8VQ>, T8, PD, EVEX_V512, WIG;
   }
 }
 
@@ -10024,16 +10024,16 @@ multiclass avx512_pmovx_dq<bits<8> opc, string OpcodeStr,
   let Predicates = [HasVLX, HasAVX512] in {
     defm Z128:  avx512_pmovx_common<opc, OpcodeStr, sched.XMM, v2i64x_info,
                    v4i32x_info, i64mem, LdFrag, InVecNode>,
-                     EVEX_CD8<32, CD8VH>, T8PD, EVEX_V128;
+                     EVEX_CD8<32, CD8VH>, T8, PD, EVEX_V128;
 
     defm Z256:  avx512_pmovx_common<opc, OpcodeStr, sched.YMM, v4i64x_info,
                    v4i32x_info, i128mem, LdFrag, OpNode>,
-                     EVEX_CD8<32, CD8VH>, T8PD, EVEX_V256;
+                     EVEX_CD8<32, CD8VH>, T8, PD, EVEX_V256;
   }
   let Predicates = [HasAVX512] in {
     defm Z   :  avx512_pmovx_common<opc, OpcodeStr, sched.ZMM, v8i64_info,
                    v8i32x_info, i256mem, LdFrag, OpNode>,
-                     EVEX_CD8<32, CD8VH>, T8PD, EVEX_V512;
+                     EVEX_CD8<32, CD8VH>, T8, PD, EVEX_V512;
   }
 }
 
@@ -11258,7 +11258,7 @@ defm : avx512_unary_lowering<"VPOPCNTD", ctpop, avx512vl_i32_info, HasVPOPCNTDQ>
 multiclass avx512_replicate<bits<8> opc, string OpcodeStr, SDNode OpNode,
                             X86SchedWriteWidths sched> {
   defm NAME:       avx512_unary_rm_vl<opc, OpcodeStr, OpNode, sched,
-                                      avx512vl_f32_info, HasAVX512>, XS;
+                                      avx512vl_f32_info, HasAVX512>, TB, XS;
 }
 
 defm VMOVSHDUP : avx512_replicate<0x16, "vmovshdup", X86Movshdup,
@@ -11301,7 +11301,7 @@ multiclass avx512_movddup_common<bits<8> opc, string OpcodeStr,
 multiclass avx512_movddup<bits<8> opc, string OpcodeStr,
                           X86SchedWriteWidths sched> {
   defm NAME:      avx512_movddup_common<opc, OpcodeStr, sched,
-                                        avx512vl_f64_info>, XD, REX_W;
+                                        avx512vl_f64_info>, TB, XD, REX_W;
 }
 
 defm VMOVDDUP : avx512_movddup<0x12, "vmovddup", SchedWriteFShuffle>;
@@ -11369,9 +11369,9 @@ multiclass avx512_extract_elt_b<string OpcodeStr, X86VectorVTInfo _> {
                   OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                   [(set GR32orGR64:$dst,
                         (X86pextrb (_.VT _.RC:$src1), timm:$src2))]>,
-                  EVEX, TAPD, Sched<[WriteVecExtract]>;
+                  EVEX, TA, PD, Sched<[WriteVecExtract]>;
 
-    defm NAME : avx512_extract_elt_bw_m<0x14, OpcodeStr, X86pextrb, _>, TAPD;
+    defm NAME : avx512_extract_elt_bw_m<0x14, OpcodeStr, X86pextrb, _>, TA, PD;
   }
 }
 
@@ -11382,15 +11382,15 @@ multiclass avx512_extract_elt_w<string OpcodeStr, X86VectorVTInfo _> {
                   OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                   [(set GR32orGR64:$dst,
                         (X86pextrw (_.VT _.RC:$src1), timm:$src2))]>,
-                  EVEX, PD, Sched<[WriteVecExtract]>;
+                  EVEX, TB, PD, Sched<[WriteVecExtract]>;
 
     let hasSideEffects = 0, isCodeGenOnly = 1, ForceDisassemble = 1 in
     def rr_REV : AVX512Ii8<0x15, MRMDestReg, (outs GR32orGR64:$dst),
                    (ins _.RC:$src1, u8imm:$src2),
                    OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
-                   EVEX, TAPD, Sched<[WriteVecExtract]>;
+                   EVEX, TA, PD, Sched<[WriteVecExtract]>;
 
-    defm NAME : avx512_extract_elt_bw_m<0x15, OpcodeStr, X86pextrw, _>, TAPD;
+    defm NAME : avx512_extract_elt_bw_m<0x15, OpcodeStr, X86pextrw, _>, TA, PD;
   }
 }
 
@@ -11402,14 +11402,14 @@ multiclass avx512_extract_elt_dq<string OpcodeStr, X86VectorVTInfo _,
                   OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                   [(set GRC:$dst,
                       (extractelt (_.VT _.RC:$src1), imm:$src2))]>,
-                  EVEX, TAPD, Sched<[WriteVecExtract]>;
+                  EVEX, TA, PD, Sched<[WriteVecExtract]>;
 
     def mr : AVX512Ii8<0x16, MRMDestMem, (outs),
                 (ins _.ScalarMemOp:$dst, _.RC:$src1, u8imm:$src2),
                 OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                 [(store (extractelt (_.VT _.RC:$src1),
                                     imm:$src2),addr:$dst)]>,
-                EVEX, EVEX_CD8<_.EltSize, CD8VT1>, TAPD,
+                EVEX, EVEX_CD8<_.EltSize, CD8VT1>, TA, PD,
                 Sched<[WriteVecExtractSt]>;
   }
 }
@@ -11452,17 +11452,17 @@ multiclass avx512_insert_elt_dq<bits<8> opc, string OpcodeStr,
         OpcodeStr#"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
         [(set _.RC:$dst,
             (_.VT (insertelt _.RC:$src1, GRC:$src2, imm:$src3)))]>,
-        EVEX, VVVV, TAPD, Sched<[WriteVecInsert]>;
+        EVEX, VVVV, TA, PD, Sched<[WriteVecInsert]>;
 
     defm NAME : avx512_insert_elt_m<opc, OpcodeStr, insertelt, _,
-                                    _.ScalarLdFrag, imm>, TAPD;
+                                    _.ScalarLdFrag, imm>, TA, PD;
   }
 }
 
 defm VPINSRBZ : avx512_insert_elt_bw<0x20, "vpinsrb", X86pinsrb, v16i8x_info,
-                                     extloadi8>, TAPD, WIG;
+                                     extloadi8>, TA, PD, WIG;
 defm VPINSRWZ : avx512_insert_elt_bw<0xC4, "vpinsrw", X86pinsrw, v8i16x_info,
-                                     extloadi16>, PD, WIG;
+                                     extloadi16>, TB, PD, WIG;
 defm VPINSRDZ : avx512_insert_elt_dq<0x22, "vpinsrd", v4i32x_info, GR32>;
 defm VPINSRQZ : avx512_insert_elt_dq<0x22, "vpinsrq", v2i64x_info, GR64>, REX_W;
 
@@ -11504,8 +11504,8 @@ multiclass avx512_shufp<string OpcodeStr, AVX512VLVectorVTInfo VTInfo_FP>{
                                     AVX512AIi8Base, EVEX, VVVV;
 }
 
-defm VSHUFPS: avx512_shufp<"vshufps", avx512vl_f32_info>, PS;
-defm VSHUFPD: avx512_shufp<"vshufpd", avx512vl_f64_info>, PD, REX_W;
+defm VSHUFPS: avx512_shufp<"vshufps", avx512vl_f32_info>, TB, PS;
+defm VSHUFPD: avx512_shufp<"vshufpd", avx512vl_f64_info>, TB, REX_W;
 
 //===----------------------------------------------------------------------===//
 // AVX-512 - Byte shift Left/Right
@@ -12217,13 +12217,13 @@ multiclass VBMI2_shift_var_rm<bits<8> Op, string OpStr, SDNode OpNode,
                 (ins VTI.RC:$src2, VTI.RC:$src3), OpStr,
                 "$src3, $src2", "$src2, $src3",
                 (VTI.VT (OpNode VTI.RC:$src1, VTI.RC:$src2, VTI.RC:$src3))>,
-                T8PD, EVEX, VVVV, Sched<[sched]>;
+                T8, PD, EVEX, VVVV, Sched<[sched]>;
     defm m:   AVX512_maskable_3src<Op, MRMSrcMem, VTI, (outs VTI.RC:$dst),
                 (ins VTI.RC:$src2, VTI.MemOp:$src3), OpStr,
                 "$src3, $src2", "$src2, $src3",
                 (VTI.VT (OpNode VTI.RC:$src1, VTI.RC:$src2,
                         (VTI.VT (VTI.LdFrag addr:$src3))))>,
-                T8PD, EVEX, VVVV,
+                T8, PD, EVEX, VVVV,
                 Sched<[sched.Folded, sched.ReadAfterFold]>;
   }
 }
@@ -12239,7 +12239,7 @@ multiclass VBMI2_shift_var_rmb<bits<8> Op, string OpStr, SDNode OpNode,
               "$src2, ${src3}"#VTI.BroadcastStr,
               (OpNode VTI.RC:$src1, VTI.RC:$src2,
                (VTI.VT (VTI.BroadcastLdFrag addr:$src3)))>,
-              T8PD, EVEX, VVVV, EVEX_B,
+              T8, PD, EVEX, VVVV, EVEX_B,
               Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
@@ -12321,13 +12321,13 @@ multiclass VNNI_rmb<bits<8> Op, string OpStr, SDNode OpNode,
                                    (VTI.VT (OpNode VTI.RC:$src1,
                                             VTI.RC:$src2, VTI.RC:$src3)),
                                    IsCommutable, IsCommutable>,
-                                   EVEX, VVVV, T8PD, Sched<[sched]>;
+                                   EVEX, VVVV, T8, PD, Sched<[sched]>;
   defm m  :   AVX512_maskable_3src<Op, MRMSrcMem, VTI, (outs VTI.RC:$dst),
                                    (ins VTI.RC:$src2, VTI.MemOp:$src3), OpStr,
                                    "$src3, $src2", "$src2, $src3",
                                    (VTI.VT (OpNode VTI.RC:$src1, VTI.RC:$src2,
                                             (VTI.VT (VTI.LdFrag addr:$src3))))>,
-                                   EVEX, VVVV, EVEX_CD8<32, CD8VF>, T8PD,
+                                   EVEX, VVVV, EVEX_CD8<32, CD8VF>, T8, PD,
                                    Sched<[sched.Folded, sched.ReadAfterFold,
                                           sched.ReadAfterFold]>;
   defm mb :   AVX512_maskable_3src<Op, MRMSrcMem, VTI, (outs VTI.RC:$dst),
@@ -12337,7 +12337,7 @@ multiclass VNNI_rmb<bits<8> Op, string OpStr, SDNode OpNode,
                                    (OpNode VTI.RC:$src1, VTI.RC:$src2,
                                     (VTI.VT (VTI.BroadcastLdFrag addr:$src3)))>,
                                    EVEX, VVVV, EVEX_CD8<32, CD8VF>, EVEX_B,
-                                   T8PD, Sched<[sched.Folded, sched.ReadAfterFold,
+                                   T8, PD, Sched<[sched.Folded, sched.ReadAfterFold,
                                                 sched.ReadAfterFold]>;
   }
 }
@@ -12406,7 +12406,7 @@ multiclass VPSHUFBITQMB_rm<X86FoldableSchedWrite sched, X86VectorVTInfo VTI> {
                                 (X86Vpshufbitqmb (VTI.VT VTI.RC:$src1),
                                 (VTI.VT VTI.RC:$src2)),
                                 (X86Vpshufbitqmb_su (VTI.VT VTI.RC:$src1),
-                                (VTI.VT VTI.RC:$src2))>, EVEX, VVVV, T8PD,
+                                (VTI.VT VTI.RC:$src2))>, EVEX, VVVV, T8, PD,
                                 Sched<[sched]>;
   defm rm : AVX512_maskable_cmp<0x8F, MRMSrcMem, VTI, (outs VTI.KRC:$dst),
                                 (ins VTI.RC:$src1, VTI.MemOp:$src2),
@@ -12416,7 +12416,7 @@ multiclass VPSHUFBITQMB_rm<X86FoldableSchedWrite sched, X86VectorVTInfo VTI> {
                                 (VTI.VT (VTI.LdFrag addr:$src2))),
                                 (X86Vpshufbitqmb_su (VTI.VT VTI.RC:$src1),
                                 (VTI.VT (VTI.LdFrag addr:$src2)))>,
-                                EVEX, VVVV, EVEX_CD8<8, CD8VF>, T8PD,
+                                EVEX, VVVV, EVEX_CD8<8, CD8VF>, T8, PD,
                                 Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
@@ -12451,7 +12451,7 @@ multiclass GF2P8MULB_avx512_common<bits<8> Op, string OpStr, SDNode OpNode,
 
 defm VGF2P8MULB : GF2P8MULB_avx512_common<0xCF, "vgf2p8mulb", X86GF2P8mulb,
                                           SchedWriteVecALU>,
-                                          EVEX_CD8<8, CD8VF>, T8PD;
+                                          EVEX_CD8<8, CD8VF>, T8;
 
 multiclass GF2P8AFFINE_avx512_rmb_imm<bits<8> Op, string OpStr, SDNode OpNode,
                                       X86FoldableSchedWrite sched, X86VectorVTInfo VTI,
@@ -12498,25 +12498,25 @@ let hasSideEffects = 0, mayLoad = 1, ExeDomain = SSEPackedSingle,
 defm V4FMADDPSrm : AVX512_maskable_3src_in_asm<0x9A, MRMSrcMem, v16f32_info,
                     (outs VR512:$dst), (ins VR512:$src2, f128mem:$src3),
                     "v4fmaddps", "$src3, $src2", "$src2, $src3",
-                    []>, EVEX_V512, EVEX, VVVV, T8XD, EVEX_CD8<32, CD8VQ>,
+                    []>, EVEX_V512, EVEX, VVVV, T8, XD, EVEX_CD8<32, CD8VQ>,
                     Sched<[SchedWriteFMA.ZMM.Folded]>;
 
 defm V4FNMADDPSrm : AVX512_maskable_3src_in_asm<0xAA, MRMSrcMem, v16f32_info,
                      (outs VR512:$dst), (ins VR512:$src2, f128mem:$src3),
                      "v4fnmaddps", "$src3, $src2", "$src2, $src3",
-                     []>, EVEX_V512, EVEX, VVVV, T8XD, EVEX_CD8<32, CD8VQ>,
+                     []>, EVEX_V512, EVEX, VVVV, T8, XD, EVEX_CD8<32, CD8VQ>,
                      Sched<[SchedWriteFMA.ZMM.Folded]>;
 
 defm V4FMADDSSrm : AVX512_maskable_3src_in_asm<0x9B, MRMSrcMem, f32x_info,
                     (outs VR128X:$dst), (ins  VR128X:$src2, f128mem:$src3),
                     "v4fmaddss", "$src3, $src2", "$src2, $src3",
-                    []>, VEX_LIG, EVEX, VVVV, T8XD, EVEX_CD8<32, CD8VF>,
+                    []>, VEX_LIG, EVEX, VVVV, T8, XD, EVEX_CD8<32, CD8VF>,
                     Sched<[SchedWriteFMA.Scl.Folded]>;
 
 defm V4FNMADDSSrm : AVX512_maskable_3src_in_asm<0xAB, MRMSrcMem, f32x_info,
                      (outs VR128X:$dst), (ins VR128X:$src2, f128mem:$src3),
                      "v4fnmaddss", "$src3, $src2", "$src2, $src3",
-                     []>, VEX_LIG, EVEX, VVVV, T8XD, EVEX_CD8<32, CD8VF>,
+                     []>, VEX_LIG, EVEX, VVVV, T8, XD, EVEX_CD8<32, CD8VF>,
                      Sched<[SchedWriteFMA.Scl.Folded]>;
 }
 
@@ -12529,13 +12529,13 @@ let hasSideEffects = 0, mayLoad = 1, ExeDomain = SSEPackedInt,
 defm VP4DPWSSDrm : AVX512_maskable_3src_in_asm<0x52, MRMSrcMem, v16i32_info,
                     (outs VR512:$dst), (ins VR512:$src2, f128mem:$src3),
                      "vp4dpwssd", "$src3, $src2", "$src2, $src3",
-                    []>, EVEX_V512, EVEX, VVVV, T8XD, EVEX_CD8<32, CD8VQ>,
+                    []>, EVEX_V512, EVEX, VVVV, T8, XD, EVEX_CD8<32, CD8VQ>,
                     Sched<[SchedWriteFMA.ZMM.Folded]>;
 
 defm VP4DPWSSDSrm : AVX512_maskable_3src_in_asm<0x53, MRMSrcMem, v16i32_info,
                      (outs VR512:$dst), (ins VR512:$src2, f128mem:$src3),
                      "vp4dpwssds", "$src3, $src2", "$src2, $src3",
-                     []>, EVEX_V512, EVEX, VVVV, T8XD, EVEX_CD8<32, CD8VQ>,
+                     []>, EVEX_V512, EVEX, VVVV, T8, XD, EVEX_CD8<32, CD8VQ>,
                      Sched<[SchedWriteFMA.ZMM.Folded]>;
 }
 
@@ -12558,7 +12558,7 @@ multiclass avx512_vp2intersect_modes<X86FoldableSchedWrite sched, X86VectorVTInf
                              "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                   [(set _.KRPC:$dst, (X86vp2intersect
                             _.RC:$src1, (_.VT _.RC:$src2)))]>,
-                  EVEX, VVVV, T8XD, Sched<[sched]>;
+                  EVEX, VVVV, T8, XD, Sched<[sched]>;
 
   def rm : I<0x68, MRMSrcMem,
                   (outs _.KRPC:$dst),
@@ -12567,7 +12567,7 @@ multiclass avx512_vp2intersect_modes<X86FoldableSchedWrite sched, X86VectorVTInf
                              "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                   [(set _.KRPC:$dst, (X86vp2intersect
                             _.RC:$src1, (_.VT (bitconvert (_.LdFrag addr:$src2)))))]>,
-                  EVEX, VVVV, T8XD, EVEX_CD8<_.EltSize, CD8VF>,
+                  EVEX, VVVV, T8, XD, EVEX_CD8<_.EltSize, CD8VF>,
                   Sched<[sched.Folded, sched.ReadAfterFold]>;
 
   def rmb : I<0x68, MRMSrcMem,
@@ -12577,7 +12577,7 @@ multiclass avx512_vp2intersect_modes<X86FoldableSchedWrite sched, X86VectorVTInf
                              ", $src1, $dst|$dst, $src1, ${src2}", _.BroadcastStr ,"}"),
                   [(set _.KRPC:$dst, (X86vp2intersect
                              _.RC:$src1, (_.VT (_.BroadcastLdFrag addr:$src2))))]>,
-                  EVEX, VVVV, T8XD, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>,
+                  EVEX, VVVV, T8, XD, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>,
                   Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
@@ -12623,7 +12623,7 @@ let ExeDomain = SSEPackedSingle in
 defm VCVTNE2PS2BF16 : avx512_binop_all2<0x72, "vcvtne2ps2bf16",
                                         SchedWriteCvtPD2PS, //FIXME: Should be SchedWriteCvtPS2BF
                                         avx512vl_f32_info, avx512vl_bf16_info,
-                                        X86cvtne2ps2bf16, HasBF16, 0>, T8XD;
+                                        X86cvtne2ps2bf16, HasBF16, 0>, T8, XD;
 
 // Truncate Float to BFloat16
 multiclass avx512_cvtps2bf16<bits<8> opc, string OpcodeStr,
@@ -12660,7 +12660,7 @@ multiclass avx512_cvtps2bf16<bits<8> opc, string OpcodeStr,
 }
 
 defm VCVTNEPS2BF16 : avx512_cvtps2bf16<0x72, "vcvtneps2bf16",
-                                       SchedWriteCvtPD2PS>, T8XS,
+                                       SchedWriteCvtPD2PS>, T8, XS,
                                        EVEX_CD8<32, CD8VF>;
 
 let Predicates = [HasBF16, HasVLX] in {
@@ -12783,7 +12783,7 @@ multiclass avx512_dpbf16ps_sizes<bits<8> opc, string OpcodeStr, SDNode OpNode,
 let ExeDomain = SSEPackedSingle in
 defm VDPBF16PS : avx512_dpbf16ps_sizes<0x52, "vdpbf16ps", X86dpbf16ps, SchedWriteFMA,
                                        avx512vl_f32_info, avx512vl_bf16_info,
-                                       HasBF16>, T8XS, EVEX_CD8<32, CD8VF>;
+                                       HasBF16>, T8, XS, EVEX_CD8<32, CD8VF>;
 
 //===----------------------------------------------------------------------===//
 // AVX512FP16
@@ -12792,12 +12792,12 @@ defm VDPBF16PS : avx512_dpbf16ps_sizes<0x52, "vdpbf16ps", X86dpbf16ps, SchedWrit
 let Predicates = [HasFP16] in {
 // Move word ( r/m16) to Packed word
 def VMOVW2SHrr : AVX512<0x6E, MRMSrcReg, (outs VR128X:$dst), (ins GR32:$src),
-                      "vmovw\t{$src, $dst|$dst, $src}", []>, T_MAP5PD, EVEX, Sched<[WriteVecMoveFromGpr]>;
+                      "vmovw\t{$src, $dst|$dst, $src}", []>, T_MAP5, PD, EVEX, Sched<[WriteVecMoveFromGpr]>;
 def VMOVWrm : AVX512<0x6E, MRMSrcMem, (outs VR128X:$dst), (ins i16mem:$src),
                       "vmovw\t{$src, $dst|$dst, $src}",
                       [(set VR128X:$dst,
                         (v8i16 (scalar_to_vector (loadi16 addr:$src))))]>,
-                      T_MAP5PD, EVEX, EVEX_CD8<16, CD8VT1>, Sched<[WriteFLoad]>;
+                      T_MAP5, PD, EVEX, EVEX_CD8<16, CD8VT1>, Sched<[WriteFLoad]>;
 
 def : Pat<(f16 (bitconvert GR16:$src)),
           (f16 (COPY_TO_REGCLASS
@@ -12854,13 +12854,13 @@ def : Pat<(v16i32 (X86vzmovl
 
 // Move word from xmm register to r/m16
 def VMOVSH2Wrr  : AVX512<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128X:$src),
-                       "vmovw\t{$src, $dst|$dst, $src}", []>, T_MAP5PD, EVEX, Sched<[WriteVecMoveToGpr]>;
+                       "vmovw\t{$src, $dst|$dst, $src}", []>, T_MAP5, PD, EVEX, Sched<[WriteVecMoveToGpr]>;
 def VMOVWmr  : AVX512<0x7E, MRMDestMem, (outs),
                        (ins i16mem:$dst, VR128X:$src),
                        "vmovw\t{$src, $dst|$dst, $src}",
                        [(store (i16 (extractelt (v8i16 VR128X:$src),
                                      (iPTR 0))), addr:$dst)]>,
-                       T_MAP5PD, EVEX, EVEX_CD8<16, CD8VT1>, Sched<[WriteFStore]>;
+                       T_MAP5, PD, EVEX, EVEX_CD8<16, CD8VT1>, Sched<[WriteFStore]>;
 
 def : Pat<(i16 (bitconvert FR16X:$src)),
           (i16 (EXTRACT_SUBREG
@@ -12872,9 +12872,9 @@ def : Pat<(i16 (extractelt (v8i16 VR128X:$src), (iPTR 0))),
 // Allow "vmovw" to use GR64
 let hasSideEffects = 0 in {
   def VMOVW64toSHrr : AVX512<0x6E, MRMSrcReg, (outs VR128X:$dst), (ins GR64:$src),
-                     "vmovw\t{$src, $dst|$dst, $src}", []>, T_MAP5PD, EVEX, REX_W, Sched<[WriteVecMoveFromGpr]>;
+                     "vmovw\t{$src, $dst|$dst, $src}", []>, T_MAP5, PD, EVEX, REX_W, Sched<[WriteVecMoveFromGpr]>;
   def VMOVSHtoW64rr : AVX512<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128X:$src),
-                     "vmovw\t{$src, $dst|$dst, $src}", []>, T_MAP5PD, EVEX, REX_W, Sched<[WriteVecMoveToGpr]>;
+                     "vmovw\t{$src, $dst|$dst, $src}", []>, T_MAP5, PD, EVEX, REX_W, Sched<[WriteVecMoveToGpr]>;
 }
 }
 
@@ -12920,27 +12920,27 @@ multiclass avx512_cvttph2w<bits<8> opc, string OpcodeStr, SDPatternOperator OpNo
 defm VCVTPH2UW : avx512_cvtph2w<0x7D, "vcvtph2uw", X86cvtp2UInt, X86cvtp2UInt,
                                 X86cvtp2UIntRnd, avx512vl_i16_info,
                                 avx512vl_f16_info, SchedWriteCvtPD2DQ>,
-                                T_MAP5PS, EVEX_CD8<16, CD8VF>;
+                                T_MAP5, PS, EVEX_CD8<16, CD8VF>;
 defm VCVTUW2PH : avx512_cvtph2w<0x7D, "vcvtuw2ph", any_uint_to_fp, uint_to_fp,
                                 X86VUintToFpRnd, avx512vl_f16_info,
                                 avx512vl_i16_info, SchedWriteCvtPD2DQ>,
-                                T_MAP5XD, EVEX_CD8<16, CD8VF>;
+                                T_MAP5, XD, EVEX_CD8<16, CD8VF>;
 defm VCVTTPH2W : avx512_cvttph2w<0x7C, "vcvttph2w", X86any_cvttp2si,
                                 X86cvttp2si, X86cvttp2siSAE,
                                 avx512vl_i16_info, avx512vl_f16_info,
-                                SchedWriteCvtPD2DQ>, T_MAP5PD, EVEX_CD8<16, CD8VF>;
+                                SchedWriteCvtPD2DQ>, T_MAP5, PD, EVEX_CD8<16, CD8VF>;
 defm VCVTTPH2UW : avx512_cvttph2w<0x7C, "vcvttph2uw", X86any_cvttp2ui,
                                 X86cvttp2ui, X86cvttp2uiSAE,
                                 avx512vl_i16_info, avx512vl_f16_info,
-                                SchedWriteCvtPD2DQ>, T_MAP5PS, EVEX_CD8<16, CD8VF>;
+                                SchedWriteCvtPD2DQ>, T_MAP5, PS, EVEX_CD8<16, CD8VF>;
 defm VCVTPH2W : avx512_cvtph2w<0x7D, "vcvtph2w", X86cvtp2Int, X86cvtp2Int,
                                 X86cvtp2IntRnd, avx512vl_i16_info,
                                 avx512vl_f16_info, SchedWriteCvtPD2DQ>,
-                                T_MAP5PD, EVEX_CD8<16, CD8VF>;
+                                T_MAP5, PD, EVEX_CD8<16, CD8VF>;
 defm VCVTW2PH : avx512_cvtph2w<0x7D, "vcvtw2ph", any_sint_to_fp, sint_to_fp,
                                 X86VSintToFpRnd, avx512vl_f16_info,
                                 avx512vl_i16_info, SchedWriteCvtPD2DQ>,
-                                T_MAP5XS, EVEX_CD8<16, CD8VF>;
+                                T_MAP5, XS, EVEX_CD8<16, CD8VF>;
 
 // Convert Half to Signed/Unsigned Doubleword
 multiclass avx512_cvtph2dq<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
@@ -12980,20 +12980,20 @@ multiclass avx512_cvttph2dq<bits<8> opc, string OpcodeStr, SDPatternOperator OpN
 
 
 defm VCVTPH2DQ : avx512_cvtph2dq<0x5B, "vcvtph2dq", X86cvtp2Int, X86cvtp2Int,
-                                 X86cvtp2IntRnd, SchedWriteCvtPS2DQ>, T_MAP5PD,
+                                 X86cvtp2IntRnd, SchedWriteCvtPS2DQ>, T_MAP5, PD,
                                  EVEX_CD8<16, CD8VH>;
 defm VCVTPH2UDQ : avx512_cvtph2dq<0x79, "vcvtph2udq", X86cvtp2UInt, X86cvtp2UInt,
-                                 X86cvtp2UIntRnd, SchedWriteCvtPS2DQ>, T_MAP5PS,
+                                 X86cvtp2UIntRnd, SchedWriteCvtPS2DQ>, T_MAP5, PS,
                                  EVEX_CD8<16, CD8VH>;
 
 defm VCVTTPH2DQ : avx512_cvttph2dq<0x5B, "vcvttph2dq", X86any_cvttp2si,
                                 X86cvttp2si, X86cvttp2siSAE,
-                                SchedWriteCvtPS2DQ>, T_MAP5XS,
+                                SchedWriteCvtPS2DQ>, T_MAP5, XS,
                                 EVEX_CD8<16, CD8VH>;
 
 defm VCVTTPH2UDQ : avx512_cvttph2dq<0x78, "vcvttph2udq", X86any_cvttp2ui,
                                  X86cvttp2ui, X86cvttp2uiSAE,
-                                 SchedWriteCvtPS2DQ>, T_MAP5PS,
+                                 SchedWriteCvtPS2DQ>, T_MAP5, PS,
                                  EVEX_CD8<16, CD8VH>;
 
 // Convert Half to Signed/Unsigned Quardword
@@ -13043,21 +13043,21 @@ multiclass avx512_cvttph2qq<bits<8> opc, string OpcodeStr, SDPatternOperator OpN
 }
 
 defm VCVTPH2QQ : avx512_cvtph2qq<0x7B, "vcvtph2qq", X86cvtp2Int, X86cvtp2Int,
-                                 X86cvtp2IntRnd, SchedWriteCvtPS2DQ>, T_MAP5PD,
+                                 X86cvtp2IntRnd, SchedWriteCvtPS2DQ>, T_MAP5, PD,
                                  EVEX_CD8<16, CD8VQ>;
 
 defm VCVTPH2UQQ : avx512_cvtph2qq<0x79, "vcvtph2uqq", X86cvtp2UInt, X86cvtp2UInt,
-                                 X86cvtp2UIntRnd, SchedWriteCvtPS2DQ>, T_MAP5PD,
+                                 X86cvtp2UIntRnd, SchedWriteCvtPS2DQ>, T_MAP5, PD,
                                  EVEX_CD8<16, CD8VQ>;
 
 defm VCVTTPH2QQ : avx512_cvttph2qq<0x7A, "vcvttph2qq", X86any_cvttp2si,
                                  X86cvttp2si, X86cvttp2siSAE,
-                                 SchedWriteCvtPS2DQ>, T_MAP5PD,
+                                 SchedWriteCvtPS2DQ>, T_MAP5, PD,
                                  EVEX_CD8<16, CD8VQ>;
 
 defm VCVTTPH2UQQ : avx512_cvttph2qq<0x78, "vcvttph2uqq", X86any_cvttp2ui,
                                  X86cvttp2ui, X86cvttp2uiSAE,
-                                 SchedWriteCvtPS2DQ>, T_MAP5PD,
+                                 SchedWriteCvtPS2DQ>, T_MAP5, PD,
                                  EVEX_CD8<16, CD8VQ>;
 
 // Convert Signed/Unsigned Quardword to Half
@@ -13154,53 +13154,53 @@ multiclass avx512_cvtqq2ph<bits<8> opc, string OpcodeStr, SDPatternOperator OpNo
 }
 
 defm VCVTQQ2PH : avx512_cvtqq2ph<0x5B, "vcvtqq2ph", any_sint_to_fp, sint_to_fp,
-                            X86VSintToFpRnd, SchedWriteCvtDQ2PS>, REX_W, T_MAP5PS,
+                            X86VSintToFpRnd, SchedWriteCvtDQ2PS>, REX_W, T_MAP5, PS,
                             EVEX_CD8<64, CD8VF>;
 
 defm VCVTUQQ2PH : avx512_cvtqq2ph<0x7A, "vcvtuqq2ph", any_uint_to_fp, uint_to_fp,
-                            X86VUintToFpRnd, SchedWriteCvtDQ2PS>, REX_W, T_MAP5XD,
+                            X86VUintToFpRnd, SchedWriteCvtDQ2PS>, REX_W, T_MAP5, XD,
                             EVEX_CD8<64, CD8VF>;
 
 // Convert half to signed/unsigned int 32/64
 defm VCVTSH2SIZ: avx512_cvt_s_int_round<0x2D, f16x_info, i32x_info, X86cvts2si,
                                    X86cvts2siRnd, WriteCvtSS2I, "cvtsh2si", "{l}", HasFP16>,
-                                   T_MAP5XS, EVEX_CD8<16, CD8VT1>;
+                                   T_MAP5, XS, EVEX_CD8<16, CD8VT1>;
 defm VCVTSH2SI64Z: avx512_cvt_s_int_round<0x2D, f16x_info, i64x_info, X86cvts2si,
                                    X86cvts2siRnd, WriteCvtSS2I, "cvtsh2si", "{q}", HasFP16>,
-                                   T_MAP5XS, REX_W, EVEX_CD8<16, CD8VT1>;
+                                   T_MAP5, XS, REX_W, EVEX_CD8<16, CD8VT1>;
 defm VCVTSH2USIZ: avx512_cvt_s_int_round<0x79, f16x_info, i32x_info, X86cvts2usi,
                                    X86cvts2usiRnd, WriteCvtSS2I, "cvtsh2usi", "{l}", HasFP16>,
-                                   T_MAP5XS, EVEX_CD8<16, CD8VT1>;
+                                   T_MAP5, XS, EVEX_CD8<16, CD8VT1>;
 defm VCVTSH2USI64Z: avx512_cvt_s_int_round<0x79, f16x_info, i64x_info, X86cvts2usi,
                                    X86cvts2usiRnd, WriteCvtSS2I, "cvtsh2usi", "{q}", HasFP16>,
-                                   T_MAP5XS, REX_W, EVEX_CD8<16, CD8VT1>;
+                                   T_MAP5, XS, REX_W, EVEX_CD8<16, CD8VT1>;
 
 defm VCVTTSH2SIZ: avx512_cvt_s_all<0x2C, "vcvttsh2si", f16x_info, i32x_info,
                         any_fp_to_sint, X86cvtts2Int, X86cvtts2IntSAE, WriteCvtSS2I,
-                        "{l}", HasFP16>, T_MAP5XS, EVEX_CD8<16, CD8VT1>;
+                        "{l}", HasFP16>, T_MAP5, XS, EVEX_CD8<16, CD8VT1>;
 defm VCVTTSH2SI64Z: avx512_cvt_s_all<0x2C, "vcvttsh2si", f16x_info, i64x_info,
                         any_fp_to_sint, X86cvtts2Int, X86cvtts2IntSAE, WriteCvtSS2I,
-                        "{q}", HasFP16>, REX_W, T_MAP5XS, EVEX_CD8<16, CD8VT1>;
+                        "{q}", HasFP16>, REX_W, T_MAP5, XS, EVEX_CD8<16, CD8VT1>;
 defm VCVTTSH2USIZ: avx512_cvt_s_all<0x78, "vcvttsh2usi", f16x_info, i32x_info,
                         any_fp_to_uint, X86cvtts2UInt, X86cvtts2UIntSAE, WriteCvtSS2I,
-                        "{l}", HasFP16>, T_MAP5XS, EVEX_CD8<16, CD8VT1>;
+                        "{l}", HasFP16>, T_MAP5, XS, EVEX_CD8<16, CD8VT1>;
 defm VCVTTSH2USI64Z: avx512_cvt_s_all<0x78, "vcvttsh2usi", f16x_info, i64x_info,
                         any_fp_to_uint, X86cvtts2UInt, X86cvtts2UIntSAE, WriteCvtSS2I,
-                        "{q}", HasFP16>, T_MAP5XS, REX_W, EVEX_CD8<16, CD8VT1>;
+                        "{q}", HasFP16>, T_MAP5, XS, REX_W, EVEX_CD8<16, CD8VT1>;
 
 let Predicates = [HasFP16] in {
   defm VCVTSI2SHZ  : avx512_vcvtsi_common<0x2A,  X86SintToFp, X86SintToFpRnd, WriteCvtI2SS, GR32,
                                    v8f16x_info, i32mem, loadi32, "cvtsi2sh", "l">,
-                                   T_MAP5XS, EVEX_CD8<32, CD8VT1>;
+                                   T_MAP5, XS, EVEX_CD8<32, CD8VT1>;
   defm VCVTSI642SHZ: avx512_vcvtsi_common<0x2A,  X86SintToFp, X86SintToFpRnd, WriteCvtI2SS, GR64,
                                    v8f16x_info, i64mem, loadi64, "cvtsi2sh","q">,
-                                   T_MAP5XS, REX_W, EVEX_CD8<64, CD8VT1>;
+                                   T_MAP5, XS, REX_W, EVEX_CD8<64, CD8VT1>;
   defm VCVTUSI2SHZ   : avx512_vcvtsi_common<0x7B,  X86UintToFp, X86UintToFpRnd, WriteCvtI2SS, GR32,
                                     v8f16x_info, i32mem, loadi32,
-                                    "cvtusi2sh","l">, T_MAP5XS, EVEX_CD8<32, CD8VT1>;
+                                    "cvtusi2sh","l">, T_MAP5, XS, EVEX_CD8<32, CD8VT1>;
   defm VCVTUSI642SHZ : avx512_vcvtsi_common<0x7B,  X86UintToFp, X86UintToFpRnd, WriteCvtI2SS, GR64,
                                     v8f16x_info, i64mem, loadi64, "cvtusi2sh", "q">,
-                                    T_MAP5XS, REX_W, EVEX_CD8<64, CD8VT1>;
+                                    T_MAP5, XS, REX_W, EVEX_CD8<64, CD8VT1>;
   def : InstAlias<"vcvtsi2sh\t{$src, $src1, $dst|$dst, $src1, $src}",
               (VCVTSI2SHZrm_Int VR128X:$dst, VR128X:$src1, i32mem:$src), 0, "att">;
 
@@ -13446,14 +13446,14 @@ multiclass avx512_cfmulop_common<bits<8> opc, string OpcodeStr, SDNode OpNode,
 
 let Uses = [MXCSR] in {
   defm VFMADDCPH  : avx512_cfmaop_common<0x56, "vfmaddcph", x86vfmaddc, x86vfmaddcRnd, 1>,
-                                    T_MAP6XS, EVEX_CD8<32, CD8VF>;
+                                    T_MAP6, XS, EVEX_CD8<32, CD8VF>;
   defm VFCMADDCPH : avx512_cfmaop_common<0x56, "vfcmaddcph", x86vfcmaddc, x86vfcmaddcRnd, 0>,
-                                    T_MAP6XD, EVEX_CD8<32, CD8VF>;
+                                    T_MAP6, XD, EVEX_CD8<32, CD8VF>;
 
   defm VFMULCPH  : avx512_cfmulop_common<0xD6, "vfmulcph", x86vfmulc, x86vfmulc,
-                                         x86vfmulcRnd, 1>, T_MAP6XS, EVEX_CD8<32, CD8VF>;
+                                         x86vfmulcRnd, 1>, T_MAP6, XS, EVEX_CD8<32, CD8VF>;
   defm VFCMULCPH : avx512_cfmulop_common<0xD6, "vfcmulcph", x86vfcmulc,
-                                         x86vfcmulc, x86vfcmulcRnd, 0>, T_MAP6XD, EVEX_CD8<32, CD8VF>;
+                                         x86vfcmulc, x86vfcmulcRnd, 0>, T_MAP6, XD, EVEX_CD8<32, CD8VF>;
 }
 
 
@@ -13504,12 +13504,12 @@ multiclass avx512_cfmbinop_sh_common<bits<8> opc, string OpcodeStr, SDNode OpNod
 
 let Uses = [MXCSR] in {
   defm VFMADDCSHZ  : avx512_cfmaop_sh_common<0x57, "vfmaddcsh", x86vfmaddcSh, x86vfmaddcShRnd, 1>,
-                                    T_MAP6XS, EVEX_CD8<32, CD8VT1>, EVEX_V128, EVEX, VVVV;
+                                    T_MAP6, XS, EVEX_CD8<32, CD8VT1>, EVEX_V128, EVEX, VVVV;
   defm VFCMADDCSHZ : avx512_cfmaop_sh_common<0x57, "vfcmaddcsh", x86vfcmaddcSh, x86vfcmaddcShRnd, 0>,
-                                    T_MAP6XD, EVEX_CD8<32, CD8VT1>, EVEX_V128, EVEX, VVVV;
+                                    T_MAP6, XD, EVEX_CD8<32, CD8VT1>, EVEX_V128, EVEX, VVVV;
 
   defm VFMULCSHZ  : avx512_cfmbinop_sh_common<0xD7, "vfmulcsh", x86vfmulcSh, x86vfmulcShRnd, 1>,
-                                    T_MAP6XS, EVEX_CD8<32, CD8VT1>, EVEX_V128, VEX_LIG, EVEX, VVVV;
+                                    T_MAP6, XS, EVEX_CD8<32, CD8VT1>, EVEX_V128, VEX_LIG, EVEX, VVVV;
   defm VFCMULCSHZ : avx512_cfmbinop_sh_common<0xD7, "vfcmulcsh", x86vfcmulcSh, x86vfcmulcShRnd, 0>,
-                                    T_MAP6XD, EVEX_CD8<32, CD8VT1>, EVEX_V128, VEX_LIG, EVEX, VVVV;
+                                    T_MAP6, XD, EVEX_CD8<32, CD8VT1>, EVEX_V128, VEX_LIG, EVEX, VVVV;
 }
diff --git a/llvm/lib/Target/X86/X86InstrArithmetic.td b/llvm/lib/Target/X86/X86InstrArithmetic.td
index 6f4b69c9b5c9f..4fb05231010d8 100644
--- a/llvm/lib/Target/X86/X86InstrArithmetic.td
+++ b/llvm/lib/Target/X86/X86InstrArithmetic.td
@@ -1117,8 +1117,8 @@ let Predicates = [HasBMI, HasEGPR, In64BitMode] in {
 
 // Complexity is reduced to give and with immediate a chance to match first.
 let Defs = [EFLAGS], AddedComplexity = -6 in {
-  defm ANDN32 : bmi_andn<"andn{l}", GR32, i32mem, loadi32, WriteALU>, T8PS;
-  defm ANDN64 : bmi_andn<"andn{q}", GR64, i64mem, loadi64, WriteALU>, T8PS, REX_W;
+  defm ANDN32 : bmi_andn<"andn{l}", GR32, i32mem, loadi32, WriteALU>, T8, PS;
+  defm ANDN64 : bmi_andn<"andn{q}", GR64, i64mem, loadi64, WriteALU>, T8, PS, REX_W;
 }
 
 let Predicates = [HasBMI], AddedComplexity = -6 in {
@@ -1141,12 +1141,12 @@ let hasSideEffects = 0 in {
 let Predicates = [HasBMI2, NoEGPR] in {
   def rr : I<0xF6, MRMSrcReg, (outs RC:$dst1, RC:$dst2), (ins RC:$src),
              !strconcat(mnemonic, "\t{$src, $dst2, $dst1|$dst1, $dst2, $src}"),
-             []>, T8XD, VEX, VVVV, Sched<[WriteIMulH, sched]>;
+             []>, T8, XD, VEX, VVVV, Sched<[WriteIMulH, sched]>;
 
   let mayLoad = 1 in
   def rm : I<0xF6, MRMSrcMem, (outs RC:$dst1, RC:$dst2), (ins x86memop:$src),
              !strconcat(mnemonic, "\t{$src, $dst2, $dst1|$dst1, $dst2, $src}"),
-             []>, T8XD, VEX, VVVV,
+             []>, T8, XD, VEX, VVVV,
              Sched<[WriteIMulHLd, sched.Folded,
                     // Memory operand.
                     ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault,
@@ -1165,11 +1165,11 @@ let Predicates = [HasBMI2, NoEGPR] in {
 let Predicates = [HasBMI2, HasEGPR, In64BitMode] in
   def rr#_EVEX : I<0xF6, MRMSrcReg, (outs RC:$dst1, RC:$dst2), (ins RC:$src),
                    !strconcat(mnemonic, "\t{$src, $dst2, $dst1|$dst1, $dst2, $src}"),
-                   []>, T8XD, EVEX, VVVV, Sched<[WriteIMulH, sched]>;
+                   []>, T8, XD, EVEX, VVVV, Sched<[WriteIMulH, sched]>;
 let Predicates = [HasBMI2, HasEGPR, In64BitMode], mayLoad = 1 in
   def rm#_EVEX : I<0xF6, MRMSrcMem, (outs RC:$dst1, RC:$dst2), (ins x86memop:$src),
                    !strconcat(mnemonic, "\t{$src, $dst2, $dst1|$dst1, $dst2, $src}"),
-                   []>, T8XD, EVEX, VVVV,
+                   []>, T8, XD, EVEX, VVVV,
                  Sched<[WriteIMulHLd, sched.Folded,
                         // Memory operand.
                         ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault,
@@ -1201,12 +1201,12 @@ class ADCOXOpRM <string m, X86TypeInfo t>
 
 let OpSize = OpSizeFixed, Constraints = "$src1 = $dst",
     Predicates = [HasADX] in {
-def ADCX32rr : ADCOXOpRR<"adcx", Xi32>, T8PD;
-def ADCX64rr : ADCOXOpRR<"adcx", Xi64>, T8PD;
-def ADOX32rr : ADCOXOpRR<"adox", Xi32>, T8XS;
-def ADOX64rr : ADCOXOpRR<"adox", Xi64>, T8XS;
-def ADCX32rm : ADCOXOpRM<"adcx", Xi32>, T8PD;
-def ADCX64rm : ADCOXOpRM<"adcx", Xi64>, T8PD;
-def ADOX32rm : ADCOXOpRM<"adox", Xi32>, T8XS;
-def ADOX64rm : ADCOXOpRM<"adox", Xi64>, T8XS;
+def ADCX32rr : ADCOXOpRR<"adcx", Xi32>, T8, PD;
+def ADCX64rr : ADCOXOpRR<"adcx", Xi64>, T8, PD;
+def ADOX32rr : ADCOXOpRR<"adox", Xi32>, T8, XS;
+def ADOX64rr : ADCOXOpRR<"adox", Xi64>, T8, XS;
+def ADCX32rm : ADCOXOpRM<"adcx", Xi32>, T8, PD;
+def ADCX64rm : ADCOXOpRM<"adcx", Xi64>, T8, PD;
+def ADOX32rm : ADCOXOpRM<"adox", Xi32>, T8, XS;
+def ADOX64rm : ADCOXOpRM<"adox", Xi64>, T8, XS;
 }
diff --git a/llvm/lib/Target/X86/X86InstrFPStack.td b/llvm/lib/Target/X86/X86InstrFPStack.td
index 09655d9391211..dd63e921b8acd 100644
--- a/llvm/lib/Target/X86/X86InstrFPStack.td
+++ b/llvm/lib/Target/X86/X86InstrFPStack.td
@@ -666,20 +666,20 @@ def FCOMPP : I<0xDE, MRM_D9, (outs), (ins), "fcompp", []>;
 
 let Uses = [FPSW, FPCW] in {
 def FXSAVE : I<0xAE, MRM0m, (outs), (ins opaquemem:$dst),
-             "fxsave\t$dst", [(int_x86_fxsave addr:$dst)]>, PS,
+             "fxsave\t$dst", [(int_x86_fxsave addr:$dst)]>, TB, PS,
              Requires<[HasFXSR]>;
 def FXSAVE64 : RI<0xAE, MRM0m, (outs), (ins opaquemem:$dst),
                "fxsave64\t$dst", [(int_x86_fxsave64 addr:$dst)]>,
-               PS, Requires<[HasFXSR, In64BitMode]>;
+               TB, PS, Requires<[HasFXSR, In64BitMode]>;
 } // Uses = [FPSW, FPCW]
 
 let Defs = [FPSW, FPCW] in {
 def FXRSTOR : I<0xAE, MRM1m, (outs), (ins opaquemem:$src),
               "fxrstor\t$src", [(int_x86_fxrstor addr:$src)]>,
-              PS, Requires<[HasFXSR]>;
+              TB, PS, Requires<[HasFXSR]>;
 def FXRSTOR64 : RI<0xAE, MRM1m, (outs), (ins opaquemem:$src),
                 "fxrstor64\t$src", [(int_x86_fxrstor64 addr:$src)]>,
-                PS, Requires<[HasFXSR, In64BitMode]>;
+                TB, PS, Requires<[HasFXSR, In64BitMode]>;
 } // Defs = [FPSW, FPCW]
 } // SchedRW
 
diff --git a/llvm/lib/Target/X86/X86InstrKL.td b/llvm/lib/Target/X86/X86InstrKL.td
index a3392b691c0a2..4586fc541627f 100644
--- a/llvm/lib/Target/X86/X86InstrKL.td
+++ b/llvm/lib/Target/X86/X86InstrKL.td
@@ -19,17 +19,17 @@ let SchedRW = [WriteSystem], Predicates = [HasKL] in {
   let Uses = [XMM0, EAX], Defs = [EFLAGS] in {
     def LOADIWKEY : I<0xDC, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
                       "loadiwkey\t{$src2, $src1|$src1, $src2}",
-                      [(int_x86_loadiwkey XMM0, VR128:$src1, VR128:$src2, EAX)]>, T8XS;
+                      [(int_x86_loadiwkey XMM0, VR128:$src1, VR128:$src2, EAX)]>, T8, XS;
   }
 
   let Uses = [XMM0], Defs = [XMM0, XMM1, XMM2, XMM4, XMM5, XMM6, EFLAGS] in {
     def ENCODEKEY128 : I<0xFA, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
-                         "encodekey128\t{$src, $dst|$dst, $src}", []>, T8XS;
+                         "encodekey128\t{$src, $dst|$dst, $src}", []>, T8, XS;
   }
 
   let Uses = [XMM0, XMM1], Defs = [XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, EFLAGS] in {
     def ENCODEKEY256 : I<0xFB, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
-                         "encodekey256\t{$src, $dst|$dst, $src}", []>, T8XS;
+                         "encodekey256\t{$src, $dst|$dst, $src}", []>, T8, XS;
   }
 
   let Constraints = "$src1 = $dst",
@@ -37,22 +37,22 @@ let SchedRW = [WriteSystem], Predicates = [HasKL] in {
    def AESENC128KL : I<0xDC, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, opaquemem:$src2),
                        "aesenc128kl\t{$src2, $src1|$src1, $src2}",
                        [(set VR128:$dst, EFLAGS,
-                         (X86aesenc128kl VR128:$src1, addr:$src2))]>, T8XS;
+                         (X86aesenc128kl VR128:$src1, addr:$src2))]>, T8, XS;
 
    def AESDEC128KL : I<0xDD, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, opaquemem:$src2),
                        "aesdec128kl\t{$src2, $src1|$src1, $src2}",
                        [(set VR128:$dst, EFLAGS,
-                         (X86aesdec128kl VR128:$src1, addr:$src2))]>, T8XS;
+                         (X86aesdec128kl VR128:$src1, addr:$src2))]>, T8, XS;
 
    def AESENC256KL : I<0xDE, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, opaquemem:$src2),
                        "aesenc256kl\t{$src2, $src1|$src1, $src2}",
                        [(set VR128:$dst, EFLAGS,
-                         (X86aesenc256kl VR128:$src1, addr:$src2))]>, T8XS;
+                         (X86aesenc256kl VR128:$src1, addr:$src2))]>, T8, XS;
 
    def AESDEC256KL : I<0xDF, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, opaquemem:$src2),
                        "aesdec256kl\t{$src2, $src1|$src1, $src2}",
                        [(set VR128:$dst, EFLAGS,
-                         (X86aesdec256kl VR128:$src1, addr:$src2))]>, T8XS;
+                         (X86aesdec256kl VR128:$src1, addr:$src2))]>, T8, XS;
   }
 
 } // SchedRW, Predicates
@@ -62,13 +62,13 @@ let SchedRW = [WriteSystem], Predicates = [HasWIDEKL] in {
       Defs = [EFLAGS, XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7],
       mayLoad = 1 in {
     def AESENCWIDE128KL : I<0xD8, MRM0m, (outs), (ins opaquemem:$src),
-                            "aesencwide128kl\t$src", []>, T8XS;
+                            "aesencwide128kl\t$src", []>, T8, XS;
     def AESDECWIDE128KL : I<0xD8, MRM1m, (outs), (ins opaquemem:$src),
-                            "aesdecwide128kl\t$src", []>, T8XS;
+                            "aesdecwide128kl\t$src", []>, T8, XS;
     def AESENCWIDE256KL : I<0xD8, MRM2m, (outs), (ins opaquemem:$src),
-                            "aesencwide256kl\t$src", []>, T8XS;
+                            "aesencwide256kl\t$src", []>, T8, XS;
     def AESDECWIDE256KL : I<0xD8, MRM3m, (outs), (ins opaquemem:$src),
-                            "aesdecwide256kl\t$src", []>, T8XS;
+                            "aesdecwide256kl\t$src", []>, T8, XS;
   }
 
 } // SchedRW, Predicates
diff --git a/llvm/lib/Target/X86/X86InstrMMX.td b/llvm/lib/Target/X86/X86InstrMMX.td
index 9796379aa0bf0..8d472ccd52df3 100644
--- a/llvm/lib/Target/X86/X86InstrMMX.td
+++ b/llvm/lib/Target/X86/X86InstrMMX.td
@@ -487,24 +487,24 @@ def MMX_PSHUFWmi : MMXIi8<0x70, MRMSrcMem,
 // -- Conversion Instructions
 defm MMX_CVTPS2PI : sse12_cvt_pint<0x2D, VR128, VR64, int_x86_sse_cvtps2pi,
                       f64mem, load, "cvtps2pi\t{$src, $dst|$dst, $src}",
-                      WriteCvtPS2I, SSEPackedSingle>, PS, SIMD_EXC;
+                      WriteCvtPS2I, SSEPackedSingle>, TB, PS, SIMD_EXC;
 defm MMX_CVTPD2PI : sse12_cvt_pint<0x2D, VR128, VR64, int_x86_sse_cvtpd2pi,
                       f128mem, memop, "cvtpd2pi\t{$src, $dst|$dst, $src}",
-                      WriteCvtPD2I, SSEPackedDouble>, PD, SIMD_EXC;
+                      WriteCvtPD2I, SSEPackedDouble>, TB, PD, SIMD_EXC;
 defm MMX_CVTTPS2PI : sse12_cvt_pint<0x2C, VR128, VR64, int_x86_sse_cvttps2pi,
                        f64mem, load, "cvttps2pi\t{$src, $dst|$dst, $src}",
-                       WriteCvtPS2I, SSEPackedSingle>, PS, SIMD_EXC;
+                       WriteCvtPS2I, SSEPackedSingle>, TB, PS, SIMD_EXC;
 defm MMX_CVTTPD2PI : sse12_cvt_pint<0x2C, VR128, VR64, int_x86_sse_cvttpd2pi,
                        f128mem, memop, "cvttpd2pi\t{$src, $dst|$dst, $src}",
-                       WriteCvtPD2I, SSEPackedDouble>, PD, SIMD_EXC;
+                       WriteCvtPD2I, SSEPackedDouble>, TB, PD, SIMD_EXC;
 defm MMX_CVTPI2PD : sse12_cvt_pint<0x2A, VR64, VR128, int_x86_sse_cvtpi2pd,
                          i64mem, load, "cvtpi2pd\t{$src, $dst|$dst, $src}",
-                         WriteCvtI2PD, SSEPackedDouble>, PD;
+                         WriteCvtI2PD, SSEPackedDouble>, TB, PD;
 let Constraints = "$src1 = $dst" in {
   defm MMX_CVTPI2PS : sse12_cvt_pint_3addr<0x2A, VR64, VR128,
                          int_x86_sse_cvtpi2ps,
                          i64mem, load, "cvtpi2ps\t{$src2, $dst|$dst, $src2}",
-                         SSEPackedSingle>, PS, SIMD_EXC;
+                         SSEPackedSingle>, TB, PS, SIMD_EXC;
 }
 
 // Extract / Insert
diff --git a/llvm/lib/Target/X86/X86InstrMisc.td b/llvm/lib/Target/X86/X86InstrMisc.td
index d3a3fb7fefc23..779f27085eae0 100644
--- a/llvm/lib/Target/X86/X86InstrMisc.td
+++ b/llvm/lib/Target/X86/X86InstrMisc.td
@@ -165,10 +165,10 @@ def POPP64r  : I<0x58, AddRegFrm, (outs GR64:$reg), (ins), "popp\t$reg", []>,
                  REX_W, ExplicitREX2Prefix, Requires<[In64BitMode]>;
 def POP2: I<0x8F, MRM0r, (outs GR64:$reg1, GR64:$reg2), (ins),
             "pop2\t{$reg2, $reg1|$reg1, $reg2}",
-            []>, EVEX, VVVV, EVEX_B, T_MAP4PS;
+            []>, EVEX, VVVV, EVEX_B, T_MAP4, PS;
 def POP2P: I<0x8F, MRM0r, (outs GR64:$reg1, GR64:$reg2), (ins),
              "pop2p\t{$reg2, $reg1|$reg1, $reg2}",
-             []>, EVEX, VVVV, EVEX_B, T_MAP4PS, REX_W;
+             []>, EVEX, VVVV, EVEX_B, T_MAP4, PS, REX_W;
 
 } // mayLoad, SchedRW
 let mayLoad = 1, mayStore = 1, SchedRW = [WriteCopy] in
@@ -186,10 +186,10 @@ def PUSHP64r  : I<0x50, AddRegFrm, (outs), (ins GR64:$reg), "pushp\t$reg", []>,
                   REX_W, ExplicitREX2Prefix, Requires<[In64BitMode]>;
 def PUSH2: I<0xFF, MRM6r, (outs), (ins GR64:$reg1, GR64:$reg2),
             "push2\t{$reg2, $reg1|$reg1, $reg2}",
-            []>, EVEX, VVVV, EVEX_B, T_MAP4PS;
+            []>, EVEX, VVVV, EVEX_B, T_MAP4, PS;
 def PUSH2P: I<0xFF, MRM6r, (outs), (ins GR64:$reg1, GR64:$reg2),
              "push2p\t{$reg2, $reg1|$reg1, $reg2}",
-             []>, EVEX, VVVV, EVEX_B, T_MAP4PS, REX_W;
+             []>, EVEX, VVVV, EVEX_B, T_MAP4, PS, REX_W;
 } // mayStore, SchedRW
 let mayLoad = 1, mayStore = 1, SchedRW = [WriteCopy] in {
 def PUSH64rmm: I<0xFF, MRM6m, (outs), (ins i64mem:$src), "push{q}\t$src", []>,
@@ -251,52 +251,52 @@ let Defs = [EFLAGS] in {
 def BSF16rr  : I<0xBC, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
                  "bsf{w}\t{$src, $dst|$dst, $src}",
                  [(set GR16:$dst, EFLAGS, (X86bsf GR16:$src))]>,
-                  PS, OpSize16, Sched<[WriteBSF]>;
+                  TB, PS, OpSize16, Sched<[WriteBSF]>;
 def BSF16rm  : I<0xBC, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
                  "bsf{w}\t{$src, $dst|$dst, $src}",
                  [(set GR16:$dst, EFLAGS, (X86bsf (loadi16 addr:$src)))]>,
-                 PS, OpSize16, Sched<[WriteBSFLd]>;
+                 TB, PS, OpSize16, Sched<[WriteBSFLd]>;
 def BSF32rr  : I<0xBC, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
                  "bsf{l}\t{$src, $dst|$dst, $src}",
                  [(set GR32:$dst, EFLAGS, (X86bsf GR32:$src))]>,
-                 PS, OpSize32, Sched<[WriteBSF]>;
+                 TB, PS, OpSize32, Sched<[WriteBSF]>;
 def BSF32rm  : I<0xBC, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
                  "bsf{l}\t{$src, $dst|$dst, $src}",
                  [(set GR32:$dst, EFLAGS, (X86bsf (loadi32 addr:$src)))]>,
-                 PS, OpSize32, Sched<[WriteBSFLd]>;
+                 TB, PS, OpSize32, Sched<[WriteBSFLd]>;
 def BSF64rr  : RI<0xBC, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
                   "bsf{q}\t{$src, $dst|$dst, $src}",
                   [(set GR64:$dst, EFLAGS, (X86bsf GR64:$src))]>,
-                  PS, Sched<[WriteBSF]>;
+                  TB, PS, Sched<[WriteBSF]>;
 def BSF64rm  : RI<0xBC, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
                   "bsf{q}\t{$src, $dst|$dst, $src}",
                   [(set GR64:$dst, EFLAGS, (X86bsf (loadi64 addr:$src)))]>,
-                  PS, Sched<[WriteBSFLd]>;
+                  TB, PS, Sched<[WriteBSFLd]>;
 
 def BSR16rr  : I<0xBD, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
                  "bsr{w}\t{$src, $dst|$dst, $src}",
                  [(set GR16:$dst, EFLAGS, (X86bsr GR16:$src))]>,
-                 PS, OpSize16, Sched<[WriteBSR]>;
+                 TB, PS, OpSize16, Sched<[WriteBSR]>;
 def BSR16rm  : I<0xBD, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
                  "bsr{w}\t{$src, $dst|$dst, $src}",
                  [(set GR16:$dst, EFLAGS, (X86bsr (loadi16 addr:$src)))]>,
-                 PS, OpSize16, Sched<[WriteBSRLd]>;
+                 TB, PS, OpSize16, Sched<[WriteBSRLd]>;
 def BSR32rr  : I<0xBD, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
                  "bsr{l}\t{$src, $dst|$dst, $src}",
                  [(set GR32:$dst, EFLAGS, (X86bsr GR32:$src))]>,
-                 PS, OpSize32, Sched<[WriteBSR]>;
+                 TB, PS, OpSize32, Sched<[WriteBSR]>;
 def BSR32rm  : I<0xBD, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
                  "bsr{l}\t{$src, $dst|$dst, $src}",
                  [(set GR32:$dst, EFLAGS, (X86bsr (loadi32 addr:$src)))]>,
-                 PS, OpSize32, Sched<[WriteBSRLd]>;
+                 TB, PS, OpSize32, Sched<[WriteBSRLd]>;
 def BSR64rr  : RI<0xBD, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
                   "bsr{q}\t{$src, $dst|$dst, $src}",
                   [(set GR64:$dst, EFLAGS, (X86bsr GR64:$src))]>,
-                  PS, Sched<[WriteBSR]>;
+                  TB, PS, Sched<[WriteBSR]>;
 def BSR64rm  : RI<0xBD, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
                   "bsr{q}\t{$src, $dst|$dst, $src}",
                   [(set GR64:$dst, EFLAGS, (X86bsr (loadi64 addr:$src)))]>,
-                  PS, Sched<[WriteBSRLd]>;
+                  TB, PS, Sched<[WriteBSRLd]>;
 } // Defs = [EFLAGS]
 
 let SchedRW = [WriteMicrocoded] in {
@@ -1095,29 +1095,29 @@ let Predicates = [HasMOVBE] in {
   def MOVBE16rm : I<0xF0, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
                     "movbe{w}\t{$src, $dst|$dst, $src}",
                     [(set GR16:$dst, (bswap (loadi16 addr:$src)))]>,
-                    OpSize16, T8PS;
+                    OpSize16, T8, PS;
   def MOVBE32rm : I<0xF0, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
                     "movbe{l}\t{$src, $dst|$dst, $src}",
                     [(set GR32:$dst, (bswap (loadi32 addr:$src)))]>,
-                    OpSize32, T8PS;
+                    OpSize32, T8, PS;
   def MOVBE64rm : RI<0xF0, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
                      "movbe{q}\t{$src, $dst|$dst, $src}",
                      [(set GR64:$dst, (bswap (loadi64 addr:$src)))]>,
-                     T8PS;
+                     T8, PS;
   }
   let SchedRW = [WriteStore] in {
   def MOVBE16mr : I<0xF1, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src),
                     "movbe{w}\t{$src, $dst|$dst, $src}",
                     [(store (bswap GR16:$src), addr:$dst)]>,
-                    OpSize16, T8PS;
+                    OpSize16, T8, PS;
   def MOVBE32mr : I<0xF1, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
                     "movbe{l}\t{$src, $dst|$dst, $src}",
                     [(store (bswap GR32:$src), addr:$dst)]>,
-                    OpSize32, T8PS;
+                    OpSize32, T8, PS;
   def MOVBE64mr : RI<0xF1, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
                      "movbe{q}\t{$src, $dst|$dst, $src}",
                      [(store (bswap GR64:$src), addr:$dst)]>,
-                     T8PS;
+                     T8, PS;
   }
 }
 
@@ -1127,13 +1127,13 @@ let Predicates = [HasMOVBE] in {
 let Predicates = [HasRDRAND], Defs = [EFLAGS], SchedRW = [WriteSystem] in {
   def RDRAND16r : I<0xC7, MRM6r, (outs GR16:$dst), (ins),
                     "rdrand{w}\t$dst", [(set GR16:$dst, EFLAGS, (X86rdrand))]>,
-                    OpSize16, PS;
+                    OpSize16, TB, PS;
   def RDRAND32r : I<0xC7, MRM6r, (outs GR32:$dst), (ins),
                     "rdrand{l}\t$dst", [(set GR32:$dst, EFLAGS, (X86rdrand))]>,
-                    OpSize32, PS;
+                    OpSize32, TB, PS;
   def RDRAND64r : RI<0xC7, MRM6r, (outs GR64:$dst), (ins),
                      "rdrand{q}\t$dst", [(set GR64:$dst, EFLAGS, (X86rdrand))]>,
-                     PS;
+                     TB, PS;
 }
 
 //===----------------------------------------------------------------------===//
@@ -1141,11 +1141,11 @@ let Predicates = [HasRDRAND], Defs = [EFLAGS], SchedRW = [WriteSystem] in {
 //
 let Predicates = [HasRDSEED], Defs = [EFLAGS], SchedRW = [WriteSystem] in {
   def RDSEED16r : I<0xC7, MRM7r, (outs GR16:$dst), (ins), "rdseed{w}\t$dst",
-                    [(set GR16:$dst, EFLAGS, (X86rdseed))]>, OpSize16, PS;
+                    [(set GR16:$dst, EFLAGS, (X86rdseed))]>, OpSize16, TB, PS;
   def RDSEED32r : I<0xC7, MRM7r, (outs GR32:$dst), (ins), "rdseed{l}\t$dst",
-                    [(set GR32:$dst, EFLAGS, (X86rdseed))]>, OpSize32, PS;
+                    [(set GR32:$dst, EFLAGS, (X86rdseed))]>, OpSize32, TB, PS;
   def RDSEED64r : RI<0xC7, MRM7r, (outs GR64:$dst), (ins), "rdseed{q}\t$dst",
-                     [(set GR64:$dst, EFLAGS, (X86rdseed))]>, PS;
+                     [(set GR64:$dst, EFLAGS, (X86rdseed))]>, TB, PS;
 }
 
 //===----------------------------------------------------------------------===//
@@ -1155,29 +1155,29 @@ let Predicates = [HasLZCNT], Defs = [EFLAGS] in {
   def LZCNT16rr : I<0xBD, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
                     "lzcnt{w}\t{$src, $dst|$dst, $src}",
                     [(set GR16:$dst, (ctlz GR16:$src)), (implicit EFLAGS)]>,
-                    XS, OpSize16, Sched<[WriteLZCNT]>;
+                    TB, XS, OpSize16, Sched<[WriteLZCNT]>;
   def LZCNT16rm : I<0xBD, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
                     "lzcnt{w}\t{$src, $dst|$dst, $src}",
                     [(set GR16:$dst, (ctlz (loadi16 addr:$src))),
-                     (implicit EFLAGS)]>, XS, OpSize16, Sched<[WriteLZCNTLd]>;
+                     (implicit EFLAGS)]>, TB, XS, OpSize16, Sched<[WriteLZCNTLd]>;
 
   def LZCNT32rr : I<0xBD, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
                     "lzcnt{l}\t{$src, $dst|$dst, $src}",
                     [(set GR32:$dst, (ctlz GR32:$src)), (implicit EFLAGS)]>,
-                    XS, OpSize32, Sched<[WriteLZCNT]>;
+                    TB, XS, OpSize32, Sched<[WriteLZCNT]>;
   def LZCNT32rm : I<0xBD, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
                     "lzcnt{l}\t{$src, $dst|$dst, $src}",
                     [(set GR32:$dst, (ctlz (loadi32 addr:$src))),
-                     (implicit EFLAGS)]>, XS, OpSize32, Sched<[WriteLZCNTLd]>;
+                     (implicit EFLAGS)]>, TB, XS, OpSize32, Sched<[WriteLZCNTLd]>;
 
   def LZCNT64rr : RI<0xBD, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
                      "lzcnt{q}\t{$src, $dst|$dst, $src}",
                      [(set GR64:$dst, (ctlz GR64:$src)), (implicit EFLAGS)]>,
-                     XS, Sched<[WriteLZCNT]>;
+                     TB, XS, Sched<[WriteLZCNT]>;
   def LZCNT64rm : RI<0xBD, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
                      "lzcnt{q}\t{$src, $dst|$dst, $src}",
                      [(set GR64:$dst, (ctlz (loadi64 addr:$src))),
-                      (implicit EFLAGS)]>, XS, Sched<[WriteLZCNTLd]>;
+                      (implicit EFLAGS)]>, TB, XS, Sched<[WriteLZCNTLd]>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -1187,29 +1187,29 @@ let Predicates = [HasBMI], Defs = [EFLAGS] in {
   def TZCNT16rr : I<0xBC, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
                     "tzcnt{w}\t{$src, $dst|$dst, $src}",
                     [(set GR16:$dst, (cttz GR16:$src)), (implicit EFLAGS)]>,
-                    XS, OpSize16, Sched<[WriteTZCNT]>;
+                    TB, XS, OpSize16, Sched<[WriteTZCNT]>;
   def TZCNT16rm : I<0xBC, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
                     "tzcnt{w}\t{$src, $dst|$dst, $src}",
                     [(set GR16:$dst, (cttz (loadi16 addr:$src))),
-                     (implicit EFLAGS)]>, XS, OpSize16, Sched<[WriteTZCNTLd]>;
+                     (implicit EFLAGS)]>, TB, XS, OpSize16, Sched<[WriteTZCNTLd]>;
 
   def TZCNT32rr : I<0xBC, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
                     "tzcnt{l}\t{$src, $dst|$dst, $src}",
                     [(set GR32:$dst, (cttz GR32:$src)), (implicit EFLAGS)]>,
-                    XS, OpSize32, Sched<[WriteTZCNT]>;
+                    TB, XS, OpSize32, Sched<[WriteTZCNT]>;
   def TZCNT32rm : I<0xBC, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
                     "tzcnt{l}\t{$src, $dst|$dst, $src}",
                     [(set GR32:$dst, (cttz (loadi32 addr:$src))),
-                     (implicit EFLAGS)]>, XS, OpSize32, Sched<[WriteTZCNTLd]>;
+                     (implicit EFLAGS)]>, TB, XS, OpSize32, Sched<[WriteTZCNTLd]>;
 
   def TZCNT64rr : RI<0xBC, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
                      "tzcnt{q}\t{$src, $dst|$dst, $src}",
                      [(set GR64:$dst, (cttz GR64:$src)), (implicit EFLAGS)]>,
-                     XS, Sched<[WriteTZCNT]>;
+                     TB, XS, Sched<[WriteTZCNT]>;
   def TZCNT64rm : RI<0xBC, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
                      "tzcnt{q}\t{$src, $dst|$dst, $src}",
                      [(set GR64:$dst, (cttz (loadi64 addr:$src))),
-                      (implicit EFLAGS)]>, XS, Sched<[WriteTZCNTLd]>;
+                      (implicit EFLAGS)]>, TB, XS, Sched<[WriteTZCNTLd]>;
 }
 
 multiclass bmi_bls<string mnemonic, Format RegMRM, Format MemMRM,
@@ -1218,11 +1218,11 @@ multiclass bmi_bls<string mnemonic, Format RegMRM, Format MemMRM,
 let hasSideEffects = 0 in {
   def rr#Suffix : I<0xF3, RegMRM, (outs RC:$dst), (ins RC:$src),
                     !strconcat(mnemonic, "\t{$src, $dst|$dst, $src}"), []>,
-                  T8PS, VEX, VVVV, Sched<[sched]>;
+                  T8, PS, VEX, VVVV, Sched<[sched]>;
   let mayLoad = 1 in
   def rm#Suffix : I<0xF3, MemMRM, (outs RC:$dst), (ins x86memop:$src),
                     !strconcat(mnemonic, "\t{$src, $dst|$dst, $src}"), []>,
-                  T8PS, VEX, VVVV, Sched<[sched.Folded]>;
+                  T8, PS, VEX, VVVV, Sched<[sched.Folded]>;
 }
 }
 
@@ -1288,12 +1288,12 @@ multiclass bmi4VOp3_base<bits<8> opc, string mnemonic, RegisterClass RC,
   def rr#Suffix : I<opc, MRMSrcReg4VOp3, (outs RC:$dst), (ins RC:$src1, RC:$src2),
                     !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                     [(set RC:$dst, (OpNode RC:$src1, RC:$src2)), (implicit EFLAGS)]>,
-                  T8PS, VEX, Sched<[Sched]>;
+                  T8, PS, VEX, Sched<[Sched]>;
 let mayLoad = 1 in
   def rm#Suffix : I<opc, MRMSrcMem4VOp3, (outs RC:$dst), (ins x86memop:$src1, RC:$src2),
                     !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                     [(set RC:$dst, (OpNode (ld_frag addr:$src1), RC:$src2)),
-                     (implicit EFLAGS)]>, T8PS, VEX,
+                     (implicit EFLAGS)]>, T8, PS, VEX,
                   Sched<[Sched.Folded,
                          // x86memop:$src1
                          ReadDefault, ReadDefault, ReadDefault, ReadDefault,
@@ -1380,24 +1380,24 @@ multiclass bmi_pdep_pext<string mnemonic, RegisterClass RC,
 
 let Predicates = [HasBMI2, NoEGPR] in {
   defm PDEP32 : bmi_pdep_pext<"pdep{l}", GR32, i32mem,
-                               X86pdep, loadi32>, T8XD;
+                               X86pdep, loadi32>, T8, XD;
   defm PDEP64 : bmi_pdep_pext<"pdep{q}", GR64, i64mem,
-                               X86pdep, loadi64>, T8XD, REX_W;
+                               X86pdep, loadi64>, T8, XD, REX_W;
   defm PEXT32 : bmi_pdep_pext<"pext{l}", GR32, i32mem,
-                               X86pext, loadi32>, T8XS;
+                               X86pext, loadi32>, T8, XS;
   defm PEXT64 : bmi_pdep_pext<"pext{q}", GR64, i64mem,
-                               X86pext, loadi64>, T8XS, REX_W;
+                               X86pext, loadi64>, T8, XS, REX_W;
 }
 
 let Predicates = [HasBMI2, HasEGPR] in {
   defm PDEP32 : bmi_pdep_pext<"pdep{l}", GR32, i32mem,
-                               X86pdep, loadi32, "_EVEX">, T8XD, EVEX;
+                               X86pdep, loadi32, "_EVEX">, T8, XD, EVEX;
   defm PDEP64 : bmi_pdep_pext<"pdep{q}", GR64, i64mem,
-                               X86pdep, loadi64, "_EVEX">, T8XD, REX_W, EVEX;
+                               X86pdep, loadi64, "_EVEX">, T8, XD, REX_W, EVEX;
   defm PEXT32 : bmi_pdep_pext<"pext{l}", GR32, i32mem,
-                               X86pext, loadi32, "_EVEX">, T8XS, EVEX;
+                               X86pext, loadi32, "_EVEX">, T8, XS, EVEX;
   defm PEXT64 : bmi_pdep_pext<"pext{q}", GR64, i64mem,
-                               X86pext, loadi64, "_EVEX">, T8XS, REX_W, EVEX;
+                               X86pext, loadi64, "_EVEX">, T8, XS, REX_W, EVEX;
 }
 
 //===----------------------------------------------------------------------===//
@@ -1471,22 +1471,22 @@ let SchedRW = [ WriteSystem ] in {
 let SchedRW = [WriteSystem] in {
   def UMONITOR16 : I<0xAE, MRM6r, (outs), (ins GR16:$src),
                      "umonitor\t$src", [(int_x86_umonitor GR16:$src)]>,
-                     XS, AdSize16, Requires<[HasWAITPKG, Not64BitMode]>;
+                     TB, XS, AdSize16, Requires<[HasWAITPKG, Not64BitMode]>;
   def UMONITOR32 : I<0xAE, MRM6r, (outs), (ins GR32:$src),
                      "umonitor\t$src", [(int_x86_umonitor GR32:$src)]>,
-                     XS, AdSize32, Requires<[HasWAITPKG]>;
+                     TB, XS, AdSize32, Requires<[HasWAITPKG]>;
   def UMONITOR64 : I<0xAE, MRM6r, (outs), (ins GR64:$src),
                      "umonitor\t$src", [(int_x86_umonitor GR64:$src)]>,
-                     XS, AdSize64, Requires<[HasWAITPKG, In64BitMode]>;
+                     TB, XS, AdSize64, Requires<[HasWAITPKG, In64BitMode]>;
   let Uses = [EAX, EDX], Defs = [EFLAGS] in {
     def UMWAIT : I<0xAE, MRM6r,
                      (outs), (ins GR32orGR64:$src), "umwait\t$src",
                      [(set EFLAGS, (X86umwait GR32orGR64:$src, EDX, EAX))]>,
-                     XD, Requires<[HasWAITPKG]>;
+                     TB, XD, Requires<[HasWAITPKG]>;
     def TPAUSE : I<0xAE, MRM6r,
                      (outs), (ins GR32orGR64:$src), "tpause\t$src",
                      [(set EFLAGS, (X86tpause GR32orGR64:$src, EDX, EAX))]>,
-                     PD, Requires<[HasWAITPKG]>;
+                     TB, PD, Requires<[HasWAITPKG]>;
   }
 } // SchedRW
 
@@ -1497,19 +1497,19 @@ let SchedRW = [WriteStore] in {
 def MOVDIRI32 : I<0xF9, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
                   "movdiri\t{$src, $dst|$dst, $src}",
                   [(int_x86_directstore32 addr:$dst, GR32:$src)]>,
-                 T8PS, Requires<[HasMOVDIRI, NoEGPR]>;
+                 T8, PS, Requires<[HasMOVDIRI, NoEGPR]>;
 def MOVDIRI64 : RI<0xF9, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
                    "movdiri\t{$src, $dst|$dst, $src}",
                    [(int_x86_directstore64 addr:$dst, GR64:$src)]>,
-                  T8PS, Requires<[In64BitMode, HasMOVDIRI, NoEGPR]>;
+                  T8, PS, Requires<[In64BitMode, HasMOVDIRI, NoEGPR]>;
 def MOVDIRI32_EVEX : I<0xF9, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
                        "movdiri\t{$src, $dst|$dst, $src}",
                        [(int_x86_directstore32 addr:$dst, GR32:$src)]>,
-                     EVEX, NoCD8, T_MAP4PS, Requires<[In64BitMode, HasMOVDIRI, HasEGPR]>;
+                     EVEX, NoCD8, T_MAP4, PS, Requires<[In64BitMode, HasMOVDIRI, HasEGPR]>;
 def MOVDIRI64_EVEX : RI<0xF9, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
                         "movdiri\t{$src, $dst|$dst, $src}",
                         [(int_x86_directstore64 addr:$dst, GR64:$src)]>,
-                     EVEX, NoCD8, T_MAP4PS, Requires<[In64BitMode, HasMOVDIRI, HasEGPR]>;
+                     EVEX, NoCD8, T_MAP4, PS, Requires<[In64BitMode, HasMOVDIRI, HasEGPR]>;
 } // SchedRW
 
 //===----------------------------------------------------------------------===//
@@ -1518,23 +1518,23 @@ def MOVDIRI64_EVEX : RI<0xF9, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
 let SchedRW = [WriteStore] in {
 def MOVDIR64B16 : I<0xF8, MRMSrcMem, (outs), (ins GR16:$dst, i512mem_GR16:$src),
                     "movdir64b\t{$src, $dst|$dst, $src}", []>,
-                   T8PD, AdSize16, Requires<[HasMOVDIR64B, Not64BitMode]>;
+                   T8, PD, AdSize16, Requires<[HasMOVDIR64B, Not64BitMode]>;
 def MOVDIR64B32 : I<0xF8, MRMSrcMem, (outs), (ins GR32:$dst, i512mem_GR32:$src),
                     "movdir64b\t{$src, $dst|$dst, $src}",
                     [(int_x86_movdir64b GR32:$dst, addr:$src)]>,
-                   T8PD, AdSize32, Requires<[HasMOVDIR64B, NoEGPR]>;
+                   T8, PD, AdSize32, Requires<[HasMOVDIR64B, NoEGPR]>;
 def MOVDIR64B64 : I<0xF8, MRMSrcMem, (outs), (ins GR64:$dst, i512mem_GR64:$src),
                     "movdir64b\t{$src, $dst|$dst, $src}",
                     [(int_x86_movdir64b GR64:$dst, addr:$src)]>,
-                   T8PD, AdSize64, Requires<[HasMOVDIR64B, NoEGPR, In64BitMode]>;
+                   T8, PD, AdSize64, Requires<[HasMOVDIR64B, NoEGPR, In64BitMode]>;
 def MOVDIR64B32_EVEX : I<0xF8, MRMSrcMem, (outs), (ins GR32:$dst, i512mem_GR32:$src),
                          "movdir64b\t{$src, $dst|$dst, $src}",
                          [(int_x86_movdir64b GR32:$dst, addr:$src)]>,
-                       EVEX, NoCD8, T_MAP4PD, AdSize32, Requires<[HasMOVDIR64B, HasEGPR, In64BitMode]>;
+                       EVEX, NoCD8, T_MAP4, PD, AdSize32, Requires<[HasMOVDIR64B, HasEGPR, In64BitMode]>;
 def MOVDIR64B64_EVEX : I<0xF8, MRMSrcMem, (outs), (ins GR64:$dst, i512mem_GR64:$src),
                          "movdir64b\t{$src, $dst|$dst, $src}",
                          [(int_x86_movdir64b GR64:$dst, addr:$src)]>,
-                       EVEX, NoCD8, T_MAP4PD, AdSize64, Requires<[HasMOVDIR64B, HasEGPR, In64BitMode]>;
+                       EVEX, NoCD8, T_MAP4, PD, AdSize64, Requires<[HasMOVDIR64B, HasEGPR, In64BitMode]>;
 } // SchedRW
 
 //===----------------------------------------------------------------------===//
@@ -1544,28 +1544,28 @@ let SchedRW = [WriteStore], Defs = [EFLAGS] in {
   def ENQCMD16 : I<0xF8, MRMSrcMem, (outs), (ins GR16:$dst, i512mem:$src),
                  "enqcmd\t{$src, $dst|$dst, $src}",
                  [(set EFLAGS, (X86enqcmd GR16:$dst, addr:$src))]>,
-                 T8XD, AdSize16, Requires<[HasENQCMD, Not64BitMode]>;
+                 T8, XD, AdSize16, Requires<[HasENQCMD, Not64BitMode]>;
   def ENQCMD32 : I<0xF8, MRMSrcMem, (outs), (ins GR32:$dst, i512mem:$src),
                  "enqcmd\t{$src, $dst|$dst, $src}",
                  [(set EFLAGS, (X86enqcmd GR32:$dst, addr:$src))]>,
-                 T8XD, AdSize32, Requires<[HasENQCMD]>;
+                 T8, XD, AdSize32, Requires<[HasENQCMD]>;
   def ENQCMD64 : I<0xF8, MRMSrcMem, (outs), (ins GR64:$dst, i512mem:$src),
                  "enqcmd\t{$src, $dst|$dst, $src}",
                  [(set EFLAGS, (X86enqcmd GR64:$dst, addr:$src))]>,
-                 T8XD, AdSize64, Requires<[HasENQCMD, In64BitMode]>;
+                 T8, XD, AdSize64, Requires<[HasENQCMD, In64BitMode]>;
 
   def ENQCMDS16 : I<0xF8, MRMSrcMem, (outs), (ins GR16:$dst, i512mem:$src),
                  "enqcmds\t{$src, $dst|$dst, $src}",
                  [(set EFLAGS, (X86enqcmds GR16:$dst, addr:$src))]>,
-                 T8XS, AdSize16, Requires<[HasENQCMD, Not64BitMode]>;
+                 T8, XS, AdSize16, Requires<[HasENQCMD, Not64BitMode]>;
   def ENQCMDS32 : I<0xF8, MRMSrcMem, (outs), (ins GR32:$dst, i512mem:$src),
                  "enqcmds\t{$src, $dst|$dst, $src}",
                  [(set EFLAGS, (X86enqcmds GR32:$dst, addr:$src))]>,
-                 T8XS, AdSize32, Requires<[HasENQCMD]>;
+                 T8, XS, AdSize32, Requires<[HasENQCMD]>;
   def ENQCMDS64 : I<0xF8, MRMSrcMem, (outs), (ins GR64:$dst, i512mem:$src),
                  "enqcmds\t{$src, $dst|$dst, $src}",
                  [(set EFLAGS, (X86enqcmds GR64:$dst, addr:$src))]>,
-                 T8XS, AdSize64, Requires<[HasENQCMD, In64BitMode]>;
+                 T8, XS, AdSize64, Requires<[HasENQCMD, In64BitMode]>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -1588,11 +1588,11 @@ let SchedRW = [WriteSystem] in {
   let Uses = [EAX, EDX] in
   def INVLPGB32 : I<0x01, MRM_FE, (outs), (ins),
                   "invlpgb", []>,
-                  PS, Requires<[Not64BitMode]>;
+                  TB, PS, Requires<[Not64BitMode]>;
   let Uses = [RAX, EDX] in
   def INVLPGB64 : I<0x01, MRM_FE, (outs), (ins),
                   "invlpgb", []>,
-                  PS, Requires<[In64BitMode]>;
+                  TB, PS, Requires<[In64BitMode]>;
 } // SchedRW
 
 //===----------------------------------------------------------------------===//
@@ -1602,7 +1602,7 @@ let SchedRW = [WriteSystem] in {
 let SchedRW = [WriteSystem] in {
   def TLBSYNC   : I<0x01, MRM_FF, (outs), (ins),
                   "tlbsync", []>,
-                  PS, Requires<[]>;
+                  TB, PS, Requires<[]>;
 } // SchedRW
 
 //===----------------------------------------------------------------------===//
@@ -1610,14 +1610,14 @@ let SchedRW = [WriteSystem] in {
 //
 let Uses = [EAX], SchedRW = [WriteSystem] in
   def HRESET : Ii8<0xF0, MRM_C0, (outs), (ins i32u8imm:$imm), "hreset\t$imm", []>,
-                   Requires<[HasHRESET]>, TAXS;
+                   Requires<[HasHRESET]>, TA, XS;
 
 //===----------------------------------------------------------------------===//
 // SERIALIZE Instruction
 //
 let SchedRW = [WriteSystem] in
   def SERIALIZE : I<0x01, MRM_E8, (outs), (ins), "serialize",
-                    [(int_x86_serialize)]>, PS,
+                    [(int_x86_serialize)]>, TB, PS,
                     Requires<[HasSERIALIZE]>;
 
 //===----------------------------------------------------------------------===//
@@ -1625,9 +1625,9 @@ let SchedRW = [WriteSystem] in
 //
 let Predicates = [HasTSXLDTRK], SchedRW = [WriteSystem] in {
   def XSUSLDTRK : I<0x01, MRM_E8, (outs), (ins), "xsusldtrk",
-                    [(int_x86_xsusldtrk)]>, XD;
+                    [(int_x86_xsusldtrk)]>, TB, XD;
   def XRESLDTRK : I<0x01, MRM_E9, (outs), (ins), "xresldtrk",
-                    [(int_x86_xresldtrk)]>, XD;
+                    [(int_x86_xresldtrk)]>, TB, XD;
 }
 
 //===----------------------------------------------------------------------===//
@@ -1635,18 +1635,18 @@ let Predicates = [HasTSXLDTRK], SchedRW = [WriteSystem] in {
 //
 let Predicates = [HasUINTR, In64BitMode], SchedRW = [WriteSystem] in {
   def UIRET : I<0x01, MRM_EC, (outs), (ins), "uiret",
-               []>, XS;
+               []>, TB, XS;
   def CLUI : I<0x01, MRM_EE, (outs), (ins), "clui",
-               [(int_x86_clui)]>, XS;
+               [(int_x86_clui)]>, TB, XS;
   def STUI : I<0x01, MRM_EF, (outs), (ins), "stui",
-               [(int_x86_stui)]>, XS;
+               [(int_x86_stui)]>, TB, XS;
 
   def SENDUIPI : I<0xC7, MRM6r, (outs), (ins GR64:$arg), "senduipi\t$arg",
-                   [(int_x86_senduipi GR64:$arg)]>, XS;
+                   [(int_x86_senduipi GR64:$arg)]>, TB, XS;
 
   let Defs = [EFLAGS] in
     def TESTUI : I<0x01, MRM_ED, (outs), (ins), "testui",
-                   [(set EFLAGS, (X86testui))]>, XS;
+                   [(set EFLAGS, (X86testui))]>, TB, XS;
 }
 
 //===----------------------------------------------------------------------===//
@@ -1670,14 +1670,14 @@ def CMPCCXADDmr32 : I<0xe0, MRMDestMem4VOp3CC, (outs GR32:$dst),
           "cmp${cond}xadd\t{$src3, $dst, $dstsrc2|$dstsrc2, $dst, $src3}",
           [(set GR32:$dst, (X86cmpccxadd addr:$dstsrc2,
             GR32:$dstsrc1, GR32:$src3, timm:$cond))]>,
-          VEX, VVVV, T8PD, Sched<[WriteXCHG]>;
+          VEX, VVVV, T8, PD, Sched<[WriteXCHG]>;
 
 def CMPCCXADDmr64 : I<0xe0, MRMDestMem4VOp3CC, (outs GR64:$dst),
           (ins GR64:$dstsrc1, i64mem:$dstsrc2, GR64:$src3, ccode:$cond),
           "cmp${cond}xadd\t{$src3, $dst, $dstsrc2|$dstsrc2, $dst, $src3}",
           [(set GR64:$dst, (X86cmpccxadd addr:$dstsrc2,
             GR64:$dstsrc1, GR64:$src3, timm:$cond))]>,
-          VEX, VVVV, REX_W, T8PD, Sched<[WriteXCHG]>;
+          VEX, VVVV, REX_W, T8, PD, Sched<[WriteXCHG]>;
 }
 
 let Predicates = [HasCMPCCXADD, HasEGPR, In64BitMode] in {
@@ -1686,14 +1686,14 @@ def CMPCCXADDmr32_EVEX : I<0xe0, MRMDestMem4VOp3CC, (outs GR32:$dst),
           "cmp${cond}xadd\t{$src3, $dst, $dstsrc2|$dstsrc2, $dst, $src3}",
           [(set GR32:$dst, (X86cmpccxadd addr:$dstsrc2,
             GR32:$dstsrc1, GR32:$src3, timm:$cond))]>,
-          EVEX, VVVV, NoCD8, T8PD, Sched<[WriteXCHG]>;
+          EVEX, VVVV, NoCD8, T8, PD, Sched<[WriteXCHG]>;
 
 def CMPCCXADDmr64_EVEX : I<0xe0, MRMDestMem4VOp3CC, (outs GR64:$dst),
           (ins GR64:$dstsrc1, i64mem:$dstsrc2, GR64:$src3, ccode:$cond),
           "cmp${cond}xadd\t{$src3, $dst, $dstsrc2|$dstsrc2, $dst, $src3}",
           [(set GR64:$dst, (X86cmpccxadd addr:$dstsrc2,
             GR64:$dstsrc1, GR64:$src3, timm:$cond))]>,
-          EVEX, VVVV, NoCD8, REX_W, T8PD, Sched<[WriteXCHG]>;
+          EVEX, VVVV, NoCD8, REX_W, T8, PD, Sched<[WriteXCHG]>;
 }
 }
 
@@ -1703,12 +1703,12 @@ def CMPCCXADDmr64_EVEX : I<0xe0, MRMDestMem4VOp3CC, (outs GR64:$dst),
 
 let Predicates = [HasCLFLUSHOPT], SchedRW = [WriteLoad] in
 def CLFLUSHOPT : I<0xAE, MRM7m, (outs), (ins i8mem:$src),
-                   "clflushopt\t$src", [(int_x86_clflushopt addr:$src)]>, PD;
+                   "clflushopt\t$src", [(int_x86_clflushopt addr:$src)]>, TB, PD;
 
 let Predicates = [HasCLWB], SchedRW = [WriteLoad] in
 def CLWB       : I<0xAE, MRM6m, (outs), (ins i8mem:$src), "clwb\t$src",
-                   [(int_x86_clwb addr:$src)]>, PD;
+                   [(int_x86_clwb addr:$src)]>, TB, PD;
 
 let Predicates = [HasCLDEMOTE], SchedRW = [WriteLoad] in
 def CLDEMOTE : I<0x1C, MRM0m, (outs), (ins i8mem:$src), "cldemote\t$src",
-                   [(int_x86_cldemote addr:$src)]>, PS;
+                   [(int_x86_cldemote addr:$src)]>, TB, PS;
diff --git a/llvm/lib/Target/X86/X86InstrRAOINT.td b/llvm/lib/Target/X86/X86InstrRAOINT.td
index dc0e267a83e39..601355d4f7de4 100644
--- a/llvm/lib/Target/X86/X86InstrRAOINT.td
+++ b/llvm/lib/Target/X86/X86InstrRAOINT.td
@@ -39,7 +39,7 @@ multiclass RAOINT_BASE<string OpcodeStr> {
                Sched<[WriteALURMW]>, REX_W;
 }
 
-defm AADD : RAOINT_BASE<"add">, T8PS;
-defm AAND : RAOINT_BASE<"and">, T8PD;
-defm AOR  : RAOINT_BASE<"or" >, T8XD;
-defm AXOR : RAOINT_BASE<"xor">, T8XS;
+defm AADD : RAOINT_BASE<"add">, T8, PS;
+defm AAND : RAOINT_BASE<"and">, T8, PD;
+defm AOR  : RAOINT_BASE<"or" >, T8, XD;
+defm AXOR : RAOINT_BASE<"xor">, T8, XS;
diff --git a/llvm/lib/Target/X86/X86InstrSGX.td b/llvm/lib/Target/X86/X86InstrSGX.td
index 6439f717accb9..3c8d6e3c6b6b3 100644
--- a/llvm/lib/Target/X86/X86InstrSGX.td
+++ b/llvm/lib/Target/X86/X86InstrSGX.td
@@ -17,13 +17,13 @@
 let SchedRW = [WriteSystem], Predicates = [HasSGX] in {
 // ENCLS - Execute an Enclave System Function of Specified Leaf Number
 def ENCLS : I<0x01, MRM_CF, (outs), (ins),
-             "encls", []>, PS;
+             "encls", []>, TB, PS;
 
 // ENCLU - Execute an Enclave User Function of Specified Leaf Number
 def ENCLU : I<0x01, MRM_D7, (outs), (ins),
-             "enclu", []>, PS;
+             "enclu", []>, TB, PS;
 
 // ENCLV - Execute an Enclave VMM Function of Specified Leaf Number
 def ENCLV : I<0x01, MRM_C0, (outs), (ins),
-             "enclv", []>, PS;
+             "enclv", []>, TB, PS;
 } // SchedRW
diff --git a/llvm/lib/Target/X86/X86InstrSNP.td b/llvm/lib/Target/X86/X86InstrSNP.td
index ab13fa43c92dd..05ed6585db6df 100644
--- a/llvm/lib/Target/X86/X86InstrSNP.td
+++ b/llvm/lib/Target/X86/X86InstrSNP.td
@@ -17,31 +17,31 @@
 let SchedRW = [WriteSystem] in {
 // F3 0F 01 FF
 let Uses = [RAX], Defs = [EAX, EFLAGS] in
-def PSMASH: I<0x01, MRM_FF, (outs), (ins), "psmash", []>, XS,
+def PSMASH: I<0x01, MRM_FF, (outs), (ins), "psmash", []>, TB, XS,
             Requires<[In64BitMode]>;
 
 // F2 0F 01 FF
 let Uses = [RAX, RCX, RDX], Defs = [EAX, EFLAGS] in
 def PVALIDATE64: I<0x01, MRM_FF, (outs), (ins), "pvalidate",[]>,
-                 XD, Requires<[In64BitMode]>;
+                 TB, XD, Requires<[In64BitMode]>;
 
 let Uses = [EAX, ECX, EDX], Defs = [EAX, EFLAGS] in
 def PVALIDATE32: I<0x01, MRM_FF, (outs), (ins), "pvalidate",[]>,
-                 XD, Requires<[Not64BitMode]>;
+                 TB, XD, Requires<[Not64BitMode]>;
 
 // F2 0F 01 FE
 let Uses = [RAX, RCX], Defs = [EAX, EFLAGS] in
-def RMPUPDATE: I<0x01, MRM_FE, (outs), (ins), "rmpupdate", []>, XD,
+def RMPUPDATE: I<0x01, MRM_FE, (outs), (ins), "rmpupdate", []>, TB, XD,
                Requires<[In64BitMode]>;
 
 // F3 0F 01 FE
 let Uses = [RAX, RCX, RDX], Defs = [EAX, EFLAGS] in
-def RMPADJUST: I<0x01, MRM_FE, (outs), (ins), "rmpadjust", []>, XS,
+def RMPADJUST: I<0x01, MRM_FE, (outs), (ins), "rmpadjust", []>, TB, XS,
                Requires<[In64BitMode]>;
 
 // F3 0F 01 FD
 let Uses = [RAX, RDX], Defs = [RAX, RCX, RDX, EFLAGS] in
-def RMPQUERY: I<0x01, MRM_FD, (outs), (ins), "rmpquery", []>, XS,
+def RMPQUERY: I<0x01, MRM_FD, (outs), (ins), "rmpquery", []>, TB, XS,
                Requires<[In64BitMode]>;
 } // SchedRW
 
diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td
index d91c7740aae39..27d3974a674ab 100644
--- a/llvm/lib/Target/X86/X86InstrSSE.td
+++ b/llvm/lib/Target/X86/X86InstrSSE.td
@@ -268,15 +268,15 @@ multiclass sse12_move_rm<RegisterClass RC, ValueType vt, X86MemOperand x86memop,
 }
 
 defm MOVSS : sse12_move<FR32, X86Movss, v4f32, f32mem, "movss",
-                        SSEPackedSingle, UseSSE1>, XS;
+                        SSEPackedSingle, UseSSE1>, TB, XS;
 defm MOVSD : sse12_move<FR64, X86Movsd, v2f64, f64mem, "movsd",
-                        SSEPackedDouble, UseSSE2>, XD;
+                        SSEPackedDouble, UseSSE2>, TB, XD;
 
 let canFoldAsLoad = 1, isReMaterializable = 1 in {
   defm MOVSS : sse12_move_rm<FR32, v4f32, f32mem, loadf32, X86vzload32, "movss",
-                             SSEPackedSingle>, XS;
+                             SSEPackedSingle>, TB, XS;
   defm MOVSD : sse12_move_rm<FR64, v2f64, f64mem, loadf64, X86vzload64, "movsd",
-                             SSEPackedDouble>, XD;
+                             SSEPackedDouble>, TB, XD;
 }
 
 // Patterns
@@ -352,46 +352,46 @@ let canFoldAsLoad = 1, isReMaterializable = 1 in
 let Predicates = [HasAVX, NoVLX] in {
 defm VMOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32, "movaps",
                                 SSEPackedSingle, SchedWriteFMoveLS.XMM>,
-                                PS, VEX, WIG;
+                                TB, PS, VEX, WIG;
 defm VMOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64, "movapd",
                                 SSEPackedDouble, SchedWriteFMoveLS.XMM>,
-                                PD, VEX, WIG;
+                                TB, PD, VEX, WIG;
 defm VMOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32, "movups",
                                 SSEPackedSingle, SchedWriteFMoveLS.XMM>,
-                                PS, VEX, WIG;
+                                TB, PS, VEX, WIG;
 defm VMOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64, "movupd",
                                 SSEPackedDouble, SchedWriteFMoveLS.XMM>,
-                                PD, VEX, WIG;
+                                TB, PD, VEX, WIG;
 
 defm VMOVAPSY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv8f32, "movaps",
                                  SSEPackedSingle, SchedWriteFMoveLS.YMM>,
-                                 PS, VEX, VEX_L, WIG;
+                                 TB, PS, VEX, VEX_L, WIG;
 defm VMOVAPDY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv4f64, "movapd",
                                  SSEPackedDouble, SchedWriteFMoveLS.YMM>,
-                                 PD, VEX, VEX_L, WIG;
+                                 TB, PD, VEX, VEX_L, WIG;
 defm VMOVUPSY : sse12_mov_packed<0x10, VR256, f256mem, loadv8f32, "movups",
                                  SSEPackedSingle, SchedWriteFMoveLS.YMM>,
-                                 PS, VEX, VEX_L, WIG;
+                                 TB, PS, VEX, VEX_L, WIG;
 defm VMOVUPDY : sse12_mov_packed<0x10, VR256, f256mem, loadv4f64, "movupd",
                                  SSEPackedDouble, SchedWriteFMoveLS.YMM>,
-                                 PD, VEX, VEX_L, WIG;
+                                 TB, PD, VEX, VEX_L, WIG;
 }
 
 let Predicates = [UseSSE1] in {
 defm MOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32, "movaps",
                                SSEPackedSingle, SchedWriteFMoveLS.XMM>,
-                               PS;
+                               TB, PS;
 defm MOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32, "movups",
                                SSEPackedSingle, SchedWriteFMoveLS.XMM>,
-                               PS;
+                               TB, PS;
 }
 let Predicates = [UseSSE2] in {
 defm MOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64, "movapd",
                                SSEPackedDouble, SchedWriteFMoveLS.XMM>,
-                               PD;
+                               TB, PD;
 defm MOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64, "movupd",
                                SSEPackedDouble, SchedWriteFMoveLS.XMM>,
-                               PD;
+                               TB, PD;
 }
 
 let Predicates = [HasAVX, NoVLX]  in {
@@ -666,7 +666,7 @@ multiclass sse12_mov_hilo_packed_base<bits<8>opc, SDPatternOperator pdnode,
   def PSrm : PI<opc, MRMSrcMem,
                 (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
                 !strconcat(base_opc, "s", asm_opr),
-                [], SSEPackedSingle>, PS,
+                [], SSEPackedSingle>, TB, PS,
                 Sched<[SchedWriteFShuffle.XMM.Folded, SchedWriteFShuffle.XMM.ReadAfterFold]>;
 
   def PDrm : PI<opc, MRMSrcMem,
@@ -674,7 +674,7 @@ multiclass sse12_mov_hilo_packed_base<bits<8>opc, SDPatternOperator pdnode,
          !strconcat(base_opc, "d", asm_opr),
      [(set VR128:$dst, (v2f64 (pdnode VR128:$src1,
                               (scalar_to_vector (loadf64 addr:$src2)))))],
-              SSEPackedDouble>, PD,
+              SSEPackedDouble>, TB, PD,
      Sched<[SchedWriteFShuffle.XMM.Folded, SchedWriteFShuffle.XMM.ReadAfterFold]>;
 }
 
@@ -903,36 +903,36 @@ let isCodeGenOnly = 1, Predicates = [UseAVX], Uses = [MXCSR], mayRaiseFPExceptio
 defm VCVTTSS2SI   : sse12_cvt_s<0x2C, FR32, GR32, any_fp_to_sint, f32mem, loadf32,
                                 "cvttss2si", "cvttss2si",
                                 WriteCvtSS2I, SSEPackedSingle>,
-                                XS, VEX, VEX_LIG;
+                                TB, XS, VEX, VEX_LIG;
 defm VCVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, any_fp_to_sint, f32mem, loadf32,
                                 "cvttss2si", "cvttss2si",
                                 WriteCvtSS2I, SSEPackedSingle>,
-                                XS, VEX, REX_W, VEX_LIG;
+                                TB, XS, VEX, REX_W, VEX_LIG;
 defm VCVTTSD2SI   : sse12_cvt_s<0x2C, FR64, GR32, any_fp_to_sint, f64mem, loadf64,
                                 "cvttsd2si", "cvttsd2si",
                                 WriteCvtSD2I, SSEPackedDouble>,
-                                XD, VEX, VEX_LIG;
+                                TB, XD, VEX, VEX_LIG;
 defm VCVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, any_fp_to_sint, f64mem, loadf64,
                                 "cvttsd2si", "cvttsd2si",
                                 WriteCvtSD2I, SSEPackedDouble>,
-                                XD, VEX, REX_W, VEX_LIG;
+                                TB, XD, VEX, REX_W, VEX_LIG;
 
 defm VCVTSS2SI   : sse12_cvt_s<0x2D, FR32, GR32, lrint, f32mem, loadf32,
                                "cvtss2si", "cvtss2si",
                                WriteCvtSS2I, SSEPackedSingle>,
-                               XS, VEX, VEX_LIG;
+                               TB, XS, VEX, VEX_LIG;
 defm VCVTSS2SI64 : sse12_cvt_s<0x2D, FR32, GR64, llrint, f32mem, loadf32,
                                "cvtss2si", "cvtss2si",
                                WriteCvtSS2I, SSEPackedSingle>,
-                               XS, VEX, REX_W, VEX_LIG;
+                               TB, XS, VEX, REX_W, VEX_LIG;
 defm VCVTSD2SI   : sse12_cvt_s<0x2D, FR64, GR32, lrint, f64mem, loadf64,
                                "cvtsd2si", "cvtsd2si",
                                WriteCvtSD2I, SSEPackedDouble>,
-                               XD, VEX, VEX_LIG;
+                               TB, XD, VEX, VEX_LIG;
 defm VCVTSD2SI64 : sse12_cvt_s<0x2D, FR64, GR64, llrint, f64mem, loadf64,
                                "cvtsd2si", "cvtsd2si",
                                WriteCvtSD2I, SSEPackedDouble>,
-                               XD, VEX, REX_W, VEX_LIG;
+                               TB, XD, VEX, REX_W, VEX_LIG;
 }
 
 // The assembler can recognize rr 64-bit instructions by seeing a rxx
@@ -941,16 +941,16 @@ defm VCVTSD2SI64 : sse12_cvt_s<0x2D, FR64, GR64, llrint, f64mem, loadf64,
 // where appropriate to do so.
 let isCodeGenOnly = 1 in {
 defm VCVTSI2SS   : sse12_vcvt_avx<0x2A, GR32, FR32, i32mem, "cvtsi2ss", "l",
-                                  WriteCvtI2SS, SSEPackedSingle>, XS, VEX, VVVV,
+                                  WriteCvtI2SS, SSEPackedSingle>, TB, XS, VEX, VVVV,
                                   VEX_LIG, SIMD_EXC;
 defm VCVTSI642SS : sse12_vcvt_avx<0x2A, GR64, FR32, i64mem, "cvtsi2ss", "q",
-                                  WriteCvtI2SS, SSEPackedSingle>, XS, VEX, VVVV,
+                                  WriteCvtI2SS, SSEPackedSingle>, TB, XS, VEX, VVVV,
                                   REX_W, VEX_LIG, SIMD_EXC;
 defm VCVTSI2SD   : sse12_vcvt_avx<0x2A, GR32, FR64, i32mem, "cvtsi2sd", "l",
-                                  WriteCvtI2SD, SSEPackedDouble>, XD, VEX, VVVV,
+                                  WriteCvtI2SD, SSEPackedDouble>, TB, XD, VEX, VVVV,
                                   VEX_LIG;
 defm VCVTSI642SD : sse12_vcvt_avx<0x2A, GR64, FR64, i64mem, "cvtsi2sd", "q",
-                                  WriteCvtI2SD, SSEPackedDouble>, XD, VEX, VVVV,
+                                  WriteCvtI2SD, SSEPackedDouble>, TB, XD, VEX, VVVV,
                                   REX_W, VEX_LIG, SIMD_EXC;
 } // isCodeGenOnly = 1
 
@@ -983,42 +983,42 @@ let Predicates = [UseAVX] in {
 let isCodeGenOnly = 1 in {
 defm CVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, any_fp_to_sint, f32mem, loadf32,
                       "cvttss2si", "cvttss2si",
-                      WriteCvtSS2I, SSEPackedSingle>, XS, SIMD_EXC;
+                      WriteCvtSS2I, SSEPackedSingle>, TB, XS, SIMD_EXC;
 defm CVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, any_fp_to_sint, f32mem, loadf32,
                       "cvttss2si", "cvttss2si",
-                      WriteCvtSS2I, SSEPackedSingle>, XS, REX_W, SIMD_EXC;
+                      WriteCvtSS2I, SSEPackedSingle>, TB, XS, REX_W, SIMD_EXC;
 defm CVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, any_fp_to_sint, f64mem, loadf64,
                       "cvttsd2si", "cvttsd2si",
-                      WriteCvtSD2I, SSEPackedDouble>, XD, SIMD_EXC;
+                      WriteCvtSD2I, SSEPackedDouble>, TB, XD, SIMD_EXC;
 defm CVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, any_fp_to_sint, f64mem, loadf64,
                       "cvttsd2si", "cvttsd2si",
-                      WriteCvtSD2I, SSEPackedDouble>, XD, REX_W, SIMD_EXC;
+                      WriteCvtSD2I, SSEPackedDouble>, TB, XD, REX_W, SIMD_EXC;
 
 defm CVTSS2SI : sse12_cvt_s<0x2D, FR32, GR32, lrint, f32mem, loadf32,
                      "cvtss2si", "cvtss2si",
-                     WriteCvtSS2I, SSEPackedSingle>, XS, SIMD_EXC;
+                     WriteCvtSS2I, SSEPackedSingle>, TB, XS, SIMD_EXC;
 defm CVTSS2SI64 : sse12_cvt_s<0x2D, FR32, GR64, llrint, f32mem, loadf32,
                      "cvtss2si", "cvtss2si",
-                     WriteCvtSS2I, SSEPackedSingle>, XS, REX_W, SIMD_EXC;
+                     WriteCvtSS2I, SSEPackedSingle>, TB, XS, REX_W, SIMD_EXC;
 defm CVTSD2SI : sse12_cvt_s<0x2D, FR64, GR32, lrint, f64mem, loadf64,
                      "cvtsd2si", "cvtsd2si",
-                     WriteCvtSD2I, SSEPackedDouble>, XD, SIMD_EXC;
+                     WriteCvtSD2I, SSEPackedDouble>, TB, XD, SIMD_EXC;
 defm CVTSD2SI64 : sse12_cvt_s<0x2D, FR64, GR64, llrint, f64mem, loadf64,
                      "cvtsd2si", "cvtsd2si",
-                     WriteCvtSD2I, SSEPackedDouble>, XD, REX_W, SIMD_EXC;
+                     WriteCvtSD2I, SSEPackedDouble>, TB, XD, REX_W, SIMD_EXC;
 
 defm CVTSI2SS  : sse12_cvt_s<0x2A, GR32, FR32, any_sint_to_fp, i32mem, loadi32,
                       "cvtsi2ss", "cvtsi2ss{l}",
-                      WriteCvtI2SS, SSEPackedSingle, ReadInt2Fpu>, XS, SIMD_EXC;
+                      WriteCvtI2SS, SSEPackedSingle, ReadInt2Fpu>, TB, XS, SIMD_EXC;
 defm CVTSI642SS : sse12_cvt_s<0x2A, GR64, FR32, any_sint_to_fp, i64mem, loadi64,
                       "cvtsi2ss", "cvtsi2ss{q}",
-                      WriteCvtI2SS, SSEPackedSingle, ReadInt2Fpu>, XS, REX_W, SIMD_EXC;
+                      WriteCvtI2SS, SSEPackedSingle, ReadInt2Fpu>, TB, XS, REX_W, SIMD_EXC;
 defm CVTSI2SD  : sse12_cvt_s<0x2A, GR32, FR64, any_sint_to_fp, i32mem, loadi32,
                       "cvtsi2sd", "cvtsi2sd{l}",
-                      WriteCvtI2SD, SSEPackedDouble, ReadInt2Fpu>, XD;
+                      WriteCvtI2SD, SSEPackedDouble, ReadInt2Fpu>, TB, XD;
 defm CVTSI642SD : sse12_cvt_s<0x2A, GR64, FR64, any_sint_to_fp, i64mem, loadi64,
                       "cvtsi2sd", "cvtsi2sd{q}",
-                      WriteCvtI2SD, SSEPackedDouble, ReadInt2Fpu>, XD, REX_W, SIMD_EXC;
+                      WriteCvtI2SD, SSEPackedDouble, ReadInt2Fpu>, TB, XD, REX_W, SIMD_EXC;
 } // isCodeGenOnly = 1
 
 let Predicates = [UseSSE1] in {
@@ -1074,46 +1074,46 @@ let Uses = [MXCSR], mayRaiseFPException = 1 in {
 let Predicates = [UseAVX] in {
 defm VCVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, i32, v2f64,
                   X86cvts2si, sdmem, sse_load_f64, "cvtsd2si",
-                  WriteCvtSD2I, SSEPackedDouble>, XD, VEX, VEX_LIG;
+                  WriteCvtSD2I, SSEPackedDouble>, TB, XD, VEX, VEX_LIG;
 defm VCVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v2f64,
                     X86cvts2si, sdmem, sse_load_f64, "cvtsd2si",
-                    WriteCvtSD2I, SSEPackedDouble>, XD, VEX, REX_W, VEX_LIG;
+                    WriteCvtSD2I, SSEPackedDouble>, TB, XD, VEX, REX_W, VEX_LIG;
 }
 defm CVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, i32, v2f64, X86cvts2si,
                  sdmem, sse_load_f64, "cvtsd2si", WriteCvtSD2I,
-                 SSEPackedDouble>, XD;
+                 SSEPackedDouble>, TB, XD;
 defm CVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v2f64, X86cvts2si,
                    sdmem, sse_load_f64, "cvtsd2si", WriteCvtSD2I,
-                   SSEPackedDouble>, XD, REX_W;
+                   SSEPackedDouble>, TB, XD, REX_W;
 }
 
 let Predicates = [UseAVX] in {
 defm VCVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
           i32mem, "cvtsi2ss", "l", WriteCvtI2SS, SSEPackedSingle, 0>,
-          XS, VEX, VVVV, VEX_LIG, SIMD_EXC;
+          TB, XS, VEX, VVVV, VEX_LIG, SIMD_EXC;
 defm VCVTSI642SS : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
           i64mem, "cvtsi2ss", "q", WriteCvtI2SS, SSEPackedSingle, 0>,
-          XS, VEX, VVVV, VEX_LIG, REX_W, SIMD_EXC;
+          TB, XS, VEX, VVVV, VEX_LIG, REX_W, SIMD_EXC;
 defm VCVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
           i32mem, "cvtsi2sd", "l", WriteCvtI2SD, SSEPackedDouble, 0>,
-          XD, VEX, VVVV, VEX_LIG;
+          TB, XD, VEX, VVVV, VEX_LIG;
 defm VCVTSI642SD : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
           i64mem, "cvtsi2sd", "q", WriteCvtI2SD, SSEPackedDouble, 0>,
-          XD, VEX, VVVV, VEX_LIG, REX_W, SIMD_EXC;
+          TB, XD, VEX, VVVV, VEX_LIG, REX_W, SIMD_EXC;
 }
 let Constraints = "$src1 = $dst" in {
   defm CVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
                         i32mem, "cvtsi2ss", "l", WriteCvtI2SS, SSEPackedSingle>,
-                        XS, SIMD_EXC;
+                        TB, XS, SIMD_EXC;
   defm CVTSI642SS : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
                         i64mem, "cvtsi2ss", "q", WriteCvtI2SS, SSEPackedSingle>,
-                        XS, REX_W, SIMD_EXC;
+                        TB, XS, REX_W, SIMD_EXC;
   defm CVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
                         i32mem, "cvtsi2sd", "l", WriteCvtI2SD, SSEPackedDouble>,
-                        XD;
+                        TB, XD;
   defm CVTSI642SD : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
                         i64mem, "cvtsi2sd", "q", WriteCvtI2SD, SSEPackedDouble>,
-                        XD, REX_W, SIMD_EXC;
+                        TB, XD, REX_W, SIMD_EXC;
 }
 
 def : InstAlias<"vcvtsi2ss{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
@@ -1150,34 +1150,34 @@ def : InstAlias<"cvtsi2sd\t{$src, $dst|$dst, $src}",
 let Predicates = [UseAVX], Uses = [MXCSR], mayRaiseFPException = 1 in {
 defm VCVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v4f32, X86cvtts2Int,
                                 ssmem, sse_load_f32, "cvttss2si",
-                                WriteCvtSS2I, SSEPackedSingle>, XS, VEX, VEX_LIG;
+                                WriteCvtSS2I, SSEPackedSingle>, TB, XS, VEX, VEX_LIG;
 defm VCVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v4f32,
                                X86cvtts2Int, ssmem, sse_load_f32,
                                "cvttss2si", WriteCvtSS2I, SSEPackedSingle>,
-                               XS, VEX, VEX_LIG, REX_W;
+                               TB, XS, VEX, VEX_LIG, REX_W;
 defm VCVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v2f64, X86cvtts2Int,
                                 sdmem, sse_load_f64, "cvttsd2si",
-                                WriteCvtSS2I, SSEPackedDouble>, XD, VEX, VEX_LIG;
+                                WriteCvtSS2I, SSEPackedDouble>, TB, XD, VEX, VEX_LIG;
 defm VCVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v2f64,
                               X86cvtts2Int, sdmem, sse_load_f64,
                               "cvttsd2si", WriteCvtSS2I, SSEPackedDouble>,
-                              XD, VEX, VEX_LIG, REX_W;
+                              TB, XD, VEX, VEX_LIG, REX_W;
 }
 let Uses = [MXCSR], mayRaiseFPException = 1 in {
 defm CVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v4f32, X86cvtts2Int,
                                     ssmem, sse_load_f32, "cvttss2si",
-                                    WriteCvtSS2I, SSEPackedSingle>, XS;
+                                    WriteCvtSS2I, SSEPackedSingle>, TB, XS;
 defm CVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v4f32,
                                    X86cvtts2Int, ssmem, sse_load_f32,
                                    "cvttss2si", WriteCvtSS2I, SSEPackedSingle>,
-                                   XS, REX_W;
+                                   TB, XS, REX_W;
 defm CVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v2f64, X86cvtts2Int,
                                     sdmem, sse_load_f64, "cvttsd2si",
-                                    WriteCvtSD2I, SSEPackedDouble>, XD;
+                                    WriteCvtSD2I, SSEPackedDouble>, TB, XD;
 defm CVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v2f64,
                                   X86cvtts2Int, sdmem, sse_load_f64,
                                   "cvttsd2si", WriteCvtSD2I, SSEPackedDouble>,
-                                  XD, REX_W;
+                                  TB, XD, REX_W;
 }
 
 def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}",
@@ -1217,32 +1217,32 @@ def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}",
 let Predicates = [UseAVX], Uses = [MXCSR], mayRaiseFPException = 1 in {
 defm VCVTSS2SI   : sse12_cvt_sint<0x2D, VR128, GR32, i32, v4f32, X86cvts2si,
                                   ssmem, sse_load_f32, "cvtss2si",
-                                  WriteCvtSS2I, SSEPackedSingle>, XS, VEX, VEX_LIG;
+                                  WriteCvtSS2I, SSEPackedSingle>, TB, XS, VEX, VEX_LIG;
 defm VCVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v4f32, X86cvts2si,
                                   ssmem, sse_load_f32, "cvtss2si",
-                                  WriteCvtSS2I, SSEPackedSingle>, XS, VEX, REX_W, VEX_LIG;
+                                  WriteCvtSS2I, SSEPackedSingle>, TB, XS, VEX, REX_W, VEX_LIG;
 }
 let Uses = [MXCSR], mayRaiseFPException = 1 in {
 defm CVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, i32, v4f32, X86cvts2si,
                                ssmem, sse_load_f32, "cvtss2si",
-                               WriteCvtSS2I, SSEPackedSingle>, XS;
+                               WriteCvtSS2I, SSEPackedSingle>, TB, XS;
 defm CVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v4f32, X86cvts2si,
                                  ssmem, sse_load_f32, "cvtss2si",
-                                 WriteCvtSS2I, SSEPackedSingle>, XS, REX_W;
+                                 WriteCvtSS2I, SSEPackedSingle>, TB, XS, REX_W;
 
 defm VCVTDQ2PS   : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, load,
                                "vcvtdq2ps\t{$src, $dst|$dst, $src}",
                                SSEPackedSingle, WriteCvtI2PS>,
-                               PS, VEX, Requires<[HasAVX, NoVLX]>, WIG;
+                               TB, PS, VEX, Requires<[HasAVX, NoVLX]>, WIG;
 defm VCVTDQ2PSY  : sse12_cvt_p<0x5B, VR256, i256mem, v8f32, v8i32, load,
                                "vcvtdq2ps\t{$src, $dst|$dst, $src}",
                                SSEPackedSingle, WriteCvtI2PSY>,
-                               PS, VEX, VEX_L, Requires<[HasAVX, NoVLX]>, WIG;
+                               TB, PS, VEX, VEX_L, Requires<[HasAVX, NoVLX]>, WIG;
 
 defm CVTDQ2PS : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, memop,
                             "cvtdq2ps\t{$src, $dst|$dst, $src}",
                             SSEPackedSingle, WriteCvtI2PS>,
-                            PS, Requires<[UseSSE2]>;
+                            TB, PS, Requires<[UseSSE2]>;
 }
 
 // AVX aliases
@@ -1295,7 +1295,7 @@ let mayLoad = 1 in
 def VCVTSD2SSrm  : I<0x5A, MRMSrcMem, (outs FR32:$dst),
                      (ins FR32:$src1, f64mem:$src2),
                      "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
-                     XD, VEX, VVVV, VEX_LIG, WIG,
+                     TB, XD, VEX, VVVV, VEX_LIG, WIG,
                      Sched<[WriteCvtSD2SS.Folded, WriteCvtSD2SS.ReadAfterFold]>, SIMD_EXC;
 }
 
@@ -1311,7 +1311,7 @@ def CVTSD2SSrr  : SDI<0x5A, MRMSrcReg, (outs FR32:$dst), (ins FR64:$src),
 def CVTSD2SSrm  : I<0x5A, MRMSrcMem, (outs FR32:$dst), (ins f64mem:$src),
                     "cvtsd2ss\t{$src, $dst|$dst, $src}",
                     [(set FR32:$dst, (any_fpround (loadf64 addr:$src)))]>,
-                    XD, Requires<[UseSSE2, OptForSize]>,
+                    TB, XD, Requires<[UseSSE2, OptForSize]>,
                     Sched<[WriteCvtSD2SS.Folded, WriteCvtSD2SS.ReadAfterFold]>, SIMD_EXC;
 }
 
@@ -1321,14 +1321,14 @@ def VCVTSD2SSrr_Int: I<0x5A, MRMSrcReg,
                        "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                        [(set VR128:$dst,
                          (v4f32 (X86frounds VR128:$src1, (v2f64 VR128:$src2))))]>,
-                       XD, VEX, VVVV, VEX_LIG, WIG, Requires<[UseAVX]>,
+                       TB, XD, VEX, VVVV, VEX_LIG, WIG, Requires<[UseAVX]>,
                        Sched<[WriteCvtSD2SS]>;
 def VCVTSD2SSrm_Int: I<0x5A, MRMSrcMem,
                        (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2),
                        "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                        [(set VR128:$dst,
                          (v4f32 (X86frounds VR128:$src1, (sse_load_f64 addr:$src2))))]>,
-                       XD, VEX, VVVV, VEX_LIG, WIG, Requires<[UseAVX]>,
+                       TB, XD, VEX, VVVV, VEX_LIG, WIG, Requires<[UseAVX]>,
                        Sched<[WriteCvtSD2SS.Folded, WriteCvtSD2SS.ReadAfterFold]>;
 let Constraints = "$src1 = $dst" in {
 def CVTSD2SSrr_Int: I<0x5A, MRMSrcReg,
@@ -1336,13 +1336,13 @@ def CVTSD2SSrr_Int: I<0x5A, MRMSrcReg,
                        "cvtsd2ss\t{$src2, $dst|$dst, $src2}",
                        [(set VR128:$dst,
                          (v4f32 (X86frounds VR128:$src1, (v2f64 VR128:$src2))))]>,
-                       XD, Requires<[UseSSE2]>, Sched<[WriteCvtSD2SS]>;
+                       TB, XD, Requires<[UseSSE2]>, Sched<[WriteCvtSD2SS]>;
 def CVTSD2SSrm_Int: I<0x5A, MRMSrcMem,
                        (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2),
                        "cvtsd2ss\t{$src2, $dst|$dst, $src2}",
                        [(set VR128:$dst,
                          (v4f32 (X86frounds VR128:$src1, (sse_load_f64 addr:$src2))))]>,
-                       XD, Requires<[UseSSE2]>,
+                       TB, XD, Requires<[UseSSE2]>,
                        Sched<[WriteCvtSD2SS.Folded, WriteCvtSD2SS.ReadAfterFold]>;
 }
 }
@@ -1353,13 +1353,13 @@ let isCodeGenOnly = 1, hasSideEffects = 0, ExeDomain = SSEPackedSingle in {
 def VCVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst),
                     (ins FR64:$src1, FR32:$src2),
                     "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
-                    XS, VEX, VVVV, VEX_LIG, WIG,
+                    TB, XS, VEX, VVVV, VEX_LIG, WIG,
                     Sched<[WriteCvtSS2SD]>, Requires<[UseAVX]>, SIMD_EXC;
 let mayLoad = 1 in
 def VCVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst),
                     (ins FR64:$src1, f32mem:$src2),
                     "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
-                    XS, VEX, VVVV, VEX_LIG, WIG,
+                    TB, XS, VEX, VVVV, VEX_LIG, WIG,
                     Sched<[WriteCvtSS2SD.Folded, WriteCvtSS2SD.ReadAfterFold]>,
                     Requires<[UseAVX, OptForSize]>, SIMD_EXC;
 } // isCodeGenOnly = 1, hasSideEffects = 0
@@ -1373,11 +1373,11 @@ let isCodeGenOnly = 1, ExeDomain = SSEPackedSingle in {
 def CVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR32:$src),
                    "cvtss2sd\t{$src, $dst|$dst, $src}",
                    [(set FR64:$dst, (any_fpextend FR32:$src))]>,
-                   XS, Requires<[UseSSE2]>, Sched<[WriteCvtSS2SD]>, SIMD_EXC;
+                   TB, XS, Requires<[UseSSE2]>, Sched<[WriteCvtSS2SD]>, SIMD_EXC;
 def CVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), (ins f32mem:$src),
                    "cvtss2sd\t{$src, $dst|$dst, $src}",
                    [(set FR64:$dst, (any_fpextend (loadf32 addr:$src)))]>,
-                   XS, Requires<[UseSSE2, OptForSize]>,
+                   TB, XS, Requires<[UseSSE2, OptForSize]>,
                    Sched<[WriteCvtSS2SD.Folded, WriteCvtSS2SD.ReadAfterFold]>, SIMD_EXC;
 } // isCodeGenOnly = 1
 
@@ -1386,25 +1386,25 @@ let hasSideEffects = 0, Uses = [MXCSR], mayRaiseFPException = 1,
 def VCVTSS2SDrr_Int: I<0x5A, MRMSrcReg,
                       (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
                     "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                    []>, XS, VEX, VVVV, VEX_LIG, WIG,
+                    []>, TB, XS, VEX, VVVV, VEX_LIG, WIG,
                     Requires<[HasAVX]>, Sched<[WriteCvtSS2SD]>;
 let mayLoad = 1 in
 def VCVTSS2SDrm_Int: I<0x5A, MRMSrcMem,
                       (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2),
                     "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                    []>, XS, VEX, VVVV, VEX_LIG, WIG, Requires<[HasAVX]>,
+                    []>, TB, XS, VEX, VVVV, VEX_LIG, WIG, Requires<[HasAVX]>,
                     Sched<[WriteCvtSS2SD.Folded, WriteCvtSS2SD.ReadAfterFold]>;
 let Constraints = "$src1 = $dst" in { // SSE2 instructions with XS prefix
 def CVTSS2SDrr_Int: I<0x5A, MRMSrcReg,
                       (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
                     "cvtss2sd\t{$src2, $dst|$dst, $src2}",
-                    []>, XS, Requires<[UseSSE2]>,
+                    []>, TB, XS, Requires<[UseSSE2]>,
                     Sched<[WriteCvtSS2SD]>;
 let mayLoad = 1 in
 def CVTSS2SDrm_Int: I<0x5A, MRMSrcMem,
                       (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2),
                     "cvtss2sd\t{$src2, $dst|$dst, $src2}",
-                    []>, XS, Requires<[UseSSE2]>,
+                    []>, TB, XS, Requires<[UseSSE2]>,
                     Sched<[WriteCvtSS2SD.Folded, WriteCvtSS2SD.ReadAfterFold]>;
 }
 } // hasSideEffects = 0
@@ -1699,30 +1699,30 @@ let Predicates = [HasAVX, NoVLX], Uses = [MXCSR], mayRaiseFPException = 1 in {
 def VCVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                     "vcvtps2pd\t{$src, $dst|$dst, $src}",
                     [(set VR128:$dst, (v2f64 (X86any_vfpext (v4f32 VR128:$src))))]>,
-                    PS, VEX, Sched<[WriteCvtPS2PD]>, WIG;
+                    TB, PS, VEX, Sched<[WriteCvtPS2PD]>, WIG;
 def VCVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
                     "vcvtps2pd\t{$src, $dst|$dst, $src}",
                     [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))]>,
-                    PS, VEX, Sched<[WriteCvtPS2PD.Folded]>, WIG;
+                    TB, PS, VEX, Sched<[WriteCvtPS2PD.Folded]>, WIG;
 def VCVTPS2PDYrr : I<0x5A, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
                      "vcvtps2pd\t{$src, $dst|$dst, $src}",
                      [(set VR256:$dst, (v4f64 (any_fpextend (v4f32 VR128:$src))))]>,
-                     PS, VEX, VEX_L, Sched<[WriteCvtPS2PDY]>, WIG;
+                     TB, PS, VEX, VEX_L, Sched<[WriteCvtPS2PDY]>, WIG;
 def VCVTPS2PDYrm : I<0x5A, MRMSrcMem, (outs VR256:$dst), (ins f128mem:$src),
                      "vcvtps2pd\t{$src, $dst|$dst, $src}",
                      [(set VR256:$dst, (v4f64 (extloadv4f32 addr:$src)))]>,
-                     PS, VEX, VEX_L, Sched<[WriteCvtPS2PDY.Folded]>, WIG;
+                     TB, PS, VEX, VEX_L, Sched<[WriteCvtPS2PDY.Folded]>, WIG;
 }
 
 let Predicates = [UseSSE2], Uses = [MXCSR], mayRaiseFPException = 1 in {
 def CVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                    "cvtps2pd\t{$src, $dst|$dst, $src}",
                    [(set VR128:$dst, (v2f64 (X86any_vfpext (v4f32 VR128:$src))))]>,
-                   PS, Sched<[WriteCvtPS2PD]>;
+                   TB, PS, Sched<[WriteCvtPS2PD]>;
 def CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
                    "cvtps2pd\t{$src, $dst|$dst, $src}",
                    [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))]>,
-                   PS, Sched<[WriteCvtPS2PD.Folded]>;
+                   TB, PS, Sched<[WriteCvtPS2PD.Folded]>;
 }
 
 // Convert Packed DW Integers to Packed Double FP
@@ -1860,22 +1860,22 @@ let ExeDomain = SSEPackedSingle in
 defm VCMPSS : sse12_cmp_scalar<FR32, f32mem, ssmem, X86cmps, v4f32, loadf32,
                  "cmpss\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
                  SchedWriteFCmpSizes.PS.Scl, sse_load_f32>,
-                 XS, VEX, VVVV, VEX_LIG, WIG;
+                 TB, XS, VEX, VVVV, VEX_LIG, WIG;
 let ExeDomain = SSEPackedDouble in
 defm VCMPSD : sse12_cmp_scalar<FR64, f64mem, sdmem, X86cmps, v2f64, loadf64,
                  "cmpsd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
                  SchedWriteFCmpSizes.PD.Scl, sse_load_f64>,
-                 XD, VEX, VVVV, VEX_LIG, WIG;
+                 TB, XD, VEX, VVVV, VEX_LIG, WIG;
 
 let Constraints = "$src1 = $dst" in {
   let ExeDomain = SSEPackedSingle in
   defm CMPSS : sse12_cmp_scalar<FR32, f32mem, ssmem, X86cmps, v4f32, loadf32,
                   "cmpss\t{$cc, $src2, $dst|$dst, $src2, $cc}",
-                  SchedWriteFCmpSizes.PS.Scl, sse_load_f32>, XS;
+                  SchedWriteFCmpSizes.PS.Scl, sse_load_f32>, TB, XS;
   let ExeDomain = SSEPackedDouble in
   defm CMPSD : sse12_cmp_scalar<FR64, f64mem, sdmem, X86cmps, v2f64, loadf64,
                   "cmpsd\t{$cc, $src2, $dst|$dst, $src2, $cc}",
-                  SchedWriteFCmpSizes.PD.Scl, sse_load_f64>, XD;
+                  SchedWriteFCmpSizes.PD.Scl, sse_load_f64>, TB, XD;
 }
 
 // sse12_ord_cmp - Unordered/Ordered scalar fp compare and set EFLAGS
@@ -1919,44 +1919,44 @@ let mayLoad = 1 in
 
 let Defs = [EFLAGS] in {
   defm VUCOMISS : sse12_ord_cmp<0x2E, FR32, X86any_fcmp, f32, f32mem, loadf32,
-                               "ucomiss", SSEPackedSingle>, PS, VEX, VEX_LIG, WIG;
+                               "ucomiss", SSEPackedSingle>, TB, PS, VEX, VEX_LIG, WIG;
   defm VUCOMISD : sse12_ord_cmp<0x2E, FR64, X86any_fcmp, f64, f64mem, loadf64,
-                               "ucomisd", SSEPackedDouble>, PD, VEX, VEX_LIG, WIG;
+                               "ucomisd", SSEPackedDouble>, TB, PD, VEX, VEX_LIG, WIG;
   defm VCOMISS  : sse12_ord_cmp<0x2F, FR32, X86strict_fcmps, f32, f32mem, loadf32,
-                               "comiss", SSEPackedSingle>, PS, VEX, VEX_LIG, WIG;
+                               "comiss", SSEPackedSingle>, TB, PS, VEX, VEX_LIG, WIG;
   defm VCOMISD  : sse12_ord_cmp<0x2F, FR64, X86strict_fcmps, f64, f64mem, loadf64,
-                               "comisd", SSEPackedDouble>, PD, VEX, VEX_LIG, WIG;
+                               "comisd", SSEPackedDouble>, TB, PD, VEX, VEX_LIG, WIG;
 
   let isCodeGenOnly = 1 in {
     defm VUCOMISS  : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v4f32, ssmem,
-                      sse_load_f32, "ucomiss", SSEPackedSingle>, PS, VEX, VEX_LIG, WIG;
+                      sse_load_f32, "ucomiss", SSEPackedSingle>, TB, PS, VEX, VEX_LIG, WIG;
     defm VUCOMISD  : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v2f64, sdmem,
-                      sse_load_f64, "ucomisd", SSEPackedDouble>, PD, VEX, VEX_LIG, WIG;
+                      sse_load_f64, "ucomisd", SSEPackedDouble>, TB, PD, VEX, VEX_LIG, WIG;
 
     defm VCOMISS  : sse12_ord_cmp_int<0x2F, VR128, X86comi, v4f32, ssmem,
-                       sse_load_f32, "comiss", SSEPackedSingle>, PS, VEX, VEX_LIG, WIG;
+                       sse_load_f32, "comiss", SSEPackedSingle>, TB, PS, VEX, VEX_LIG, WIG;
     defm VCOMISD  : sse12_ord_cmp_int<0x2F, VR128, X86comi, v2f64, sdmem,
-                       sse_load_f64, "comisd", SSEPackedDouble>, PD, VEX, VEX_LIG, WIG;
+                       sse_load_f64, "comisd", SSEPackedDouble>, TB, PD, VEX, VEX_LIG, WIG;
   }
   defm UCOMISS  : sse12_ord_cmp<0x2E, FR32, X86any_fcmp, f32, f32mem, loadf32,
-                                  "ucomiss", SSEPackedSingle>, PS;
+                                  "ucomiss", SSEPackedSingle>, TB, PS;
   defm UCOMISD  : sse12_ord_cmp<0x2E, FR64, X86any_fcmp, f64, f64mem, loadf64,
-                                  "ucomisd", SSEPackedDouble>, PD;
+                                  "ucomisd", SSEPackedDouble>, TB, PD;
   defm COMISS   : sse12_ord_cmp<0x2F, FR32, X86strict_fcmps, f32, f32mem, loadf32,
-                                  "comiss", SSEPackedSingle>, PS;
+                                  "comiss", SSEPackedSingle>, TB, PS;
   defm COMISD   : sse12_ord_cmp<0x2F, FR64, X86strict_fcmps, f64, f64mem, loadf64,
-                                  "comisd", SSEPackedDouble>, PD;
+                                  "comisd", SSEPackedDouble>, TB, PD;
 
   let isCodeGenOnly = 1 in {
     defm UCOMISS  : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v4f32, ssmem,
-                            sse_load_f32, "ucomiss", SSEPackedSingle>, PS;
+                            sse_load_f32, "ucomiss", SSEPackedSingle>, TB, PS;
     defm UCOMISD  : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v2f64, sdmem,
-                            sse_load_f64, "ucomisd", SSEPackedDouble>, PD;
+                            sse_load_f64, "ucomisd", SSEPackedDouble>, TB, PD;
 
     defm COMISS  : sse12_ord_cmp_int<0x2F, VR128, X86comi, v4f32, ssmem,
-                                sse_load_f32, "comiss", SSEPackedSingle>, PS;
+                                sse_load_f32, "comiss", SSEPackedSingle>, TB, PS;
     defm COMISD  : sse12_ord_cmp_int<0x2F, VR128, X86comi, v2f64, sdmem,
-                                    sse_load_f64, "comisd", SSEPackedDouble>, PD;
+                                    sse_load_f64, "comisd", SSEPackedDouble>, TB, PD;
   }
 } // Defs = [EFLAGS]
 
@@ -1979,23 +1979,23 @@ multiclass sse12_cmp_packed<RegisterClass RC, X86MemOperand x86memop,
 
 defm VCMPPS : sse12_cmp_packed<VR128, f128mem, v4f32,
                "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
-               SchedWriteFCmpSizes.PS.XMM, SSEPackedSingle, loadv4f32>, PS, VEX, VVVV, WIG;
+               SchedWriteFCmpSizes.PS.XMM, SSEPackedSingle, loadv4f32>, TB, PS, VEX, VVVV, WIG;
 defm VCMPPD : sse12_cmp_packed<VR128, f128mem, v2f64,
                "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
-               SchedWriteFCmpSizes.PD.XMM, SSEPackedDouble, loadv2f64>, PD, VEX, VVVV, WIG;
+               SchedWriteFCmpSizes.PD.XMM, SSEPackedDouble, loadv2f64>, TB, PD, VEX, VVVV, WIG;
 defm VCMPPSY : sse12_cmp_packed<VR256, f256mem, v8f32,
                "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
-               SchedWriteFCmpSizes.PS.YMM, SSEPackedSingle, loadv8f32>, PS, VEX, VVVV, VEX_L, WIG;
+               SchedWriteFCmpSizes.PS.YMM, SSEPackedSingle, loadv8f32>, TB, PS, VEX, VVVV, VEX_L, WIG;
 defm VCMPPDY : sse12_cmp_packed<VR256, f256mem, v4f64,
                "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
-               SchedWriteFCmpSizes.PD.YMM, SSEPackedDouble, loadv4f64>, PD, VEX, VVVV, VEX_L, WIG;
+               SchedWriteFCmpSizes.PD.YMM, SSEPackedDouble, loadv4f64>, TB, PD, VEX, VVVV, VEX_L, WIG;
 let Constraints = "$src1 = $dst" in {
   defm CMPPS : sse12_cmp_packed<VR128, f128mem, v4f32,
                  "cmpps\t{$cc, $src2, $dst|$dst, $src2, $cc}",
-                 SchedWriteFCmpSizes.PS.XMM, SSEPackedSingle, memopv4f32>, PS;
+                 SchedWriteFCmpSizes.PS.XMM, SSEPackedSingle, memopv4f32>, TB, PS;
   defm CMPPD : sse12_cmp_packed<VR128, f128mem, v2f64,
                  "cmppd\t{$cc, $src2, $dst|$dst, $src2, $cc}",
-                 SchedWriteFCmpSizes.PD.XMM, SSEPackedDouble, memopv2f64>, PD;
+                 SchedWriteFCmpSizes.PD.XMM, SSEPackedDouble, memopv2f64>, TB, PD;
 }
 
 def CommutableCMPCC : PatLeaf<(timm), [{
@@ -2076,27 +2076,27 @@ let Predicates = [HasAVX, NoVLX] in {
   defm VSHUFPS  : sse12_shuffle<VR128, f128mem, v4f32,
            "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
            loadv4f32, SchedWriteFShuffle.XMM, SSEPackedSingle>,
-           PS, VEX, VVVV, WIG;
+           TB, PS, VEX, VVVV, WIG;
   defm VSHUFPSY : sse12_shuffle<VR256, f256mem, v8f32,
            "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
            loadv8f32, SchedWriteFShuffle.YMM, SSEPackedSingle>,
-           PS, VEX, VVVV, VEX_L, WIG;
+           TB, PS, VEX, VVVV, VEX_L, WIG;
   defm VSHUFPD  : sse12_shuffle<VR128, f128mem, v2f64,
            "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
            loadv2f64, SchedWriteFShuffle.XMM, SSEPackedDouble>,
-           PD, VEX, VVVV, WIG;
+           TB, PD, VEX, VVVV, WIG;
   defm VSHUFPDY : sse12_shuffle<VR256, f256mem, v4f64,
            "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
            loadv4f64, SchedWriteFShuffle.YMM, SSEPackedDouble>,
-           PD, VEX, VVVV, VEX_L, WIG;
+           TB, PD, VEX, VVVV, VEX_L, WIG;
 }
 let Constraints = "$src1 = $dst" in {
   defm SHUFPS : sse12_shuffle<VR128, f128mem, v4f32,
                     "shufps\t{$src3, $src2, $dst|$dst, $src2, $src3}",
-                    memopv4f32, SchedWriteFShuffle.XMM, SSEPackedSingle>, PS;
+                    memopv4f32, SchedWriteFShuffle.XMM, SSEPackedSingle>, TB, PS;
   defm SHUFPD : sse12_shuffle<VR128, f128mem, v2f64,
                     "shufpd\t{$src3, $src2, $dst|$dst, $src2, $src3}",
-                    memopv2f64, SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, PD;
+                    memopv2f64, SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, TB, PD;
 }
 
 //===----------------------------------------------------------------------===//
@@ -2126,44 +2126,44 @@ multiclass sse12_unpack_interleave<bits<8> opc, SDNode OpNode, ValueType vt,
 let Predicates = [HasAVX, NoVLX] in {
 defm VUNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, load,
       VR128, f128mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                     SchedWriteFShuffle.XMM, SSEPackedSingle>, PS, VEX, VVVV, WIG;
+                     SchedWriteFShuffle.XMM, SSEPackedSingle>, TB, PS, VEX, VVVV, WIG;
 defm VUNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, load,
       VR128, f128mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                     SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, PD, VEX, VVVV, WIG;
+                     SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, TB, PD, VEX, VVVV, WIG;
 defm VUNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, load,
       VR128, f128mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                     SchedWriteFShuffle.XMM, SSEPackedSingle>, PS, VEX, VVVV, WIG;
+                     SchedWriteFShuffle.XMM, SSEPackedSingle>, TB, PS, VEX, VVVV, WIG;
 defm VUNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, load,
       VR128, f128mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                     SchedWriteFShuffle.XMM, SSEPackedDouble>, PD, VEX, VVVV, WIG;
+                     SchedWriteFShuffle.XMM, SSEPackedDouble>, TB, PD, VEX, VVVV, WIG;
 
 defm VUNPCKHPSY: sse12_unpack_interleave<0x15, X86Unpckh, v8f32, load,
       VR256, f256mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                     SchedWriteFShuffle.YMM, SSEPackedSingle>, PS, VEX, VVVV, VEX_L, WIG;
+                     SchedWriteFShuffle.YMM, SSEPackedSingle>, TB, PS, VEX, VVVV, VEX_L, WIG;
 defm VUNPCKHPDY: sse12_unpack_interleave<0x15, X86Unpckh, v4f64, load,
       VR256, f256mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                     SchedWriteFShuffle.YMM, SSEPackedDouble>, PD, VEX, VVVV, VEX_L, WIG;
+                     SchedWriteFShuffle.YMM, SSEPackedDouble>, TB, PD, VEX, VVVV, VEX_L, WIG;
 defm VUNPCKLPSY: sse12_unpack_interleave<0x14, X86Unpckl, v8f32, load,
       VR256, f256mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                     SchedWriteFShuffle.YMM, SSEPackedSingle>, PS, VEX, VVVV, VEX_L, WIG;
+                     SchedWriteFShuffle.YMM, SSEPackedSingle>, TB, PS, VEX, VVVV, VEX_L, WIG;
 defm VUNPCKLPDY: sse12_unpack_interleave<0x14, X86Unpckl, v4f64, load,
       VR256, f256mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                     SchedWriteFShuffle.YMM, SSEPackedDouble>, PD, VEX, VVVV, VEX_L, WIG;
+                     SchedWriteFShuffle.YMM, SSEPackedDouble>, TB, PD, VEX, VVVV, VEX_L, WIG;
 }// Predicates = [HasAVX, NoVLX]
 
 let Constraints = "$src1 = $dst" in {
   defm UNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, memop,
         VR128, f128mem, "unpckhps\t{$src2, $dst|$dst, $src2}",
-                       SchedWriteFShuffle.XMM, SSEPackedSingle>, PS;
+                       SchedWriteFShuffle.XMM, SSEPackedSingle>, TB, PS;
   defm UNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, memop,
         VR128, f128mem, "unpckhpd\t{$src2, $dst|$dst, $src2}",
-                       SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, PD;
+                       SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, TB, PD;
   defm UNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, memop,
         VR128, f128mem, "unpcklps\t{$src2, $dst|$dst, $src2}",
-                       SchedWriteFShuffle.XMM, SSEPackedSingle>, PS;
+                       SchedWriteFShuffle.XMM, SSEPackedSingle>, TB, PS;
   defm UNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, memop,
         VR128, f128mem, "unpcklpd\t{$src2, $dst|$dst, $src2}",
-                       SchedWriteFShuffle.XMM, SSEPackedDouble>, PD;
+                       SchedWriteFShuffle.XMM, SSEPackedDouble>, TB, PD;
 } // Constraints = "$src1 = $dst"
 
 let Predicates = [HasAVX1Only] in {
@@ -2208,13 +2208,13 @@ multiclass sse12_extr_sign_mask<RegisterClass RC, ValueType vt,
 
 let Predicates = [HasAVX] in {
   defm VMOVMSKPS : sse12_extr_sign_mask<VR128, v4f32, "movmskps",
-                                        SSEPackedSingle>, PS, VEX, WIG;
+                                        SSEPackedSingle>, TB, PS, VEX, WIG;
   defm VMOVMSKPD : sse12_extr_sign_mask<VR128, v2f64, "movmskpd",
-                                        SSEPackedDouble>, PD, VEX, WIG;
+                                        SSEPackedDouble>, TB, PD, VEX, WIG;
   defm VMOVMSKPSY : sse12_extr_sign_mask<VR256, v8f32, "movmskps",
-                                         SSEPackedSingle>, PS, VEX, VEX_L, WIG;
+                                         SSEPackedSingle>, TB, PS, VEX, VEX_L, WIG;
   defm VMOVMSKPDY : sse12_extr_sign_mask<VR256, v4f64, "movmskpd",
-                                         SSEPackedDouble>, PD, VEX, VEX_L, WIG;
+                                         SSEPackedDouble>, TB, PD, VEX, VEX_L, WIG;
 
   // Also support integer VTs to avoid a int->fp bitcast in the DAG.
   def : Pat<(X86movmsk (v4i32 VR128:$src)),
@@ -2228,9 +2228,9 @@ let Predicates = [HasAVX] in {
 }
 
 defm MOVMSKPS : sse12_extr_sign_mask<VR128, v4f32, "movmskps",
-                                     SSEPackedSingle>, PS;
+                                     SSEPackedSingle>, TB, PS;
 defm MOVMSKPD : sse12_extr_sign_mask<VR128, v2f64, "movmskpd",
-                                     SSEPackedDouble>, PD;
+                                     SSEPackedDouble>, TB, PD;
 
 let Predicates = [UseSSE2] in {
   // Also support integer VTs to avoid a int->fp bitcast in the DAG.
@@ -2312,29 +2312,29 @@ multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr,
   let Predicates = [HasAVX, NoVLX] in {
   defm V#NAME#PSY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedSingle,
         !strconcat(OpcodeStr, "ps"), f256mem, sched.YMM,
-        [], [], 0>, PS, VEX, VVVV, VEX_L, WIG;
+        [], [], 0>, TB, PS, VEX, VVVV, VEX_L, WIG;
 
   defm V#NAME#PDY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedDouble,
         !strconcat(OpcodeStr, "pd"), f256mem, sched.YMM,
-        [], [], 0>, PD, VEX, VVVV, VEX_L, WIG;
+        [], [], 0>, TB, PD, VEX, VVVV, VEX_L, WIG;
 
   defm V#NAME#PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle,
        !strconcat(OpcodeStr, "ps"), f128mem, sched.XMM,
-       [], [], 0>, PS, VEX, VVVV, WIG;
+       [], [], 0>, TB, PS, VEX, VVVV, WIG;
 
   defm V#NAME#PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble,
        !strconcat(OpcodeStr, "pd"), f128mem, sched.XMM,
-       [], [], 0>, PD, VEX, VVVV, WIG;
+       [], [], 0>, TB, PD, VEX, VVVV, WIG;
   }
 
   let Constraints = "$src1 = $dst" in {
     defm PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle,
          !strconcat(OpcodeStr, "ps"), f128mem, sched.XMM,
-         [], []>, PS;
+         [], []>, TB, PS;
 
     defm PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble,
          !strconcat(OpcodeStr, "pd"), f128mem, sched.XMM,
-         [], []>, PD;
+         [], []>, TB, PD;
   }
 }
 
@@ -2636,26 +2636,26 @@ let Uses = [MXCSR], mayRaiseFPException = 1 in {
   let Predicates = [HasAVX, NoVLX] in {
   defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode,
                                VR128, v4f32, f128mem, loadv4f32,
-                               SSEPackedSingle, sched.PS.XMM, 0>, PS, VEX, VVVV, WIG;
+                               SSEPackedSingle, sched.PS.XMM, 0>, TB, PS, VEX, VVVV, WIG;
   defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode,
                                VR128, v2f64, f128mem, loadv2f64,
-                               SSEPackedDouble, sched.PD.XMM, 0>, PD, VEX, VVVV, WIG;
+                               SSEPackedDouble, sched.PD.XMM, 0>, TB, PD, VEX, VVVV, WIG;
 
   defm V#NAME#PSY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"),
                         OpNode, VR256, v8f32, f256mem, loadv8f32,
-                        SSEPackedSingle, sched.PS.YMM, 0>, PS, VEX, VVVV, VEX_L, WIG;
+                        SSEPackedSingle, sched.PS.YMM, 0>, TB, PS, VEX, VVVV, VEX_L, WIG;
   defm V#NAME#PDY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"),
                         OpNode, VR256, v4f64, f256mem, loadv4f64,
-                        SSEPackedDouble, sched.PD.YMM, 0>, PD, VEX, VVVV, VEX_L, WIG;
+                        SSEPackedDouble, sched.PD.YMM, 0>, TB, PD, VEX, VVVV, VEX_L, WIG;
   }
 
   let Constraints = "$src1 = $dst" in {
     defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR128,
                               v4f32, f128mem, memopv4f32, SSEPackedSingle,
-                              sched.PS.XMM>, PS;
+                              sched.PS.XMM>, TB, PS;
     defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, VR128,
                               v2f64, f128mem, memopv2f64, SSEPackedDouble,
-                              sched.PD.XMM>, PD;
+                              sched.PD.XMM>, TB, PD;
   }
 }
 }
@@ -2665,18 +2665,18 @@ multiclass basic_sse12_fp_binop_s<bits<8> opc, string OpcodeStr, SDPatternOperat
 let Uses = [MXCSR], mayRaiseFPException = 1 in {
   defm V#NAME#SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"),
                          OpNode, FR32, f32mem, SSEPackedSingle, sched.PS.Scl, 0>,
-                         XS, VEX, VVVV, VEX_LIG, WIG;
+                         TB, XS, VEX, VVVV, VEX_LIG, WIG;
   defm V#NAME#SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"),
                          OpNode, FR64, f64mem, SSEPackedDouble, sched.PD.Scl, 0>,
-                         XD, VEX, VVVV, VEX_LIG, WIG;
+                         TB, XD, VEX, VVVV, VEX_LIG, WIG;
 
   let Constraints = "$src1 = $dst" in {
     defm SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"),
                               OpNode, FR32, f32mem, SSEPackedSingle,
-                              sched.PS.Scl>, XS;
+                              sched.PS.Scl>, TB, XS;
     defm SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"),
                               OpNode, FR64, f64mem, SSEPackedDouble,
-                              sched.PD.Scl>, XD;
+                              sched.PD.Scl>, TB, XD;
   }
 }
 }
@@ -2687,18 +2687,18 @@ multiclass basic_sse12_fp_binop_s_int<bits<8> opc, string OpcodeStr,
 let Uses = [MXCSR], mayRaiseFPException = 1 in {
   defm V#NAME#SS : sse12_fp_scalar_int<opc, OpNode, VR128, v4f32,
                    !strconcat(OpcodeStr, "ss"), ssmem, sse_load_f32,
-                   SSEPackedSingle, sched.PS.Scl, 0>, XS, VEX, VVVV, VEX_LIG, WIG;
+                   SSEPackedSingle, sched.PS.Scl, 0>, TB, XS, VEX, VVVV, VEX_LIG, WIG;
   defm V#NAME#SD : sse12_fp_scalar_int<opc, OpNode, VR128, v2f64,
                    !strconcat(OpcodeStr, "sd"), sdmem, sse_load_f64,
-                   SSEPackedDouble, sched.PD.Scl, 0>, XD, VEX, VVVV, VEX_LIG, WIG;
+                   SSEPackedDouble, sched.PD.Scl, 0>, TB, XD, VEX, VVVV, VEX_LIG, WIG;
 
   let Constraints = "$src1 = $dst" in {
     defm SS : sse12_fp_scalar_int<opc, OpNode, VR128, v4f32,
                    !strconcat(OpcodeStr, "ss"), ssmem, sse_load_f32,
-                   SSEPackedSingle, sched.PS.Scl>, XS;
+                   SSEPackedSingle, sched.PS.Scl>, TB, XS;
     defm SD : sse12_fp_scalar_int<opc, OpNode, VR128, v2f64,
                    !strconcat(OpcodeStr, "sd"), sdmem, sse_load_f64,
-                   SSEPackedDouble, sched.PD.Scl>, XD;
+                   SSEPackedDouble, sched.PD.Scl>, TB, XD;
   }
 }
 }
@@ -3016,29 +3016,29 @@ let Predicates = [HasAVX, NoVLX] in {
 multiclass sse1_fp_unop_s_intr<string OpcodeStr, Predicate AVXTarget> {
   defm SS        :  sse_fp_unop_s_intr<v4f32, sse_load_f32,
                       !cast<Intrinsic>("int_x86_sse_"#OpcodeStr#_ss),
-                      UseSSE1>, XS;
+                      UseSSE1>, TB, XS;
   defm V#NAME#SS  : avx_fp_unop_s_intr<v4f32, sse_load_f32,
                       !cast<Intrinsic>("int_x86_sse_"#OpcodeStr#_ss),
                       AVXTarget>,
-                      XS, VEX, VVVV, VEX_LIG, WIG;
+                      TB, XS, VEX, VVVV, VEX_LIG, WIG;
 }
 
 multiclass sse1_fp_unop_s<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
                           X86SchedWriteWidths sched, Predicate AVXTarget> {
   defm SS        :  sse_fp_unop_s<opc, OpcodeStr#ss, FR32, f32mem,
-                      ssmem, OpNode, SSEPackedSingle, sched.Scl, UseSSE1>, XS;
+                      ssmem, OpNode, SSEPackedSingle, sched.Scl, UseSSE1>, TB, XS;
   defm V#NAME#SS  : avx_fp_unop_s<opc, "v"#OpcodeStr#ss, FR32, f32,
                       f32mem, ssmem, OpNode, SSEPackedSingle, sched.Scl, AVXTarget>,
-                       XS, VEX, VVVV, VEX_LIG, WIG;
+                       TB, XS, VEX, VVVV, VEX_LIG, WIG;
 }
 
 multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
                           X86SchedWriteWidths sched, Predicate AVXTarget> {
   defm SD         : sse_fp_unop_s<opc, OpcodeStr#sd, FR64, f64mem,
-                         sdmem, OpNode, SSEPackedDouble, sched.Scl, UseSSE2>, XD;
+                         sdmem, OpNode, SSEPackedDouble, sched.Scl, UseSSE2>, TB, XD;
   defm V#NAME#SD  : avx_fp_unop_s<opc, "v"#OpcodeStr#sd, FR64, f64,
                          f64mem, sdmem, OpNode, SSEPackedDouble, sched.Scl, AVXTarget>,
-                         XD, VEX, VVVV, VEX_LIG, WIG;
+                         TB, XD, VEX, VVVV, VEX_LIG, WIG;
 }
 
 // Square root.
@@ -3165,11 +3165,11 @@ let SchedRW = [WriteStoreNT] in {
 def MOVNTImr : I<0xC3, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
                  "movnti{l}\t{$src, $dst|$dst, $src}",
                  [(nontemporalstore (i32 GR32:$src), addr:$dst)]>,
-               PS, Requires<[HasSSE2]>;
+               TB, PS, Requires<[HasSSE2]>;
 def MOVNTI_64mr : RI<0xC3, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
                      "movnti{q}\t{$src, $dst|$dst, $src}",
                      [(nontemporalstore (i64 GR64:$src), addr:$dst)]>,
-                  PS, Requires<[HasSSE2]>;
+                  TB, PS, Requires<[HasSSE2]>;
 } // SchedRW = [WriteStoreNT]
 
 let Predicates = [HasAVX, NoVLX] in {
@@ -3226,14 +3226,14 @@ let SchedRW = [WriteLoad] in {
 // Flush cache
 def CLFLUSH : I<0xAE, MRM7m, (outs), (ins i8mem:$src),
                "clflush\t$src", [(int_x86_sse2_clflush addr:$src)]>,
-               PS, Requires<[HasCLFLUSH]>;
+               TB, PS, Requires<[HasCLFLUSH]>;
 }
 
 let SchedRW = [WriteNop] in {
 // Pause. This "instruction" is encoded as "rep; nop", so even though it
 // was introduced with SSE2, it's backward compatible.
 def PAUSE : I<0x90, RawFrm, (outs), (ins),
-              "pause", [(int_x86_sse2_pause)]>, OBXS;
+              "pause", [(int_x86_sse2_pause)]>, XS;
 }
 
 let SchedRW = [WriteFence] in {
@@ -3241,11 +3241,11 @@ let SchedRW = [WriteFence] in {
 // TODO: As with mfence, we may want to ease the availability of sfence/lfence
 // to include any 64-bit target.
 def SFENCE : I<0xAE, MRM7X, (outs), (ins), "sfence", [(int_x86_sse_sfence)]>,
-               PS, Requires<[HasSSE1]>;
+               TB, PS, Requires<[HasSSE1]>;
 def LFENCE : I<0xAE, MRM5X, (outs), (ins), "lfence", [(int_x86_sse2_lfence)]>,
-               PS, Requires<[HasSSE2]>;
+               TB, PS, Requires<[HasSSE2]>;
 def MFENCE : I<0xAE, MRM6X, (outs), (ins), "mfence", [(int_x86_sse2_mfence)]>,
-               PS, Requires<[HasMFence]>;
+               TB, PS, Requires<[HasMFence]>;
 } // SchedRW
 
 def : Pat<(X86MFence), (MFENCE)>;
@@ -3266,11 +3266,11 @@ def VSTMXCSR : VPSI<0xAE, MRM3m, (outs), (ins i32mem:$dst),
 let mayLoad=1, hasSideEffects=1, Defs=[MXCSR] in
 def LDMXCSR : I<0xAE, MRM2m, (outs), (ins i32mem:$src),
               "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)]>,
-              PS, Sched<[WriteLDMXCSR]>;
+              TB, PS, Sched<[WriteLDMXCSR]>;
 let mayStore=1, hasSideEffects=1, Uses=[MXCSR] in
 def STMXCSR : I<0xAE, MRM3m, (outs), (ins i32mem:$dst),
               "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)]>,
-              PS, Sched<[WriteSTMXCSR]>;
+              TB, PS, Sched<[WriteSTMXCSR]>;
 
 //===---------------------------------------------------------------------===//
 // SSE2 - Move Aligned/Unaligned Packed Integer Instructions
@@ -3327,11 +3327,11 @@ def VMOVDQUrm  : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
                    "vmovdqu\t{$src, $dst|$dst, $src}",
                    [(set VR128:$dst, (loadv2i64 addr:$src))]>,
                    Sched<[SchedWriteVecMoveLS.XMM.RM]>,
-                   XS, VEX, WIG;
+                   TB, XS, VEX, WIG;
 def VMOVDQUYrm : I<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
                    "vmovdqu\t{$src, $dst|$dst, $src}", []>,
                    Sched<[SchedWriteVecMoveLS.YMM.RM]>,
-                   XS, VEX, VEX_L, WIG;
+                   TB, XS, VEX, VEX_L, WIG;
 }
 
 let mayStore = 1, hasSideEffects = 0, Predicates = [HasAVX,NoVLX] in {
@@ -3347,10 +3347,10 @@ def VMOVDQAYmr : VPDI<0x7F, MRMDestMem, (outs),
 def VMOVDQUmr  : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
                    "vmovdqu\t{$src, $dst|$dst, $src}",
                    [(store (v2i64 VR128:$src), addr:$dst)]>,
-                   Sched<[SchedWriteVecMoveLS.XMM.MR]>, XS, VEX, WIG;
+                   Sched<[SchedWriteVecMoveLS.XMM.MR]>, TB, XS, VEX, WIG;
 def VMOVDQUYmr : I<0x7F, MRMDestMem, (outs), (ins i256mem:$dst, VR256:$src),
                    "vmovdqu\t{$src, $dst|$dst, $src}",[]>,
-                   Sched<[SchedWriteVecMoveLS.YMM.MR]>, XS, VEX, VEX_L, WIG;
+                   Sched<[SchedWriteVecMoveLS.YMM.MR]>, TB, XS, VEX, VEX_L, WIG;
 }
 
 let SchedRW = [SchedWriteVecMoveLS.XMM.RR] in {
@@ -3360,7 +3360,7 @@ def MOVDQArr : PDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
 
 def MOVDQUrr :   I<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                    "movdqu\t{$src, $dst|$dst, $src}", []>,
-                   XS, Requires<[UseSSE2]>;
+                   TB, XS, Requires<[UseSSE2]>;
 }
 
 // For Disassembler
@@ -3370,7 +3370,7 @@ def MOVDQArr_REV : PDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
 
 def MOVDQUrr_REV :   I<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
                        "movdqu\t{$src, $dst|$dst, $src}", []>,
-                       XS, Requires<[UseSSE2]>;
+                       TB, XS, Requires<[UseSSE2]>;
 }
 } // SchedRW
 
@@ -3382,7 +3382,7 @@ def MOVDQArm : PDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
 def MOVDQUrm :   I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
                    "movdqu\t{$src, $dst|$dst, $src}",
                    [/*(set VR128:$dst, (loadv2i64 addr:$src))*/]>,
-                 XS, Requires<[UseSSE2]>;
+                 TB, XS, Requires<[UseSSE2]>;
 }
 
 let mayStore = 1, hasSideEffects = 0,
@@ -3393,7 +3393,7 @@ def MOVDQAmr : PDI<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
 def MOVDQUmr :   I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
                    "movdqu\t{$src, $dst|$dst, $src}",
                    [/*(store (v2i64 VR128:$src), addr:$dst)*/]>,
-                 XS, Requires<[UseSSE2]>;
+                 TB, XS, Requires<[UseSSE2]>;
 }
 
 } // ExeDomain = SSEPackedInt
@@ -3757,11 +3757,11 @@ let Predicates = [UseSSE2] in {
 } // ExeDomain = SSEPackedInt
 
 defm PSHUFD  : sse2_pshuffle<"pshufd", v4i32, v8i32, X86PShufd,
-                             SchedWriteShuffle, NoVLX>, PD;
+                             SchedWriteShuffle, NoVLX>, TB, PD;
 defm PSHUFHW : sse2_pshuffle<"pshufhw", v8i16, v16i16, X86PShufhw,
-                             SchedWriteShuffle, NoVLX_Or_NoBWI>, XS;
+                             SchedWriteShuffle, NoVLX_Or_NoBWI>, TB, XS;
 defm PSHUFLW : sse2_pshuffle<"pshuflw", v8i16, v16i16, X86PShuflw,
-                             SchedWriteShuffle, NoVLX_Or_NoBWI>, XD;
+                             SchedWriteShuffle, NoVLX_Or_NoBWI>, TB, XD;
 
 //===---------------------------------------------------------------------===//
 // Packed Integer Pack Instructions (SSE & AVX)
@@ -4004,7 +4004,7 @@ def VPEXTRWrr : Ii8<0xC5, MRMSrcReg,
                     "vpextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                     [(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1),
                                             timm:$src2))]>,
-                PD, VEX, WIG, Sched<[WriteVecExtract]>;
+                TB, PD, VEX, WIG, Sched<[WriteVecExtract]>;
 def PEXTRWrr : PDIi8<0xC5, MRMSrcReg,
                     (outs GR32orGR64:$dst), (ins VR128:$src1, u8imm:$src2),
                     "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
@@ -4014,10 +4014,10 @@ def PEXTRWrr : PDIi8<0xC5, MRMSrcReg,
 
 // Insert
 let Predicates = [HasAVX, NoBWI] in
-defm VPINSRW : sse2_pinsrw<0>, PD, VEX, VVVV, WIG;
+defm VPINSRW : sse2_pinsrw<0>, TB, PD, VEX, VVVV, WIG;
 
 let Predicates = [UseSSE2], Constraints = "$src1 = $dst" in
-defm PINSRW : sse2_pinsrw, PD;
+defm PINSRW : sse2_pinsrw, TB, PD;
 
 } // ExeDomain = SSEPackedInt
 
@@ -4306,13 +4306,13 @@ let ExeDomain = SSEPackedInt, SchedRW = [WriteVecLoad] in {
 def VMOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
                     "vmovq\t{$src, $dst|$dst, $src}",
                     [(set VR128:$dst,
-                      (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>, XS,
+                      (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>, TB, XS,
                     VEX, Requires<[UseAVX]>, WIG;
 def MOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
                     "movq\t{$src, $dst|$dst, $src}",
                     [(set VR128:$dst,
                       (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>,
-                    XS, Requires<[UseSSE2]>; // SSE2 instruction with XS Prefix
+                    TB, XS, Requires<[UseSSE2]>; // SSE2 instruction with XS Prefix
 } // ExeDomain, SchedRW
 
 //===---------------------------------------------------------------------===//
@@ -4369,11 +4369,11 @@ let ExeDomain = SSEPackedInt, SchedRW = [SchedWriteVecLogic.XMM] in {
 def VMOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                         "vmovq\t{$src, $dst|$dst, $src}",
                     [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))]>,
-                         XS, VEX, Requires<[UseAVX]>, WIG;
+                         TB, XS, VEX, Requires<[UseAVX]>, WIG;
 def MOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                         "movq\t{$src, $dst|$dst, $src}",
                     [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))]>,
-                        XS, Requires<[UseSSE2]>;
+                        TB, XS, Requires<[UseSSE2]>;
 } // ExeDomain, SchedRW
 
 let Predicates = [UseAVX] in {
@@ -4563,27 +4563,27 @@ let Predicates = [HasAVX] in {
   let ExeDomain = SSEPackedSingle in {
     defm VADDSUBPS : sse3_addsub<"vaddsubps", v4f32, VR128, f128mem,
                                  SchedWriteFAddSizes.PS.XMM, loadv4f32, 0>,
-                                 XD, VEX, VVVV, WIG;
+                                 TB, XD, VEX, VVVV, WIG;
     defm VADDSUBPSY : sse3_addsub<"vaddsubps", v8f32, VR256, f256mem,
                                   SchedWriteFAddSizes.PS.YMM, loadv8f32, 0>,
-                                  XD, VEX, VVVV, VEX_L, WIG;
+                                  TB, XD, VEX, VVVV, VEX_L, WIG;
   }
   let ExeDomain = SSEPackedDouble in {
     defm VADDSUBPD : sse3_addsub<"vaddsubpd", v2f64, VR128, f128mem,
                                  SchedWriteFAddSizes.PD.XMM, loadv2f64, 0>,
-                                 PD, VEX, VVVV, WIG;
+                                 TB, PD, VEX, VVVV, WIG;
     defm VADDSUBPDY : sse3_addsub<"vaddsubpd", v4f64, VR256, f256mem,
                                   SchedWriteFAddSizes.PD.YMM, loadv4f64, 0>,
-                                  PD, VEX, VVVV, VEX_L, WIG;
+                                  TB, PD, VEX, VVVV, VEX_L, WIG;
   }
 }
 let Constraints = "$src1 = $dst", Predicates = [UseSSE3] in {
   let ExeDomain = SSEPackedSingle in
   defm ADDSUBPS : sse3_addsub<"addsubps", v4f32, VR128, f128mem,
-                              SchedWriteFAddSizes.PS.XMM, memopv4f32>, XD;
+                              SchedWriteFAddSizes.PS.XMM, memopv4f32>, TB, XD;
   let ExeDomain = SSEPackedDouble in
   defm ADDSUBPD : sse3_addsub<"addsubpd", v2f64, VR128, f128mem,
-                              SchedWriteFAddSizes.PD.XMM, memopv2f64>, PD;
+                              SchedWriteFAddSizes.PD.XMM, memopv2f64>, TB, PD;
 }
 
 //===---------------------------------------------------------------------===//
@@ -5760,33 +5760,33 @@ let Defs = [EFLAGS], Predicates = [HasPOPCNT] in {
   def POPCNT16rr : I<0xB8, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
                      "popcnt{w}\t{$src, $dst|$dst, $src}",
                      [(set GR16:$dst, (ctpop GR16:$src)), (implicit EFLAGS)]>,
-                     Sched<[WritePOPCNT]>, OpSize16, XS;
+                     Sched<[WritePOPCNT]>, OpSize16, TB, XS;
   def POPCNT16rm : I<0xB8, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
                      "popcnt{w}\t{$src, $dst|$dst, $src}",
                      [(set GR16:$dst, (ctpop (loadi16 addr:$src))),
                       (implicit EFLAGS)]>,
-                      Sched<[WritePOPCNT.Folded]>, OpSize16, XS;
+                      Sched<[WritePOPCNT.Folded]>, OpSize16, TB, XS;
 
   def POPCNT32rr : I<0xB8, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
                      "popcnt{l}\t{$src, $dst|$dst, $src}",
                      [(set GR32:$dst, (ctpop GR32:$src)), (implicit EFLAGS)]>,
-                     Sched<[WritePOPCNT]>, OpSize32, XS;
+                     Sched<[WritePOPCNT]>, OpSize32, TB, XS;
 
   def POPCNT32rm : I<0xB8, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
                      "popcnt{l}\t{$src, $dst|$dst, $src}",
                      [(set GR32:$dst, (ctpop (loadi32 addr:$src))),
                       (implicit EFLAGS)]>,
-                      Sched<[WritePOPCNT.Folded]>, OpSize32, XS;
+                      Sched<[WritePOPCNT.Folded]>, OpSize32, TB, XS;
 
   def POPCNT64rr : RI<0xB8, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
                       "popcnt{q}\t{$src, $dst|$dst, $src}",
                       [(set GR64:$dst, (ctpop GR64:$src)), (implicit EFLAGS)]>,
-                      Sched<[WritePOPCNT]>, XS;
+                      Sched<[WritePOPCNT]>, TB, XS;
   def POPCNT64rm : RI<0xB8, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
                       "popcnt{q}\t{$src, $dst|$dst, $src}",
                       [(set GR64:$dst, (ctpop (loadi64 addr:$src))),
                        (implicit EFLAGS)]>,
-                       Sched<[WritePOPCNT.Folded]>, XS;
+                       Sched<[WritePOPCNT.Folded]>, TB, XS;
 }
 
 // SS41I_unop_rm_int_v16 - SSE 4.1 unary operator whose type is v8i16.
@@ -6290,7 +6290,7 @@ multiclass SS41I_quaternary_avx<bits<8> opc, string OpcodeStr, RegisterClass RC,
                   !strconcat(OpcodeStr,
                     "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
                   [(set RC:$dst, (VT (OpNode RC:$src3, RC:$src2, RC:$src1)))],
-                  SSEPackedInt>, TAPD, VEX, VVVV,
+                  SSEPackedInt>, TA, PD, VEX, VVVV,
                 Sched<[sched]>;
 
   def rm : Ii8Reg<opc, MRMSrcMem, (outs RC:$dst),
@@ -6299,7 +6299,7 @@ multiclass SS41I_quaternary_avx<bits<8> opc, string OpcodeStr, RegisterClass RC,
                     "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
                   [(set RC:$dst,
                         (OpNode RC:$src3, (mem_frag addr:$src2),
-                                RC:$src1))], SSEPackedInt>, TAPD, VEX, VVVV,
+                                RC:$src1))], SSEPackedInt>, TA, PD, VEX, VVVV,
                 Sched<[sched.Folded, sched.ReadAfterFold,
                        // x86memop:$src2
                        ReadDefault, ReadDefault, ReadDefault, ReadDefault,
@@ -6715,7 +6715,7 @@ multiclass SHAI_binop<bits<8> Opc, string OpcodeStr, Intrinsic IntId,
                     [!if(UsesXMM0,
                          (set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0)),
                          (set VR128:$dst, (IntId VR128:$src1, VR128:$src2)))]>,
-                    T8PS, Sched<[sched]>;
+                    T8, PS, Sched<[sched]>;
 
   def rm#Suffix : I<Opc, MRMSrcMem, (outs VR128:$dst),
                     (ins VR128:$src1, i128mem:$src2),
@@ -6726,7 +6726,7 @@ multiclass SHAI_binop<bits<8> Opc, string OpcodeStr, Intrinsic IntId,
                          (set VR128:$dst, (IntId VR128:$src1,
                            (memop addr:$src2), XMM0)),
                          (set VR128:$dst, (IntId VR128:$src1,
-                           (memop addr:$src2))))]>, T8PS,
+                           (memop addr:$src2))))]>, T8, PS,
                     Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
@@ -6736,7 +6736,7 @@ let Constraints = "$src1 = $dst", Predicates = [HasSHA, NoEGPR] in {
                          "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}",
                          [(set VR128:$dst,
                            (int_x86_sha1rnds4 VR128:$src1, VR128:$src2,
-                            (i8 timm:$src3)))]>, TAPS,
+                            (i8 timm:$src3)))]>, TA, PS,
                          Sched<[SchedWriteVecIMul.XMM]>;
   def SHA1RNDS4rmi : Ii8<0xCC, MRMSrcMem, (outs VR128:$dst),
                          (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
@@ -6744,7 +6744,7 @@ let Constraints = "$src1 = $dst", Predicates = [HasSHA, NoEGPR] in {
                          [(set VR128:$dst,
                            (int_x86_sha1rnds4 VR128:$src1,
                             (memop addr:$src2),
-                            (i8 timm:$src3)))]>, TAPS,
+                            (i8 timm:$src3)))]>, TA, PS,
                          Sched<[SchedWriteVecIMul.XMM.Folded,
                                 SchedWriteVecIMul.XMM.ReadAfterFold]>;
 
@@ -6772,7 +6772,7 @@ let Constraints = "$src1 = $dst", Predicates = [HasSHA, HasEGPR, In64BitMode] in
                              [(set VR128:$dst,
                                (int_x86_sha1rnds4 VR128:$src1, VR128:$src2,
                                 (i8 timm:$src3)))]>,
-                         EVEX, NoCD8, T_MAP4PS, Sched<[SchedWriteVecIMul.XMM]>;
+                         EVEX, NoCD8, T_MAP4, PS, Sched<[SchedWriteVecIMul.XMM]>;
   def SHA1RNDS4rmi_EVEX: Ii8<0xD4, MRMSrcMem, (outs VR128:$dst),
                              (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
                              "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}",
@@ -6780,31 +6780,31 @@ let Constraints = "$src1 = $dst", Predicates = [HasSHA, HasEGPR, In64BitMode] in
                                (int_x86_sha1rnds4 VR128:$src1,
                                 (memop addr:$src2),
                                 (i8 timm:$src3)))]>,
-                         EVEX, NoCD8, T_MAP4PS,
+                         EVEX, NoCD8, T_MAP4, PS,
                          Sched<[SchedWriteVecIMul.XMM.Folded,
                                 SchedWriteVecIMul.XMM.ReadAfterFold]>;
 
   defm SHA1NEXTE : SHAI_binop<0xD8, "sha1nexte", int_x86_sha1nexte,
                                    SchedWriteVecIMul.XMM, "_EVEX">,
-                        EVEX, NoCD8, T_MAP4PS;
+                        EVEX, NoCD8, T_MAP4;
   defm SHA1MSG1  : SHAI_binop<0xD9, "sha1msg1", int_x86_sha1msg1,
                               SchedWriteVecIMul.XMM, "_EVEX">,
-                   EVEX, NoCD8, T_MAP4PS;
+                   EVEX, NoCD8, T_MAP4;
   defm SHA1MSG2  : SHAI_binop<0xDA, "sha1msg2", int_x86_sha1msg2,
                               SchedWriteVecIMul.XMM, "_EVEX">,
-                   EVEX, NoCD8, T_MAP4PS;
+                   EVEX, NoCD8, T_MAP4;
 
   let Uses=[XMM0] in
   defm SHA256RNDS2 : SHAI_binop<0xDB, "sha256rnds2", int_x86_sha256rnds2,
                                 SchedWriteVecIMul.XMM, "_EVEX", 1>,
-                     EVEX, NoCD8, T_MAP4PS;
+                     EVEX, NoCD8, T_MAP4;
 
   defm SHA256MSG1 : SHAI_binop<0xDC, "sha256msg1", int_x86_sha256msg1,
                                SchedWriteVecIMul.XMM, "_EVEX">,
-                    EVEX, NoCD8, T_MAP4PS;
+                    EVEX, NoCD8, T_MAP4;
   defm SHA256MSG2 : SHAI_binop<0xDD, "sha256msg2", int_x86_sha256msg2,
                                SchedWriteVecIMul.XMM, "_EVEX">,
-                    EVEX, NoCD8, T_MAP4PS;
+                    EVEX, NoCD8, T_MAP4;
 }
 
 //===----------------------------------------------------------------------===//
@@ -7035,26 +7035,26 @@ def EXTRQI : Ii8<0x78, MRMXr, (outs VR128:$dst),
                  "extrq\t{$idx, $len, $src|$src, $len, $idx}",
                  [(set VR128:$dst, (X86extrqi VR128:$src, timm:$len,
                                     timm:$idx))]>,
-                 PD, Sched<[SchedWriteVecALU.XMM]>;
+                 TB, PD, Sched<[SchedWriteVecALU.XMM]>;
 def EXTRQ  : I<0x79, MRMSrcReg, (outs VR128:$dst),
               (ins VR128:$src, VR128:$mask),
               "extrq\t{$mask, $src|$src, $mask}",
               [(set VR128:$dst, (int_x86_sse4a_extrq VR128:$src,
                                  VR128:$mask))]>,
-              PD, Sched<[SchedWriteVecALU.XMM]>;
+              TB, PD, Sched<[SchedWriteVecALU.XMM]>;
 
 def INSERTQI : Ii8<0x78, MRMSrcReg, (outs VR128:$dst),
                    (ins VR128:$src, VR128:$src2, u8imm:$len, u8imm:$idx),
                    "insertq\t{$idx, $len, $src2, $src|$src, $src2, $len, $idx}",
                    [(set VR128:$dst, (X86insertqi VR128:$src, VR128:$src2,
                                       timm:$len, timm:$idx))]>,
-                   XD, Sched<[SchedWriteVecALU.XMM]>;
+                   TB, XD, Sched<[SchedWriteVecALU.XMM]>;
 def INSERTQ  : I<0x79, MRMSrcReg, (outs VR128:$dst),
                  (ins VR128:$src, VR128:$mask),
                  "insertq\t{$mask, $src|$src, $mask}",
                  [(set VR128:$dst, (int_x86_sse4a_insertq VR128:$src,
                                     VR128:$mask))]>,
-                 XD, Sched<[SchedWriteVecALU.XMM]>;
+                 TB, XD, Sched<[SchedWriteVecALU.XMM]>;
 }
 } // ExeDomain = SSEPackedInt
 
@@ -7062,10 +7062,10 @@ def INSERTQ  : I<0x79, MRMSrcReg, (outs VR128:$dst),
 let AddedComplexity = 400 in { // Prefer non-temporal versions
 let hasSideEffects = 0, mayStore = 1, SchedRW = [SchedWriteFMoveLSNT.Scl.MR] in {
 def MOVNTSS : I<0x2B, MRMDestMem, (outs), (ins f32mem:$dst, VR128:$src),
-                "movntss\t{$src, $dst|$dst, $src}", []>, XS;
+                "movntss\t{$src, $dst|$dst, $src}", []>, TB, XS;
 
 def MOVNTSD : I<0x2B, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
-                "movntsd\t{$src, $dst|$dst, $src}", []>, XD;
+                "movntsd\t{$src, $dst|$dst, $src}", []>, TB, XD;
 } // SchedRW
 
 def : Pat<(nontemporalstore FR32:$src, addr:$dst),
@@ -7474,12 +7474,12 @@ let Defs = [YMM0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7,
             YMM8, YMM9, YMM10, YMM11, YMM12, YMM13, YMM14, YMM15] in {
   // Zero All YMM registers
   def VZEROALL : I<0x77, RawFrm, (outs), (ins), "vzeroall",
-                  [(int_x86_avx_vzeroall)]>, PS, VEX, VEX_L,
+                  [(int_x86_avx_vzeroall)]>, TB, PS, VEX, VEX_L,
                   Requires<[HasAVX]>, WIG;
 
   // Zero Upper bits of YMM registers
   def VZEROUPPER : I<0x77, RawFrm, (outs), (ins), "vzeroupper",
-                     [(int_x86_avx_vzeroupper)]>, PS, VEX,
+                     [(int_x86_avx_vzeroupper)]>, TB, PS, VEX,
                      Requires<[HasAVX]>, WIG;
 } // Defs
 } // SchedRW
@@ -7493,11 +7493,11 @@ multiclass f16c_ph2ps<RegisterClass RC, X86MemOperand x86memop,
   def rr : I<0x13, MRMSrcReg, (outs RC:$dst), (ins VR128:$src),
              "vcvtph2ps\t{$src, $dst|$dst, $src}",
              [(set RC:$dst, (X86any_cvtph2ps VR128:$src))]>,
-             T8PD, VEX, Sched<[sched]>;
+             T8, PD, VEX, Sched<[sched]>;
   let hasSideEffects = 0, mayLoad = 1 in
   def rm : I<0x13, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
              "vcvtph2ps\t{$src, $dst|$dst, $src}",
-             []>, T8PD, VEX, Sched<[sched.Folded]>;
+             []>, T8, PD, VEX, Sched<[sched.Folded]>;
 }
 
 multiclass f16c_ps2ph<RegisterClass RC, X86MemOperand x86memop,
@@ -7506,12 +7506,12 @@ multiclass f16c_ps2ph<RegisterClass RC, X86MemOperand x86memop,
                (ins RC:$src1, i32u8imm:$src2),
                "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                [(set VR128:$dst, (X86any_cvtps2ph RC:$src1, timm:$src2))]>,
-               TAPD, VEX, Sched<[RR]>;
+               TA, PD, VEX, Sched<[RR]>;
   let hasSideEffects = 0, mayStore = 1 in
   def mr : Ii8<0x1D, MRMDestMem, (outs),
                (ins x86memop:$dst, RC:$src1, i32u8imm:$src2),
                "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
-               TAPD, VEX, Sched<[MR]>;
+               TA, PD, VEX, Sched<[MR]>;
 }
 
 let Predicates = [HasF16C, NoVLX] in {
@@ -8109,12 +8109,12 @@ multiclass GF2P8MULB_rm<string OpcodeStr, ValueType OpVT,
     let isCommutable = 1 in
     def rr : PDI<0xCF, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), "",
                  [(set RC:$dst, (OpVT (X86GF2P8mulb RC:$src1, RC:$src2)))]>,
-             Sched<[sched]>, T8PD;
+             Sched<[sched]>, T8;
 
     def rm : PDI<0xCF, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, X86MemOp:$src2), "",
                  [(set RC:$dst, (OpVT (X86GF2P8mulb RC:$src1,
                                  (MemOpFrag addr:$src2))))]>,
-             Sched<[sched.Folded, sched.ReadAfterFold]>, T8PD;
+             Sched<[sched.Folded, sched.ReadAfterFold]>, T8;
   }
 }
 
@@ -8167,9 +8167,9 @@ let Predicates  = [HasGFNI, HasAVX, NoVLX] in {
 // GF2P8AFFINEINVQB, GF2P8AFFINEQB
 let isCommutable = 0 in {
   defm GF2P8AFFINEINVQB : GF2P8AFFINE_common<0xCF, "gf2p8affineinvqb",
-                                             X86GF2P8affineinvqb>, TAPD;
+                                             X86GF2P8affineinvqb>, TA, PD;
   defm GF2P8AFFINEQB    : GF2P8AFFINE_common<0xCE, "gf2p8affineqb",
-                                             X86GF2P8affineqb>, TAPD;
+                                             X86GF2P8affineqb>, TA, PD;
 }
 
 // AVX-IFMA
@@ -8234,40 +8234,40 @@ multiclass avx_dotprod_rm<bits<8> Opc, string OpcodeStr, ValueType OpVT,
 let Predicates = [HasAVXVNNIINT8] in {
   defm VPDPBSSD   : avx_dotprod_rm<0x50,"vpdpbssd",  v4i32, VR128, loadv4i32,
                                    i128mem, X86vpdpbssd, SchedWriteVecIMul.XMM,
-                                   1>, T8XD;
+                                   1>, T8, XD;
   defm VPDPBSSDY  : avx_dotprod_rm<0x50,"vpdpbssd",  v8i32, VR256, loadv8i32,
                                    i256mem, X86vpdpbssd, SchedWriteVecIMul.YMM,
-                                   1>, VEX_L, T8XD;
+                                   1>, VEX_L, T8, XD;
   defm VPDPBUUD   : avx_dotprod_rm<0x50,"vpdpbuud",  v4i32, VR128, loadv4i32,
                                    i128mem, X86vpdpbuud, SchedWriteVecIMul.XMM,
-                                   1>, T8PS;
+                                   1>, T8, PS;
   defm VPDPBUUDY  : avx_dotprod_rm<0x50,"vpdpbuud",  v8i32, VR256, loadv8i32,
                                    i256mem, X86vpdpbuud, SchedWriteVecIMul.YMM,
-                                   1>, VEX_L, T8PS;
+                                   1>, VEX_L, T8, PS;
   defm VPDPBSSDS  : avx_dotprod_rm<0x51,"vpdpbssds", v4i32, VR128, loadv4i32,
                                    i128mem, X86vpdpbssds, SchedWriteVecIMul.XMM,
-                                   1>, T8XD;
+                                   1>, T8, XD;
   defm VPDPBSSDSY : avx_dotprod_rm<0x51,"vpdpbssds", v8i32, VR256, loadv8i32,
                                    i256mem, X86vpdpbssds, SchedWriteVecIMul.YMM,
-                                   1>, VEX_L, T8XD;
+                                   1>, VEX_L, T8, XD;
   defm VPDPBUUDS  : avx_dotprod_rm<0x51,"vpdpbuuds", v4i32, VR128, loadv4i32,
                                    i128mem, X86vpdpbuuds, SchedWriteVecIMul.XMM,
-                                   1>, T8PS;
+                                   1>, T8, PS;
   defm VPDPBUUDSY : avx_dotprod_rm<0x51,"vpdpbuuds", v8i32, VR256, loadv8i32,
                                    i256mem, X86vpdpbuuds, SchedWriteVecIMul.YMM,
-                                   1>, VEX_L, T8PS;
+                                   1>, VEX_L, T8, PS;
   defm VPDPBSUD   : avx_dotprod_rm<0x50,"vpdpbsud",  v4i32, VR128, loadv4i32,
                                    i128mem, X86vpdpbsud,  SchedWriteVecIMul.XMM,
-                                   0>, T8XS;
+                                   0>, T8, XS;
   defm VPDPBSUDY  : avx_dotprod_rm<0x50,"vpdpbsud",  v8i32, VR256, loadv8i32,
                                    i256mem, X86vpdpbsud,  SchedWriteVecIMul.YMM,
-                                   0>,  VEX_L, T8XS;
+                                   0>,  VEX_L, T8, XS;
   defm VPDPBSUDS  : avx_dotprod_rm<0x51,"vpdpbsuds", v4i32, VR128, loadv4i32,
                                    i128mem, X86vpdpbsuds, SchedWriteVecIMul.XMM,
-                                   0>, T8XS;
+                                   0>, T8, XS;
   defm VPDPBSUDSY : avx_dotprod_rm<0x51,"vpdpbsuds", v8i32, VR256, loadv8i32,
                                    i256mem, X86vpdpbsuds, SchedWriteVecIMul.YMM,
-                                   0>, VEX_L, T8XS;
+                                   0>, VEX_L, T8, XS;
 }
 
 // AVX-NE-CONVERT
@@ -8306,18 +8306,18 @@ multiclass VCVTNEPS2BF16_BASE {
 
 let Predicates = [HasAVXNECONVERT] in {
   defm VBCSTNEBF162PS : AVX_NE_CONVERT_BASE<0xb1, "vbcstnebf162ps", f16mem,
-       f16mem>, T8XS;
+       f16mem>, T8, XS;
   defm VBCSTNESH2PS : AVX_NE_CONVERT_BASE<0xb1, "vbcstnesh2ps", f16mem, f16mem>,
-       T8PD;
+       T8, PD;
   defm VCVTNEEBF162PS : AVX_NE_CONVERT_BASE<0xb0, "vcvtneebf162ps", f128mem,
-       f256mem>, T8XS;
+       f256mem>, T8, XS;
   defm VCVTNEEPH2PS : AVX_NE_CONVERT_BASE<0xb0, "vcvtneeph2ps", f128mem,
-       f256mem>, T8PD;
+       f256mem>, T8, PD;
   defm VCVTNEOBF162PS : AVX_NE_CONVERT_BASE<0xb0, "vcvtneobf162ps", f128mem,
-       f256mem>, T8XD;
+       f256mem>, T8, XD;
   defm VCVTNEOPH2PS : AVX_NE_CONVERT_BASE<0xb0, "vcvtneoph2ps", f128mem,
-       f256mem>, T8PS;
-  defm VCVTNEPS2BF16 : VCVTNEPS2BF16_BASE, VEX, T8XS, ExplicitVEXPrefix;
+       f256mem>, T8, PS;
+  defm VCVTNEPS2BF16 : VCVTNEPS2BF16_BASE, VEX, T8, XS, ExplicitVEXPrefix;
 
   def : Pat<(v8bf16 (X86vfpround (v8f32 VR256:$src))),
             (VCVTNEPS2BF16Yrr VR256:$src)>;
@@ -8337,19 +8337,19 @@ def VSHA512MSG1rr : I<0xcc, MRMSrcReg, (outs VR256:$dst),
                      "vsha512msg1\t{$src2, $dst|$dst, $src2}",
                      [(set VR256:$dst,
                        (int_x86_vsha512msg1 VR256:$src1, VR128:$src2))]>, VEX_L,
-                     VEX, T8XD, Sched<[WriteVecIMul]>;
+                     VEX, T8, XD, Sched<[WriteVecIMul]>;
 def VSHA512MSG2rr : I<0xcd, MRMSrcReg, (outs VR256:$dst),
                      (ins VR256:$src1, VR256:$src2),
                      "vsha512msg2\t{$src2, $dst|$dst, $src2}",
                      [(set VR256:$dst,
                        (int_x86_vsha512msg2 VR256:$src1, VR256:$src2))]>, VEX_L,
-                     VEX, T8XD, Sched<[WriteVecIMul]>;
+                     VEX, T8, XD, Sched<[WriteVecIMul]>;
 def VSHA512RNDS2rr : I<0xcb, MRMSrcReg, (outs VR256:$dst),
                       (ins VR256:$src1, VR256:$src2, VR128:$src3),
                       "vsha512rnds2\t{$src3, $src2, $dst|$dst, $src2, $src3}",
                       [(set VR256:$dst,
                         (int_x86_vsha512rnds2 VR256:$src1, VR256:$src2, VR128:$src3))]>,
-                      VEX_L, VEX, VVVV, T8XD, Sched<[WriteVecIMul]>;
+                      VEX_L, VEX, VVVV, T8, XD, Sched<[WriteVecIMul]>;
 }
 
 // FIXME: Is there a better scheduler class for SM3 than WriteVecIMul?
@@ -8389,9 +8389,9 @@ let Predicates = [HasSM3], Constraints = "$src1 = $dst" in {
   }
 }
 
-defm VSM3MSG1 : SM3_Base<"vsm3msg1">, T8PS;
-defm VSM3MSG2 : SM3_Base<"vsm3msg2">, T8PD;
-defm VSM3RNDS2 : VSM3RNDS2_Base, VEX, VVVV, TAPD;
+defm VSM3MSG1 : SM3_Base<"vsm3msg1">, T8, PS;
+defm VSM3MSG2 : SM3_Base<"vsm3msg2">, T8, PD;
+defm VSM3RNDS2 : VSM3RNDS2_Base, VEX, VVVV, TA, PD;
 
 // FIXME: Is there a better scheduler class for SM4 than WriteVecIMul?
 let Predicates = [HasSM4] in {
@@ -8412,10 +8412,10 @@ let Predicates = [HasSM4] in {
   }
 }
 
-defm VSM4KEY4  : SM4_Base<"vsm4key4", VR128, "128", loadv4i32, i128mem>, T8XS, VEX, VVVV;
-defm VSM4KEY4Y : SM4_Base<"vsm4key4", VR256, "256", loadv8i32, i256mem>, T8XS, VEX_L, VEX, VVVV;
-defm VSM4RNDS4  : SM4_Base<"vsm4rnds4", VR128, "128", loadv4i32, i128mem>, T8XD, VEX, VVVV;
-defm VSM4RNDS4Y : SM4_Base<"vsm4rnds4", VR256, "256", loadv8i32, i256mem>, T8XD, VEX_L, VEX, VVVV;
+defm VSM4KEY4  : SM4_Base<"vsm4key4", VR128, "128", loadv4i32, i128mem>, T8, XS, VEX, VVVV;
+defm VSM4KEY4Y : SM4_Base<"vsm4key4", VR256, "256", loadv8i32, i256mem>, T8, XS, VEX_L, VEX, VVVV;
+defm VSM4RNDS4  : SM4_Base<"vsm4rnds4", VR128, "128", loadv4i32, i128mem>, T8, XD, VEX, VVVV;
+defm VSM4RNDS4Y : SM4_Base<"vsm4rnds4", VR256, "256", loadv8i32, i256mem>, T8, XD, VEX_L, VEX, VVVV;
 
 let Predicates = [HasAVXVNNIINT16], Constraints = "$src1 = $dst" in
 multiclass avx_vnni_int16<bits<8> opc, string OpcodeStr, bit IsCommutable> {
@@ -8454,9 +8454,9 @@ multiclass avx_vnni_int16<bits<8> opc, string OpcodeStr, bit IsCommutable> {
                VEX, VVVV, VEX_L, Sched<[SchedWriteVecIMul.YMM]>;
 }
 
-defm VPDPWSUD   : avx_vnni_int16<0xd2, "vpdpwsud", 0>, T8XS;
-defm VPDPWSUDS  : avx_vnni_int16<0xd3, "vpdpwsuds", 0>, T8XS;
-defm VPDPWUSD   : avx_vnni_int16<0xd2, "vpdpwusd", 0>, T8PD;
-defm VPDPWUSDS  : avx_vnni_int16<0xd3, "vpdpwusds", 0>, T8PD;
-defm VPDPWUUD   : avx_vnni_int16<0xd2, "vpdpwuud", 1>, T8PS;
-defm VPDPWUUDS  : avx_vnni_int16<0xd3, "vpdpwuuds", 1>, T8PS;
+defm VPDPWSUD   : avx_vnni_int16<0xd2, "vpdpwsud", 0>, T8, XS;
+defm VPDPWSUDS  : avx_vnni_int16<0xd3, "vpdpwsuds", 0>, T8, XS;
+defm VPDPWUSD   : avx_vnni_int16<0xd2, "vpdpwusd", 0>, T8, PD;
+defm VPDPWUSDS  : avx_vnni_int16<0xd3, "vpdpwusds", 0>, T8, PD;
+defm VPDPWUUD   : avx_vnni_int16<0xd2, "vpdpwuud", 1>, T8, PS;
+defm VPDPWUUDS  : avx_vnni_int16<0xd3, "vpdpwuuds", 1>, T8, PS;
diff --git a/llvm/lib/Target/X86/X86InstrShiftRotate.td b/llvm/lib/Target/X86/X86InstrShiftRotate.td
index 48bf23f8cbf7b..d13e3b7af69a9 100644
--- a/llvm/lib/Target/X86/X86InstrShiftRotate.td
+++ b/llvm/lib/Target/X86/X86InstrShiftRotate.td
@@ -829,12 +829,12 @@ multiclass bmi_rotate<string asm, RegisterClass RC, X86MemOperand x86memop,
 let hasSideEffects = 0 in {
   def ri#Suffix : Ii8<0xF0, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, u8imm:$src2),
                       !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>,
-                  TAXD, VEX, Sched<[WriteShift]>;
+                  TA, XD, VEX, Sched<[WriteShift]>;
   let mayLoad = 1 in
   def mi#Suffix : Ii8<0xF0, MRMSrcMem, (outs RC:$dst),
                       (ins x86memop:$src1, u8imm:$src2),
                       !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>,
-                  TAXD, VEX, Sched<[WriteShiftLd]>;
+                  TA, XD, VEX, Sched<[WriteShiftLd]>;
 }
 }
 
@@ -860,23 +860,23 @@ let hasSideEffects = 0 in {
 let Predicates = [HasBMI2, NoEGPR] in {
   defm RORX32 : bmi_rotate<"rorx{l}", GR32, i32mem>;
   defm RORX64 : bmi_rotate<"rorx{q}", GR64, i64mem>, REX_W;
-  defm SARX32 : bmi_shift<"sarx{l}", GR32, i32mem>, T8XS;
-  defm SARX64 : bmi_shift<"sarx{q}", GR64, i64mem>, T8XS, REX_W;
-  defm SHRX32 : bmi_shift<"shrx{l}", GR32, i32mem>, T8XD;
-  defm SHRX64 : bmi_shift<"shrx{q}", GR64, i64mem>, T8XD, REX_W;
-  defm SHLX32 : bmi_shift<"shlx{l}", GR32, i32mem>, T8PD;
-  defm SHLX64 : bmi_shift<"shlx{q}", GR64, i64mem>, T8PD, REX_W;
+  defm SARX32 : bmi_shift<"sarx{l}", GR32, i32mem>, T8, XS;
+  defm SARX64 : bmi_shift<"sarx{q}", GR64, i64mem>, T8, XS, REX_W;
+  defm SHRX32 : bmi_shift<"shrx{l}", GR32, i32mem>, T8, XD;
+  defm SHRX64 : bmi_shift<"shrx{q}", GR64, i64mem>, T8, XD, REX_W;
+  defm SHLX32 : bmi_shift<"shlx{l}", GR32, i32mem>, T8, PD;
+  defm SHLX64 : bmi_shift<"shlx{q}", GR64, i64mem>, T8, PD, REX_W;
 }
 
 let Predicates = [HasBMI2, HasEGPR] in {
   defm RORX32 : bmi_rotate<"rorx{l}", GR32, i32mem, "_EVEX">, EVEX;
   defm RORX64 : bmi_rotate<"rorx{q}", GR64, i64mem, "_EVEX">, REX_W, EVEX;
-  defm SARX32 : bmi_shift<"sarx{l}", GR32, i32mem, "_EVEX">, T8XS, EVEX;
-  defm SARX64 : bmi_shift<"sarx{q}", GR64, i64mem, "_EVEX">, T8XS, REX_W, EVEX;
-  defm SHRX32 : bmi_shift<"shrx{l}", GR32, i32mem, "_EVEX">, T8XD, EVEX;
-  defm SHRX64 : bmi_shift<"shrx{q}", GR64, i64mem, "_EVEX">, T8XD, REX_W, EVEX;
-  defm SHLX32 : bmi_shift<"shlx{l}", GR32, i32mem, "_EVEX">, T8PD, EVEX;
-  defm SHLX64 : bmi_shift<"shlx{q}", GR64, i64mem, "_EVEX">, T8PD, REX_W, EVEX;
+  defm SARX32 : bmi_shift<"sarx{l}", GR32, i32mem, "_EVEX">, T8, XS, EVEX;
+  defm SARX64 : bmi_shift<"sarx{q}", GR64, i64mem, "_EVEX">, T8, XS, REX_W, EVEX;
+  defm SHRX32 : bmi_shift<"shrx{l}", GR32, i32mem, "_EVEX">, T8, XD, EVEX;
+  defm SHRX64 : bmi_shift<"shrx{q}", GR64, i64mem, "_EVEX">, T8, XD, REX_W, EVEX;
+  defm SHLX32 : bmi_shift<"shlx{l}", GR32, i32mem, "_EVEX">, T8, PD, EVEX;
+  defm SHLX64 : bmi_shift<"shlx{q}", GR64, i64mem, "_EVEX">, T8, PD, REX_W, EVEX;
 }
 
 let Predicates = [HasBMI2] in {
diff --git a/llvm/lib/Target/X86/X86InstrSystem.td b/llvm/lib/Target/X86/X86InstrSystem.td
index 25db96b31be7a..4471071e8f9a9 100644
--- a/llvm/lib/Target/X86/X86InstrSystem.td
+++ b/llvm/lib/Target/X86/X86InstrSystem.td
@@ -426,31 +426,31 @@ let SchedRW = [WriteSystem] in {
 let Uses = [EAX, ECX, EDX] in
 def WRMSR : I<0x30, RawFrm, (outs), (ins), "wrmsr", []>, TB;
 let Uses = [EAX, ECX, EDX] in
-def WRMSRNS : I<0x01, MRM_C6, (outs), (ins), "wrmsrns", []>, PS;
+def WRMSRNS : I<0x01, MRM_C6, (outs), (ins), "wrmsrns", []>, TB, PS;
 let Defs = [EAX, EDX], Uses = [ECX] in
 def RDMSR : I<0x32, RawFrm, (outs), (ins), "rdmsr", []>, TB;
 let Defs = [RAX, EFLAGS], Uses = [RBX, RCX], Predicates = [In64BitMode] in
-def PBNDKB : I<0x01, MRM_C7, (outs), (ins), "pbndkb", []>, PS;
+def PBNDKB : I<0x01, MRM_C7, (outs), (ins), "pbndkb", []>, TB, PS;
 let Uses = [RSI, RDI, RCX], Predicates = [In64BitMode] in {
-def WRMSRLIST : I<0x01, MRM_C6, (outs), (ins), "wrmsrlist", []>, XS;
-def RDMSRLIST : I<0x01, MRM_C6, (outs), (ins), "rdmsrlist", []>, XD;
+def WRMSRLIST : I<0x01, MRM_C6, (outs), (ins), "wrmsrlist", []>, TB, XS;
+def RDMSRLIST : I<0x01, MRM_C6, (outs), (ins), "rdmsrlist", []>, TB, XD;
 }
 
 let Predicates = [HasUSERMSR], mayLoad = 1 in {
   def URDMSRrr : I<0xf8, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
                 "urdmsr\t{$src, $dst|$dst, $src}",
-                [(set GR64:$dst, (int_x86_urdmsr GR64:$src))]>, T8XD;
+                [(set GR64:$dst, (int_x86_urdmsr GR64:$src))]>, T8, XD;
   def URDMSRri : Ii32<0xf8, MRM0r, (outs GR64:$dst), (ins i64i32imm:$imm),
                 "urdmsr\t{$imm, $dst|$dst, $imm}",
-                [(set GR64:$dst, (int_x86_urdmsr i64immSExt32_su:$imm))]>, T_MAP7XD, VEX;
+                [(set GR64:$dst, (int_x86_urdmsr i64immSExt32_su:$imm))]>, T_MAP7, XD, VEX;
 }
 let Predicates = [HasUSERMSR], mayStore = 1 in {
   def UWRMSRrr : I<0xf8, MRMSrcReg, (outs), (ins GR64:$src1, GR64:$src2),
                 "uwrmsr\t{$src1, $src2|$src2, $src1}",
-                [(int_x86_uwrmsr GR64:$src1, GR64:$src2)]>, T8XS;
+                [(int_x86_uwrmsr GR64:$src1, GR64:$src2)]>, T8, XS;
   def UWRMSRir : Ii32<0xf8, MRM0r, (outs), (ins GR64:$src, i64i32imm:$imm),
                 "uwrmsr\t{$src, $imm|$imm, $src}",
-                [(int_x86_uwrmsr GR64:$src, i64immSExt32_su:$imm)]>, T_MAP7XS, VEX;
+                [(int_x86_uwrmsr GR64:$src, i64immSExt32_su:$imm)]>, T_MAP7, XS, VEX;
 }
 let Defs = [RAX, RDX], Uses = [ECX] in
 def RDPMC : I<0x33, RawFrm, (outs), (ins), "rdpmc", []>, TB;
@@ -481,12 +481,12 @@ let Defs = [EAX, EBX, ECX, EDX], Uses = [EAX, ECX] in
 // Cache instructions
 let SchedRW = [WriteSystem] in {
 def INVD : I<0x08, RawFrm, (outs), (ins), "invd", []>, TB;
-def WBINVD : I<0x09, RawFrm, (outs), (ins), "wbinvd", [(int_x86_wbinvd)]>, PS;
+def WBINVD : I<0x09, RawFrm, (outs), (ins), "wbinvd", [(int_x86_wbinvd)]>, TB, PS;
 
 // wbnoinvd is like wbinvd, except without invalidation
 // encoding: like wbinvd + an 0xF3 prefix
 def WBNOINVD : I<0x09, RawFrm, (outs), (ins), "wbnoinvd",
-                 [(int_x86_wbnoinvd)]>, XS,
+                 [(int_x86_wbnoinvd)]>, TB, XS,
                  Requires<[HasWBNOINVD]>;
 } // SchedRW
 
@@ -497,74 +497,74 @@ let SchedRW = [WriteSystem] in {
   let Uses = [SSP] in {
     let Defs = [SSP] in {
       def INCSSPD : I<0xAE, MRM5r, (outs), (ins GR32:$src), "incsspd\t$src",
-                       [(int_x86_incsspd GR32:$src)]>, XS;
+                       [(int_x86_incsspd GR32:$src)]>, TB, XS;
       def INCSSPQ : RI<0xAE, MRM5r, (outs), (ins GR64:$src), "incsspq\t$src",
-                       [(int_x86_incsspq GR64:$src)]>, XS;
+                       [(int_x86_incsspq GR64:$src)]>, TB, XS;
     } // Defs SSP
 
     let Constraints = "$src = $dst" in {
       def RDSSPD : I<0x1E, MRM1r, (outs GR32:$dst), (ins GR32:$src),
                      "rdsspd\t$dst",
-                     [(set GR32:$dst, (int_x86_rdsspd GR32:$src))]>, XS;
+                     [(set GR32:$dst, (int_x86_rdsspd GR32:$src))]>, TB, XS;
       def RDSSPQ : RI<0x1E, MRM1r, (outs GR64:$dst), (ins GR64:$src),
                      "rdsspq\t$dst",
-                     [(set GR64:$dst, (int_x86_rdsspq GR64:$src))]>, XS;
+                     [(set GR64:$dst, (int_x86_rdsspq GR64:$src))]>, TB, XS;
     }
 
     let Defs = [SSP] in {
       def SAVEPREVSSP : I<0x01, MRM_EA, (outs), (ins), "saveprevssp",
-                       [(int_x86_saveprevssp)]>, XS;
+                       [(int_x86_saveprevssp)]>, TB, XS;
       def RSTORSSP : I<0x01, MRM5m, (outs), (ins i32mem:$src),
                        "rstorssp\t$src",
-                       [(int_x86_rstorssp addr:$src)]>, XS;
+                       [(int_x86_rstorssp addr:$src)]>, TB, XS;
     } // Defs SSP
   } // Uses SSP
 
 let Predicates = [NoEGPR] in {
   def WRSSD : I<0xF6, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
                 "wrssd\t{$src, $dst|$dst, $src}",
-                [(int_x86_wrssd GR32:$src, addr:$dst)]>, T8PS;
+                [(int_x86_wrssd GR32:$src, addr:$dst)]>, T8, PS;
   def WRSSQ : RI<0xF6, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
                  "wrssq\t{$src, $dst|$dst, $src}",
-                 [(int_x86_wrssq GR64:$src, addr:$dst)]>, T8PS;
+                 [(int_x86_wrssq GR64:$src, addr:$dst)]>, T8, PS;
   def WRUSSD : I<0xF5, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
                  "wrussd\t{$src, $dst|$dst, $src}",
-                 [(int_x86_wrussd GR32:$src, addr:$dst)]>, T8PD;
+                 [(int_x86_wrussd GR32:$src, addr:$dst)]>, T8, PD;
   def WRUSSQ : RI<0xF5, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
                   "wrussq\t{$src, $dst|$dst, $src}",
-                  [(int_x86_wrussq GR64:$src, addr:$dst)]>, T8PD;
+                  [(int_x86_wrussq GR64:$src, addr:$dst)]>, T8, PD;
 }
 
 let Predicates = [HasEGPR, In64BitMode] in {
   def WRSSD_EVEX : I<0x66, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
                      "wrssd\t{$src, $dst|$dst, $src}",
-                     [(int_x86_wrssd GR32:$src, addr:$dst)]>, EVEX, NoCD8, T_MAP4PS;
+                     [(int_x86_wrssd GR32:$src, addr:$dst)]>, EVEX, NoCD8, T_MAP4, PS;
   def WRSSQ_EVEX : RI<0x66, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
                       "wrssq\t{$src, $dst|$dst, $src}",
-                      [(int_x86_wrssq GR64:$src, addr:$dst)]>, EVEX, NoCD8, T_MAP4PS;
+                      [(int_x86_wrssq GR64:$src, addr:$dst)]>, EVEX, NoCD8, T_MAP4, PS;
   def WRUSSD_EVEX : I<0x65, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
                       "wrussd\t{$src, $dst|$dst, $src}",
-                      [(int_x86_wrussd GR32:$src, addr:$dst)]>, EVEX, NoCD8, T_MAP4PD;
+                      [(int_x86_wrussd GR32:$src, addr:$dst)]>, EVEX, NoCD8, T_MAP4, PD;
   def WRUSSQ_EVEX : RI<0x65, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
                        "wrussq\t{$src, $dst|$dst, $src}",
-                       [(int_x86_wrussq GR64:$src, addr:$dst)]>, EVEX, NoCD8, T_MAP4PD;
+                       [(int_x86_wrussq GR64:$src, addr:$dst)]>, EVEX, NoCD8, T_MAP4, PD;
 }
 
   let Defs = [SSP] in {
     let Uses = [SSP] in {
         def SETSSBSY : I<0x01, MRM_E8, (outs), (ins), "setssbsy",
-                         [(int_x86_setssbsy)]>, XS;
+                         [(int_x86_setssbsy)]>, TB, XS;
     } // Uses SSP
 
     def CLRSSBSY : I<0xAE, MRM6m, (outs), (ins i32mem:$src),
                      "clrssbsy\t$src",
-                     [(int_x86_clrssbsy addr:$src)]>, XS;
+                     [(int_x86_clrssbsy addr:$src)]>, TB, XS;
   } // Defs SSP
 } // SchedRW
 
 let SchedRW = [WriteSystem] in {
-    def ENDBR64 : I<0x1E, MRM_FA, (outs), (ins), "endbr64", []>, XS;
-    def ENDBR32 : I<0x1E, MRM_FB, (outs), (ins), "endbr32", []>, XS;
+    def ENDBR64 : I<0x1E, MRM_FA, (outs), (ins), "endbr64", []>, TB, XS;
+    def ENDBR32 : I<0x1E, MRM_FB, (outs), (ins), "endbr32", []>, TB, XS;
 } // SchedRW
 
 //===----------------------------------------------------------------------===//
@@ -574,51 +574,51 @@ let SchedRW = [WriteSystem] in {
 // on Windows without needing to enable the xsave feature to be compatible with
 // MSVC.
 let Defs = [EDX, EAX], Uses = [ECX] in
-def XGETBV : I<0x01, MRM_D0, (outs), (ins), "xgetbv", []>, PS;
+def XGETBV : I<0x01, MRM_D0, (outs), (ins), "xgetbv", []>, TB, PS;
 
 let Uses = [EDX, EAX, ECX] in
 def XSETBV : I<0x01, MRM_D1, (outs), (ins),
               "xsetbv",
-              [(int_x86_xsetbv ECX, EDX, EAX)]>, PS;
+              [(int_x86_xsetbv ECX, EDX, EAX)]>, TB, PS;
 
 
 let Uses = [EDX, EAX] in {
 def XSAVE : I<0xAE, MRM4m, (outs), (ins opaquemem:$dst),
               "xsave\t$dst",
-              [(int_x86_xsave addr:$dst, EDX, EAX)]>, PS, Requires<[HasXSAVE]>;
+              [(int_x86_xsave addr:$dst, EDX, EAX)]>, TB, PS, Requires<[HasXSAVE]>;
 def XSAVE64 : RI<0xAE, MRM4m, (outs), (ins opaquemem:$dst),
                  "xsave64\t$dst",
-                 [(int_x86_xsave64 addr:$dst, EDX, EAX)]>, PS, Requires<[HasXSAVE, In64BitMode]>;
+                 [(int_x86_xsave64 addr:$dst, EDX, EAX)]>, TB, PS, Requires<[HasXSAVE, In64BitMode]>;
 def XRSTOR : I<0xAE, MRM5m, (outs), (ins opaquemem:$dst),
                "xrstor\t$dst",
-               [(int_x86_xrstor addr:$dst, EDX, EAX)]>, PS, Requires<[HasXSAVE]>;
+               [(int_x86_xrstor addr:$dst, EDX, EAX)]>, TB, PS, Requires<[HasXSAVE]>;
 def XRSTOR64 : RI<0xAE, MRM5m, (outs), (ins opaquemem:$dst),
                   "xrstor64\t$dst",
-                  [(int_x86_xrstor64 addr:$dst, EDX, EAX)]>, PS, Requires<[HasXSAVE, In64BitMode]>;
+                  [(int_x86_xrstor64 addr:$dst, EDX, EAX)]>, TB, PS, Requires<[HasXSAVE, In64BitMode]>;
 def XSAVEOPT : I<0xAE, MRM6m, (outs), (ins opaquemem:$dst),
                  "xsaveopt\t$dst",
-                 [(int_x86_xsaveopt addr:$dst, EDX, EAX)]>, PS, Requires<[HasXSAVEOPT]>;
+                 [(int_x86_xsaveopt addr:$dst, EDX, EAX)]>, TB, PS, Requires<[HasXSAVEOPT]>;
 def XSAVEOPT64 : RI<0xAE, MRM6m, (outs), (ins opaquemem:$dst),
                     "xsaveopt64\t$dst",
-                    [(int_x86_xsaveopt64 addr:$dst, EDX, EAX)]>, PS, Requires<[HasXSAVEOPT, In64BitMode]>;
+                    [(int_x86_xsaveopt64 addr:$dst, EDX, EAX)]>, TB, PS, Requires<[HasXSAVEOPT, In64BitMode]>;
 def XSAVEC : I<0xC7, MRM4m, (outs), (ins opaquemem:$dst),
                "xsavec\t$dst",
-               [(int_x86_xsavec addr:$dst, EDX, EAX)]>, PS, Requires<[HasXSAVEC]>;
+               [(int_x86_xsavec addr:$dst, EDX, EAX)]>, TB, PS, Requires<[HasXSAVEC]>;
 def XSAVEC64 : RI<0xC7, MRM4m, (outs), (ins opaquemem:$dst),
                  "xsavec64\t$dst",
-                 [(int_x86_xsavec64 addr:$dst, EDX, EAX)]>, PS, Requires<[HasXSAVEC, In64BitMode]>;
+                 [(int_x86_xsavec64 addr:$dst, EDX, EAX)]>, TB, PS, Requires<[HasXSAVEC, In64BitMode]>;
 def XSAVES : I<0xC7, MRM5m, (outs), (ins opaquemem:$dst),
                "xsaves\t$dst",
-               [(int_x86_xsaves addr:$dst, EDX, EAX)]>, PS, Requires<[HasXSAVES]>;
+               [(int_x86_xsaves addr:$dst, EDX, EAX)]>, TB, PS, Requires<[HasXSAVES]>;
 def XSAVES64 : RI<0xC7, MRM5m, (outs), (ins opaquemem:$dst),
                   "xsaves64\t$dst",
-                  [(int_x86_xsaves64 addr:$dst, EDX, EAX)]>, PS, Requires<[HasXSAVE, In64BitMode]>;
+                  [(int_x86_xsaves64 addr:$dst, EDX, EAX)]>, TB, PS, Requires<[HasXSAVE, In64BitMode]>;
 def XRSTORS : I<0xC7, MRM3m, (outs), (ins opaquemem:$dst),
                 "xrstors\t$dst",
-                [(int_x86_xrstors addr:$dst, EDX, EAX)]>, PS, Requires<[HasXSAVES]>;
+                [(int_x86_xrstors addr:$dst, EDX, EAX)]>, TB, PS, Requires<[HasXSAVES]>;
 def XRSTORS64 : RI<0xC7, MRM3m, (outs), (ins opaquemem:$dst),
                    "xrstors64\t$dst",
-                   [(int_x86_xrstors64 addr:$dst, EDX, EAX)]>, PS, Requires<[HasXSAVES, In64BitMode]>;
+                   [(int_x86_xrstors64 addr:$dst, EDX, EAX)]>, TB, PS, Requires<[HasXSAVES, In64BitMode]>;
 } // Uses
 } // SchedRW
 
@@ -651,10 +651,10 @@ let Defs = [RAX, RDX, RSI], Uses = [RAX, RSI] in
 let SchedRW = [WriteSystem] in {
 let Defs = [EAX, EDX], Uses = [ECX] in
   def RDPKRUr : I<0x01, MRM_EE, (outs), (ins), "rdpkru",
-                  [(set EAX, (X86rdpkru ECX)), (implicit EDX)]>, PS;
+                  [(set EAX, (X86rdpkru ECX)), (implicit EDX)]>, TB, PS;
 let Uses = [EAX, ECX, EDX] in
   def WRPKRUr : I<0x01, MRM_EF, (outs), (ins), "wrpkru",
-                  [(X86wrpkru EAX, EDX, ECX)]>, PS;
+                  [(X86wrpkru EAX, EDX, ECX)]>, TB, PS;
 } // SchedRW
 
 //===----------------------------------------------------------------------===//
@@ -662,28 +662,28 @@ let Uses = [EAX, ECX, EDX] in
 let Predicates = [HasFSGSBase, In64BitMode], SchedRW = [WriteSystem] in {
   def RDFSBASE : I<0xAE, MRM0r, (outs GR32:$dst), (ins),
                    "rdfsbase{l}\t$dst",
-                   [(set GR32:$dst, (int_x86_rdfsbase_32))]>, XS;
+                   [(set GR32:$dst, (int_x86_rdfsbase_32))]>, TB, XS;
   def RDFSBASE64 : RI<0xAE, MRM0r, (outs GR64:$dst), (ins),
                      "rdfsbase{q}\t$dst",
-                     [(set GR64:$dst, (int_x86_rdfsbase_64))]>, XS;
+                     [(set GR64:$dst, (int_x86_rdfsbase_64))]>, TB, XS;
   def RDGSBASE : I<0xAE, MRM1r, (outs GR32:$dst), (ins),
                    "rdgsbase{l}\t$dst",
-                   [(set GR32:$dst, (int_x86_rdgsbase_32))]>, XS;
+                   [(set GR32:$dst, (int_x86_rdgsbase_32))]>, TB, XS;
   def RDGSBASE64 : RI<0xAE, MRM1r, (outs GR64:$dst), (ins),
                      "rdgsbase{q}\t$dst",
-                     [(set GR64:$dst, (int_x86_rdgsbase_64))]>, XS;
+                     [(set GR64:$dst, (int_x86_rdgsbase_64))]>, TB, XS;
   def WRFSBASE : I<0xAE, MRM2r, (outs), (ins GR32:$src),
                    "wrfsbase{l}\t$src",
-                   [(int_x86_wrfsbase_32 GR32:$src)]>, XS;
+                   [(int_x86_wrfsbase_32 GR32:$src)]>, TB, XS;
   def WRFSBASE64 : RI<0xAE, MRM2r, (outs), (ins GR64:$src),
                       "wrfsbase{q}\t$src",
-                      [(int_x86_wrfsbase_64 GR64:$src)]>, XS;
+                      [(int_x86_wrfsbase_64 GR64:$src)]>, TB, XS;
   def WRGSBASE : I<0xAE, MRM3r, (outs), (ins GR32:$src),
                    "wrgsbase{l}\t$src",
-                   [(int_x86_wrgsbase_32 GR32:$src)]>, XS;
+                   [(int_x86_wrgsbase_32 GR32:$src)]>, TB, XS;
   def WRGSBASE64 : RI<0xAE, MRM3r, (outs), (ins GR64:$src),
                       "wrgsbase{q}\t$src",
-                      [(int_x86_wrgsbase_64 GR64:$src)]>, XS;
+                      [(int_x86_wrgsbase_64 GR64:$src)]>, TB, XS;
 }
 
 //===----------------------------------------------------------------------===//
@@ -691,15 +691,15 @@ let Predicates = [HasFSGSBase, In64BitMode], SchedRW = [WriteSystem] in {
 let SchedRW = [WriteSystem] in {
 def INVPCID32 : I<0x82, MRMSrcMem, (outs), (ins GR32:$src1, i128mem:$src2),
                   "invpcid\t{$src2, $src1|$src1, $src2}",
-                  [(int_x86_invpcid GR32:$src1, addr:$src2)]>, T8PD,
+                  [(int_x86_invpcid GR32:$src1, addr:$src2)]>, T8, PD,
                   Requires<[Not64BitMode, HasINVPCID]>;
 def INVPCID64 : I<0x82, MRMSrcMem, (outs), (ins GR64:$src1, i128mem:$src2),
-                  "invpcid\t{$src2, $src1|$src1, $src2}", []>, T8PD,
+                  "invpcid\t{$src2, $src1|$src1, $src2}", []>, T8, PD,
                   Requires<[In64BitMode, HasINVPCID]>;
 
 def INVPCID64_EVEX : I<0xF2, MRMSrcMem, (outs), (ins GR64:$src1, i128mem:$src2),
                        "invpcid\t{$src2, $src1|$src1, $src2}", []>,
-                     EVEX, NoCD8, T_MAP4XS, Requires<[In64BitMode, HasINVPCID]>;
+                     EVEX, NoCD8, T_MAP4, XS, Requires<[In64BitMode, HasINVPCID]>;
 } // SchedRW
 
 let Predicates = [In64BitMode, HasINVPCID] in {
@@ -718,15 +718,15 @@ let Predicates = [In64BitMode, HasINVPCID] in {
 //===----------------------------------------------------------------------===//
 // SMAP Instruction
 let Defs = [EFLAGS], SchedRW = [WriteSystem] in {
-  def CLAC : I<0x01, MRM_CA, (outs), (ins), "clac", []>, PS;
-  def STAC : I<0x01, MRM_CB, (outs), (ins), "stac", []>, PS;
+  def CLAC : I<0x01, MRM_CA, (outs), (ins), "clac", []>, TB, PS;
+  def STAC : I<0x01, MRM_CB, (outs), (ins), "stac", []>, TB, PS;
 }
 
 //===----------------------------------------------------------------------===//
 // SMX Instruction
 let SchedRW = [WriteSystem] in {
 let Uses = [RAX, RBX, RCX, RDX], Defs = [RAX, RBX, RCX] in {
-  def GETSEC : I<0x37, RawFrm, (outs), (ins), "getsec", []>, PS;
+  def GETSEC : I<0x37, RawFrm, (outs), (ins), "getsec", []>, TB, PS;
 } // Uses, Defs
 } // SchedRW
 
@@ -747,9 +747,9 @@ def STI : I<0xFB, RawFrm, (outs), (ins), "sti", []>;
 // RDPID Instruction
 let SchedRW = [WriteSystem] in {
 def RDPID32 : I<0xC7, MRM7r, (outs GR32:$dst), (ins),
-                "rdpid\t$dst", [(set GR32:$dst, (int_x86_rdpid))]>, XS,
+                "rdpid\t$dst", [(set GR32:$dst, (int_x86_rdpid))]>, TB, XS,
                 Requires<[Not64BitMode, HasRDPID]>;
-def RDPID64 : I<0xC7, MRM7r, (outs GR64:$dst), (ins), "rdpid\t$dst", []>, XS,
+def RDPID64 : I<0xC7, MRM7r, (outs GR64:$dst), (ins), "rdpid\t$dst", []>, TB, XS,
                 Requires<[In64BitMode, HasRDPID]>;
 } // SchedRW
 
@@ -765,17 +765,17 @@ let Predicates = [In64BitMode, HasRDPID] in {
 // PTWRITE Instruction - Write Data to a Processor Trace Packet
 let SchedRW = [WriteSystem] in {
 def PTWRITEm: I<0xAE, MRM4m, (outs), (ins i32mem:$dst),
-                "ptwrite{l}\t$dst", [(int_x86_ptwrite32 (loadi32 addr:$dst))]>, XS,
+                "ptwrite{l}\t$dst", [(int_x86_ptwrite32 (loadi32 addr:$dst))]>, TB, XS,
                 Requires<[HasPTWRITE]>;
 def PTWRITE64m : RI<0xAE, MRM4m, (outs), (ins i64mem:$dst),
-                    "ptwrite{q}\t$dst", [(int_x86_ptwrite64 (loadi64 addr:$dst))]>, XS,
+                    "ptwrite{q}\t$dst", [(int_x86_ptwrite64 (loadi64 addr:$dst))]>, TB, XS,
                     Requires<[In64BitMode, HasPTWRITE]>;
 
 def PTWRITEr : I<0xAE, MRM4r, (outs), (ins GR32:$dst),
-                 "ptwrite{l}\t$dst", [(int_x86_ptwrite32 GR32:$dst)]>, XS,
+                 "ptwrite{l}\t$dst", [(int_x86_ptwrite32 GR32:$dst)]>, TB, XS,
                     Requires<[HasPTWRITE]>;
 def PTWRITE64r : RI<0xAE, MRM4r, (outs), (ins GR64:$dst),
-                    "ptwrite{q}\t$dst", [(int_x86_ptwrite64 GR64:$dst)]>, XS,
+                    "ptwrite{q}\t$dst", [(int_x86_ptwrite64 GR64:$dst)]>, TB, XS,
                     Requires<[In64BitMode, HasPTWRITE]>;
 } // SchedRW
 
@@ -784,7 +784,7 @@ def PTWRITE64r : RI<0xAE, MRM4r, (outs), (ins GR64:$dst),
 
 let SchedRW = [WriteSystem] in {
 let Uses = [ECX], Defs = [EAX, EDX] in
-   def RDPRU : I<0x01, MRM_FD, (outs), (ins), "rdpru", []>, PS,
+   def RDPRU : I<0x01, MRM_FD, (outs), (ins), "rdpru", []>, TB, PS,
                Requires<[HasRDPRU]>;
 }
 
@@ -803,6 +803,6 @@ let Uses = [ECX], Defs = [EAX, EDX] in
 
 let SchedRW = [WriteSystem] in {
 let Uses = [RAX, RBX, RCX, RDX], Defs = [RAX, RBX, RCX, RDX, EFLAGS] in
-    def PCONFIG : I<0x01, MRM_C5, (outs), (ins), "pconfig", []>, PS,
+    def PCONFIG : I<0x01, MRM_C5, (outs), (ins), "pconfig", []>, TB, PS,
                   Requires<[HasPCONFIG]>;
 } // SchedRW
diff --git a/llvm/lib/Target/X86/X86InstrTDX.td b/llvm/lib/Target/X86/X86InstrTDX.td
index 8d7cd60820953..fe01677b2ea17 100644
--- a/llvm/lib/Target/X86/X86InstrTDX.td
+++ b/llvm/lib/Target/X86/X86InstrTDX.td
@@ -17,23 +17,17 @@
 // 64-bit only instructions
 let SchedRW = [WriteSystem], Predicates = [In64BitMode] in {
 // SEAMCALL - Call to SEAM VMX-root Operation Module
-def SEAMCALL : I<0x01, MRM_CF, (outs), (ins),
-             "seamcall", []>, PD;
+def SEAMCALL : I<0x01, MRM_CF, (outs), (ins), "seamcall", []>, TB, PD;
 
 // SEAMRET - Return to Legacy VMX-root Operation
-def SEAMRET : I<0x01, MRM_CD, (outs), (ins),
-             "seamret", []>, PD;
+def SEAMRET : I<0x01, MRM_CD, (outs), (ins), "seamret", []>, TB, PD;
 
 // SEAMOPS - SEAM Operations
-def SEAMOPS : I<0x01, MRM_CE, (outs), (ins),
-             "seamops", []>, PD;
-
+def SEAMOPS : I<0x01, MRM_CE, (outs), (ins), "seamops", []>, TB, PD;
 } // SchedRW
 
 // common instructions
 let SchedRW = [WriteSystem] in {
 // TDCALL - Call SEAM Module Functions
-def TDCALL : I<0x01, MRM_CC, (outs), (ins),
-             "tdcall", []>, PD;
-
+def TDCALL : I<0x01, MRM_CC, (outs), (ins), "tdcall", []>, TB, PD;
 } // SchedRW
diff --git a/llvm/lib/Target/X86/X86InstrTSX.td b/llvm/lib/Target/X86/X86InstrTSX.td
index 7671eb4676eed..cc9174a0c491c 100644
--- a/llvm/lib/Target/X86/X86InstrTSX.td
+++ b/llvm/lib/Target/X86/X86InstrTSX.td
@@ -37,11 +37,11 @@ def XABORT_DEF : I<0, Pseudo, (outs), (ins), "# XABORT DEF", []>;
 }
 
 def XEND : I<0x01, MRM_D5, (outs), (ins),
-             "xend", [(int_x86_xend)]>, PS, Requires<[HasRTM]>;
+             "xend", [(int_x86_xend)]>, TB, PS, Requires<[HasRTM]>;
 
 let Defs = [EFLAGS] in
 def XTEST : I<0x01, MRM_D6, (outs), (ins),
-              "xtest", [(set EFLAGS, (X86xtest))]>, PS, Requires<[HasRTM]>;
+              "xtest", [(set EFLAGS, (X86xtest))]>, TB, PS, Requires<[HasRTM]>;
 
 def XABORT : Ii8<0xc6, MRM_F8, (outs), (ins i8imm:$imm),
                  "xabort\t$imm",
diff --git a/llvm/lib/Target/X86/X86InstrUtils.td b/llvm/lib/Target/X86/X86InstrUtils.td
index dd59a641dfaa2..87eacf704de6c 100644
--- a/llvm/lib/Target/X86/X86InstrUtils.td
+++ b/llvm/lib/Target/X86/X86InstrUtils.td
@@ -27,41 +27,18 @@ class REP    { bit hasREPPrefix = 1; }
 class TB     { Map OpMap = TB; }
 class T8     { Map OpMap = T8; }
 class TA     { Map OpMap = TA; }
+class T_MAP4 { Map OpMap = T_MAP4; }
+class T_MAP5 { Map OpMap = T_MAP5; }
+class T_MAP6 { Map OpMap = T_MAP6; }
+class T_MAP7 { Map OpMap = T_MAP7; }
 class XOP8   { Map OpMap = XOP8; Prefix OpPrefix = PS; }
 class XOP9   { Map OpMap = XOP9; Prefix OpPrefix = PS; }
 class XOPA   { Map OpMap = XOPA; Prefix OpPrefix = PS; }
 class ThreeDNow { Map OpMap = ThreeDNow; }
-class T_MAP4     { Map OpMap = T_MAP4; }
-class T_MAP4PS : T_MAP4 { Prefix OpPrefix = PS; } // none
-class T_MAP4PD : T_MAP4 { Prefix OpPrefix = PD; } // 0x66
-class T_MAP4XS : T_MAP4 { Prefix OpPrefix = XS; } // 0xF3
-class T_MAP4XD : T_MAP4 { Prefix OpPrefix = XD; } // 0xF2
-class T_MAP5     { Map OpMap = T_MAP5; }
-class T_MAP5PS : T_MAP5 { Prefix OpPrefix = PS; } // none
-class T_MAP5PD : T_MAP5 { Prefix OpPrefix = PD; } // 0x66
-class T_MAP5XS : T_MAP5 { Prefix OpPrefix = XS; } // 0xF3
-class T_MAP5XD : T_MAP5 { Prefix OpPrefix = XD; } // 0xF2
-class T_MAP6     { Map OpMap = T_MAP6; }
-class T_MAP6PS : T_MAP6 { Prefix OpPrefix = PS; }
-class T_MAP6PD : T_MAP6 { Prefix OpPrefix = PD; }
-class T_MAP6XS : T_MAP6 { Prefix OpPrefix = XS; }
-class T_MAP6XD : T_MAP6 { Prefix OpPrefix = XD; }
-class T_MAP7     { Map OpMap = T_MAP7; }
-class T_MAP7XS : T_MAP7 { Prefix OpPrefix = XS; } // 0xF3
-class T_MAP7XD : T_MAP7 { Prefix OpPrefix = XD; } // 0xF2
-class OBXS   { Prefix OpPrefix = XS; }
-class PS   : TB { Prefix OpPrefix = PS; }
-class PD   : TB { Prefix OpPrefix = PD; }
-class XD   : TB { Prefix OpPrefix = XD; }
-class XS   : TB { Prefix OpPrefix = XS; }
-class T8PS : T8 { Prefix OpPrefix = PS; }
-class T8PD : T8 { Prefix OpPrefix = PD; }
-class T8XD : T8 { Prefix OpPrefix = XD; }
-class T8XS : T8 { Prefix OpPrefix = XS; }
-class TAPS : TA { Prefix OpPrefix = PS; }
-class TAPD : TA { Prefix OpPrefix = PD; }
-class TAXD : TA { Prefix OpPrefix = XD; }
-class TAXS : TA { Prefix OpPrefix = XS; }
+class PS { Prefix OpPrefix = PS; }
+class PD { Prefix OpPrefix = PD; }
+class XD { Prefix OpPrefix = XD; }
+class XS { Prefix OpPrefix = XS; }
 class VEX    { Encoding OpEnc = EncVEX; }
 class WIG  { bit IgnoresW = 1; }
 // Special version of REX_W that can be changed to VEX.W==0 for EVEX2VEX.
@@ -90,23 +67,23 @@ class XOP { Encoding OpEnc = EncXOP; }
 class EVEX2VEXOverride<string VEXInstrName> {
   string EVEX2VEXOverride = VEXInstrName;
 }
-class AVX512BIi8Base : PD {
+class AVX512BIi8Base : TB, PD {
   Domain ExeDomain = SSEPackedInt;
   ImmType ImmT = Imm8;
 }
-class AVX512XSIi8Base : XS {
+class AVX512XSIi8Base : TB, XS {
   Domain ExeDomain = SSEPackedInt;
   ImmType ImmT = Imm8;
 }
-class AVX512XDIi8Base : XD {
+class AVX512XDIi8Base : TB, XD {
   Domain ExeDomain = SSEPackedInt;
   ImmType ImmT = Imm8;
 }
-class AVX512PSIi8Base : PS {
+class AVX512PSIi8Base : TB, PS {
   Domain ExeDomain = SSEPackedSingle;
   ImmType ImmT = Imm8;
 }
-class AVX512PDIi8Base : PD {
+class AVX512PDIi8Base : TB, PD {
   Domain ExeDomain = SSEPackedDouble;
   ImmType ImmT = Imm8;
 }
@@ -591,26 +568,26 @@ class PIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
 
 class SSI<bits<8> o, Format F, dag outs, dag ins, string asm,
           list<dag> pattern>
-      : I<o, F, outs, ins, asm, pattern>, XS, Requires<[UseSSE1]>;
+      : I<o, F, outs, ins, asm, pattern>, TB, XS, Requires<[UseSSE1]>;
 class SSIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
             list<dag> pattern>
-      : Ii8<o, F, outs, ins, asm, pattern>, XS, Requires<[UseSSE1]>;
+      : Ii8<o, F, outs, ins, asm, pattern>, TB, XS, Requires<[UseSSE1]>;
 class PSI<bits<8> o, Format F, dag outs, dag ins, string asm,
           list<dag> pattern>
-      : I<o, F, outs, ins, asm, pattern, SSEPackedSingle>, PS,
+      : I<o, F, outs, ins, asm, pattern, SSEPackedSingle>, TB, PS,
         Requires<[UseSSE1]>;
 class PSIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
             list<dag> pattern>
-      : Ii8<o, F, outs, ins, asm, pattern, SSEPackedSingle>, PS,
+      : Ii8<o, F, outs, ins, asm, pattern, SSEPackedSingle>, TB, PS,
         Requires<[UseSSE1]>;
 class VSSI<bits<8> o, Format F, dag outs, dag ins, string asm,
            list<dag> pattern>
-      : I<o, F, outs, ins, !strconcat("v", asm), pattern>, XS,
+      : I<o, F, outs, ins, !strconcat("v", asm), pattern>, TB, XS,
         Requires<[HasAVX]>;
 class VPSI<bits<8> o, Format F, dag outs, dag ins, string asm,
            list<dag> pattern>
-      : I<o, F, outs, ins, !strconcat("v", asm), pattern, SSEPackedSingle>, PS,
-        Requires<[HasAVX]>;
+      : I<o, F, outs, ins, !strconcat("v", asm), pattern, SSEPackedSingle>,
+        TB, PS, Requires<[HasAVX]>;
 
 // SSE2 Instruction Templates:
 //
@@ -632,49 +609,49 @@ class VPSI<bits<8> o, Format F, dag outs, dag ins, string asm,
 
 class SDI<bits<8> o, Format F, dag outs, dag ins, string asm,
           list<dag> pattern>
-      : I<o, F, outs, ins, asm, pattern>, XD, Requires<[UseSSE2]>;
+      : I<o, F, outs, ins, asm, pattern>, TB, XD, Requires<[UseSSE2]>;
 class SDIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
             list<dag> pattern>
-      : Ii8<o, F, outs, ins, asm, pattern>, XD, Requires<[UseSSE2]>;
+      : Ii8<o, F, outs, ins, asm, pattern>, TB, XD, Requires<[UseSSE2]>;
 class S2SI<bits<8> o, Format F, dag outs, dag ins, string asm,
            list<dag> pattern>
-      : I<o, F, outs, ins, asm, pattern>, XS, Requires<[UseSSE2]>;
+      : I<o, F, outs, ins, asm, pattern>, TB, XS, Requires<[UseSSE2]>;
 class S2SIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
              list<dag> pattern>
-      : Ii8<o, F, outs, ins, asm, pattern>, XS, Requires<[UseSSE2]>;
+      : Ii8<o, F, outs, ins, asm, pattern>, TB, XS, Requires<[UseSSE2]>;
 class PDI<bits<8> o, Format F, dag outs, dag ins, string asm,
           list<dag> pattern>
-      : I<o, F, outs, ins, asm, pattern, SSEPackedDouble>, PD,
+      : I<o, F, outs, ins, asm, pattern, SSEPackedDouble>, TB, PD,
         Requires<[UseSSE2]>;
 class PDIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
             list<dag> pattern>
-      : Ii8<o, F, outs, ins, asm, pattern, SSEPackedDouble>, PD,
+      : Ii8<o, F, outs, ins, asm, pattern, SSEPackedDouble>, TB, PD,
         Requires<[UseSSE2]>;
 class VSDI<bits<8> o, Format F, dag outs, dag ins, string asm,
            list<dag> pattern>
-      : I<o, F, outs, ins, !strconcat("v", asm), pattern>, XD,
+      : I<o, F, outs, ins, !strconcat("v", asm), pattern>, TB, XD,
         Requires<[UseAVX]>;
 class VS2SI<bits<8> o, Format F, dag outs, dag ins, string asm,
             list<dag> pattern>
-      : I<o, F, outs, ins, !strconcat("v", asm), pattern>, XS,
+      : I<o, F, outs, ins, !strconcat("v", asm), pattern>, TB, XS,
         Requires<[HasAVX]>;
 class VPDI<bits<8> o, Format F, dag outs, dag ins, string asm,
            list<dag> pattern>
       : I<o, F, outs, ins, !strconcat("v", asm), pattern, SSEPackedDouble>,
-        PD, Requires<[HasAVX]>;
+        TB, PD, Requires<[HasAVX]>;
 class VS2I<bits<8> o, Format F, dag outs, dag ins, string asm,
            list<dag> pattern>
-      : I<o, F, outs, ins, !strconcat("v", asm), pattern>, PD,
+      : I<o, F, outs, ins, !strconcat("v", asm), pattern>, TB, PD,
         Requires<[UseAVX]>;
 class S2I<bits<8> o, Format F, dag outs, dag ins, string asm,
            list<dag> pattern>
-      : I<o, F, outs, ins, asm, pattern>, PD, Requires<[UseSSE2]>;
+      : I<o, F, outs, ins, asm, pattern>, TB, PD, Requires<[UseSSE2]>;
 class MMXSDIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
                list<dag> pattern>
-      : Ii8<o, F, outs, ins, asm, pattern>, XD, Requires<[HasMMX, HasSSE2]>;
+      : Ii8<o, F, outs, ins, asm, pattern>, TB, XD, Requires<[HasMMX, HasSSE2]>;
 class MMXS2SIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
                 list<dag> pattern>
-      : Ii8<o, F, outs, ins, asm, pattern>, XS, Requires<[HasMMX, HasSSE2]>;
+      : Ii8<o, F, outs, ins, asm, pattern>, TB, XS, Requires<[HasMMX, HasSSE2]>;
 
 // SSE3 Instruction Templates:
 //
@@ -684,15 +661,15 @@ class MMXS2SIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
 
 class S3SI<bits<8> o, Format F, dag outs, dag ins, string asm,
            list<dag> pattern>
-      : I<o, F, outs, ins, asm, pattern, SSEPackedSingle>, XS,
+      : I<o, F, outs, ins, asm, pattern, SSEPackedSingle>, TB, XS,
         Requires<[UseSSE3]>;
 class S3DI<bits<8> o, Format F, dag outs, dag ins, string asm,
            list<dag> pattern>
-      : I<o, F, outs, ins, asm, pattern, SSEPackedDouble>, XD,
+      : I<o, F, outs, ins, asm, pattern, SSEPackedDouble>, TB, XD,
         Requires<[UseSSE3]>;
 class S3I<bits<8> o, Format F, dag outs, dag ins, string asm,
           list<dag> pattern>
-      : I<o, F, outs, ins, asm, pattern, SSEPackedDouble>, PD,
+      : I<o, F, outs, ins, asm, pattern, SSEPackedDouble>, TB, PD,
         Requires<[UseSSE3]>;
 
 
@@ -709,19 +686,19 @@ class S3I<bits<8> o, Format F, dag outs, dag ins, string asm,
 
 class SS38I<bits<8> o, Format F, dag outs, dag ins, string asm,
             list<dag> pattern>
-      : I<o, F, outs, ins, asm, pattern, SSEPackedInt>, T8PD,
+      : I<o, F, outs, ins, asm, pattern, SSEPackedInt>, T8, PD,
         Requires<[UseSSSE3]>;
 class SS3AI<bits<8> o, Format F, dag outs, dag ins, string asm,
             list<dag> pattern>
-      : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, TAPD,
+      : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, TA, PD,
         Requires<[UseSSSE3]>;
 class MMXSS38I<bits<8> o, Format F, dag outs, dag ins, string asm,
                list<dag> pattern>
-      : I<o, F, outs, ins, asm, pattern, SSEPackedInt>, T8PS,
+      : I<o, F, outs, ins, asm, pattern, SSEPackedInt>, T8, PS,
         Requires<[HasMMX, HasSSSE3]>;
 class MMXSS3AI<bits<8> o, Format F, dag outs, dag ins, string asm,
                list<dag> pattern>
-      : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, TAPS,
+      : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, TA, PS,
         Requires<[HasMMX, HasSSSE3]>;
 
 // SSE4.1 Instruction Templates:
@@ -731,11 +708,11 @@ class MMXSS3AI<bits<8> o, Format F, dag outs, dag ins, string asm,
 //
 class SS48I<bits<8> o, Format F, dag outs, dag ins, string asm,
             list<dag> pattern>
-      : I<o, F, outs, ins, asm, pattern, SSEPackedInt>, T8PD,
+      : I<o, F, outs, ins, asm, pattern, SSEPackedInt>, T8, PD,
         Requires<[UseSSE41]>;
 class SS4AIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
             list<dag> pattern>
-      : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, TAPD,
+      : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, TA, PD,
         Requires<[UseSSE41]>;
 
 // SSE4.2 Instruction Templates:
@@ -743,13 +720,13 @@ class SS4AIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
 //   SS428I - SSE 4.2 instructions with T8 prefix.
 class SS428I<bits<8> o, Format F, dag outs, dag ins, string asm,
              list<dag> pattern>
-      : I<o, F, outs, ins, asm, pattern, SSEPackedInt>, T8PD,
+      : I<o, F, outs, ins, asm, pattern, SSEPackedInt>, T8, PD,
         Requires<[UseSSE42]>;
 
 //   SS42AI = SSE 4.2 instructions with TA prefix
 class SS42AI<bits<8> o, Format F, dag outs, dag ins, string asm,
              list<dag> pattern>
-      : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, TAPD,
+      : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, TA, PD,
         Requires<[UseSSE42]>;
 
 //   CRC32I - SSE 4.2 CRC32 instructions.
@@ -757,42 +734,42 @@ class SS42AI<bits<8> o, Format F, dag outs, dag ins, string asm,
 // controlled by the SSE42 flag.
 class CRC32I<bits<8> o, Format F, dag outs, dag ins, string asm,
              list<dag> pattern>
-      : I<o, F, outs, ins, asm, pattern>, T8XD, Requires<[HasCRC32]>;
+      : I<o, F, outs, ins, asm, pattern>, T8, XD, Requires<[HasCRC32]>;
 
 // AVX Instruction Templates:
 //   Instructions introduced in AVX (no SSE equivalent forms)
 //
-//   AVX8I - AVX instructions with T8PD prefix.
-//   AVXAIi8 - AVX instructions with TAPD prefix and ImmT = Imm8.
+//   AVX8I - AVX instructions with T8, PD prefix.
+//   AVXAIi8 - AVX instructions with TA, PD prefix and ImmT = Imm8.
 class AVX8I<bits<8> o, Format F, dag outs, dag ins, string asm,
             list<dag> pattern>
-      : I<o, F, outs, ins, asm, pattern, SSEPackedInt>, T8PD,
+      : I<o, F, outs, ins, asm, pattern, SSEPackedInt>, T8, PD,
         Requires<[HasAVX]>;
 class AVXAIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
               list<dag> pattern>
-      : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, TAPD,
+      : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, TA, PD,
         Requires<[HasAVX]>;
 
 // AVX2 Instruction Templates:
 //   Instructions introduced in AVX2 (no SSE equivalent forms)
 //
-//   AVX28I - AVX2 instructions with T8PD prefix.
-//   AVX2AIi8 - AVX2 instructions with TAPD prefix and ImmT = Imm8.
+//   AVX28I - AVX2 instructions with T8, PD prefix.
+//   AVX2AIi8 - AVX2 instructions with TA, PD prefix and ImmT = Imm8.
 class AVX28I<bits<8> o, Format F, dag outs, dag ins, string asm,
             list<dag> pattern>
-      : I<o, F, outs, ins, asm, pattern, SSEPackedInt>, T8PD,
+      : I<o, F, outs, ins, asm, pattern, SSEPackedInt>, T8, PD,
         Requires<[HasAVX2]>;
 class AVX2AIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
               list<dag> pattern>
-      : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, TAPD,
+      : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, TA, PD,
         Requires<[HasAVX2]>;
 
 
 // AVX-512 Instruction Templates:
 //   Instructions introduced in AVX-512 (no SSE equivalent forms)
 //
-//   AVX5128I - AVX-512 instructions with T8PD prefix.
-//   AVX512AIi8 - AVX-512 instructions with TAPD prefix and ImmT = Imm8.
+//   AVX5128I - AVX-512 instructions with T8, PD prefix.
+//   AVX512AIi8 - AVX-512 instructions with TA, PD prefix and ImmT = Imm8.
 //   AVX512PDI  - AVX-512 instructions with PD, double packed.
 //   AVX512PSI  - AVX-512 instructions with PS, single packed.
 //   AVX512XS8I - AVX-512 instructions with T8 and XS prefixes.
@@ -802,39 +779,39 @@ class AVX2AIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
 
 class AVX5128I<bits<8> o, Format F, dag outs, dag ins, string asm,
             list<dag> pattern>
-      : I<o, F, outs, ins, asm, pattern, SSEPackedInt>, T8PD,
+      : I<o, F, outs, ins, asm, pattern, SSEPackedInt>, T8, PD,
         Requires<[HasAVX512]>;
-class AVX5128IBase : T8PD {
+class AVX5128IBase : T8, PD {
   Domain ExeDomain = SSEPackedInt;
 }
 class AVX512XS8I<bits<8> o, Format F, dag outs, dag ins, string asm,
             list<dag> pattern>
-      : I<o, F, outs, ins, asm, pattern, SSEPackedInt>, T8XS,
+      : I<o, F, outs, ins, asm, pattern, SSEPackedInt>, T8, XS,
         Requires<[HasAVX512]>;
 class AVX512XSI<bits<8> o, Format F, dag outs, dag ins, string asm,
             list<dag> pattern>
-      : I<o, F, outs, ins, asm, pattern>, XS,
+      : I<o, F, outs, ins, asm, pattern>, TB, XS,
         Requires<[HasAVX512]>;
 class AVX512XDI<bits<8> o, Format F, dag outs, dag ins, string asm,
             list<dag> pattern>
-      : I<o, F, outs, ins, asm, pattern, SSEPackedInt>, XD,
+      : I<o, F, outs, ins, asm, pattern, SSEPackedInt>, TB, XD,
         Requires<[HasAVX512]>;
 class AVX512BI<bits<8> o, Format F, dag outs, dag ins, string asm,
             list<dag> pattern>
-      : I<o, F, outs, ins, asm, pattern, SSEPackedInt>, PD,
+      : I<o, F, outs, ins, asm, pattern, SSEPackedInt>, TB, PD,
         Requires<[HasAVX512]>;
-class AVX512BIBase : PD {
+class AVX512BIBase : TB, PD {
   Domain ExeDomain = SSEPackedInt;
 }
 class AVX512BIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
               list<dag> pattern>
-      : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, PD,
+      : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, TB, PD,
         Requires<[HasAVX512]>;
 class AVX512AIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
               list<dag> pattern>
-      : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, TAPD,
+      : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, TA, PD,
         Requires<[HasAVX512]>;
-class AVX512AIi8Base : TAPD {
+class AVX512AIi8Base : TA, PD {
   ImmType ImmT = Imm8;
 }
 class AVX512Ii8<bits<8> o, Format F, dag outs, dag ins, string asm,
@@ -843,11 +820,11 @@ class AVX512Ii8<bits<8> o, Format F, dag outs, dag ins, string asm,
         Requires<[HasAVX512]>;
 class AVX512PDI<bits<8> o, Format F, dag outs, dag ins, string asm,
            list<dag> pattern>
-      : I<o, F, outs, ins, asm, pattern, SSEPackedDouble>, PD,
+      : I<o, F, outs, ins, asm, pattern, SSEPackedDouble>, TB, PD,
         Requires<[HasAVX512]>;
 class AVX512PSI<bits<8> o, Format F, dag outs, dag ins, string asm,
            list<dag> pattern>
-      : I<o, F, outs, ins, asm, pattern, SSEPackedSingle>, PS,
+      : I<o, F, outs, ins, asm, pattern, SSEPackedSingle>, TB, PS,
         Requires<[HasAVX512]>;
 class AVX512PIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
               list<dag> pattern, Domain d>
@@ -857,7 +834,7 @@ class AVX512PI<bits<8> o, Format F, dag outs, dag ins, string asm,
       : I<o, F, outs, ins, asm, pattern, d>, Requires<[HasAVX512]>;
 class AVX512FMA3S<bits<8> o, Format F, dag outs, dag ins, string asm,
            list<dag>pattern>
-      : I<o, F, outs, ins, asm, pattern>, T8PD,
+      : I<o, F, outs, ins, asm, pattern>, T8, PD,
         EVEX, VVVV, Requires<[HasAVX512]>;
 
 class AVX512<bits<8> o, Format F, dag outs, dag ins, string asm,
@@ -870,45 +847,45 @@ class AVX512<bits<8> o, Format F, dag outs, dag ins, string asm,
 // These use the same encoding as the SSE4.2 T8 and TA encodings.
 class AES8I<bits<8> o, Format F, dag outs, dag ins, string asm,
             list<dag>pattern>
-      : I<o, F, outs, ins, asm, pattern, SSEPackedInt>, T8PD,
+      : I<o, F, outs, ins, asm, pattern, SSEPackedInt>, T8, PD,
         Requires<[NoAVX, HasAES]>;
 
 class AESAI<bits<8> o, Format F, dag outs, dag ins, string asm,
             list<dag> pattern>
-      : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, TAPD,
+      : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, TA, PD,
         Requires<[NoAVX, HasAES]>;
 
 // PCLMUL Instruction Templates
 class PCLMULIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
                list<dag>pattern>
-      : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, TAPD;
+      : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, TA, PD;
 
 // FMA3 Instruction Templates
 class FMA3<bits<8> o, Format F, dag outs, dag ins, string asm,
            list<dag>pattern>
-      : I<o, F, outs, ins, asm, pattern>, T8PD,
+      : I<o, F, outs, ins, asm, pattern>, T8, PD,
         VEX, VVVV, FMASC, Requires<[HasFMA, NoFMA4, NoVLX]>;
 class FMA3S<bits<8> o, Format F, dag outs, dag ins, string asm,
             list<dag>pattern>
-      : I<o, F, outs, ins, asm, pattern>, T8PD,
+      : I<o, F, outs, ins, asm, pattern>, T8, PD,
         VEX, VVVV, FMASC, Requires<[HasFMA, NoFMA4, NoAVX512]>;
 class FMA3S_Int<bits<8> o, Format F, dag outs, dag ins, string asm,
                 list<dag>pattern>
-      : I<o, F, outs, ins, asm, pattern>, T8PD,
+      : I<o, F, outs, ins, asm, pattern>, T8, PD,
         VEX, VVVV, FMASC, Requires<[HasFMA, NoAVX512]>;
 
 // FMA4 Instruction Templates
 class FMA4<bits<8> o, Format F, dag outs, dag ins, string asm,
            list<dag>pattern>
-      : Ii8Reg<o, F, outs, ins, asm, pattern>, TAPD,
+      : Ii8Reg<o, F, outs, ins, asm, pattern>, TA, PD,
         VEX, VVVV, FMASC, Requires<[HasFMA4, NoVLX]>;
 class FMA4S<bits<8> o, Format F, dag outs, dag ins, string asm,
             list<dag>pattern>
-      : Ii8Reg<o, F, outs, ins, asm, pattern>, TAPD,
+      : Ii8Reg<o, F, outs, ins, asm, pattern>, TA, PD,
         VEX, VVVV, FMASC, Requires<[HasFMA4, NoAVX512]>;
 class FMA4S_Int<bits<8> o, Format F, dag outs, dag ins, string asm,
                 list<dag>pattern>
-      : Ii8Reg<o, F, outs, ins, asm, pattern>, TAPD,
+      : Ii8Reg<o, F, outs, ins, asm, pattern>, TA, PD,
         VEX, VVVV, FMASC, Requires<[HasFMA4]>;
 
 // XOP 2, 3 and 4 Operand Instruction Template
@@ -931,7 +908,7 @@ class IXOPi8Reg<bits<8> o, Format F, dag outs, dag ins, string asm,
 //  XOP 5 operand instruction (VEX encoding!)
 class IXOP5<bits<8> o, Format F, dag outs, dag ins, string asm,
            list<dag>pattern>
-      : Ii8Reg<o, F, outs, ins, asm, pattern, SSEPackedInt>, TAPD,
+      : Ii8Reg<o, F, outs, ins, asm, pattern, SSEPackedInt>, TA, PD,
         VEX, VVVV, Requires<[HasXOP]>;
 
 // X86-64 Instruction templates...
@@ -970,14 +947,14 @@ class VRS2I<bits<8> o, Format F, dag outs, dag ins, string asm,
 // MMXIi8 - MMX instructions with ImmT == Imm8 and PS prefix.
 class MMXI<bits<8> o, Format F, dag outs, dag ins, string asm,
            list<dag> pattern>
-      : I<o, F, outs, ins, asm, pattern>, PS, Requires<[HasMMX]>;
+      : I<o, F, outs, ins, asm, pattern>, TB, PS, Requires<[HasMMX]>;
 class MMXRI<bits<8> o, Format F, dag outs, dag ins, string asm,
             list<dag> pattern>
-      : I<o, F, outs, ins, asm, pattern>, PS, REX_W,
+      : I<o, F, outs, ins, asm, pattern>, TB, PS, REX_W,
         Requires<[HasMMX,In64BitMode]>;
 class MMXIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
              list<dag> pattern>
-      : Ii8<o, F, outs, ins, asm, pattern>, PS, Requires<[HasMMX]>;
+      : Ii8<o, F, outs, ins, asm, pattern>, TB, PS, Requires<[HasMMX]>;
 
 /// ITy - This instruction base class takes the type info for the instruction.
 /// Using this, it:
diff --git a/llvm/lib/Target/X86/X86InstrVMX.td b/llvm/lib/Target/X86/X86InstrVMX.td
index c3fba9c5728ca..f2fc0dbaa3703 100644
--- a/llvm/lib/Target/X86/X86InstrVMX.td
+++ b/llvm/lib/Target/X86/X86InstrVMX.td
@@ -17,33 +17,33 @@
 let SchedRW = [WriteSystem] in {
 // 66 0F 38 80
 def INVEPT32 : I<0x80, MRMSrcMem, (outs), (ins GR32:$src1, i128mem:$src2),
-               "invept\t{$src2, $src1|$src1, $src2}", []>, T8PD,
+               "invept\t{$src2, $src1|$src1, $src2}", []>, T8, PD,
                Requires<[Not64BitMode]>;
 def INVEPT64 : I<0x80, MRMSrcMem, (outs), (ins GR64:$src1, i128mem:$src2),
-               "invept\t{$src2, $src1|$src1, $src2}", []>, T8PD,
+               "invept\t{$src2, $src1|$src1, $src2}", []>, T8, PD,
                Requires<[In64BitMode]>;
 def INVEPT64_EVEX : I<0xF0, MRMSrcMem, (outs), (ins GR64:$src1, i128mem:$src2),
                       "invept\t{$src2, $src1|$src1, $src2}", []>,
-                    EVEX, NoCD8, T_MAP4XS, Requires<[In64BitMode]>;
+                    EVEX, NoCD8, T_MAP4, XS, Requires<[In64BitMode]>;
 
 // 66 0F 38 81
 def INVVPID32 : I<0x81, MRMSrcMem, (outs), (ins GR32:$src1, i128mem:$src2),
-                "invvpid\t{$src2, $src1|$src1, $src2}", []>, T8PD,
+                "invvpid\t{$src2, $src1|$src1, $src2}", []>, T8, PD,
                 Requires<[Not64BitMode]>;
 def INVVPID64 : I<0x81, MRMSrcMem, (outs), (ins GR64:$src1, i128mem:$src2),
-                "invvpid\t{$src2, $src1|$src1, $src2}", []>, T8PD,
+                "invvpid\t{$src2, $src1|$src1, $src2}", []>, T8, PD,
                 Requires<[In64BitMode]>;
 def INVVPID64_EVEX : I<0xF1, MRMSrcMem, (outs), (ins GR64:$src1, i128mem:$src2),
                        "invvpid\t{$src2, $src1|$src1, $src2}", []>,
-                     EVEX, NoCD8, T_MAP4XS, Requires<[In64BitMode]>;
+                     EVEX, NoCD8, T_MAP4, XS, Requires<[In64BitMode]>;
 
 // 0F 01 C1
 def VMCALL : I<0x01, MRM_C1, (outs), (ins), "vmcall", []>, TB;
 def VMCLEARm : I<0xC7, MRM6m, (outs), (ins i64mem:$vmcs),
-  "vmclear\t$vmcs", []>, PD;
+  "vmclear\t$vmcs", []>, TB, PD;
 
 // OF 01 D4
-def VMFUNC : I<0x01, MRM_D4, (outs), (ins), "vmfunc", []>, PS;
+def VMFUNC : I<0x01, MRM_D4, (outs), (ins), "vmfunc", []>, TB, PS;
 
 // 0F 01 C2
 def VMLAUNCH : I<0x01, MRM_C2, (outs), (ins), "vmlaunch", []>, TB;
@@ -51,35 +51,35 @@ def VMLAUNCH : I<0x01, MRM_C2, (outs), (ins), "vmlaunch", []>, TB;
 // 0F 01 C3
 def VMRESUME : I<0x01, MRM_C3, (outs), (ins), "vmresume", []>, TB;
 def VMPTRLDm : I<0xC7, MRM6m, (outs), (ins i64mem:$vmcs),
-  "vmptrld\t$vmcs", []>, PS;
+  "vmptrld\t$vmcs", []>, TB, PS;
 def VMPTRSTm : I<0xC7, MRM7m, (outs), (ins i64mem:$vmcs),
-  "vmptrst\t$vmcs", []>, PS;
+  "vmptrst\t$vmcs", []>, TB, PS;
 def VMREAD64rr : I<0x78, MRMDestReg, (outs GR64:$dst), (ins GR64:$src),
-  "vmread{q}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[In64BitMode]>;
+  "vmread{q}\t{$src, $dst|$dst, $src}", []>, TB, PS, Requires<[In64BitMode]>;
 def VMREAD32rr : I<0x78, MRMDestReg, (outs GR32:$dst), (ins GR32:$src),
-  "vmread{l}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[Not64BitMode]>;
+  "vmread{l}\t{$src, $dst|$dst, $src}", []>, TB, PS, Requires<[Not64BitMode]>;
 
 let mayStore = 1 in {
 def VMREAD64mr : I<0x78, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
-  "vmread{q}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[In64BitMode]>;
+  "vmread{q}\t{$src, $dst|$dst, $src}", []>, TB, PS, Requires<[In64BitMode]>;
 def VMREAD32mr : I<0x78, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
-  "vmread{l}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[Not64BitMode]>;
+  "vmread{l}\t{$src, $dst|$dst, $src}", []>, TB, PS, Requires<[Not64BitMode]>;
 } // mayStore
 
 def VMWRITE64rr : I<0x79, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
-  "vmwrite{q}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[In64BitMode]>;
+  "vmwrite{q}\t{$src, $dst|$dst, $src}", []>, TB, PS, Requires<[In64BitMode]>;
 def VMWRITE32rr : I<0x79, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
-  "vmwrite{l}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[Not64BitMode]>;
+  "vmwrite{l}\t{$src, $dst|$dst, $src}", []>, TB, PS, Requires<[Not64BitMode]>;
 
 let mayLoad = 1 in {
 def VMWRITE64rm : I<0x79, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
-  "vmwrite{q}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[In64BitMode]>;
+  "vmwrite{q}\t{$src, $dst|$dst, $src}", []>, TB, PS, Requires<[In64BitMode]>;
 def VMWRITE32rm : I<0x79, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
-  "vmwrite{l}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[Not64BitMode]>;
+  "vmwrite{l}\t{$src, $dst|$dst, $src}", []>, TB, PS, Requires<[Not64BitMode]>;
 } // mayLoad
 
 // 0F 01 C4
 def VMXOFF : I<0x01, MRM_C4, (outs), (ins), "vmxoff", []>, TB;
 def VMXON : I<0xC7, MRM6m, (outs), (ins i64mem:$vmxon),
-  "vmxon\t$vmxon", []>, XS;
+  "vmxon\t$vmxon", []>, TB, XS;
 } // SchedRW

From 06a9c6738af79c6c9ce7671b78a99f1c23ff51b0 Mon Sep 17 00:00:00 2001
From: Shan Huang <52285902006@stu.ecnu.edu.cn>
Date: Fri, 22 Dec 2023 16:26:32 +0800
Subject: [PATCH 155/342] [CVP] Fix #76058: missing debug location in
 processSDiv function (#76118)

This PR fixes #76058.
---
 .../Scalar/CorrelatedValuePropagation.cpp     |  6 ++-
 .../sdiv_missing_debugloc.ll                  | 42 +++++++++++++++++++
 2 files changed, 46 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/Transforms/CorrelatedValuePropagation/sdiv_missing_debugloc.ll

diff --git a/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
index d2dfc764d042b..c44d3748a80d8 100644
--- a/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
+++ b/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
@@ -935,11 +935,13 @@ static bool processSDiv(BinaryOperator *SDI, const ConstantRange &LCR,
   UDiv->setDebugLoc(SDI->getDebugLoc());
   UDiv->setIsExact(SDI->isExact());
 
-  Value *Res = UDiv;
+  auto *Res = UDiv;
 
   // If the operands had two different domains, we need to negate the result.
-  if (Ops[0].D != Ops[1].D)
+  if (Ops[0].D != Ops[1].D) {
     Res = BinaryOperator::CreateNeg(Res, Res->getName() + ".neg", SDI);
+    Res->setDebugLoc(SDI->getDebugLoc());
+  }
 
   SDI->replaceAllUsesWith(Res);
   SDI->eraseFromParent();
diff --git a/llvm/test/Transforms/CorrelatedValuePropagation/sdiv_missing_debugloc.ll b/llvm/test/Transforms/CorrelatedValuePropagation/sdiv_missing_debugloc.ll
new file mode 100644
index 0000000000000..3bb0ca384240f
--- /dev/null
+++ b/llvm/test/Transforms/CorrelatedValuePropagation/sdiv_missing_debugloc.ll
@@ -0,0 +1,42 @@
+; RUN: opt -passes=correlated-propagation -S < %s | FileCheck %s
+; CHECK: %{{[a-zA-Z0-9_]*}} = udiv i8 %x.nonneg, %y, !dbg ![[DBGLOC:[0-9]+]]
+; CHECK-NEXT: %{{[a-zA-Z0-9_]*}}.neg = sub i8 0, %rem1, !dbg ![[DBGLOC]]
+
+; Function Attrs: inaccessiblememonly nocallback nofree nosync nounwind willreturn
+declare void @llvm.assume(i1 noundef) #0
+
+define void @test8_neg_neg(i8 %x, i8 %y) !dbg !5 {
+  %c0 = icmp sle i8 %x, 0, !dbg !13
+  call void @llvm.dbg.value(metadata i1 %c0, metadata !9, metadata !DIExpression()), !dbg !13
+  call void @llvm.assume(i1 %c0), !dbg !14
+  %c1 = icmp sge i8 %y, 0, !dbg !15
+  call void @llvm.dbg.value(metadata i1 %c1, metadata !11, metadata !DIExpression()), !dbg !15
+  call void @llvm.assume(i1 %c1), !dbg !16
+  %rem = sdiv i8 %x, %y, !dbg !17
+  call void @llvm.dbg.value(metadata i8 %rem, metadata !12, metadata !DIExpression()), !dbg !17
+  ret void, !dbg !18
+}
+
+; Function Attrs: nocallback nofree nosync nounwind readnone speculatable willreturn
+declare void @llvm.dbg.value(metadata, metadata, metadata) #1
+
+!llvm.module.flags = !{!0}
+!llvm.dbg.cu = !{!1}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, producer: "debugify", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
+!2 = !DIFile(filename: "reduced.ll", directory: "/")
+!5 = distinct !DISubprogram(name: "test8_neg_neg", linkageName: "test8_neg_neg", scope: null, file: !2, line: 1, type: !6, scopeLine: 1, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1, retainedNodes: !8)
+!6 = !DISubroutineType(types: !7)
+!7 = !{}
+!8 = !{!9, !11, !12}
+!9 = !DILocalVariable(name: "1", scope: !5, file: !2, line: 1, type: !10)
+!10 = !DIBasicType(name: "ty8", size: 8, encoding: DW_ATE_unsigned)
+!11 = !DILocalVariable(name: "2", scope: !5, file: !2, line: 3, type: !10)
+!12 = !DILocalVariable(name: "3", scope: !5, file: !2, line: 5, type: !10)
+!13 = !DILocation(line: 1, column: 1, scope: !5)
+!14 = !DILocation(line: 2, column: 1, scope: !5)
+!15 = !DILocation(line: 3, column: 1, scope: !5)
+!16 = !DILocation(line: 4, column: 1, scope: !5)
+!17 = !DILocation(line: 5, column: 1, scope: !5)
+!18 = !DILocation(line: 6, column: 1, scope: !5)
\ No newline at end of file

From 17858ce6f3d24f994f6ad8c899bfa4eed39f739d Mon Sep 17 00:00:00 2001
From: Wang Pengcheng <wangpengcheng.pp@bytedance.com>
Date: Fri, 22 Dec 2023 16:31:38 +0800
Subject: [PATCH 156/342] [MacroFusion] Remove
 createBranchMacroFusionDAGMutation (#76209)

Instead, we add a `BranchOnly` parameter to indicate that only
branches with its predecessors will be fused.

X86 is the only user of `createBranchMacroFusionDAGMutation`.
---
 llvm/include/llvm/CodeGen/MacroFusion.h | 12 ++++--------
 llvm/lib/CodeGen/MacroFusion.cpp        | 12 +++---------
 llvm/lib/Target/X86/X86MacroFusion.cpp  |  3 ++-
 3 files changed, 9 insertions(+), 18 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/MacroFusion.h b/llvm/include/llvm/CodeGen/MacroFusion.h
index a359fca604260..191c906e9ef6c 100644
--- a/llvm/include/llvm/CodeGen/MacroFusion.h
+++ b/llvm/include/llvm/CodeGen/MacroFusion.h
@@ -50,15 +50,11 @@ bool fuseInstructionPair(ScheduleDAGInstrs &DAG, SUnit &FirstSU,
 /// for instructions that benefit according to the target-specific
 /// predicate functions. shouldScheduleAdjacent will be true if any of the
 /// provided predicates are true.
+/// If BranchOnly is true, only branch instructions with one of their
+/// predecessors will be fused.
 std::unique_ptr<ScheduleDAGMutation>
-createMacroFusionDAGMutation(ArrayRef<MacroFusionPredTy> Predicates);
-
-/// Create a DAG scheduling mutation to pair branch instructions with one
-/// of their predecessors back to back for instructions that benefit according
-/// to the target-specific predicate functions. shouldScheduleAdjacent will be
-/// true if any of the provided predicates are true.
-std::unique_ptr<ScheduleDAGMutation>
-createBranchMacroFusionDAGMutation(ArrayRef<MacroFusionPredTy> Predicates);
+createMacroFusionDAGMutation(ArrayRef<MacroFusionPredTy> Predicates,
+                             bool BranchOnly = false);
 
 } // end namespace llvm
 
diff --git a/llvm/lib/CodeGen/MacroFusion.cpp b/llvm/lib/CodeGen/MacroFusion.cpp
index aff4d95781f45..5bd6ca0978a4b 100644
--- a/llvm/lib/CodeGen/MacroFusion.cpp
+++ b/llvm/lib/CodeGen/MacroFusion.cpp
@@ -212,15 +212,9 @@ bool MacroFusion::scheduleAdjacentImpl(ScheduleDAGInstrs &DAG, SUnit &AnchorSU)
 }
 
 std::unique_ptr<ScheduleDAGMutation>
-llvm::createMacroFusionDAGMutation(ArrayRef<MacroFusionPredTy> Predicates) {
+llvm::createMacroFusionDAGMutation(ArrayRef<MacroFusionPredTy> Predicates,
+                                   bool BranchOnly) {
   if (EnableMacroFusion)
-    return std::make_unique<MacroFusion>(Predicates, true);
-  return nullptr;
-}
-
-std::unique_ptr<ScheduleDAGMutation> llvm::createBranchMacroFusionDAGMutation(
-    ArrayRef<MacroFusionPredTy> Predicates) {
-  if (EnableMacroFusion)
-    return std::make_unique<MacroFusion>(Predicates, false);
+    return std::make_unique<MacroFusion>(Predicates, !BranchOnly);
   return nullptr;
 }
diff --git a/llvm/lib/Target/X86/X86MacroFusion.cpp b/llvm/lib/Target/X86/X86MacroFusion.cpp
index 82667b8cdbdb8..c0fa9aa703243 100644
--- a/llvm/lib/Target/X86/X86MacroFusion.cpp
+++ b/llvm/lib/Target/X86/X86MacroFusion.cpp
@@ -68,7 +68,8 @@ static bool shouldScheduleAdjacent(const TargetInstrInfo &TII,
 namespace llvm {
 
 std::unique_ptr<ScheduleDAGMutation> createX86MacroFusionDAGMutation() {
-  return createBranchMacroFusionDAGMutation(shouldScheduleAdjacent);
+  return createMacroFusionDAGMutation(shouldScheduleAdjacent,
+                                      /*BranchOnly=*/true);
 }
 
 } // end namespace llvm

From 0d903b689ab984d6e7d8e1919a5b37658ae94518 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bal=C3=A1zs=20K=C3=A9ri?= <balazs.keri@ericsson.com>
Date: Fri, 22 Dec 2023 10:07:38 +0100
Subject: [PATCH 157/342] [clang][ASTImporter] Import AlignValueAttr correctly.
 (#75308)

Expression of attribute `align_value` was not imported. Import of the
attribute is corrected, a test for it is added, other related tests with
FIXME are updated.
Fixes #75054.
---
 clang/lib/AST/ASTImporter.cpp           |  6 +++
 clang/unittests/AST/ASTImporterTest.cpp | 71 +++++++++----------------
 2 files changed, 31 insertions(+), 46 deletions(-)

diff --git a/clang/lib/AST/ASTImporter.cpp b/clang/lib/AST/ASTImporter.cpp
index 1cc47de675bf3..88b8c6abb6d5f 100644
--- a/clang/lib/AST/ASTImporter.cpp
+++ b/clang/lib/AST/ASTImporter.cpp
@@ -9103,6 +9103,12 @@ Expected<Attr *> ASTImporter::Import(const Attr *FromAttr) {
     break;
   }
 
+  case attr::AlignValue: {
+    auto *From = cast<AlignValueAttr>(FromAttr);
+    AI.importAttr(From, AI.importArg(From->getAlignment()).value());
+    break;
+  }
+
   case attr::Format: {
     const auto *From = cast<FormatAttr>(FromAttr);
     AI.importAttr(From, Import(From->getType()), From->getFormatIdx(),
diff --git a/clang/unittests/AST/ASTImporterTest.cpp b/clang/unittests/AST/ASTImporterTest.cpp
index 4c06152d3eb56..9fa7660cde659 100644
--- a/clang/unittests/AST/ASTImporterTest.cpp
+++ b/clang/unittests/AST/ASTImporterTest.cpp
@@ -7445,67 +7445,46 @@ void ImportAttributes::checkImported<Decl>(const Decl *From, const Decl *To) {
             ToAST->getASTContext().getTranslationUnitDecl());
 }
 
-// FIXME: Use ImportAttributes for this test.
-TEST_P(ASTImporterOptionSpecificTestBase, ImportExprOfAlignmentAttr) {
-  // Test if import of these packed and aligned attributes does not trigger an
-  // error situation where source location from 'From' context is referenced in
-  // 'To' context through evaluation of the alignof attribute.
-  // This happens if the 'alignof(A)' expression is not imported correctly.
-  Decl *FromTU = getTuDecl(
+TEST_P(ImportAttributes, ImportAligned) {
+  AlignedAttr *FromAttr, *ToAttr;
+  importAttr<RecordDecl>(
       R"(
       struct __attribute__((packed)) A { int __attribute__((aligned(8))) X; };
-      struct alignas(alignof(A)) S {};
+      struct alignas(alignof(A)) test {};
       )",
-      Lang_CXX11, "input.cc");
-  auto *FromD = FirstDeclMatcher<CXXRecordDecl>().match(
-      FromTU, cxxRecordDecl(hasName("S"), unless(isImplicit())));
-  ASSERT_TRUE(FromD);
-
-  auto *ToD = Import(FromD, Lang_CXX11);
-  ASSERT_TRUE(ToD);
-
-  auto *FromAttr = FromD->getAttr<AlignedAttr>();
-  auto *ToAttr = ToD->getAttr<AlignedAttr>();
-  EXPECT_EQ(FromAttr->isInherited(), ToAttr->isInherited());
-  EXPECT_EQ(FromAttr->isPackExpansion(), ToAttr->isPackExpansion());
-  EXPECT_EQ(FromAttr->isImplicit(), ToAttr->isImplicit());
-  EXPECT_EQ(FromAttr->getSyntax(), ToAttr->getSyntax());
-  EXPECT_EQ(FromAttr->getSemanticSpelling(), ToAttr->getSemanticSpelling());
-  EXPECT_TRUE(ToAttr->getAlignmentExpr());
+      FromAttr, ToAttr);
+  checkImported(FromAttr->getAlignmentExpr(), ToAttr->getAlignmentExpr());
 
   auto *ToA = FirstDeclMatcher<CXXRecordDecl>().match(
-      ToD->getTranslationUnitDecl(),
+      ToAST->getASTContext().getTranslationUnitDecl(),
       cxxRecordDecl(hasName("A"), unless(isImplicit())));
   // Ensure that 'struct A' was imported (through reference from attribute of
-  // 'S').
+  // struct 'test').
   EXPECT_TRUE(ToA);
 }
 
-// FIXME: Use ImportAttributes for this test.
-TEST_P(ASTImporterOptionSpecificTestBase, ImportFormatAttr) {
-  Decl *FromTU = getTuDecl(
+TEST_P(ImportAttributes, ImportAlignValue) {
+  AlignValueAttr *FromAttr, *ToAttr;
+  importAttr<VarDecl>(
+      R"(
+      void *test __attribute__((align_value(64)));
+      )",
+      FromAttr, ToAttr);
+  checkImported(FromAttr->getAlignment(), ToAttr->getAlignment());
+}
+
+TEST_P(ImportAttributes, ImportFormat) {
+  FormatAttr *FromAttr, *ToAttr;
+  importAttr<FunctionDecl>(
       R"(
-      int foo(const char * fmt, ...)
+      int test(const char * fmt, ...)
       __attribute__ ((__format__ (__scanf__, 1, 2)));
       )",
-      Lang_CXX03, "input.cc");
-  auto *FromD = FirstDeclMatcher<FunctionDecl>().match(
-      FromTU, functionDecl(hasName("foo")));
-  ASSERT_TRUE(FromD);
+      FromAttr, ToAttr);
 
-  auto *ToD = Import(FromD, Lang_CXX03);
-  ASSERT_TRUE(ToD);
-  ToD->dump(); // Should not crash!
-
-  auto *FromAttr = FromD->getAttr<FormatAttr>();
-  auto *ToAttr = ToD->getAttr<FormatAttr>();
-  EXPECT_EQ(FromAttr->isInherited(), ToAttr->isInherited());
-  EXPECT_EQ(FromAttr->isPackExpansion(), ToAttr->isPackExpansion());
-  EXPECT_EQ(FromAttr->isImplicit(), ToAttr->isImplicit());
-  EXPECT_EQ(FromAttr->getSyntax(), ToAttr->getSyntax());
-  EXPECT_EQ(FromAttr->getAttributeSpellingListIndex(),
-            ToAttr->getAttributeSpellingListIndex());
   EXPECT_EQ(FromAttr->getType()->getName(), ToAttr->getType()->getName());
+  EXPECT_EQ(FromAttr->getFirstArg(), ToAttr->getFirstArg());
+  EXPECT_EQ(FromAttr->getFormatIdx(), ToAttr->getFormatIdx());
 }
 
 TEST_P(ImportAttributes, ImportEnableIf) {

From 3f199cb14c7c0fbfe0d33bc5a04f356f35259bf4 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Fri, 22 Dec 2023 10:11:51 +0100
Subject: [PATCH 158/342] [SROA] Add test for #64081 (NFC)

---
 llvm/test/Transforms/SROA/pr64081.ll | 33 ++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)
 create mode 100644 llvm/test/Transforms/SROA/pr64081.ll

diff --git a/llvm/test/Transforms/SROA/pr64081.ll b/llvm/test/Transforms/SROA/pr64081.ll
new file mode 100644
index 0000000000000..e03e5c497b5fb
--- /dev/null
+++ b/llvm/test/Transforms/SROA/pr64081.ll
@@ -0,0 +1,33 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S -passes=sroa < %s | FileCheck %s
+
+%B = type { i1, i3 }
+
+; FIXME: This is a miscompile.
+define void @test(i7 %x) {
+; CHECK-LABEL: define void @test(
+; CHECK-SAME: i7 [[X:%.*]]) {
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[RES:%.*]] = alloca [2 x i8], align 1
+; CHECK-NEXT:    [[TMP_SROA_1:%.*]] = alloca i3, align 1
+; CHECK-NEXT:    store i7 [[X]], ptr [[TMP_SROA_1]], align 1
+; CHECK-NEXT:    store i1 undef, ptr [[RES]], align 1
+; CHECK-NEXT:    [[TMP_SROA_1_0_RES_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[RES]], i64 1
+; CHECK-NEXT:    [[TMP_SROA_1_0_TMP_SROA_1_0_COPYLOAD:%.*]] = load i3, ptr [[TMP_SROA_1]], align 1
+; CHECK-NEXT:    store i3 [[TMP_SROA_1_0_TMP_SROA_1_0_COPYLOAD]], ptr [[TMP_SROA_1_0_RES_SROA_IDX]], align 1
+; CHECK-NEXT:    [[TMP0:%.*]] = call i8 @use(ptr [[RES]])
+; CHECK-NEXT:    ret void
+;
+bb:
+  %res = alloca [2 x i8]
+  %tmp = alloca { i1, i3 }
+  %tmp.1 = getelementptr i8, ptr %tmp, i64 1
+  store i7 %x, ptr %tmp.1
+  call void @llvm.memcpy.p0.p0.i64(ptr %res, ptr %tmp, i64 2, i1 false)
+  call i8 @use(ptr %res)
+  ret void
+}
+
+declare void @use(ptr)
+
+declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg)

From 54067c5fbe9fc13ab195cdddb8f17e18d72b5fe4 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Fri, 22 Dec 2023 10:14:21 +0100
Subject: [PATCH 159/342] [SROA] Use memcpy if type size does not match store
 size

The original memcpy also copies the padding, so make sure that
this is still the case after splitting.

Fixes https://github.com/llvm/llvm-project/issues/64081.
---
 llvm/lib/Transforms/Scalar/SROA.cpp  | 1 +
 llvm/test/Transforms/SROA/pr64081.ll | 7 +++----
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index 24da26c9f0f25..656abdb0abbff 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -3285,6 +3285,7 @@ class AllocaSliceRewriter : public InstVisitor<AllocaSliceRewriter, bool> {
         (BeginOffset > NewAllocaBeginOffset || EndOffset < NewAllocaEndOffset ||
          SliceSize !=
              DL.getTypeStoreSize(NewAI.getAllocatedType()).getFixedValue() ||
+         !DL.typeSizeEqualsStoreSize(NewAI.getAllocatedType()) ||
          !NewAI.getAllocatedType()->isSingleValueType());
 
     // If we're just going to emit a memcpy, the alloca hasn't changed, and the
diff --git a/llvm/test/Transforms/SROA/pr64081.ll b/llvm/test/Transforms/SROA/pr64081.ll
index e03e5c497b5fb..4b89384213826 100644
--- a/llvm/test/Transforms/SROA/pr64081.ll
+++ b/llvm/test/Transforms/SROA/pr64081.ll
@@ -3,18 +3,17 @@
 
 %B = type { i1, i3 }
 
-; FIXME: This is a miscompile.
 define void @test(i7 %x) {
 ; CHECK-LABEL: define void @test(
 ; CHECK-SAME: i7 [[X:%.*]]) {
 ; CHECK-NEXT:  bb:
 ; CHECK-NEXT:    [[RES:%.*]] = alloca [2 x i8], align 1
+; CHECK-NEXT:    [[TMP_SROA_0:%.*]] = alloca i1, align 8
 ; CHECK-NEXT:    [[TMP_SROA_1:%.*]] = alloca i3, align 1
 ; CHECK-NEXT:    store i7 [[X]], ptr [[TMP_SROA_1]], align 1
-; CHECK-NEXT:    store i1 undef, ptr [[RES]], align 1
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[RES]], ptr align 8 [[TMP_SROA_0]], i64 1, i1 false)
 ; CHECK-NEXT:    [[TMP_SROA_1_0_RES_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[RES]], i64 1
-; CHECK-NEXT:    [[TMP_SROA_1_0_TMP_SROA_1_0_COPYLOAD:%.*]] = load i3, ptr [[TMP_SROA_1]], align 1
-; CHECK-NEXT:    store i3 [[TMP_SROA_1_0_TMP_SROA_1_0_COPYLOAD]], ptr [[TMP_SROA_1_0_RES_SROA_IDX]], align 1
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[TMP_SROA_1_0_RES_SROA_IDX]], ptr align 1 [[TMP_SROA_1]], i64 1, i1 false)
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i8 @use(ptr [[RES]])
 ; CHECK-NEXT:    ret void
 ;

From 0e46b49de43349f8cbb2a7d4c6badef6d16e31ae Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Fri, 22 Dec 2023 15:44:09 +0700
Subject: [PATCH 160/342] Reapply "RegisterCoalescer: Add implicit-def of super
 register when coalescing SUBREG_TO_REG"

This reverts commit c398fa009a47eb24f88383d5e911e59e70f8db86.

PPC backend was fixed in 2f82662ce901c6666fceb9c6c5e0de216a1c9667
---
 llvm/lib/CodeGen/RegisterCoalescer.cpp        |  51 ++-
 .../AArch64/GlobalISel/arm64-pcsections.ll    |  92 ++---
 ...coalescer-breaks-subreg-to-reg-liveness.ll | 185 ++++++++++
 ...icit-def-regression-imp-operand-assert.mir |   4 +-
 ...subreg-to-reg-requires-subrange-update.mir |  47 +++
 .../CodeGen/X86/subreg-to-reg-coalescing.mir  | 348 ++++++++++++++++++
 6 files changed, 670 insertions(+), 57 deletions(-)
 create mode 100644 llvm/test/CodeGen/X86/coalescer-breaks-subreg-to-reg-liveness.ll
 create mode 100644 llvm/test/CodeGen/X86/coalescing-subreg-to-reg-requires-subrange-update.mir
 create mode 100644 llvm/test/CodeGen/X86/subreg-to-reg-coalescing.mir

diff --git a/llvm/lib/CodeGen/RegisterCoalescer.cpp b/llvm/lib/CodeGen/RegisterCoalescer.cpp
index c1af37c8510ff..397fff5263426 100644
--- a/llvm/lib/CodeGen/RegisterCoalescer.cpp
+++ b/llvm/lib/CodeGen/RegisterCoalescer.cpp
@@ -305,7 +305,11 @@ namespace {
     /// number if it is not zero. If DstReg is a physical register and the
     /// existing subregister number of the def / use being updated is not zero,
     /// make sure to set it to the correct physical subregister.
-    void updateRegDefsUses(Register SrcReg, Register DstReg, unsigned SubIdx);
+    ///
+    /// If \p IsSubregToReg, we are coalescing a DstReg = SUBREG_TO_REG
+    /// SrcReg. This introduces an implicit-def of DstReg on coalesced users.
+    void updateRegDefsUses(Register SrcReg, Register DstReg, unsigned SubIdx,
+                           bool IsSubregToReg);
 
     /// If the given machine operand reads only undefined lanes add an undef
     /// flag.
@@ -1343,8 +1347,7 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP,
     if (DstReg.isPhysical()) {
       Register NewDstReg = DstReg;
 
-      unsigned NewDstIdx = TRI->composeSubRegIndices(CP.getSrcIdx(),
-                                              DefMI->getOperand(0).getSubReg());
+      unsigned NewDstIdx = TRI->composeSubRegIndices(CP.getSrcIdx(), DefSubIdx);
       if (NewDstIdx)
         NewDstReg = TRI->getSubReg(DstReg, NewDstIdx);
 
@@ -1493,7 +1496,7 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP,
     MRI->setRegClass(DstReg, NewRC);
 
     // Update machine operands and add flags.
-    updateRegDefsUses(DstReg, DstReg, DstIdx);
+    updateRegDefsUses(DstReg, DstReg, DstIdx, false);
     NewMI.getOperand(0).setSubReg(NewIdx);
     // updateRegDefUses can add an "undef" flag to the definition, since
     // it will replace DstReg with DstReg.DstIdx. If NewIdx is 0, make
@@ -1814,7 +1817,7 @@ void RegisterCoalescer::addUndefFlag(const LiveInterval &Int, SlotIndex UseIdx,
 }
 
 void RegisterCoalescer::updateRegDefsUses(Register SrcReg, Register DstReg,
-                                          unsigned SubIdx) {
+                                          unsigned SubIdx, bool IsSubregToReg) {
   bool DstIsPhys = DstReg.isPhysical();
   LiveInterval *DstInt = DstIsPhys ? nullptr : &LIS->getInterval(DstReg);
 
@@ -1854,6 +1857,8 @@ void RegisterCoalescer::updateRegDefsUses(Register SrcReg, Register DstReg,
     if (DstInt && !Reads && SubIdx && !UseMI->isDebugInstr())
       Reads = DstInt->liveAt(LIS->getInstructionIndex(*UseMI));
 
+    bool FullDef = true;
+
     // Replace SrcReg with DstReg in all UseMI operands.
     for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
       MachineOperand &MO = UseMI->getOperand(Ops[i]);
@@ -1861,9 +1866,13 @@ void RegisterCoalescer::updateRegDefsUses(Register SrcReg, Register DstReg,
       // Adjust <undef> flags in case of sub-register joins. We don't want to
       // turn a full def into a read-modify-write sub-register def and vice
       // versa.
-      if (SubIdx && MO.isDef())
+      if (SubIdx && MO.isDef()) {
         MO.setIsUndef(!Reads);
 
+        if (!Reads)
+          FullDef = false;
+      }
+
       // A subreg use of a partially undef (super) register may be a complete
       // undef use now and then has to be marked that way.
       if (MO.isUse() && !DstIsPhys) {
@@ -1895,6 +1904,25 @@ void RegisterCoalescer::updateRegDefsUses(Register SrcReg, Register DstReg,
         MO.substVirtReg(DstReg, SubIdx, *TRI);
     }
 
+    if (IsSubregToReg && !FullDef) {
+      // If the coalesed instruction doesn't fully define the register, we need
+      // to preserve the original super register liveness for SUBREG_TO_REG.
+      //
+      // We pretended SUBREG_TO_REG was a regular copy for coalescing purposes,
+      // but it introduces liveness for other subregisters. Downstream users may
+      // have been relying on those bits, so we need to ensure their liveness is
+      // captured with a def of other lanes.
+
+      // FIXME: Need to add new subrange if tracking subranges. We could also
+      // skip adding this if we knew the other lanes are dead, and only for
+      // other lanes.
+
+      assert(!MRI->shouldTrackSubRegLiveness(DstReg) &&
+             "this should update subranges");
+      MachineInstrBuilder MIB(*MF, UseMI);
+      MIB.addReg(DstReg, RegState::ImplicitDefine);
+    }
+
     LLVM_DEBUG({
       dbgs() << "\t\tupdated: ";
       if (!UseMI->isDebugInstr())
@@ -2094,6 +2122,8 @@ bool RegisterCoalescer::joinCopy(MachineInstr *CopyMI, bool &Again) {
     });
   }
 
+  const bool IsSubregToReg = CopyMI->isSubregToReg();
+
   ShrinkMask = LaneBitmask::getNone();
   ShrinkMainRange = false;
 
@@ -2161,9 +2191,12 @@ bool RegisterCoalescer::joinCopy(MachineInstr *CopyMI, bool &Again) {
 
   // Rewrite all SrcReg operands to DstReg.
   // Also update DstReg operands to include DstIdx if it is set.
-  if (CP.getDstIdx())
-    updateRegDefsUses(CP.getDstReg(), CP.getDstReg(), CP.getDstIdx());
-  updateRegDefsUses(CP.getSrcReg(), CP.getDstReg(), CP.getSrcIdx());
+  if (CP.getDstIdx()) {
+    assert(!IsSubregToReg && "can this happen?");
+    updateRegDefsUses(CP.getDstReg(), CP.getDstReg(), CP.getDstIdx(), false);
+  }
+  updateRegDefsUses(CP.getSrcReg(), CP.getDstReg(), CP.getSrcIdx(),
+                    IsSubregToReg);
 
   // Shrink subregister ranges if necessary.
   if (ShrinkMask.any()) {
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-pcsections.ll b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-pcsections.ll
index 4c07081404c88..8529dd388ba0f 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-pcsections.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-pcsections.ll
@@ -13,7 +13,7 @@ define i32 @val_compare_and_swap(ptr %p, i32 %cmp, i32 %new) {
   ; CHECK-NEXT:   successors: %bb.2(0x7c000000), %bb.3(0x04000000)
   ; CHECK-NEXT:   liveins: $w1, $w2, $x0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $w8 = LDAXRW renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s32) from %ir.p)
+  ; CHECK-NEXT:   renamable $w8 = LDAXRW renamable $x0, implicit-def renamable $x8, pcsections !0 :: (volatile load (s32) from %ir.p)
   ; CHECK-NEXT:   $wzr = SUBSWrs renamable $w8, renamable $w1, 0, implicit-def $nzcv, pcsections !0
   ; CHECK-NEXT:   Bcc 1, %bb.3, implicit killed $nzcv, pcsections !0
   ; CHECK-NEXT: {{  $}}
@@ -47,13 +47,13 @@ define i32 @val_compare_and_swap_from_load(ptr %p, i32 %cmp, ptr %pnew) {
   ; CHECK-NEXT:   successors: %bb.1(0x80000000)
   ; CHECK-NEXT:   liveins: $w1, $x0, $x2
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $w9 = LDRWui killed renamable $x2, 0, implicit-def $x9, pcsections !0 :: (load (s32) from %ir.pnew)
+  ; CHECK-NEXT:   renamable $w9 = LDRWui killed renamable $x2, 0, implicit-def renamable $x9, pcsections !0 :: (load (s32) from %ir.pnew)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1.cmpxchg.start:
   ; CHECK-NEXT:   successors: %bb.2(0x7c000000), %bb.3(0x04000000)
   ; CHECK-NEXT:   liveins: $w1, $x0, $x9
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $w8 = LDAXRW renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s32) from %ir.p)
+  ; CHECK-NEXT:   renamable $w8 = LDAXRW renamable $x0, implicit-def renamable $x8, pcsections !0 :: (volatile load (s32) from %ir.p)
   ; CHECK-NEXT:   $wzr = SUBSWrs renamable $w8, renamable $w1, 0, implicit-def $nzcv, pcsections !0
   ; CHECK-NEXT:   Bcc 1, %bb.3, implicit killed $nzcv, pcsections !0
   ; CHECK-NEXT: {{  $}}
@@ -93,7 +93,7 @@ define i32 @val_compare_and_swap_rel(ptr %p, i32 %cmp, i32 %new) {
   ; CHECK-NEXT:   successors: %bb.2(0x7c000000), %bb.3(0x04000000)
   ; CHECK-NEXT:   liveins: $w1, $w2, $x0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $w8 = LDAXRW renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s32) from %ir.p)
+  ; CHECK-NEXT:   renamable $w8 = LDAXRW renamable $x0, implicit-def renamable $x8, pcsections !0 :: (volatile load (s32) from %ir.p)
   ; CHECK-NEXT:   $wzr = SUBSWrs renamable $w8, renamable $w1, 0, implicit-def $nzcv, pcsections !0
   ; CHECK-NEXT:   Bcc 1, %bb.3, implicit killed $nzcv, pcsections !0
   ; CHECK-NEXT: {{  $}}
@@ -249,7 +249,7 @@ define i32 @fetch_and_nand(ptr %p) {
   ; CHECK-NEXT:   successors: %bb.1(0x7c000000), %bb.2(0x04000000)
   ; CHECK-NEXT:   liveins: $x0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $w8 = LDXRW renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s32) from %ir.p)
+  ; CHECK-NEXT:   renamable $w8 = LDXRW renamable $x0, implicit-def renamable $x8, pcsections !0 :: (volatile load (s32) from %ir.p)
   ; CHECK-NEXT:   renamable $w9 = ANDWri renamable $w8, 2, pcsections !0
   ; CHECK-NEXT:   $w9 = ORNWrs $wzr, killed renamable $w9, 0, pcsections !0
   ; CHECK-NEXT:   early-clobber renamable $w10 = STLXRW killed renamable $w9, renamable $x0, pcsections !0 :: (volatile store (s32) into %ir.p)
@@ -302,7 +302,7 @@ define i32 @fetch_and_or(ptr %p) {
   ; CHECK-NEXT:   successors: %bb.1(0x7c000000), %bb.2(0x04000000)
   ; CHECK-NEXT:   liveins: $w9, $x0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $w8 = LDAXRW renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s32) from %ir.p)
+  ; CHECK-NEXT:   renamable $w8 = LDAXRW renamable $x0, implicit-def renamable $x8, pcsections !0 :: (volatile load (s32) from %ir.p)
   ; CHECK-NEXT:   $w10 = ORRWrs renamable $w8, renamable $w9, 0, pcsections !0
   ; CHECK-NEXT:   early-clobber renamable $w11 = STLXRW killed renamable $w10, renamable $x0, pcsections !0 :: (volatile store (s32) into %ir.p)
   ; CHECK-NEXT:   CBNZW killed renamable $w11, %bb.1, pcsections !0
@@ -735,8 +735,8 @@ define i8 @atomicrmw_add_i8(ptr %ptr, i8 %rhs) {
   ; CHECK-NEXT:   successors: %bb.1(0x7c000000), %bb.2(0x04000000)
   ; CHECK-NEXT:   liveins: $w1, $x0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $w8 = LDAXRB renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s8) from %ir.ptr)
-  ; CHECK-NEXT:   $w9 = ADDWrs renamable $w8, renamable $w1, 0, implicit-def $x9, pcsections !0
+  ; CHECK-NEXT:   renamable $w8 = LDAXRB renamable $x0, implicit-def renamable $x8, pcsections !0 :: (volatile load (s8) from %ir.ptr)
+  ; CHECK-NEXT:   $w9 = ADDWrs renamable $w8, renamable $w1, 0, implicit-def renamable $x9, pcsections !0
   ; CHECK-NEXT:   early-clobber renamable $w10 = STLXRB renamable $w9, renamable $x0, implicit killed $x9, pcsections !0 :: (volatile store (s8) into %ir.ptr)
   ; CHECK-NEXT:   CBNZW killed renamable $w10, %bb.1, pcsections !0
   ; CHECK-NEXT: {{  $}}
@@ -761,7 +761,7 @@ define i8 @atomicrmw_xchg_i8(ptr %ptr, i8 %rhs) {
   ; CHECK-NEXT:   successors: %bb.1(0x7c000000), %bb.2(0x04000000)
   ; CHECK-NEXT:   liveins: $x0, $x1
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $w8 = LDXRB renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s8) from %ir.ptr)
+  ; CHECK-NEXT:   renamable $w8 = LDXRB renamable $x0, implicit-def renamable $x8, pcsections !0 :: (volatile load (s8) from %ir.ptr)
   ; CHECK-NEXT:   early-clobber renamable $w9 = STXRB renamable $w1, renamable $x0, pcsections !0 :: (volatile store (s8) into %ir.ptr)
   ; CHECK-NEXT:   CBNZW killed renamable $w9, %bb.1, pcsections !0
   ; CHECK-NEXT: {{  $}}
@@ -785,8 +785,8 @@ define i8 @atomicrmw_sub_i8(ptr %ptr, i8 %rhs) {
   ; CHECK-NEXT:   successors: %bb.1(0x7c000000), %bb.2(0x04000000)
   ; CHECK-NEXT:   liveins: $w1, $x0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $w8 = LDAXRB renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s8) from %ir.ptr)
-  ; CHECK-NEXT:   $w9 = SUBWrs renamable $w8, renamable $w1, 0, implicit-def $x9, pcsections !0
+  ; CHECK-NEXT:   renamable $w8 = LDAXRB renamable $x0, implicit-def renamable $x8, pcsections !0 :: (volatile load (s8) from %ir.ptr)
+  ; CHECK-NEXT:   $w9 = SUBWrs renamable $w8, renamable $w1, 0, implicit-def renamable $x9, pcsections !0
   ; CHECK-NEXT:   early-clobber renamable $w10 = STXRB renamable $w9, renamable $x0, implicit killed $x9, pcsections !0 :: (volatile store (s8) into %ir.ptr)
   ; CHECK-NEXT:   CBNZW killed renamable $w10, %bb.1, pcsections !0
   ; CHECK-NEXT: {{  $}}
@@ -810,8 +810,8 @@ define i8 @atomicrmw_and_i8(ptr %ptr, i8 %rhs) {
   ; CHECK-NEXT:   successors: %bb.1(0x7c000000), %bb.2(0x04000000)
   ; CHECK-NEXT:   liveins: $w1, $x0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $w8 = LDXRB renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s8) from %ir.ptr)
-  ; CHECK-NEXT:   $w9 = ANDWrs renamable $w8, renamable $w1, 0, implicit-def $x9, pcsections !0
+  ; CHECK-NEXT:   renamable $w8 = LDXRB renamable $x0, implicit-def renamable $x8, pcsections !0 :: (volatile load (s8) from %ir.ptr)
+  ; CHECK-NEXT:   $w9 = ANDWrs renamable $w8, renamable $w1, 0, implicit-def renamable $x9, pcsections !0
   ; CHECK-NEXT:   early-clobber renamable $w10 = STLXRB renamable $w9, renamable $x0, implicit killed $x9, pcsections !0 :: (volatile store (s8) into %ir.ptr)
   ; CHECK-NEXT:   CBNZW killed renamable $w10, %bb.1, pcsections !0
   ; CHECK-NEXT: {{  $}}
@@ -835,8 +835,8 @@ define i8 @atomicrmw_or_i8(ptr %ptr, i8 %rhs) {
   ; CHECK-NEXT:   successors: %bb.1(0x7c000000), %bb.2(0x04000000)
   ; CHECK-NEXT:   liveins: $w1, $x0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $w8 = LDAXRB renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s8) from %ir.ptr)
-  ; CHECK-NEXT:   $w9 = ORRWrs renamable $w8, renamable $w1, 0, implicit-def $x9, pcsections !0
+  ; CHECK-NEXT:   renamable $w8 = LDAXRB renamable $x0, implicit-def renamable $x8, pcsections !0 :: (volatile load (s8) from %ir.ptr)
+  ; CHECK-NEXT:   $w9 = ORRWrs renamable $w8, renamable $w1, 0, implicit-def renamable $x9, pcsections !0
   ; CHECK-NEXT:   early-clobber renamable $w10 = STLXRB renamable $w9, renamable $x0, implicit killed $x9, pcsections !0 :: (volatile store (s8) into %ir.ptr)
   ; CHECK-NEXT:   CBNZW killed renamable $w10, %bb.1, pcsections !0
   ; CHECK-NEXT: {{  $}}
@@ -860,8 +860,8 @@ define i8 @atomicrmw_xor_i8(ptr %ptr, i8 %rhs) {
   ; CHECK-NEXT:   successors: %bb.1(0x7c000000), %bb.2(0x04000000)
   ; CHECK-NEXT:   liveins: $w1, $x0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $w8 = LDXRB renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s8) from %ir.ptr)
-  ; CHECK-NEXT:   $w9 = EORWrs renamable $w8, renamable $w1, 0, implicit-def $x9, pcsections !0
+  ; CHECK-NEXT:   renamable $w8 = LDXRB renamable $x0, implicit-def renamable $x8, pcsections !0 :: (volatile load (s8) from %ir.ptr)
+  ; CHECK-NEXT:   $w9 = EORWrs renamable $w8, renamable $w1, 0, implicit-def renamable $x9, pcsections !0
   ; CHECK-NEXT:   early-clobber renamable $w10 = STXRB renamable $w9, renamable $x0, implicit killed $x9, pcsections !0 :: (volatile store (s8) into %ir.ptr)
   ; CHECK-NEXT:   CBNZW killed renamable $w10, %bb.1, pcsections !0
   ; CHECK-NEXT: {{  $}}
@@ -885,10 +885,10 @@ define i8 @atomicrmw_min_i8(ptr %ptr, i8 %rhs) {
   ; CHECK-NEXT:   successors: %bb.1(0x7c000000), %bb.2(0x04000000)
   ; CHECK-NEXT:   liveins: $w1, $x0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $w8 = LDAXRB renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s8) from %ir.ptr)
+  ; CHECK-NEXT:   renamable $w8 = LDAXRB renamable $x0, implicit-def renamable $x8, pcsections !0 :: (volatile load (s8) from %ir.ptr)
   ; CHECK-NEXT:   renamable $w9 = SBFMWri renamable $w8, 0, 7, pcsections !0
   ; CHECK-NEXT:   dead $wzr = SUBSWrx killed renamable $w9, renamable $w1, 32, implicit-def $nzcv, pcsections !0
-  ; CHECK-NEXT:   renamable $w9 = CSELWr renamable $w8, renamable $w1, 13, implicit killed $nzcv, implicit-def $x9, pcsections !0
+  ; CHECK-NEXT:   renamable $w9 = CSELWr renamable $w8, renamable $w1, 13, implicit killed $nzcv, implicit-def renamable $x9, pcsections !0
   ; CHECK-NEXT:   early-clobber renamable $w10 = STXRB renamable $w9, renamable $x0, implicit killed $x9, pcsections !0 :: (volatile store (s8) into %ir.ptr)
   ; CHECK-NEXT:   CBNZW killed renamable $w10, %bb.1, pcsections !0
   ; CHECK-NEXT: {{  $}}
@@ -912,10 +912,10 @@ define i8 @atomicrmw_max_i8(ptr %ptr, i8 %rhs) {
   ; CHECK-NEXT:   successors: %bb.1(0x7c000000), %bb.2(0x04000000)
   ; CHECK-NEXT:   liveins: $w1, $x0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $w8 = LDXRB renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s8) from %ir.ptr)
+  ; CHECK-NEXT:   renamable $w8 = LDXRB renamable $x0, implicit-def renamable $x8, pcsections !0 :: (volatile load (s8) from %ir.ptr)
   ; CHECK-NEXT:   renamable $w9 = SBFMWri renamable $w8, 0, 7, pcsections !0
   ; CHECK-NEXT:   dead $wzr = SUBSWrx killed renamable $w9, renamable $w1, 32, implicit-def $nzcv, pcsections !0
-  ; CHECK-NEXT:   renamable $w9 = CSELWr renamable $w8, renamable $w1, 12, implicit killed $nzcv, implicit-def $x9, pcsections !0
+  ; CHECK-NEXT:   renamable $w9 = CSELWr renamable $w8, renamable $w1, 12, implicit killed $nzcv, implicit-def renamable $x9, pcsections !0
   ; CHECK-NEXT:   early-clobber renamable $w10 = STLXRB renamable $w9, renamable $x0, implicit killed $x9, pcsections !0 :: (volatile store (s8) into %ir.ptr)
   ; CHECK-NEXT:   CBNZW killed renamable $w10, %bb.1, pcsections !0
   ; CHECK-NEXT: {{  $}}
@@ -940,10 +940,10 @@ define i8 @atomicrmw_umin_i8(ptr %ptr, i8 %rhs) {
   ; CHECK-NEXT:   successors: %bb.1(0x7c000000), %bb.2(0x04000000)
   ; CHECK-NEXT:   liveins: $w9, $x0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $w8 = LDAXRB renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s8) from %ir.ptr)
+  ; CHECK-NEXT:   renamable $w8 = LDAXRB renamable $x0, implicit-def renamable $x8, pcsections !0 :: (volatile load (s8) from %ir.ptr)
   ; CHECK-NEXT:   renamable $w10 = ANDWri renamable $w8, 7
   ; CHECK-NEXT:   $wzr = SUBSWrs renamable $w10, renamable $w9, 0, implicit-def $nzcv, pcsections !0
-  ; CHECK-NEXT:   renamable $w10 = CSELWr killed renamable $w10, renamable $w9, 9, implicit killed $nzcv, implicit-def $x10, pcsections !0
+  ; CHECK-NEXT:   renamable $w10 = CSELWr killed renamable $w10, renamable $w9, 9, implicit killed $nzcv, implicit-def renamable $x10, pcsections !0
   ; CHECK-NEXT:   early-clobber renamable $w11 = STLXRB renamable $w10, renamable $x0, implicit killed $x10, pcsections !0 :: (volatile store (s8) into %ir.ptr)
   ; CHECK-NEXT:   CBNZW killed renamable $w11, %bb.1, pcsections !0
   ; CHECK-NEXT: {{  $}}
@@ -968,10 +968,10 @@ define i8 @atomicrmw_umax_i8(ptr %ptr, i8 %rhs) {
   ; CHECK-NEXT:   successors: %bb.1(0x7c000000), %bb.2(0x04000000)
   ; CHECK-NEXT:   liveins: $w9, $x0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $w8 = LDXRB renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s8) from %ir.ptr)
+  ; CHECK-NEXT:   renamable $w8 = LDXRB renamable $x0, implicit-def renamable $x8, pcsections !0 :: (volatile load (s8) from %ir.ptr)
   ; CHECK-NEXT:   renamable $w10 = ANDWri renamable $w8, 7
   ; CHECK-NEXT:   $wzr = SUBSWrs renamable $w10, renamable $w9, 0, implicit-def $nzcv, pcsections !0
-  ; CHECK-NEXT:   renamable $w10 = CSELWr killed renamable $w10, renamable $w9, 8, implicit killed $nzcv, implicit-def $x10, pcsections !0
+  ; CHECK-NEXT:   renamable $w10 = CSELWr killed renamable $w10, renamable $w9, 8, implicit killed $nzcv, implicit-def renamable $x10, pcsections !0
   ; CHECK-NEXT:   early-clobber renamable $w11 = STXRB renamable $w10, renamable $x0, implicit killed $x10, pcsections !0 :: (volatile store (s8) into %ir.ptr)
   ; CHECK-NEXT:   CBNZW killed renamable $w11, %bb.1, pcsections !0
   ; CHECK-NEXT: {{  $}}
@@ -995,8 +995,8 @@ define i16 @atomicrmw_add_i16(ptr %ptr, i16 %rhs) {
   ; CHECK-NEXT:   successors: %bb.1(0x7c000000), %bb.2(0x04000000)
   ; CHECK-NEXT:   liveins: $w1, $x0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $w8 = LDAXRH renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s16) from %ir.ptr)
-  ; CHECK-NEXT:   $w9 = ADDWrs renamable $w8, renamable $w1, 0, implicit-def $x9, pcsections !0
+  ; CHECK-NEXT:   renamable $w8 = LDAXRH renamable $x0, implicit-def renamable $x8, pcsections !0 :: (volatile load (s16) from %ir.ptr)
+  ; CHECK-NEXT:   $w9 = ADDWrs renamable $w8, renamable $w1, 0, implicit-def renamable $x9, pcsections !0
   ; CHECK-NEXT:   early-clobber renamable $w10 = STLXRH renamable $w9, renamable $x0, implicit killed $x9, pcsections !0 :: (volatile store (s16) into %ir.ptr)
   ; CHECK-NEXT:   CBNZW killed renamable $w10, %bb.1, pcsections !0
   ; CHECK-NEXT: {{  $}}
@@ -1021,7 +1021,7 @@ define i16 @atomicrmw_xchg_i16(ptr %ptr, i16 %rhs) {
   ; CHECK-NEXT:   successors: %bb.1(0x7c000000), %bb.2(0x04000000)
   ; CHECK-NEXT:   liveins: $x0, $x1
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $w8 = LDXRH renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s16) from %ir.ptr)
+  ; CHECK-NEXT:   renamable $w8 = LDXRH renamable $x0, implicit-def renamable $x8, pcsections !0 :: (volatile load (s16) from %ir.ptr)
   ; CHECK-NEXT:   early-clobber renamable $w9 = STXRH renamable $w1, renamable $x0, pcsections !0 :: (volatile store (s16) into %ir.ptr)
   ; CHECK-NEXT:   CBNZW killed renamable $w9, %bb.1, pcsections !0
   ; CHECK-NEXT: {{  $}}
@@ -1045,8 +1045,8 @@ define i16 @atomicrmw_sub_i16(ptr %ptr, i16 %rhs) {
   ; CHECK-NEXT:   successors: %bb.1(0x7c000000), %bb.2(0x04000000)
   ; CHECK-NEXT:   liveins: $w1, $x0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $w8 = LDAXRH renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s16) from %ir.ptr)
-  ; CHECK-NEXT:   $w9 = SUBWrs renamable $w8, renamable $w1, 0, implicit-def $x9, pcsections !0
+  ; CHECK-NEXT:   renamable $w8 = LDAXRH renamable $x0, implicit-def renamable $x8, pcsections !0 :: (volatile load (s16) from %ir.ptr)
+  ; CHECK-NEXT:   $w9 = SUBWrs renamable $w8, renamable $w1, 0, implicit-def renamable $x9, pcsections !0
   ; CHECK-NEXT:   early-clobber renamable $w10 = STXRH renamable $w9, renamable $x0, implicit killed $x9, pcsections !0 :: (volatile store (s16) into %ir.ptr)
   ; CHECK-NEXT:   CBNZW killed renamable $w10, %bb.1, pcsections !0
   ; CHECK-NEXT: {{  $}}
@@ -1070,8 +1070,8 @@ define i16 @atomicrmw_and_i16(ptr %ptr, i16 %rhs) {
   ; CHECK-NEXT:   successors: %bb.1(0x7c000000), %bb.2(0x04000000)
   ; CHECK-NEXT:   liveins: $w1, $x0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $w8 = LDXRH renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s16) from %ir.ptr)
-  ; CHECK-NEXT:   $w9 = ANDWrs renamable $w8, renamable $w1, 0, implicit-def $x9, pcsections !0
+  ; CHECK-NEXT:   renamable $w8 = LDXRH renamable $x0, implicit-def renamable $x8, pcsections !0 :: (volatile load (s16) from %ir.ptr)
+  ; CHECK-NEXT:   $w9 = ANDWrs renamable $w8, renamable $w1, 0, implicit-def renamable $x9, pcsections !0
   ; CHECK-NEXT:   early-clobber renamable $w10 = STLXRH renamable $w9, renamable $x0, implicit killed $x9, pcsections !0 :: (volatile store (s16) into %ir.ptr)
   ; CHECK-NEXT:   CBNZW killed renamable $w10, %bb.1, pcsections !0
   ; CHECK-NEXT: {{  $}}
@@ -1095,8 +1095,8 @@ define i16 @atomicrmw_or_i16(ptr %ptr, i16 %rhs) {
   ; CHECK-NEXT:   successors: %bb.1(0x7c000000), %bb.2(0x04000000)
   ; CHECK-NEXT:   liveins: $w1, $x0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $w8 = LDAXRH renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s16) from %ir.ptr)
-  ; CHECK-NEXT:   $w9 = ORRWrs renamable $w8, renamable $w1, 0, implicit-def $x9, pcsections !0
+  ; CHECK-NEXT:   renamable $w8 = LDAXRH renamable $x0, implicit-def renamable $x8, pcsections !0 :: (volatile load (s16) from %ir.ptr)
+  ; CHECK-NEXT:   $w9 = ORRWrs renamable $w8, renamable $w1, 0, implicit-def renamable $x9, pcsections !0
   ; CHECK-NEXT:   early-clobber renamable $w10 = STLXRH renamable $w9, renamable $x0, implicit killed $x9, pcsections !0 :: (volatile store (s16) into %ir.ptr)
   ; CHECK-NEXT:   CBNZW killed renamable $w10, %bb.1, pcsections !0
   ; CHECK-NEXT: {{  $}}
@@ -1120,8 +1120,8 @@ define i16 @atomicrmw_xor_i16(ptr %ptr, i16 %rhs) {
   ; CHECK-NEXT:   successors: %bb.1(0x7c000000), %bb.2(0x04000000)
   ; CHECK-NEXT:   liveins: $w1, $x0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $w8 = LDXRH renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s16) from %ir.ptr)
-  ; CHECK-NEXT:   $w9 = EORWrs renamable $w8, renamable $w1, 0, implicit-def $x9, pcsections !0
+  ; CHECK-NEXT:   renamable $w8 = LDXRH renamable $x0, implicit-def renamable $x8, pcsections !0 :: (volatile load (s16) from %ir.ptr)
+  ; CHECK-NEXT:   $w9 = EORWrs renamable $w8, renamable $w1, 0, implicit-def renamable $x9, pcsections !0
   ; CHECK-NEXT:   early-clobber renamable $w10 = STXRH renamable $w9, renamable $x0, implicit killed $x9, pcsections !0 :: (volatile store (s16) into %ir.ptr)
   ; CHECK-NEXT:   CBNZW killed renamable $w10, %bb.1, pcsections !0
   ; CHECK-NEXT: {{  $}}
@@ -1145,10 +1145,10 @@ define i16 @atomicrmw_min_i16(ptr %ptr, i16 %rhs) {
   ; CHECK-NEXT:   successors: %bb.1(0x7c000000), %bb.2(0x04000000)
   ; CHECK-NEXT:   liveins: $w1, $x0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $w8 = LDAXRH renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s16) from %ir.ptr)
+  ; CHECK-NEXT:   renamable $w8 = LDAXRH renamable $x0, implicit-def renamable $x8, pcsections !0 :: (volatile load (s16) from %ir.ptr)
   ; CHECK-NEXT:   renamable $w9 = SBFMWri renamable $w8, 0, 15, pcsections !0
   ; CHECK-NEXT:   dead $wzr = SUBSWrx killed renamable $w9, renamable $w1, 40, implicit-def $nzcv, pcsections !0
-  ; CHECK-NEXT:   renamable $w9 = CSELWr renamable $w8, renamable $w1, 13, implicit killed $nzcv, implicit-def $x9, pcsections !0
+  ; CHECK-NEXT:   renamable $w9 = CSELWr renamable $w8, renamable $w1, 13, implicit killed $nzcv, implicit-def renamable $x9, pcsections !0
   ; CHECK-NEXT:   early-clobber renamable $w10 = STXRH renamable $w9, renamable $x0, implicit killed $x9, pcsections !0 :: (volatile store (s16) into %ir.ptr)
   ; CHECK-NEXT:   CBNZW killed renamable $w10, %bb.1, pcsections !0
   ; CHECK-NEXT: {{  $}}
@@ -1172,10 +1172,10 @@ define i16 @atomicrmw_max_i16(ptr %ptr, i16 %rhs) {
   ; CHECK-NEXT:   successors: %bb.1(0x7c000000), %bb.2(0x04000000)
   ; CHECK-NEXT:   liveins: $w1, $x0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $w8 = LDXRH renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s16) from %ir.ptr)
+  ; CHECK-NEXT:   renamable $w8 = LDXRH renamable $x0, implicit-def renamable $x8, pcsections !0 :: (volatile load (s16) from %ir.ptr)
   ; CHECK-NEXT:   renamable $w9 = SBFMWri renamable $w8, 0, 15, pcsections !0
   ; CHECK-NEXT:   dead $wzr = SUBSWrx killed renamable $w9, renamable $w1, 40, implicit-def $nzcv, pcsections !0
-  ; CHECK-NEXT:   renamable $w9 = CSELWr renamable $w8, renamable $w1, 12, implicit killed $nzcv, implicit-def $x9, pcsections !0
+  ; CHECK-NEXT:   renamable $w9 = CSELWr renamable $w8, renamable $w1, 12, implicit killed $nzcv, implicit-def renamable $x9, pcsections !0
   ; CHECK-NEXT:   early-clobber renamable $w10 = STLXRH renamable $w9, renamable $x0, implicit killed $x9, pcsections !0 :: (volatile store (s16) into %ir.ptr)
   ; CHECK-NEXT:   CBNZW killed renamable $w10, %bb.1, pcsections !0
   ; CHECK-NEXT: {{  $}}
@@ -1200,10 +1200,10 @@ define i16 @atomicrmw_umin_i16(ptr %ptr, i16 %rhs) {
   ; CHECK-NEXT:   successors: %bb.1(0x7c000000), %bb.2(0x04000000)
   ; CHECK-NEXT:   liveins: $w9, $x0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $w8 = LDAXRH renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s16) from %ir.ptr)
+  ; CHECK-NEXT:   renamable $w8 = LDAXRH renamable $x0, implicit-def renamable $x8, pcsections !0 :: (volatile load (s16) from %ir.ptr)
   ; CHECK-NEXT:   renamable $w10 = ANDWri renamable $w8, 15
   ; CHECK-NEXT:   $wzr = SUBSWrs renamable $w10, renamable $w9, 0, implicit-def $nzcv, pcsections !0
-  ; CHECK-NEXT:   renamable $w10 = CSELWr killed renamable $w10, renamable $w9, 9, implicit killed $nzcv, implicit-def $x10, pcsections !0
+  ; CHECK-NEXT:   renamable $w10 = CSELWr killed renamable $w10, renamable $w9, 9, implicit killed $nzcv, implicit-def renamable $x10, pcsections !0
   ; CHECK-NEXT:   early-clobber renamable $w11 = STLXRH renamable $w10, renamable $x0, implicit killed $x10, pcsections !0 :: (volatile store (s16) into %ir.ptr)
   ; CHECK-NEXT:   CBNZW killed renamable $w11, %bb.1, pcsections !0
   ; CHECK-NEXT: {{  $}}
@@ -1228,10 +1228,10 @@ define i16 @atomicrmw_umax_i16(ptr %ptr, i16 %rhs) {
   ; CHECK-NEXT:   successors: %bb.1(0x7c000000), %bb.2(0x04000000)
   ; CHECK-NEXT:   liveins: $w9, $x0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $w8 = LDXRH renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s16) from %ir.ptr)
+  ; CHECK-NEXT:   renamable $w8 = LDXRH renamable $x0, implicit-def renamable $x8, pcsections !0 :: (volatile load (s16) from %ir.ptr)
   ; CHECK-NEXT:   renamable $w10 = ANDWri renamable $w8, 15
   ; CHECK-NEXT:   $wzr = SUBSWrs renamable $w10, renamable $w9, 0, implicit-def $nzcv, pcsections !0
-  ; CHECK-NEXT:   renamable $w10 = CSELWr killed renamable $w10, renamable $w9, 8, implicit killed $nzcv, implicit-def $x10, pcsections !0
+  ; CHECK-NEXT:   renamable $w10 = CSELWr killed renamable $w10, renamable $w9, 8, implicit killed $nzcv, implicit-def renamable $x10, pcsections !0
   ; CHECK-NEXT:   early-clobber renamable $w11 = STXRH renamable $w10, renamable $x0, implicit killed $x10, pcsections !0 :: (volatile store (s16) into %ir.ptr)
   ; CHECK-NEXT:   CBNZW killed renamable $w11, %bb.1, pcsections !0
   ; CHECK-NEXT: {{  $}}
@@ -1257,7 +1257,7 @@ define { i8, i1 } @cmpxchg_i8(ptr %ptr, i8 %desired, i8 %new) {
   ; CHECK-NEXT:   successors: %bb.2(0x7c000000), %bb.4(0x04000000)
   ; CHECK-NEXT:   liveins: $w1, $x2, $x8
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $w0 = LDXRB renamable $x8, implicit-def $x0, pcsections !0 :: (volatile load (s8) from %ir.ptr)
+  ; CHECK-NEXT:   renamable $w0 = LDXRB renamable $x8, implicit-def renamable $x0, pcsections !0 :: (volatile load (s8) from %ir.ptr)
   ; CHECK-NEXT:   renamable $w9 = ANDWri renamable $w0, 7, pcsections !0
   ; CHECK-NEXT:   dead $wzr = SUBSWrx killed renamable $w9, renamable $w1, 0, implicit-def $nzcv, pcsections !0
   ; CHECK-NEXT:   Bcc 1, %bb.4, implicit killed $nzcv, pcsections !0
@@ -1300,7 +1300,7 @@ define { i16, i1 } @cmpxchg_i16(ptr %ptr, i16 %desired, i16 %new) {
   ; CHECK-NEXT:   successors: %bb.2(0x7c000000), %bb.4(0x04000000)
   ; CHECK-NEXT:   liveins: $w1, $x2, $x8
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $w0 = LDXRH renamable $x8, implicit-def $x0, pcsections !0 :: (volatile load (s16) from %ir.ptr)
+  ; CHECK-NEXT:   renamable $w0 = LDXRH renamable $x8, implicit-def renamable $x0, pcsections !0 :: (volatile load (s16) from %ir.ptr)
   ; CHECK-NEXT:   renamable $w9 = ANDWri renamable $w0, 15, pcsections !0
   ; CHECK-NEXT:   dead $wzr = SUBSWrx killed renamable $w9, renamable $w1, 8, implicit-def $nzcv, pcsections !0
   ; CHECK-NEXT:   Bcc 1, %bb.4, implicit killed $nzcv, pcsections !0
diff --git a/llvm/test/CodeGen/X86/coalescer-breaks-subreg-to-reg-liveness.ll b/llvm/test/CodeGen/X86/coalescer-breaks-subreg-to-reg-liveness.ll
new file mode 100644
index 0000000000000..a3c3fc70e9761
--- /dev/null
+++ b/llvm/test/CodeGen/X86/coalescer-breaks-subreg-to-reg-liveness.ll
@@ -0,0 +1,185 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -mtriple=x86_64-grtev4-linux-gnu < %s | FileCheck %s
+
+%struct.wibble = type { %struct.wombat }
+%struct.wombat = type { %struct.ham, [3 x i8] }
+%struct.ham = type { %struct.zot }
+%struct.zot = type { %struct.blam }
+%struct.blam = type { %struct.ham.0 }
+%struct.ham.0 = type { %struct.bar }
+%struct.bar = type { %struct.bar.1 }
+%struct.bar.1 = type { %struct.baz, i8 }
+%struct.baz = type { %struct.snork }
+%struct.snork = type <{ %struct.spam, i8, [3 x i8] }>
+%struct.spam = type { %struct.snork.2, %struct.snork.2 }
+%struct.snork.2 = type { i32 }
+%struct.snork.3 = type { %struct.baz, i8, [3 x i8] }
+
+define void @foo(ptr %arg, ptr %arg1, i40 %arg2, ptr %arg3, i32 %arg4) #0 {
+; CHECK-LABEL: foo:
+; CHECK:       # %bb.0: # %bb
+; CHECK-NEXT:    pushq %rbp
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset %rbp, -16
+; CHECK-NEXT:    movq %rsp, %rbp
+; CHECK-NEXT:    .cfi_def_cfa_register %rbp
+; CHECK-NEXT:    pushq %r15
+; CHECK-NEXT:    pushq %r14
+; CHECK-NEXT:    pushq %r13
+; CHECK-NEXT:    pushq %r12
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    subq $24, %rsp
+; CHECK-NEXT:    .cfi_offset %rbx, -56
+; CHECK-NEXT:    .cfi_offset %r12, -48
+; CHECK-NEXT:    .cfi_offset %r13, -40
+; CHECK-NEXT:    .cfi_offset %r14, -32
+; CHECK-NEXT:    .cfi_offset %r15, -24
+; CHECK-NEXT:    movl %r8d, %r14d
+; CHECK-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movq %rsi, %r13
+; CHECK-NEXT:    movq %rdi, %r15
+; CHECK-NEXT:    incl %r14d
+; CHECK-NEXT:    xorl %ebx, %ebx
+; CHECK-NEXT:    # implicit-def: $r12
+; CHECK-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    jmp .LBB0_3
+; CHECK-NEXT:    .p2align 4, 0x90
+; CHECK-NEXT:  .LBB0_1: # %bb17
+; CHECK-NEXT:    # in Loop: Header=BB0_3 Depth=1
+; CHECK-NEXT:    movq %r15, %r13
+; CHECK-NEXT:    xorl %r15d, %r15d
+; CHECK-NEXT:    testq %rbx, %rbx
+; CHECK-NEXT:    sete %r15b
+; CHECK-NEXT:    xorl %edi, %edi
+; CHECK-NEXT:    callq _Znwm@PLT
+; CHECK-NEXT:    shll $4, %r15d
+; CHECK-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload
+; CHECK-NEXT:    movq %r12, %rcx
+; CHECK-NEXT:    shrq $32, %rcx
+; CHECK-NEXT:    movb %cl, 12(%rax)
+; CHECK-NEXT:    movl %r12d, 8(%rax)
+; CHECK-NEXT:    movq %r15, %rbx
+; CHECK-NEXT:    movq %r13, %r15
+; CHECK-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; CHECK-NEXT:    decl %r14d
+; CHECK-NEXT:    je .LBB0_8
+; CHECK-NEXT:  .LBB0_3: # %bb7
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    callq widget@PLT
+; CHECK-NEXT:    cmpb $-5, (%r13)
+; CHECK-NEXT:    jae .LBB0_5
+; CHECK-NEXT:  # %bb.4: # in Loop: Header=BB0_3 Depth=1
+; CHECK-NEXT:    movl %r12d, %r12d
+; CHECK-NEXT:    cmpq %r15, %rbx
+; CHECK-NEXT:    jbe .LBB0_1
+; CHECK-NEXT:    jmp .LBB0_7
+; CHECK-NEXT:    .p2align 4, 0x90
+; CHECK-NEXT:  .LBB0_5: # %bb12
+; CHECK-NEXT:    # in Loop: Header=BB0_3 Depth=1
+; CHECK-NEXT:    movq 0, %rax
+; CHECK-NEXT:    movq 8, %rax
+; CHECK-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
+; CHECK-NEXT:    cmpq %r15, %rbx
+; CHECK-NEXT:    jbe .LBB0_1
+; CHECK-NEXT:  .LBB0_7: # in Loop: Header=BB0_3 Depth=1
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    xorl %ebx, %ebx
+; CHECK-NEXT:    decl %r14d
+; CHECK-NEXT:    jne .LBB0_3
+; CHECK-NEXT:  .LBB0_8: # %bb21
+; CHECK-NEXT:    cmpb $0, 12(%rax)
+; CHECK-NEXT:    jne .LBB0_10
+; CHECK-NEXT:  # %bb.9: # %bb26
+; CHECK-NEXT:    addq $24, %rsp
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    popq %r12
+; CHECK-NEXT:    popq %r13
+; CHECK-NEXT:    popq %r14
+; CHECK-NEXT:    popq %r15
+; CHECK-NEXT:    popq %rbp
+; CHECK-NEXT:    .cfi_def_cfa %rsp, 8
+; CHECK-NEXT:    retq
+; CHECK-NEXT:  .LBB0_10: # %bb25
+; CHECK-NEXT:    .cfi_def_cfa %rbp, 16
+; CHECK-NEXT:    movq %r15, %rdi
+; CHECK-NEXT:    callq pluto@PLT
+bb:
+  br label %bb7
+
+bb5:                                              ; preds = %bb17, %bb14
+  %phi = phi ptr [ %call19, %bb17 ], [ null, %bb14 ]
+  %phi6 = phi ptr [ %getelementptr, %bb17 ], [ null, %bb14 ]
+  %add = add i32 %phi9, 1
+  %icmp = icmp eq i32 %phi9, %arg4
+  br i1 %icmp, label %bb21, label %bb7
+
+bb7:                                              ; preds = %bb5, %bb
+  %phi8 = phi ptr [ null, %bb ], [ %phi6, %bb5 ]
+  %phi9 = phi i32 [ 0, %bb ], [ %add, %bb5 ]
+  %phi10 = phi i40 [ undef, %bb ], [ %phi15, %bb5 ]
+  %call = call ptr @widget()
+  %load = load i8, ptr %arg1, align 8
+  %icmp11 = icmp ult i8 %load, -5
+  %and = and i40 %phi10, 4294967295
+  br i1 %icmp11, label %bb14, label %bb12
+
+bb12:                                             ; preds = %bb7
+  %load13 = load volatile { i64, i64 }, ptr null, align 4294967296
+  br label %bb14
+
+bb14:                                             ; preds = %bb12, %bb7
+  %phi15 = phi i40 [ %and, %bb7 ], [ %arg2, %bb12 ]
+  %icmp16 = icmp ugt ptr %phi8, %arg
+  br i1 %icmp16, label %bb5, label %bb17
+
+bb17:                                             ; preds = %bb14
+  %icmp18 = icmp eq ptr %phi8, null
+  %zext = zext i1 %icmp18 to i64
+  %call19 = call ptr @_Znwm(i64 0)
+  %getelementptr = getelementptr %struct.wibble, ptr %arg3, i64 %zext
+  %getelementptr20 = getelementptr i8, ptr %call19, i64 8
+  store i40 %phi15, ptr %getelementptr20, align 4
+  br label %bb5
+
+bb21:                                             ; preds = %bb5
+  %getelementptr22 = getelementptr %struct.snork.3, ptr %phi, i64 0, i32 1
+  %load23 = load i8, ptr %getelementptr22, align 4
+  %icmp24 = icmp eq i8 %load23, 0
+  br i1 %icmp24, label %bb26, label %bb25
+
+bb25:                                             ; preds = %bb21
+  call void @pluto(ptr %arg)
+  unreachable
+
+bb26:                                             ; preds = %bb21
+  ret void
+}
+
+define void @eggs(ptr %arg, ptr %arg1) {
+; CHECK-LABEL: eggs:
+; CHECK:       # %bb.0: # %bb
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    movq %rsi, %rdi
+; CHECK-NEXT:    movq %rax, %rsi
+; CHECK-NEXT:    xorl %edx, %edx
+; CHECK-NEXT:    xorl %ecx, %ecx
+; CHECK-NEXT:    xorl %r8d, %r8d
+; CHECK-NEXT:    callq foo@PLT
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+bb:
+  call void @foo(ptr %arg1, ptr %arg, i40 0, ptr null, i32 0)
+  ret void
+}
+
+declare ptr @widget()
+
+declare void @pluto(ptr)
+
+declare ptr @_Znwm(i64)
+
+attributes #0 = { noinline "frame-pointer"="all" }
diff --git a/llvm/test/CodeGen/X86/coalescer-implicit-def-regression-imp-operand-assert.mir b/llvm/test/CodeGen/X86/coalescer-implicit-def-regression-imp-operand-assert.mir
index 8241a1757af52..190b14052d9b6 100644
--- a/llvm/test/CodeGen/X86/coalescer-implicit-def-regression-imp-operand-assert.mir
+++ b/llvm/test/CodeGen/X86/coalescer-implicit-def-regression-imp-operand-assert.mir
@@ -9,7 +9,7 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.1(0x2aaaaaab), %bb.2(0x55555555)
   ; CHECK-NEXT:   liveins: $edi
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef [[MOV32r0_:%[0-9]+]].sub_32bit:gr64_with_sub_8bit = MOV32r0 implicit-def dead $eflags
+  ; CHECK-NEXT:   undef [[MOV32r0_:%[0-9]+]].sub_32bit:gr64_with_sub_8bit = MOV32r0 implicit-def dead $eflags, implicit-def [[MOV32r0_]]
   ; CHECK-NEXT:   JCC_1 %bb.2, 5, implicit killed undef $eflags
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
@@ -28,7 +28,7 @@ body:             |
   ; CHECK-NEXT:   JCC_1 %bb.5, 5, implicit killed undef $eflags
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.4:
-  ; CHECK-NEXT:   dead $eax = MOV32r0 implicit-def dead $eflags, implicit-def $al
+  ; CHECK-NEXT:   dead $eax = MOV32r0 implicit-def dead $eflags, implicit-def $al, implicit-def $al
   ; CHECK-NEXT:   RET 0, killed undef $al
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.5:
diff --git a/llvm/test/CodeGen/X86/coalescing-subreg-to-reg-requires-subrange-update.mir b/llvm/test/CodeGen/X86/coalescing-subreg-to-reg-requires-subrange-update.mir
new file mode 100644
index 0000000000000..fe53aef86e835
--- /dev/null
+++ b/llvm/test/CodeGen/X86/coalescing-subreg-to-reg-requires-subrange-update.mir
@@ -0,0 +1,47 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
+# RUN: llc -mtriple=x86_64-- -run-pass=register-coalescer -enable-subreg-liveness -verify-coalescing -o - %s | FileCheck %s
+
+
+# FIXME: Need to handle subrange updates when coalescing with subreg_to_reg
+# This will fail if x86 enables subregister liveness.
+---
+name: requires_new_subrange_coalesce_subreg_to_reg
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: name: requires_new_subrange_coalesce_subreg_to_reg
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT:   liveins: $eax
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   undef %a.sub_32bit:gr64_with_sub_8bit = COPY $eax
+  ; CHECK-NEXT:   %b:gr32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   %c:gr64 = INSERT_SUBREG %a, %b, %subreg.sub_32bit
+  ; CHECK-NEXT:   JCC_1 %bb.2, 4, implicit undef $eflags
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   undef %a.sub_32bit:gr64_with_sub_8bit = MOV32r0 implicit-def dead $eflags
+  ; CHECK-NEXT:   %c.sub_32bit:gr64 = COPY %a
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   %c.sub_32bit:gr64 = SUBREG_TO_REG %a, %b, %subreg.sub_32bit
+  ; CHECK-NEXT:   RET 0, implicit %c
+  bb.0:
+    liveins: $eax
+    %init_eax:gr32 = COPY $eax
+    %a:gr64 = SUBREG_TO_REG 0, %init_eax, %subreg.sub_32bit
+    %b:gr32 = IMPLICIT_DEF
+    %c:gr64 = INSERT_SUBREG %a, %b, %subreg.sub_32bit
+    JCC_1 %bb.2, 4, implicit undef $eflags
+
+  bb.1:
+    %imm0:gr32 = MOV32r0 implicit-def dead $eflags
+    %a = SUBREG_TO_REG 0, %imm0, %subreg.sub_32bit
+    %c.sub_32bit = COPY %a
+
+  bb.2:
+    %c.sub_32bit = SUBREG_TO_REG %a, %b, %subreg.sub_32bit
+    RET 0, implicit %c
+
+...
diff --git a/llvm/test/CodeGen/X86/subreg-to-reg-coalescing.mir b/llvm/test/CodeGen/X86/subreg-to-reg-coalescing.mir
new file mode 100644
index 0000000000000..6121a0bcc5641
--- /dev/null
+++ b/llvm/test/CodeGen/X86/subreg-to-reg-coalescing.mir
@@ -0,0 +1,348 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
+# RUN: llc -mtriple=x86_64-- -run-pass=register-coalescer -o - %s | FileCheck %s
+
+# We cannot lose the liveness of the high subregister of %1 when
+# coalesced with %0, so introduce an implicit-def of the super
+# register on the MOV.
+
+---
+name: coalesce_mov32r0_into_subreg_to_reg64
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: coalesce_mov32r0_into_subreg_to_reg64
+    ; CHECK: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    ; CHECK-NEXT: undef %1.sub_32bit:gr64_with_sub_8bit = MOV32r0 implicit-def dead $eflags, implicit-def %1
+    ; CHECK-NEXT: dead $edi = MOV32r0 implicit-def dead $eflags, implicit-def $rdi
+    ; CHECK-NEXT: CALL64r %1, csr_64, implicit $rsp, implicit $ssp, implicit killed $rdi, implicit-def $rsp, implicit-def $ssp, implicit-def dead $rax
+    ; CHECK-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    ; CHECK-NEXT: RET 0
+    ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    %0:gr32 = MOV32r0 implicit-def dead $eflags
+    %1:gr64 = SUBREG_TO_REG 0, killed %0, %subreg.sub_32bit
+    $rdi = COPY %1
+    CALL64r killed %1, csr_64, implicit $rsp, implicit $ssp, implicit killed $rdi, implicit-def $rsp, implicit-def $ssp, implicit-def dead $rax
+    ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    RET 0
+
+...
+
+---
+name: subreg_to_reg_folds_to_undef
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $rax
+
+    ; CHECK-LABEL: name: subreg_to_reg_folds_to_undef
+    ; CHECK: liveins: $rax
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr64_with_sub_8bit = COPY $rax
+    ; CHECK-NEXT: undef %4.sub_32bit:gr64_with_sub_8bit = MOV32rr [[COPY]].sub_32bit, implicit-def %4
+    ; CHECK-NEXT: RET 0, implicit %4
+    %0:gr64 = COPY killed $rax
+    %1:gr32 = COPY killed %0.sub_32bit
+    %2:gr32 = MOV32rr killed %1
+    %3:gr64 = SUBREG_TO_REG 0, killed %2, %subreg.sub_32bit
+    %4:gr64 = COPY killed %3
+    RET 0, implicit %4
+
+...
+
+---
+name: coalesce_mov32r0_subreg_def_into_subreg_to_reg64
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: coalesce_mov32r0_subreg_def_into_subreg_to_reg64
+    ; CHECK: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    ; CHECK-NEXT: undef %1.sub_32bit:gr64_with_sub_8bit = MOV32r0 implicit-def dead $eflags
+    ; CHECK-NEXT: dead $edi = MOV32r0 implicit-def dead $eflags, implicit-def $rdi
+    ; CHECK-NEXT: CALL64r %1, csr_64, implicit $rsp, implicit $ssp, implicit killed $rdi, implicit-def $rsp, implicit-def $ssp, implicit-def dead $rax
+    ; CHECK-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    ; CHECK-NEXT: RET 0
+    ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    undef %0.sub_32bit:gr64_with_sub_8bit = MOV32r0 implicit-def dead $eflags
+    %1:gr64 = SUBREG_TO_REG 0, killed %0.sub_32bit, %subreg.sub_32bit
+    $rdi = COPY %1
+    CALL64r killed %1, csr_64, implicit $rsp, implicit $ssp, implicit killed $rdi, implicit-def $rsp, implicit-def $ssp, implicit-def dead $rax
+    ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    RET 0
+
+...
+
+---
+name: coalesce_mov32r0_into_subreg_def_with_super_def_to_reg64
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: coalesce_mov32r0_into_subreg_def_with_super_def_to_reg64
+    ; CHECK: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    ; CHECK-NEXT: undef %1.sub_32bit:gr64_with_sub_8bit = MOV32r0 implicit-def dead $eflags, implicit-def %1
+    ; CHECK-NEXT: dead $edi = MOV32r0 implicit-def dead $eflags, implicit-def $rdi
+    ; CHECK-NEXT: CALL64r %1, csr_64, implicit $rsp, implicit $ssp, implicit killed $rdi, implicit-def $rsp, implicit-def $ssp, implicit-def dead $rax
+    ; CHECK-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    ; CHECK-NEXT: RET 0
+    ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    undef %0.sub_32bit:gr64_with_sub_8bit = MOV32r0 implicit-def dead $eflags, implicit-def %0
+    %1:gr64 = SUBREG_TO_REG 0, killed %0.sub_32bit, %subreg.sub_32bit
+    $rdi = COPY %1
+    CALL64r killed %1, csr_64, implicit $rsp, implicit $ssp, implicit killed $rdi, implicit-def $rsp, implicit-def $ssp, implicit-def dead $rax
+    ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    RET 0
+
+...
+
+---
+name: coalesce_mov32r0_into_subreg_to_reg64_already_defs_other_subreg
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: coalesce_mov32r0_into_subreg_to_reg64_already_defs_other_subreg
+    ; CHECK: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    ; CHECK-NEXT: undef %1.sub_32bit:gr64_with_sub_8bit = MOV32r0 implicit-def dead $eflags, implicit-def undef %1.sub_8bit, implicit-def %1
+    ; CHECK-NEXT: INLINEASM &"", 0 /* attdialect */, implicit %1
+    ; CHECK-NEXT: CALL64r %1, csr_64, implicit $rsp, implicit $ssp, implicit undef $rdi, implicit-def $rsp, implicit-def $ssp, implicit-def dead $rax
+    ; CHECK-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    ; CHECK-NEXT: RET 0
+    ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    %0:gr32 = MOV32r0 implicit-def dead $eflags, implicit-def undef %0.sub_8bit
+    %1:gr64 = SUBREG_TO_REG 0, killed %0, %subreg.sub_32bit
+    INLINEASM &"", 0, implicit %1
+    CALL64r killed %1, csr_64, implicit $rsp, implicit $ssp, implicit undef $rdi, implicit-def $rsp, implicit-def $ssp, implicit-def dead $rax
+    ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    RET 0
+
+...
+
+
+# Reduced realistic case which was asserting after introducing new implicit-defs
+---
+name: coalesce_needs_implicit_defs
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: name: coalesce_needs_implicit_defs
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $rdi
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gr64 = COPY $rdi
+  ; CHECK-NEXT:   undef %2.sub_32bit:gr64_with_sub_8bit = MOV32r0 implicit-def dead $eflags, implicit-def %2
+  ; CHECK-NEXT:   undef %3.sub_32bit:gr64_with_sub_8bit = MOV32r0 implicit-def dead $eflags, implicit-def %3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   undef %10.sub_32bit:gr64_with_sub_8bit = MOV32r0 implicit-def dead $eflags
+  ; CHECK-NEXT:   TEST64rr %3, %3, implicit-def $eflags
+  ; CHECK-NEXT:   %10.sub_8bit:gr64_with_sub_8bit = SETCCr 4, implicit killed $eflags
+  ; CHECK-NEXT:   ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+  ; CHECK-NEXT:   dead $edi = MOV32r0 implicit-def dead $eflags, implicit-def $rdi
+  ; CHECK-NEXT:   CALL64r %2, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit-def $rsp, implicit-def $ssp, implicit-def dead $rax
+  ; CHECK-NEXT:   ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+  ; CHECK-NEXT:   [[SHL64ri:%[0-9]+]]:gr64_with_sub_8bit = SHL64ri [[SHL64ri]], 4, implicit-def dead $eflags
+  ; CHECK-NEXT:   [[ADD64rr:%[0-9]+]]:gr64_with_sub_8bit = ADD64rr [[ADD64rr]], [[COPY]], implicit-def dead $eflags
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:gr64_with_sub_8bit = COPY [[ADD64rr]]
+  ; CHECK-NEXT:   JMP_1 %bb.1
+  bb.0:
+    liveins: $rdi
+
+    %0:gr64 = COPY killed $rdi
+    %1:gr32 = MOV32r0 implicit-def dead $eflags
+    %2:gr64 = SUBREG_TO_REG 0, %1, %subreg.sub_32bit
+    %3:gr64 = COPY killed %2
+
+  bb.1:
+    %4:gr64 = COPY killed %3
+    %5:gr32 = MOV32r0 implicit-def dead $eflags
+    TEST64rr killed %4, %4, implicit-def $eflags
+    %6:gr8 = SETCCr 4, implicit killed $eflags
+    %7:gr32 = COPY killed %5
+    %7.sub_8bit:gr32 = COPY killed %6
+    %8:gr64 = SUBREG_TO_REG 0, killed %7, %subreg.sub_32bit
+    ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    %9:gr64 = SUBREG_TO_REG 0, %1, %subreg.sub_32bit
+    $rdi = COPY %9
+    CALL64r killed %9, csr_64, implicit $rsp, implicit $ssp, implicit killed $rdi, implicit-def $rsp, implicit-def $ssp, implicit-def dead $rax
+    ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    %10:gr64 = COPY killed %8
+    %10:gr64 = SHL64ri %10, 4, implicit-def dead $eflags
+    %11:gr64 = COPY killed %10
+    %11:gr64 = ADD64rr %11, %0, implicit-def dead $eflags
+    %3:gr64 = COPY killed %11
+    JMP_1 %bb.1
+
+...
+
+---
+name: coalesce_mov32r0_into_subreg_to_reg64_physreg_def
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: coalesce_mov32r0_into_subreg_to_reg64_physreg_def
+    ; CHECK: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    ; CHECK-NEXT: dead $edi = MOV32r0 implicit-def dead $eflags, implicit-def $rdi
+    ; CHECK-NEXT: CALL64r killed $rdi, csr_64, implicit $rsp, implicit $ssp, implicit killed $rdi, implicit-def $rsp, implicit-def $ssp, implicit-def dead $rax
+    ; CHECK-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    ; CHECK-NEXT: RET 0
+    ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    %0:gr32 = MOV32r0 implicit-def dead $eflags
+    $rdi = SUBREG_TO_REG 0, killed %0, %subreg.sub_32bit
+    CALL64r killed $rdi, csr_64, implicit $rsp, implicit $ssp, implicit killed $rdi, implicit-def $rsp, implicit-def $ssp, implicit-def dead $rax
+    ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    RET 0
+
+...
+
+---
+name: coalesce_mov32r0_into_subreg_to_reg64_physreg_use
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $eax
+    ; CHECK-LABEL: name: coalesce_mov32r0_into_subreg_to_reg64_physreg_use
+    ; CHECK: liveins: $eax
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    ; CHECK-NEXT: $eax = MOV32r0 implicit-def dead $eflags
+    ; CHECK-NEXT: [[SUBREG_TO_REG:%[0-9]+]]:gr64 = SUBREG_TO_REG 0, $eax, %subreg.sub_32bit
+    ; CHECK-NEXT: $rdi = COPY [[SUBREG_TO_REG]]
+    ; CHECK-NEXT: CALL64r [[SUBREG_TO_REG]], csr_64, implicit $rsp, implicit $ssp, implicit killed $rdi, implicit-def $rsp, implicit-def $ssp, implicit-def dead $rax
+    ; CHECK-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    ; CHECK-NEXT: RET 0
+    ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    $eax = MOV32r0 implicit-def dead $eflags
+    %1:gr64 = SUBREG_TO_REG 0, killed $eax, %subreg.sub_32bit
+    $rdi = COPY %1
+    CALL64r killed %1, csr_64, implicit $rsp, implicit $ssp, implicit killed $rdi, implicit-def $rsp, implicit-def $ssp, implicit-def dead $rax
+    ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    RET 0
+
+...
+
+# Coalesced instruction is a copy with other implicit operands
+---
+name: coalesce_copy_into_subreg_to_reg64
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $eax
+    ; CHECK-LABEL: name: coalesce_copy_into_subreg_to_reg64
+    ; CHECK: liveins: $eax
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    ; CHECK-NEXT: undef %1.sub_32bit:gr64_with_sub_8bit = COPY $eax, implicit-def dead $eflags, implicit-def %1
+    ; CHECK-NEXT: $rdi = COPY %1
+    ; CHECK-NEXT: CALL64r %1, csr_64, implicit $rsp, implicit $ssp, implicit killed $rdi, implicit-def $rsp, implicit-def $ssp, implicit-def dead $rax
+    ; CHECK-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    ; CHECK-NEXT: RET 0
+    ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    %0:gr32 = COPY $eax, implicit-def dead $eflags
+    %1:gr64 = SUBREG_TO_REG 0, killed %0, %subreg.sub_32bit
+    $rdi = COPY %1
+    CALL64r killed %1, csr_64, implicit $rsp, implicit $ssp, implicit killed $rdi, implicit-def $rsp, implicit-def $ssp, implicit-def dead $rax
+    ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    RET 0
+
+...
+
+---
+name: coalesce_mov32r0_into_subreg_to_reg64_multiple_redef_value
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: coalesce_mov32r0_into_subreg_to_reg64_multiple_redef_value
+    ; CHECK: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    ; CHECK-NEXT: undef %1.sub_32bit:gr64_with_sub_8bit = MOV32r0 implicit-def dead $eflags, implicit-def %1
+    ; CHECK-NEXT: INLINEASM &"", 0 /* attdialect */, implicit-def %1.sub_32bit, implicit %1.sub_32bit
+    ; CHECK-NEXT: $rdi = COPY %1
+    ; CHECK-NEXT: CALL64r %1, csr_64, implicit $rsp, implicit $ssp, implicit killed $rdi, implicit-def $rsp, implicit-def $ssp, implicit-def dead $rax
+    ; CHECK-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    ; CHECK-NEXT: RET 0
+    ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    %0:gr32 = MOV32r0 implicit-def dead $eflags
+    INLINEASM &"", 0, implicit-def %0, implicit %0
+    %1:gr64 = SUBREG_TO_REG 0, killed %0, %subreg.sub_32bit
+    $rdi = COPY %1
+    CALL64r killed %1, csr_64, implicit $rsp, implicit $ssp, implicit killed $rdi, implicit-def $rsp, implicit-def $ssp, implicit-def dead $rax
+    ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    RET 0
+
+...
+
+---
+name: coalesce_mov32r0_into_subreg_to_reg64_def_is_block_liveout
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: name: coalesce_mov32r0_into_subreg_to_reg64_def_is_block_liveout
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   INLINEASM &"", 0 /* attdialect */, implicit-def undef %1.sub_32bit, implicit-def %1
+  ; CHECK-NEXT:   JCC_1 %bb.1, 4, implicit undef $eflags
+  ; CHECK-NEXT:   JMP_1 %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   $rdi = COPY %1
+  ; CHECK-NEXT:   ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+  ; CHECK-NEXT:   CALL64r %1, csr_64, implicit $rsp, implicit $ssp, implicit killed $rdi, implicit-def $rsp, implicit-def $ssp, implicit-def dead $rax
+  ; CHECK-NEXT:   ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+  ; CHECK-NEXT:   RET 0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  bb.0:
+    INLINEASM &"", 0, implicit-def %0:gr32
+    JCC_1 %bb.1, 4, implicit undef $eflags
+    JMP_1 %bb.2
+
+  bb.1:
+    %1:gr64 = SUBREG_TO_REG 0, killed %0, %subreg.sub_32bit
+    $rdi = COPY %1
+    ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    CALL64r killed %1, csr_64, implicit $rsp, implicit $ssp, implicit killed $rdi, implicit-def $rsp, implicit-def $ssp, implicit-def dead $rax
+    ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    RET 0
+
+  bb.2:
+
+...
+
+---
+name: coalesce_mov32r0_into_subreg_to_reg64_def_is_phi_def
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: name: coalesce_mov32r0_into_subreg_to_reg64_def_is_phi_def
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   INLINEASM &"", 0 /* attdialect */, implicit-def undef %1.sub_32bit, implicit-def %1
+  ; CHECK-NEXT:   JCC_1 %bb.1, 4, implicit undef $eflags
+  ; CHECK-NEXT:   JMP_1 %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $rdi = COPY %1
+  ; CHECK-NEXT:   ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+  ; CHECK-NEXT:   CALL64r %1, csr_64, implicit $rsp, implicit $ssp, implicit killed $rdi, implicit-def $rsp, implicit-def $ssp, implicit-def dead $rax
+  ; CHECK-NEXT:   ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+  ; CHECK-NEXT:   JMP_1 %bb.1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  bb.0:
+
+    INLINEASM &"", 0, implicit-def %0:gr32
+    JCC_1 %bb.1, 4, implicit undef $eflags
+    JMP_1 %bb.2
+
+  bb.1:
+    %1:gr64 = SUBREG_TO_REG 0, %0, %subreg.sub_32bit
+    $rdi = COPY %1
+    ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    CALL64r %1, csr_64, implicit $rsp, implicit $ssp, implicit killed $rdi, implicit-def $rsp, implicit-def $ssp, implicit-def dead $rax
+    ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+    JMP_1 %bb.1
+
+  bb.2:
+
+...

From f3fa603d7404ebca7091534bfa18fed25b099204 Mon Sep 17 00:00:00 2001
From: jeanPerier <jperier@nvidia.com>
Date: Fri, 22 Dec 2023 10:59:01 +0100
Subject: [PATCH 161/342] [flang] lower ASSOCIATED for procedure pointers
 (#76067)

This is a lot less complex than the data case where the shape has to be
accounted for, so the implementation is done inline.

One corner case will not be supported correctly for now: the case where
POINTER and TARGET points to the same internal procedure may return
false because lowering is creating fir.embox_proc each time the address
of an internal procedure is taken, so different thunk for the same
internal procedure/host link may be created and compare to false. This
will be fixed in a later patch that moves creating of internal procedure
fir.embox_proc in the host so that the addresses are the same when the
host link is the same. This change is required to properly support the
required lifetime of internal procedure addresses anyway (should be the
always be the lifetime of the host, even when the address is taken in an
internal procedure).
---
 .../flang/Optimizer/Builder/HLFIRTools.h      |   2 +-
 .../include/flang/Optimizer/Dialect/FIRType.h |   6 +
 .../flang/Optimizer/HLFIR/HLFIRDialect.h      |   6 -
 flang/lib/Lower/ConvertCall.cpp               |   8 +-
 flang/lib/Optimizer/Builder/IntrinsicCall.cpp |  27 ++++
 .../Intrinsics/associated-proc-pointers.f90   | 116 ++++++++++++++++++
 6 files changed, 153 insertions(+), 12 deletions(-)
 create mode 100644 flang/test/Lower/Intrinsics/associated-proc-pointers.f90

diff --git a/flang/include/flang/Optimizer/Builder/HLFIRTools.h b/flang/include/flang/Optimizer/Builder/HLFIRTools.h
index e7561dffb7563..fcf0eded0c7ba 100644
--- a/flang/include/flang/Optimizer/Builder/HLFIRTools.h
+++ b/flang/include/flang/Optimizer/Builder/HLFIRTools.h
@@ -59,7 +59,7 @@ class Entity : public mlir::Value {
   bool isVariable() const { return !isValue(); }
   bool isMutableBox() const { return hlfir::isBoxAddressType(getType()); }
   bool isProcedurePointer() const {
-    return hlfir::isBoxProcAddressType(getType());
+    return fir::isBoxProcAddressType(getType());
   }
   bool isBoxAddressOrValue() const {
     return hlfir::isBoxAddressOrValueType(getType());
diff --git a/flang/include/flang/Optimizer/Dialect/FIRType.h b/flang/include/flang/Optimizer/Dialect/FIRType.h
index a79c67dfe6de8..ecfa9839617da 100644
--- a/flang/include/flang/Optimizer/Dialect/FIRType.h
+++ b/flang/include/flang/Optimizer/Dialect/FIRType.h
@@ -436,6 +436,12 @@ inline bool isBoxAddressOrValue(mlir::Type t) {
   return fir::unwrapRefType(t).isa<fir::BaseBoxType>();
 }
 
+/// Is this a fir.boxproc address type?
+inline bool isBoxProcAddressType(mlir::Type t) {
+  t = fir::dyn_cast_ptrEleTy(t);
+  return t && t.isa<fir::BoxProcType>();
+}
+
 /// Return a string representation of `ty`.
 ///
 /// fir.array<10x10xf32> -> prefix_10x10xf32
diff --git a/flang/include/flang/Optimizer/HLFIR/HLFIRDialect.h b/flang/include/flang/Optimizer/HLFIR/HLFIRDialect.h
index e8f2848529827..aa68d0811c486 100644
--- a/flang/include/flang/Optimizer/HLFIR/HLFIRDialect.h
+++ b/flang/include/flang/Optimizer/HLFIR/HLFIRDialect.h
@@ -67,12 +67,6 @@ inline bool isBoxAddressType(mlir::Type type) {
   return type && type.isa<fir::BaseBoxType>();
 }
 
-/// Is this a fir.boxproc address type?
-inline bool isBoxProcAddressType(mlir::Type type) {
-  type = fir::dyn_cast_ptrEleTy(type);
-  return type && type.isa<fir::BoxProcType>();
-}
-
 /// Is this a fir.box or fir.class address or value type?
 inline bool isBoxAddressOrValueType(mlir::Type type) {
   return fir::unwrapRefType(type).isa<fir::BaseBoxType>();
diff --git a/flang/lib/Lower/ConvertCall.cpp b/flang/lib/Lower/ConvertCall.cpp
index fd726c90c07bd..57ac9d0652b31 100644
--- a/flang/lib/Lower/ConvertCall.cpp
+++ b/flang/lib/Lower/ConvertCall.cpp
@@ -887,7 +887,7 @@ static PreparedDummyArgument preparePresentUserCallActualArgument(
   // Handle the procedure pointer actual arguments.
   if (actual.isProcedurePointer()) {
     // Procedure pointer actual to procedure pointer dummy.
-    if (hlfir::isBoxProcAddressType(dummyType))
+    if (fir::isBoxProcAddressType(dummyType))
       return PreparedDummyArgument{actual, /*cleanups=*/{}};
     // Procedure pointer actual to procedure dummy.
     if (hlfir::isFortranProcedureValue(dummyType)) {
@@ -898,7 +898,7 @@ static PreparedDummyArgument preparePresentUserCallActualArgument(
 
   // NULL() actual to procedure pointer dummy
   if (Fortran::evaluate::IsNullProcedurePointer(expr) &&
-      hlfir::isBoxProcAddressType(dummyType)) {
+      fir::isBoxProcAddressType(dummyType)) {
     auto boxTy{Fortran::lower::getUntypedBoxProcType(builder.getContext())};
     auto tempBoxProc{builder.createTemporary(loc, boxTy)};
     hlfir::Entity nullBoxProc(
@@ -909,7 +909,7 @@ static PreparedDummyArgument preparePresentUserCallActualArgument(
 
   if (actual.isProcedure()) {
     // Procedure actual to procedure pointer dummy.
-    if (hlfir::isBoxProcAddressType(dummyType)) {
+    if (fir::isBoxProcAddressType(dummyType)) {
       auto tempBoxProc{builder.createTemporary(loc, actual.getType())};
       builder.create<fir::StoreOp>(loc, actual, tempBoxProc);
       return PreparedDummyArgument{tempBoxProc, /*cleanups=*/{}};
@@ -1555,8 +1555,6 @@ genIntrinsicRefCore(Fortran::lower::PreparedActualArguments &loweredActuals,
     }
 
     hlfir::Entity actual = arg.value()->getActual(loc, builder);
-    if (actual.isProcedurePointer())
-      TODO(loc, "Procedure pointer as actual argument to intrinsics.");
     switch (argRules.lowerAs) {
     case fir::LowerIntrinsicArgAs::Value:
       operands.emplace_back(
diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
index ff5dbff04360a..b6d84fb13c235 100644
--- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
+++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
@@ -2134,6 +2134,33 @@ fir::ExtendedValue
 IntrinsicLibrary::genAssociated(mlir::Type resultType,
                                 llvm::ArrayRef<fir::ExtendedValue> args) {
   assert(args.size() == 2);
+  if (fir::isBoxProcAddressType(fir::getBase(args[0]).getType())) {
+    mlir::Value pointerBoxProc =
+        builder.create<fir::LoadOp>(loc, fir::getBase(args[0]));
+    mlir::Value pointerTarget =
+        builder.create<fir::BoxAddrOp>(loc, pointerBoxProc);
+    if (isStaticallyAbsent(args[1]))
+      return builder.genIsNotNullAddr(loc, pointerTarget);
+    mlir::Value target = fir::getBase(args[1]);
+    if (fir::isBoxProcAddressType(target.getType()))
+      target = builder.create<fir::LoadOp>(loc, target);
+    if (target.getType().isa<fir::BoxProcType>())
+      target = builder.create<fir::BoxAddrOp>(loc, target);
+    mlir::Type intPtrTy = builder.getIntPtrType();
+    mlir::Value pointerInt =
+        builder.createConvert(loc, intPtrTy, pointerTarget);
+    mlir::Value targetInt = builder.createConvert(loc, intPtrTy, target);
+    mlir::Value sameTarget = builder.create<mlir::arith::CmpIOp>(
+        loc, mlir::arith::CmpIPredicate::eq, pointerInt, targetInt);
+    mlir::Value zero = builder.createIntegerConstant(loc, intPtrTy, 0);
+    mlir::Value notNull = builder.create<mlir::arith::CmpIOp>(
+        loc, mlir::arith::CmpIPredicate::ne, zero, pointerInt);
+    // The not notNull test covers the following two cases:
+    // - TARGET is a procedure that is OPTIONAL and absent at runtime.
+    // - TARGET is a procedure pointer that is NULL.
+    // In both cases, ASSOCIATED should be false if POINTER is NULL.
+    return builder.create<mlir::arith::AndIOp>(loc, sameTarget, notNull);
+  }
   auto *pointer =
       args[0].match([&](const fir::MutableBoxValue &x) { return &x; },
                     [&](const auto &) -> const fir::MutableBoxValue * {
diff --git a/flang/test/Lower/Intrinsics/associated-proc-pointers.f90 b/flang/test/Lower/Intrinsics/associated-proc-pointers.f90
new file mode 100644
index 0000000000000..248b0aff8d286
--- /dev/null
+++ b/flang/test/Lower/Intrinsics/associated-proc-pointers.f90
@@ -0,0 +1,116 @@
+! Test ASSOCIATED() with procedure pointers.
+! RUN: bbc -emit-hlfir -o - %s | FileCheck %s
+
+subroutine test_proc_pointer_1(p, dummy_proc)
+  procedure(), pointer :: p
+  procedure() :: dummy_proc
+  call takes_log(associated(p, dummy_proc))
+end subroutine
+! CHECK-LABEL:   func.func @_QPtest_proc_pointer_1(
+! CHECK-SAME:                                      %[[VAL_0:.*]]: !fir.ref<!fir.boxproc<() -> ()>>,
+! CHECK-SAME:                                      %[[VAL_1:.*]]: !fir.boxproc<() -> ()>) {
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_proc_pointer_1Ep"} : (!fir.ref<!fir.boxproc<() -> ()>>) -> (!fir.ref<!fir.boxproc<() -> ()>>, !fir.ref<!fir.boxproc<() -> ()>>)
+! CHECK:           %[[VAL_3:.*]] = fir.load %[[VAL_2]]#1 : !fir.ref<!fir.boxproc<() -> ()>>
+! CHECK:           %[[VAL_4:.*]] = fir.box_addr %[[VAL_3]] : (!fir.boxproc<() -> ()>) -> (() -> ())
+! CHECK:           %[[VAL_5:.*]] = fir.box_addr %[[VAL_1]] : (!fir.boxproc<() -> ()>) -> (() -> ())
+! CHECK:           %[[VAL_6:.*]] = fir.convert %[[VAL_4]] : (() -> ()) -> i64
+! CHECK:           %[[VAL_7:.*]] = fir.convert %[[VAL_5]] : (() -> ()) -> i64
+! CHECK:           %[[VAL_8:.*]] = arith.cmpi eq, %[[VAL_6]], %[[VAL_7]] : i64
+! CHECK:           %[[VAL_9:.*]] = arith.constant 0 : i64
+! CHECK:           %[[VAL_10:.*]] = arith.cmpi ne, %[[VAL_9]], %[[VAL_6]] : i64
+! CHECK:           %[[VAL_11:.*]] = arith.andi %[[VAL_8]], %[[VAL_10]] : i1
+! CHECK:           %[[VAL_12:.*]] = fir.convert %[[VAL_11]] : (i1) -> !fir.logical<4>
+
+subroutine test_proc_pointer_2(p, p_target)
+  procedure(), pointer :: p, p_target
+  call takes_log(associated(p, p_target))
+end subroutine
+! CHECK-LABEL:   func.func @_QPtest_proc_pointer_2(
+! CHECK-SAME:                                      %[[VAL_0:.*]]: !fir.ref<!fir.boxproc<() -> ()>>,
+! CHECK-SAME:                                      %[[VAL_1:.*]]: !fir.ref<!fir.boxproc<() -> ()>>) {
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_proc_pointer_2Ep"} : (!fir.ref<!fir.boxproc<() -> ()>>) -> (!fir.ref<!fir.boxproc<() -> ()>>, !fir.ref<!fir.boxproc<() -> ()>>)
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_proc_pointer_2Ep_target"} : (!fir.ref<!fir.boxproc<() -> ()>>) -> (!fir.ref<!fir.boxproc<() -> ()>>, !fir.ref<!fir.boxproc<() -> ()>>)
+! CHECK:           %[[VAL_4:.*]] = fir.load %[[VAL_2]]#1 : !fir.ref<!fir.boxproc<() -> ()>>
+! CHECK:           %[[VAL_5:.*]] = fir.box_addr %[[VAL_4]] : (!fir.boxproc<() -> ()>) -> (() -> ())
+! CHECK:           %[[VAL_6:.*]] = fir.load %[[VAL_3]]#1 : !fir.ref<!fir.boxproc<() -> ()>>
+! CHECK:           %[[VAL_7:.*]] = fir.box_addr %[[VAL_6]] : (!fir.boxproc<() -> ()>) -> (() -> ())
+! CHECK:           %[[VAL_8:.*]] = fir.convert %[[VAL_5]] : (() -> ()) -> i64
+! CHECK:           %[[VAL_9:.*]] = fir.convert %[[VAL_7]] : (() -> ()) -> i64
+! CHECK:           %[[VAL_10:.*]] = arith.cmpi eq, %[[VAL_8]], %[[VAL_9]] : i64
+! CHECK:           %[[VAL_11:.*]] = arith.constant 0 : i64
+! CHECK:           %[[VAL_12:.*]] = arith.cmpi ne, %[[VAL_11]], %[[VAL_8]] : i64
+! CHECK:           %[[VAL_13:.*]] = arith.andi %[[VAL_10]], %[[VAL_12]] : i1
+! CHECK:           %[[VAL_14:.*]] = fir.convert %[[VAL_13]] : (i1) -> !fir.logical<4>
+
+subroutine test_proc_pointer_3(p, dummy_proc)
+  procedure(), pointer :: p
+  procedure(), optional :: dummy_proc
+  call takes_log(associated(p, dummy_proc))
+end subroutine
+! CHECK-LABEL:   func.func @_QPtest_proc_pointer_3(
+! CHECK-SAME:                                      %[[VAL_0:.*]]: !fir.ref<!fir.boxproc<() -> ()>>,
+! CHECK-SAME:                                      %[[VAL_1:.*]]: !fir.boxproc<() -> ()>) {
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_proc_pointer_3Ep"} : (!fir.ref<!fir.boxproc<() -> ()>>) -> (!fir.ref<!fir.boxproc<() -> ()>>, !fir.ref<!fir.boxproc<() -> ()>>)
+! CHECK:           %[[VAL_3:.*]] = fir.load %[[VAL_2]]#1 : !fir.ref<!fir.boxproc<() -> ()>>
+! CHECK:           %[[VAL_4:.*]] = fir.box_addr %[[VAL_3]] : (!fir.boxproc<() -> ()>) -> (() -> ())
+! CHECK:           %[[VAL_5:.*]] = fir.box_addr %[[VAL_1]] : (!fir.boxproc<() -> ()>) -> (() -> ())
+! CHECK:           %[[VAL_6:.*]] = fir.convert %[[VAL_4]] : (() -> ()) -> i64
+! CHECK:           %[[VAL_7:.*]] = fir.convert %[[VAL_5]] : (() -> ()) -> i64
+! CHECK:           %[[VAL_8:.*]] = arith.cmpi eq, %[[VAL_6]], %[[VAL_7]] : i64
+! CHECK:           %[[VAL_9:.*]] = arith.constant 0 : i64
+! CHECK:           %[[VAL_10:.*]] = arith.cmpi ne, %[[VAL_9]], %[[VAL_6]] : i64
+! CHECK:           %[[VAL_11:.*]] = arith.andi %[[VAL_8]], %[[VAL_10]] : i1
+! CHECK:           %[[VAL_12:.*]] = fir.convert %[[VAL_11]] : (i1) -> !fir.logical<4>
+
+subroutine test_proc_pointer_4(p)
+  procedure(), pointer :: p
+  external :: some_external
+  call takes_log(associated(p, some_external))
+end subroutine
+! CHECK-LABEL:   func.func @_QPtest_proc_pointer_4(
+! CHECK-SAME:                                      %[[VAL_0:.*]]: !fir.ref<!fir.boxproc<() -> ()>>) {
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_proc_pointer_4Ep"} : (!fir.ref<!fir.boxproc<() -> ()>>) -> (!fir.ref<!fir.boxproc<() -> ()>>, !fir.ref<!fir.boxproc<() -> ()>>)
+! CHECK:           %[[VAL_2:.*]] = fir.address_of(@_QPsome_external) : () -> ()
+! CHECK:           %[[VAL_3:.*]] = fir.emboxproc %[[VAL_2]] : (() -> ()) -> !fir.boxproc<() -> ()>
+! CHECK:           %[[VAL_4:.*]] = fir.load %[[VAL_1]]#1 : !fir.ref<!fir.boxproc<() -> ()>>
+! CHECK:           %[[VAL_5:.*]] = fir.box_addr %[[VAL_4]] : (!fir.boxproc<() -> ()>) -> (() -> ())
+! CHECK:           %[[VAL_6:.*]] = fir.box_addr %[[VAL_3]] : (!fir.boxproc<() -> ()>) -> (() -> ())
+! CHECK:           %[[VAL_7:.*]] = fir.convert %[[VAL_5]] : (() -> ()) -> i64
+! CHECK:           %[[VAL_8:.*]] = fir.convert %[[VAL_6]] : (() -> ()) -> i64
+! CHECK:           %[[VAL_9:.*]] = arith.cmpi eq, %[[VAL_7]], %[[VAL_8]] : i64
+! CHECK:           %[[VAL_10:.*]] = arith.constant 0 : i64
+! CHECK:           %[[VAL_11:.*]] = arith.cmpi ne, %[[VAL_10]], %[[VAL_7]] : i64
+! CHECK:           %[[VAL_12:.*]] = arith.andi %[[VAL_9]], %[[VAL_11]] : i1
+! CHECK:           %[[VAL_13:.*]] = fir.convert %[[VAL_12]] : (i1) -> !fir.logical<4>
+
+subroutine test_proc_pointer_5(p, dummy_proc)
+  interface
+    character(10) function char_func()
+    end function
+  end interface
+  procedure(char_func), pointer :: p
+  procedure(char_func) :: dummy_proc
+  call takes_log(associated(p, dummy_proc))
+end subroutine
+! CHECK-LABEL:   func.func @_QPtest_proc_pointer_5(
+! CHECK-SAME:                                      %[[VAL_0:.*]]: !fir.ref<!fir.boxproc<() -> ()>>,
+! CHECK-SAME:                                      %[[VAL_1:.*]]: tuple<!fir.boxproc<() -> ()>, i64> {fir.char_proc}) {
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_proc_pointer_5Ep"} : (!fir.ref<!fir.boxproc<() -> ()>>) -> (!fir.ref<!fir.boxproc<() -> ()>>, !fir.ref<!fir.boxproc<() -> ()>>)
+! CHECK:           %[[VAL_3:.*]] = fir.extract_value %[[VAL_1]], [0 : index] : (tuple<!fir.boxproc<() -> ()>, i64>) -> !fir.boxproc<() -> ()>
+! CHECK:           %[[VAL_4:.*]] = fir.box_addr %[[VAL_3]] : (!fir.boxproc<() -> ()>) -> (() -> ())
+! CHECK:           %[[VAL_5:.*]] = arith.constant 10 : i64
+! CHECK:           %[[VAL_6:.*]] = fir.emboxproc %[[VAL_4]] : (() -> ()) -> !fir.boxproc<() -> ()>
+! CHECK:           %[[VAL_7:.*]] = fir.undefined tuple<!fir.boxproc<() -> ()>, i64>
+! CHECK:           %[[VAL_8:.*]] = fir.insert_value %[[VAL_7]], %[[VAL_6]], [0 : index] : (tuple<!fir.boxproc<() -> ()>, i64>, !fir.boxproc<() -> ()>) -> tuple<!fir.boxproc<() -> ()>, i64>
+! CHECK:           %[[VAL_9:.*]] = fir.insert_value %[[VAL_8]], %[[VAL_5]], [1 : index] : (tuple<!fir.boxproc<() -> ()>, i64>, i64) -> tuple<!fir.boxproc<() -> ()>, i64>
+! CHECK:           %[[VAL_10:.*]] = fir.extract_value %[[VAL_9]], [0 : index] : (tuple<!fir.boxproc<() -> ()>, i64>) -> !fir.boxproc<() -> ()>
+! CHECK:           %[[VAL_11:.*]] = fir.load %[[VAL_2]]#1 : !fir.ref<!fir.boxproc<() -> ()>>
+! CHECK:           %[[VAL_12:.*]] = fir.box_addr %[[VAL_11]] : (!fir.boxproc<() -> ()>) -> (() -> ())
+! CHECK:           %[[VAL_13:.*]] = fir.box_addr %[[VAL_10]] : (!fir.boxproc<() -> ()>) -> (() -> ())
+! CHECK:           %[[VAL_14:.*]] = fir.convert %[[VAL_12]] : (() -> ()) -> i64
+! CHECK:           %[[VAL_15:.*]] = fir.convert %[[VAL_13]] : (() -> ()) -> i64
+! CHECK:           %[[VAL_16:.*]] = arith.cmpi eq, %[[VAL_14]], %[[VAL_15]] : i64
+! CHECK:           %[[VAL_17:.*]] = arith.constant 0 : i64
+! CHECK:           %[[VAL_18:.*]] = arith.cmpi ne, %[[VAL_17]], %[[VAL_14]] : i64
+! CHECK:           %[[VAL_19:.*]] = arith.andi %[[VAL_16]], %[[VAL_18]] : i1
+! CHECK:           %[[VAL_20:.*]] = fir.convert %[[VAL_19]] : (i1) -> !fir.logical<4>

From 30a1c0aa27944e52f6e51fe12abc91f62e7b7eac Mon Sep 17 00:00:00 2001
From: jeanPerier <jperier@nvidia.com>
Date: Fri, 22 Dec 2023 10:59:59 +0100
Subject: [PATCH 162/342] [flang] c_funloc - handle pocedure pointers in
 convertToBox (#76070)

C_FUNLOC was not handling procedure pointer argument correctly, the
issue lied in `hlfir::convertToBox` that did not handle procedure
pointers.

I modified the interface of `hlfir::convertToXXX` to take values on the
way because hlfir::Entity are fundamentally an mlir::Value with type
guarantees, so they should be dealt with by value as mlir::Value are
(they are very small).
---
 .../flang/Optimizer/Builder/HLFIRTools.h      |  6 +--
 flang/lib/Optimizer/Builder/HLFIRTools.cpp    | 11 ++++--
 .../Intrinsics/c_funloc-proc-pointers.f90     | 38 +++++++++++++++++++
 3 files changed, 49 insertions(+), 6 deletions(-)
 create mode 100644 flang/test/Lower/Intrinsics/c_funloc-proc-pointers.f90

diff --git a/flang/include/flang/Optimizer/Builder/HLFIRTools.h b/flang/include/flang/Optimizer/Builder/HLFIRTools.h
index fcf0eded0c7ba..46dc79f41a18b 100644
--- a/flang/include/flang/Optimizer/Builder/HLFIRTools.h
+++ b/flang/include/flang/Optimizer/Builder/HLFIRTools.h
@@ -404,15 +404,15 @@ mlir::Value inlineElementalOp(
 
 std::pair<fir::ExtendedValue, std::optional<hlfir::CleanupFunction>>
 convertToValue(mlir::Location loc, fir::FirOpBuilder &builder,
-               const hlfir::Entity &entity);
+               hlfir::Entity entity);
 
 std::pair<fir::ExtendedValue, std::optional<hlfir::CleanupFunction>>
 convertToAddress(mlir::Location loc, fir::FirOpBuilder &builder,
-                 const hlfir::Entity &entity, mlir::Type targetType);
+                 hlfir::Entity entity, mlir::Type targetType);
 
 std::pair<fir::ExtendedValue, std::optional<hlfir::CleanupFunction>>
 convertToBox(mlir::Location loc, fir::FirOpBuilder &builder,
-             const hlfir::Entity &entity, mlir::Type targetType);
+             hlfir::Entity entity, mlir::Type targetType);
 
 /// Clone an hlfir.elemental_addr into an hlfir.elemental value.
 hlfir::ElementalOp cloneToElementalOp(mlir::Location loc,
diff --git a/flang/lib/Optimizer/Builder/HLFIRTools.cpp b/flang/lib/Optimizer/Builder/HLFIRTools.cpp
index 17efa45b8667d..94f723b4bae70 100644
--- a/flang/lib/Optimizer/Builder/HLFIRTools.cpp
+++ b/flang/lib/Optimizer/Builder/HLFIRTools.cpp
@@ -935,7 +935,7 @@ hlfir::translateToExtendedValue(mlir::Location loc, fir::FirOpBuilder &builder,
 
 std::pair<fir::ExtendedValue, std::optional<hlfir::CleanupFunction>>
 hlfir::convertToValue(mlir::Location loc, fir::FirOpBuilder &builder,
-                      const hlfir::Entity &entity) {
+                      hlfir::Entity entity) {
   // Load scalar references to integer, logical, real, or complex value
   // to an mlir value, dereference allocatable and pointers, and get rid
   // of fir.box that are not needed or create a copy into contiguous memory.
@@ -957,7 +957,12 @@ static fir::ExtendedValue placeTrivialInMemory(mlir::Location loc,
 
 std::pair<fir::ExtendedValue, std::optional<hlfir::CleanupFunction>>
 hlfir::convertToBox(mlir::Location loc, fir::FirOpBuilder &builder,
-                    const hlfir::Entity &entity, mlir::Type targetType) {
+                    hlfir::Entity entity, mlir::Type targetType) {
+  // fir::factory::createBoxValue is not meant to deal with procedures.
+  // Dereference procedure pointers here.
+  if (entity.isProcedurePointer())
+    entity = hlfir::derefPointersAndAllocatables(loc, builder, entity);
+
   auto [exv, cleanup] = translateToExtendedValue(loc, builder, entity);
   // Procedure entities should not go through createBoxValue that embox
   // object entities. Return the fir.boxproc directly.
@@ -972,7 +977,7 @@ hlfir::convertToBox(mlir::Location loc, fir::FirOpBuilder &builder,
 
 std::pair<fir::ExtendedValue, std::optional<hlfir::CleanupFunction>>
 hlfir::convertToAddress(mlir::Location loc, fir::FirOpBuilder &builder,
-                        const hlfir::Entity &entity, mlir::Type targetType) {
+                        hlfir::Entity entity, mlir::Type targetType) {
   hlfir::Entity derefedEntity =
       hlfir::derefPointersAndAllocatables(loc, builder, entity);
   auto [exv, cleanup] =
diff --git a/flang/test/Lower/Intrinsics/c_funloc-proc-pointers.f90 b/flang/test/Lower/Intrinsics/c_funloc-proc-pointers.f90
new file mode 100644
index 0000000000000..c9578b17ac525
--- /dev/null
+++ b/flang/test/Lower/Intrinsics/c_funloc-proc-pointers.f90
@@ -0,0 +1,38 @@
+! Test C_FUNLOC() with procedure pointers.
+! RUN: bbc -emit-hlfir -o - %s | FileCheck %s
+
+subroutine test_c_funloc(p)
+  use iso_c_binding, only : c_funloc
+  real, pointer, external :: p
+  call test(c_funloc(p))
+end subroutine
+! CHECK-LABEL:   func.func @_QPtest_c_funloc(
+! CHECK-SAME:                                %[[VAL_0:.*]]: !fir.ref<!fir.boxproc<() -> ()>>) {
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_c_funlocEp"} : (!fir.ref<!fir.boxproc<() -> ()>>) -> (!fir.ref<!fir.boxproc<() -> ()>>, !fir.ref<!fir.boxproc<() -> ()>>)
+! CHECK:           %[[VAL_2:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<!fir.boxproc<() -> ()>>
+! CHECK:           %[[VAL_3:.*]] = fir.alloca !fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>
+! CHECK:           %[[VAL_4:.*]] = fir.field_index __address, !fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>
+! CHECK:           %[[VAL_5:.*]] = fir.coordinate_of %[[VAL_3]], %[[VAL_4]] : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>>, !fir.field) -> !fir.ref<i64>
+! CHECK:           %[[VAL_6:.*]] = fir.box_addr %[[VAL_2]] : (!fir.boxproc<() -> ()>) -> (() -> ())
+! CHECK:           %[[VAL_7:.*]] = fir.convert %[[VAL_6]] : (() -> ()) -> i64
+! CHECK:           fir.store %[[VAL_7]] to %[[VAL_5]] : !fir.ref<i64>
+
+subroutine test_c_funloc_char(p)
+  use iso_c_binding, only : c_funloc
+  interface
+    character(10) function char_func()
+    end function
+  end interface
+  procedure(char_func), pointer :: p
+  call test(c_funloc(p))
+end subroutine
+! CHECK-LABEL:   func.func @_QPtest_c_funloc_char(
+! CHECK-SAME:                                     %[[VAL_0:.*]]: !fir.ref<!fir.boxproc<() -> ()>>) {
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_c_funloc_charEp"} : (!fir.ref<!fir.boxproc<() -> ()>>) -> (!fir.ref<!fir.boxproc<() -> ()>>, !fir.ref<!fir.boxproc<() -> ()>>)
+! CHECK:           %[[VAL_2:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<!fir.boxproc<() -> ()>>
+! CHECK:           %[[VAL_3:.*]] = fir.alloca !fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>
+! CHECK:           %[[VAL_4:.*]] = fir.field_index __address, !fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>
+! CHECK:           %[[VAL_5:.*]] = fir.coordinate_of %[[VAL_3]], %[[VAL_4]] : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>>, !fir.field) -> !fir.ref<i64>
+! CHECK:           %[[VAL_6:.*]] = fir.box_addr %[[VAL_2]] : (!fir.boxproc<() -> ()>) -> (() -> ())
+! CHECK:           %[[VAL_7:.*]] = fir.convert %[[VAL_6]] : (() -> ()) -> i64
+! CHECK:           fir.store %[[VAL_7]] to %[[VAL_5]] : !fir.ref<i64>

From 0ac1dfa31153f3dcb2fcda63e7f198f5682193eb Mon Sep 17 00:00:00 2001
From: jeanPerier <jperier@nvidia.com>
Date: Fri, 22 Dec 2023 11:01:03 +0100
Subject: [PATCH 163/342] [flang] lower c_f_procpointer (#76071)

This is equivalent to a procedure pointer assignment, except that the
target is a C_FUNPTR.
---
 .../flang/Optimizer/Builder/IntrinsicCall.h   |  1 +
 flang/lib/Optimizer/Builder/IntrinsicCall.cpp | 20 +++++++++
 .../test/Lower/Intrinsics/c_f_procpointer.f90 | 42 +++++++++++++++++++
 3 files changed, 63 insertions(+)
 create mode 100644 flang/test/Lower/Intrinsics/c_f_procpointer.f90

diff --git a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h
index ba0c4806c759e..dba946975e192 100644
--- a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h
+++ b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h
@@ -202,6 +202,7 @@ struct IntrinsicLibrary {
   fir::ExtendedValue genCAssociatedCPtr(mlir::Type,
                                         llvm::ArrayRef<fir::ExtendedValue>);
   void genCFPointer(llvm::ArrayRef<fir::ExtendedValue>);
+  void genCFProcPointer(llvm::ArrayRef<fir::ExtendedValue>);
   fir::ExtendedValue genCFunLoc(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>);
   fir::ExtendedValue genCLoc(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>);
   void genDateAndTime(llvm::ArrayRef<fir::ExtendedValue>);
diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
index b6d84fb13c235..c8057fbdd475a 100644
--- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
+++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
@@ -165,6 +165,10 @@ static constexpr IntrinsicHandler handlers[]{
        {"fptr", asInquired},
        {"shape", asAddr, handleDynamicOptional}}},
      /*isElemental=*/false},
+    {"c_f_procpointer",
+     &I::genCFProcPointer,
+     {{{"cptr", asValue}, {"fptr", asInquired}}},
+     /*isElemental=*/false},
     {"c_funloc", &I::genCFunLoc, {{{"x", asBox}}}, /*isElemental=*/false},
     {"c_loc", &I::genCLoc, {{{"x", asBox}}}, /*isElemental=*/false},
     {"ceiling", &I::genCeiling},
@@ -2525,6 +2529,22 @@ void IntrinsicLibrary::genCFPointer(llvm::ArrayRef<fir::ExtendedValue> args) {
                                     /*lbounds=*/mlir::ValueRange{});
 }
 
+// C_F_PROCPOINTER
+void IntrinsicLibrary::genCFProcPointer(
+    llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 2);
+  mlir::Value cptr =
+      fir::factory::genCPtrOrCFunptrValue(builder, loc, fir::getBase(args[0]));
+  mlir::Value fptr = fir::getBase(args[1]);
+  auto boxProcType =
+      mlir::cast<fir::BoxProcType>(fir::unwrapRefType(fptr.getType()));
+  mlir::Value cptrCast =
+      builder.createConvert(loc, boxProcType.getEleTy(), cptr);
+  mlir::Value cptrBox =
+      builder.create<fir::EmboxProcOp>(loc, boxProcType, cptrCast);
+  builder.create<fir::StoreOp>(loc, cptrBox, fptr);
+}
+
 // C_FUNLOC
 fir::ExtendedValue
 IntrinsicLibrary::genCFunLoc(mlir::Type resultType,
diff --git a/flang/test/Lower/Intrinsics/c_f_procpointer.f90 b/flang/test/Lower/Intrinsics/c_f_procpointer.f90
new file mode 100644
index 0000000000000..f70a56c91b916
--- /dev/null
+++ b/flang/test/Lower/Intrinsics/c_f_procpointer.f90
@@ -0,0 +1,42 @@
+! Test C_F_PROCPOINTER() lowering.
+! RUN: bbc -emit-hlfir -o - %s | FileCheck %s
+
+subroutine test_c_funloc(fptr, cptr)
+  use iso_c_binding, only : c_f_procpointer, c_funptr
+  real, pointer, external :: fptr
+  type(c_funptr), cptr
+  call c_f_procpointer(cptr, fptr)
+end subroutine
+! CHECK-LABEL:   func.func @_QPtest_c_funloc(
+! CHECK-SAME:                                %[[VAL_0:.*]]: !fir.ref<!fir.boxproc<() -> ()>>,
+! CHECK-SAME:                                %[[VAL_1:.*]]: !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>> {fir.bindc_name = "cptr"}) {
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFtest_c_funlocEcptr"} : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>>) -> (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>>, !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>>)
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_c_funlocEfptr"} : (!fir.ref<!fir.boxproc<() -> ()>>) -> (!fir.ref<!fir.boxproc<() -> ()>>, !fir.ref<!fir.boxproc<() -> ()>>)
+! CHECK:           %[[VAL_4:.*]] = fir.field_index __address, !fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>
+! CHECK:           %[[VAL_5:.*]] = fir.coordinate_of %[[VAL_2]]#1, %[[VAL_4]] : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>>, !fir.field) -> !fir.ref<i64>
+! CHECK:           %[[VAL_6:.*]] = fir.load %[[VAL_5]] : !fir.ref<i64>
+! CHECK:           %[[VAL_7:.*]] = fir.convert %[[VAL_6]] : (i64) -> (() -> ())
+! CHECK:           %[[VAL_8:.*]] = fir.emboxproc %[[VAL_7]] : (() -> ()) -> !fir.boxproc<() -> ()>
+! CHECK:           fir.store %[[VAL_8]] to %[[VAL_3]]#1 : !fir.ref<!fir.boxproc<() -> ()>>
+
+subroutine test_c_funloc_char(fptr, cptr)
+  use iso_c_binding, only : c_f_procpointer, c_funptr
+  interface
+    character(10) function char_func()
+    end function
+  end interface
+  procedure(char_func), pointer :: fptr
+  type(c_funptr), cptr
+  call c_f_procpointer(cptr, fptr)
+end subroutine
+! CHECK-LABEL:   func.func @_QPtest_c_funloc_char(
+! CHECK-SAME:                                     %[[VAL_0:.*]]: !fir.ref<!fir.boxproc<() -> ()>>,
+! CHECK-SAME:                                     %[[VAL_1:.*]]: !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>> {fir.bindc_name = "cptr"}) {
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFtest_c_funloc_charEcptr"} : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>>) -> (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>>, !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>>)
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_c_funloc_charEfptr"} : (!fir.ref<!fir.boxproc<() -> ()>>) -> (!fir.ref<!fir.boxproc<() -> ()>>, !fir.ref<!fir.boxproc<() -> ()>>)
+! CHECK:           %[[VAL_4:.*]] = fir.field_index __address, !fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>
+! CHECK:           %[[VAL_5:.*]] = fir.coordinate_of %[[VAL_2]]#1, %[[VAL_4]] : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>>, !fir.field) -> !fir.ref<i64>
+! CHECK:           %[[VAL_6:.*]] = fir.load %[[VAL_5]] : !fir.ref<i64>
+! CHECK:           %[[VAL_7:.*]] = fir.convert %[[VAL_6]] : (i64) -> (() -> ())
+! CHECK:           %[[VAL_8:.*]] = fir.emboxproc %[[VAL_7]] : (() -> ()) -> !fir.boxproc<() -> ()>
+! CHECK:           fir.store %[[VAL_8]] to %[[VAL_3]]#1 : !fir.ref<!fir.boxproc<() -> ()>>

From 7109a462cd7335f1427683139b958634eb883b9e Mon Sep 17 00:00:00 2001
From: Lucas Duarte Prates <lucas.prates@arm.com>
Date: Fri, 22 Dec 2023 10:06:06 +0000
Subject: [PATCH 164/342] [AArch64] Assembly support for the Armv9.5-A RAS
 Extensions (#76161)

This implements assembly support for the RAS extensions introduced as
part of the Armv9.5-A architecture version.
The changes include:
* New system registers for Delegated SError exceptions for EL3
(FEAT_E3DSE):
  * VDISR_EL3
  * VSESR_EL3

Mode details about these extensions can be found at:
* https://community.arm.com/arm-community-blogs/b/architectures-and-processors-blog/posts/arm-a-profile-architecture-developments-2023
* https://developer.arm.com/documentation/ddi0602/2023-09/

Co-authored-by: Jirui Wu <jirui.wu@arm.com>
Co-authored-by: Oliver Stannard <oliver.stannard@arm.com>
---
 llvm/lib/Target/AArch64/AArch64SystemOperands.td    |  5 +++++
 llvm/test/MC/AArch64/armv9.5a-e3dse.s               | 13 +++++++++++++
 .../test/MC/Disassembler/AArch64/armv9.5a-e3dse.txt | 13 +++++++++++++
 3 files changed, 31 insertions(+)
 create mode 100644 llvm/test/MC/AArch64/armv9.5a-e3dse.s
 create mode 100644 llvm/test/MC/Disassembler/AArch64/armv9.5a-e3dse.txt

diff --git a/llvm/lib/Target/AArch64/AArch64SystemOperands.td b/llvm/lib/Target/AArch64/AArch64SystemOperands.td
index e8b5f6059c9ee..28a5776a3089c 100644
--- a/llvm/lib/Target/AArch64/AArch64SystemOperands.td
+++ b/llvm/lib/Target/AArch64/AArch64SystemOperands.td
@@ -1946,3 +1946,8 @@ def : RWSysReg<"MDSTEPOP_EL1",      0b10, 0b000, 0b0000, 0b0101, 0b010>;
 // v9.5a System PMU zero register (FEAT_SPMU2)
 //                                  Op0   Op1    CRn     CRm     Op2
 def : WOSysReg<"SPMZR_EL0",         0b10, 0b011, 0b1001, 0b1100, 0b100>;
+
+// v9.5a Delegated SError exceptions for EL3 (FEAT_E3DSE)
+//                                  Op0   Op1    CRn     CRm     Op2
+def : RWSysReg<"VDISR_EL3",         0b11, 0b110, 0b1100, 0b0001, 0b001>;
+def : RWSysReg<"VSESR_EL3",         0b11, 0b110, 0b0101, 0b0010, 0b011>;
diff --git a/llvm/test/MC/AArch64/armv9.5a-e3dse.s b/llvm/test/MC/AArch64/armv9.5a-e3dse.s
new file mode 100644
index 0000000000000..b69d49ab4e9e5
--- /dev/null
+++ b/llvm/test/MC/AArch64/armv9.5a-e3dse.s
@@ -0,0 +1,13 @@
+// RUN: llvm-mc -triple aarch64 -show-encoding < %s | FileCheck %s
+
+mrs x0, VDISR_EL3
+// CHECK: mrs x0, VDISR_EL3                  // encoding: [0x20,0xc1,0x3e,0xd5]
+
+msr VDISR_EL3, x0
+// CHECK: msr VDISR_EL3, x0                  // encoding: [0x20,0xc1,0x1e,0xd5]
+
+mrs x0, VSESR_EL3
+// CHECK: mrs x0, VSESR_EL3                  // encoding: [0x60,0x52,0x3e,0xd5]
+
+msr VSESR_EL3, x0
+// CHECK: msr VSESR_EL3, x0                  // encoding: [0x60,0x52,0x1e,0xd5]
diff --git a/llvm/test/MC/Disassembler/AArch64/armv9.5a-e3dse.txt b/llvm/test/MC/Disassembler/AArch64/armv9.5a-e3dse.txt
new file mode 100644
index 0000000000000..d2476dbf876d4
--- /dev/null
+++ b/llvm/test/MC/Disassembler/AArch64/armv9.5a-e3dse.txt
@@ -0,0 +1,13 @@
+# RUN: llvm-mc -triple aarch64 -disassemble < %s | FileCheck %s
+
+[0x20,0xc1,0x3e,0xd5]
+# CHECK: mrs x0, VDISR_EL3
+
+[0x20,0xc1,0x1e,0xd5]
+# CHECK: msr VDISR_EL3, x0
+
+[0x60,0x52,0x3e,0xd5]
+# CHECK: mrs x0, VSESR_EL3
+
+[0x60,0x52,0x1e,0xd5]
+# CHECK: msr VSESR_EL3, x0

From 625197d39cf9d56a295f8e6ee2584c825b461db9 Mon Sep 17 00:00:00 2001
From: Qizhi Hu <836744285@qq.com>
Date: Fri, 22 Dec 2023 18:06:59 +0800
Subject: [PATCH 165/342] [clang][ASTImporter] Support Importer of
 BuiltinBitCastExpr (#74813)

Since import `ExplicitCastExpr` lacks of processing
`BuiltinBitCastExprClass` type, it would reach to the 'unreachable' code
and produce the crash. This patch aims to fix the
[crash](https://github.com/llvm/llvm-project/issues/74774) and try to
handle `BuiltinBitCastExpr`.

Co-authored-by: huqizhi <836744285@qq.com>
---
 clang/lib/AST/ASTImporter.cpp           | 12 ++++++++++++
 clang/unittests/AST/ASTImporterTest.cpp | 12 ++++++++++++
 2 files changed, 24 insertions(+)

diff --git a/clang/lib/AST/ASTImporter.cpp b/clang/lib/AST/ASTImporter.cpp
index 88b8c6abb6d5f..949310856562c 100644
--- a/clang/lib/AST/ASTImporter.cpp
+++ b/clang/lib/AST/ASTImporter.cpp
@@ -7826,6 +7826,18 @@ ExpectedStmt ASTNodeImporter::VisitExplicitCastExpr(ExplicitCastExpr *E) {
         *ToLParenLocOrErr, OCE->getBridgeKind(), E->getCastKind(),
         *ToBridgeKeywordLocOrErr, ToTypeInfoAsWritten, ToSubExpr);
   }
+  case Stmt::BuiltinBitCastExprClass: {
+    auto *BBC = cast<BuiltinBitCastExpr>(E);
+    ExpectedSLoc ToKWLocOrErr = import(BBC->getBeginLoc());
+    if (!ToKWLocOrErr)
+      return ToKWLocOrErr.takeError();
+    ExpectedSLoc ToRParenLocOrErr = import(BBC->getEndLoc());
+    if (!ToRParenLocOrErr)
+      return ToRParenLocOrErr.takeError();
+    return new (Importer.getToContext()) BuiltinBitCastExpr(
+        ToType, E->getValueKind(), E->getCastKind(), ToSubExpr,
+        ToTypeInfoAsWritten, *ToKWLocOrErr, *ToRParenLocOrErr);
+  }
   default:
     llvm_unreachable("Cast expression of unsupported type!");
     return make_error<ASTImportError>(ASTImportError::UnsupportedConstruct);
diff --git a/clang/unittests/AST/ASTImporterTest.cpp b/clang/unittests/AST/ASTImporterTest.cpp
index 9fa7660cde659..6c7b2b64ca2d1 100644
--- a/clang/unittests/AST/ASTImporterTest.cpp
+++ b/clang/unittests/AST/ASTImporterTest.cpp
@@ -561,6 +561,18 @@ TEST_P(ImportExpr, ImportVAArgExpr) {
                  cStyleCastExpr(hasSourceExpression(vaArgExpr())))));
 }
 
+const internal::VariadicDynCastAllOfMatcher<Stmt, BuiltinBitCastExpr>
+    builtinBitCastExpr;
+
+TEST_P(ImportExpr, ImportBuiltinBitCastExpr) {
+  MatchVerifier<Decl> Verifier;
+  testImport("void declToImport(int X) {"
+             "  (void)__builtin_bit_cast(float, X); }",
+             Lang_CXX20, "", Lang_CXX20, Verifier,
+             functionDecl(hasDescendant(
+                 cStyleCastExpr(hasSourceExpression(builtinBitCastExpr())))));
+}
+
 TEST_P(ImportExpr, CXXTemporaryObjectExpr) {
   MatchVerifier<Decl> Verifier;
   testImport(

From f7c3627338a2b98ccfac0312d47bb06d5a189c29 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Fri, 22 Dec 2023 17:15:52 +0700
Subject: [PATCH 166/342] DAG: Implement promotion for strict_fpextend (#74310)

Test is a placeholder, will be merged into the existing test after
additional bug fixes for illegal f16 targets are fixed.
---
 .../SelectionDAG/LegalizeFloatTypes.cpp       | 23 ++++++++++
 llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h |  1 +
 llvm/test/CodeGen/AMDGPU/strict_fp_casts.ll   | 43 +++++++++++++++++++
 3 files changed, 67 insertions(+)
 create mode 100644 llvm/test/CodeGen/AMDGPU/strict_fp_casts.ll

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
index c4605a6b9598a..65919a64b8065 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
@@ -2214,6 +2214,9 @@ bool DAGTypeLegalizer::PromoteFloatOperand(SDNode *N, unsigned OpNo) {
     case ISD::FP_TO_UINT_SAT:
                           R = PromoteFloatOp_FP_TO_XINT_SAT(N, OpNo); break;
     case ISD::FP_EXTEND:  R = PromoteFloatOp_FP_EXTEND(N, OpNo); break;
+    case ISD::STRICT_FP_EXTEND:
+      R = PromoteFloatOp_STRICT_FP_EXTEND(N, OpNo);
+      break;
     case ISD::SELECT_CC:  R = PromoteFloatOp_SELECT_CC(N, OpNo); break;
     case ISD::SETCC:      R = PromoteFloatOp_SETCC(N, OpNo); break;
     case ISD::STORE:      R = PromoteFloatOp_STORE(N, OpNo); break;
@@ -2276,6 +2279,26 @@ SDValue DAGTypeLegalizer::PromoteFloatOp_FP_EXTEND(SDNode *N, unsigned OpNo) {
   return DAG.getNode(ISD::FP_EXTEND, SDLoc(N), VT, Op);
 }
 
+SDValue DAGTypeLegalizer::PromoteFloatOp_STRICT_FP_EXTEND(SDNode *N,
+                                                          unsigned OpNo) {
+  assert(OpNo == 1);
+
+  SDValue Op = GetPromotedFloat(N->getOperand(1));
+  EVT VT = N->getValueType(0);
+
+  // Desired VT is same as promoted type.  Use promoted float directly.
+  if (VT == Op->getValueType(0)) {
+    ReplaceValueWith(SDValue(N, 1), N->getOperand(0));
+    return Op;
+  }
+
+  // Else, extend the promoted float value to the desired VT.
+  SDValue Res = DAG.getNode(ISD::STRICT_FP_EXTEND, SDLoc(N), N->getVTList(),
+                            N->getOperand(0), Op);
+  ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
+  return Res;
+}
+
 // Promote the float operands used for comparison.  The true- and false-
 // operands have the same type as the result and are promoted, if needed, by
 // PromoteFloatRes_SELECT_CC
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index 4a249e7a2dc92..84b1b2c71fd0b 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -713,6 +713,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
   SDValue PromoteFloatOp_BITCAST(SDNode *N, unsigned OpNo);
   SDValue PromoteFloatOp_FCOPYSIGN(SDNode *N, unsigned OpNo);
   SDValue PromoteFloatOp_FP_EXTEND(SDNode *N, unsigned OpNo);
+  SDValue PromoteFloatOp_STRICT_FP_EXTEND(SDNode *N, unsigned OpNo);
   SDValue PromoteFloatOp_UnaryOp(SDNode *N, unsigned OpNo);
   SDValue PromoteFloatOp_FP_TO_XINT_SAT(SDNode *N, unsigned OpNo);
   SDValue PromoteFloatOp_STORE(SDNode *N, unsigned OpNo);
diff --git a/llvm/test/CodeGen/AMDGPU/strict_fp_casts.ll b/llvm/test/CodeGen/AMDGPU/strict_fp_casts.ll
new file mode 100644
index 0000000000000..0339fca4d56cf
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/strict_fp_casts.ll
@@ -0,0 +1,43 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii < %s | FileCheck -check-prefixes=GFX7 %s
+
+declare float @llvm.experimental.constrained.fpext.f32.f16(half, metadata) #0
+declare <2 x float> @llvm.experimental.constrained.fpext.v2f32.v2f16(<2 x half>, metadata) #0
+
+define float @v_constrained_fpext_f16_to_f32(ptr addrspace(1) %ptr) #0 {
+; GFX7-LABEL: v_constrained_fpext_f16_to_f32:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %val = load half, ptr addrspace(1) %ptr
+  %result = call float @llvm.experimental.constrained.fpext.f32.f16(half %val, metadata !"fpexcept.strict")
+  ret float %result
+}
+
+define <2 x float> @v_constrained_fpext_v2f16_to_v2f32(ptr addrspace(1) %ptr) #0 {
+; GFX7-LABEL: v_constrained_fpext_v2f16_to_v2f32:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
+; GFX7-NEXT:    s_mov_b32 s5, s6
+; GFX7-NEXT:    buffer_load_dword v1, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+  %val = load <2 x half>, ptr addrspace(1) %ptr
+  %result = call <2 x float> @llvm.experimental.constrained.fpext.v2f32.v2f16(<2 x half> %val, metadata !"fpexcept.strict")
+  ret <2 x float> %result
+}
+
+attributes #0 = { strictfp }

From 24e80d4cc5778fe4869badd34acbe1e7ded811d6 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Fri, 22 Dec 2023 11:28:11 +0100
Subject: [PATCH 167/342] [IndVars] Move "using namespace" to top-level scope
 (NFC)

---
 llvm/lib/Transforms/Utils/SimplifyIndVar.cpp | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp b/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp
index 722ed03db3de3..2f2fa3730c894 100644
--- a/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp
@@ -27,6 +27,7 @@
 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
 
 using namespace llvm;
+using namespace llvm::PatternMatch;
 
 #define DEBUG_TYPE "indvars"
 
@@ -786,8 +787,6 @@ bool SimplifyIndvar::strengthenOverflowingOperation(BinaryOperator *BO,
 /// otherwise.
 bool SimplifyIndvar::strengthenRightShift(BinaryOperator *BO,
                                           Instruction *IVOperand) {
-  using namespace llvm::PatternMatch;
-
   if (BO->getOpcode() == Instruction::Shl) {
     bool Changed = false;
     ConstantRange IVRange = SE->getUnsignedRange(SE->getSCEV(IVOperand));
@@ -2011,8 +2010,6 @@ PHINode *WidenIV::createWideIV(SCEVExpander &Rewriter) {
 /// by looking at dominating conditions inside of the loop
 void WidenIV::calculatePostIncRange(Instruction *NarrowDef,
                                     Instruction *NarrowUser) {
-  using namespace llvm::PatternMatch;
-
   Value *NarrowDefLHS;
   const APInt *NarrowDefRHS;
   if (!match(NarrowDef, m_NSWAdd(m_Value(NarrowDefLHS),

From c16559137cd1ad08ef5934558829d2ff5d227d70 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Fri, 22 Dec 2023 11:17:39 +0100
Subject: [PATCH 168/342] [IndVars] Avoid unnecessary truncate for zext nneg
 use

When performing sext IV widening, if one of the narrow uses is in
a zext nneg, we can treat it like an sext and avoid the insertion
of a trunc.
---
 llvm/lib/Transforms/Utils/SimplifyIndVar.cpp             | 2 +-
 llvm/test/Transforms/IndVarSimplify/widen-nonnegative.ll | 8 ++------
 2 files changed, 3 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp b/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp
index 2f2fa3730c894..42e7c4006b427 100644
--- a/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp
@@ -1762,7 +1762,7 @@ Instruction *WidenIV::widenIVUse(WidenIV::NarrowIVDefUse DU, SCEVExpander &Rewri
   };
 
   // Our raison d'etre! Eliminate sign and zero extension.
-  if ((isa<SExtInst>(DU.NarrowUse) && canWidenBySExt()) ||
+  if ((match(DU.NarrowUse, m_SExtLike(m_Value())) && canWidenBySExt()) ||
       (isa<ZExtInst>(DU.NarrowUse) && canWidenByZExt())) {
     Value *NewDef = DU.WideDef;
     if (DU.NarrowUse->getType() != WideType) {
diff --git a/llvm/test/Transforms/IndVarSimplify/widen-nonnegative.ll b/llvm/test/Transforms/IndVarSimplify/widen-nonnegative.ll
index 612e9452c6373..739db26311f4a 100644
--- a/llvm/test/Transforms/IndVarSimplify/widen-nonnegative.ll
+++ b/llvm/test/Transforms/IndVarSimplify/widen-nonnegative.ll
@@ -331,9 +331,7 @@ define void @zext_nneg_add_nsw(ptr %A, i32 %offset, i32 %M) {
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = add nsw i64 [[INDVARS_IV]], [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[TMP1]] to i32
-; CHECK-NEXT:    [[IDXPROM_US:%.*]] = zext nneg i32 [[TMP2]] to i64
-; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IDXPROM_US]]
+; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP1]]
 ; CHECK-NEXT:    tail call void @use_ptr(ptr [[ARRAYIDX_US]])
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
@@ -658,9 +656,7 @@ define void @zext_nneg_mul_nsw(ptr %A, i32 %multiple, i32 %M) {
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul nsw i64 [[INDVARS_IV]], [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[TMP1]] to i32
-; CHECK-NEXT:    [[IDXPROM_US:%.*]] = zext nneg i32 [[TMP2]] to i64
-; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IDXPROM_US]]
+; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP1]]
 ; CHECK-NEXT:    tail call void @use_ptr(ptr [[ARRAYIDX_US]])
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]

From ffabf7355302b4c506e9a9534ef8f78c1a9e94e7 Mon Sep 17 00:00:00 2001
From: Dominik Adamski <dominik.adamski@amd.com>
Date: Fri, 22 Dec 2023 11:58:04 +0100
Subject: [PATCH 169/342] [NFC][OpenMP][MLIR] Add test for lowering parallel
 workshare GPU loop (#76144)

This test checks if MLIR code is lowered according to schema presented
below:

func1() {
    call __kmpc_parallel_51(..., func2, ...)
}

func2() {
   call __kmpc_for_static_loop_4u(..., func3, ...)
}

func3() {
   //loop body
}
---
 .../LLVMIR/omptarget-parallel-wsloop.mlir     | 36 +++++++++++++++++++
 1 file changed, 36 insertions(+)
 create mode 100644 mlir/test/Target/LLVMIR/omptarget-parallel-wsloop.mlir

diff --git a/mlir/test/Target/LLVMIR/omptarget-parallel-wsloop.mlir b/mlir/test/Target/LLVMIR/omptarget-parallel-wsloop.mlir
new file mode 100644
index 0000000000000..43d0934d3a931
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/omptarget-parallel-wsloop.mlir
@@ -0,0 +1,36 @@
+// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
+
+// The aim of the test is to check the GPU LLVM IR codegen
+// for nested omp do loop inside omp target region
+
+module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memory_space", 5 : ui32>>, llvm.data_layout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8", llvm.target_triple = "amdgcn-amd-amdhsa", omp.is_gpu = true, omp.is_target_device = true } {
+  llvm.func @target_parallel_wsloop(%arg0: !llvm.ptr ){
+    omp.parallel {
+      %loop_ub = llvm.mlir.constant(9 : i32) : i32
+      %loop_lb = llvm.mlir.constant(0 : i32) : i32
+      %loop_step = llvm.mlir.constant(1 : i32) : i32
+      omp.wsloop for  (%loop_cnt) : i32 = (%loop_lb) to (%loop_ub) inclusive step (%loop_step) {
+        %gep = llvm.getelementptr %arg0[0, %loop_cnt] : (!llvm.ptr, i32) -> !llvm.ptr, !llvm.array<10 x i32>
+        llvm.store %loop_cnt, %gep : i32, !llvm.ptr
+        omp.yield
+      }
+     omp.terminator
+    }
+
+    llvm.return
+  }
+
+}
+// CHECK:      call void @__kmpc_parallel_51(ptr addrspacecast
+// CHECK-SAME:  (ptr addrspace(1) @[[GLOB:[0-9]+]] to ptr),
+// CHECK-SAME:  i32 %[[THREAD_NUM:.*]], i32 1, i32 -1, i32 -1,
+// CHECK-SAME:  ptr @[[PARALLEL_FUNC:.*]], ptr null, ptr %[[PARALLEL_ARGS:.*]], i64 1)
+
+// CHECK:      define internal void @[[PARALLEL_FUNC]]
+// CHECK-SAME:  (ptr noalias noundef %[[TID_ADDR:.*]], ptr noalias noundef %[[ZERO_ADDR:.*]],
+// CHECK-SAME:  ptr %[[ARG_PTR:.*]])
+// CHECK: call void @__kmpc_for_static_loop_4u(ptr addrspacecast (ptr addrspace(1) @[[GLOB]] to ptr),
+// CHECK-SAME:   ptr @[[LOOP_BODY_FUNC:.*]], ptr %[[LOO_BODY_FUNC_ARG:.*]], i32 10,
+// CHECK-SAME:   i32 %[[THREAD_NUM:.*]], i32 0)
+
+// CHECK:      define internal void @[[LOOP_BODY_FUNC]](i32 %[[CNT:.*]], ptr %[[LOOP_BODY_ARG_PTR:.*]]) {

From 95b423e44f6f35651bb1e7d4f6e0591df71360af Mon Sep 17 00:00:00 2001
From: Ilya Biryukov <ibiryukov@google.com>
Date: Fri, 22 Dec 2023 11:47:20 +0100
Subject: [PATCH 170/342] [Sema] NFC. Simplify code in a few places of
 TryOrBuildParenListInitialization

---
 clang/lib/Sema/SemaInit.cpp | 19 ++++++-------------
 1 file changed, 6 insertions(+), 13 deletions(-)

diff --git a/clang/lib/Sema/SemaInit.cpp b/clang/lib/Sema/SemaInit.cpp
index 0fbd87ce34db9..d5ba7fd341371 100644
--- a/clang/lib/Sema/SemaInit.cpp
+++ b/clang/lib/Sema/SemaInit.cpp
@@ -25,6 +25,7 @@
 #include "clang/Sema/EnterExpressionEvaluationContext.h"
 #include "clang/Sema/Initialization.h"
 #include "clang/Sema/Lookup.h"
+#include "clang/Sema/Ownership.h"
 #include "clang/Sema/SemaInternal.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/FoldingSet.h"
@@ -5429,18 +5430,12 @@ static void TryOrBuildParenListInitialization(
   auto HandleInitializedEntity = [&](const InitializedEntity &SubEntity,
                                      const InitializationKind &SubKind,
                                      Expr *Arg, Expr **InitExpr = nullptr) {
-    InitializationSequence IS = [&]() {
-      if (Arg)
-        return InitializationSequence(S, SubEntity, SubKind, Arg);
-      return InitializationSequence(S, SubEntity, SubKind, std::nullopt);
-    }();
+    InitializationSequence IS = InitializationSequence(
+        S, SubEntity, SubKind, Arg ? MultiExprArg(Arg) : std::nullopt);
 
     if (IS.Failed()) {
       if (!VerifyOnly) {
-        if (Arg)
-          IS.Diagnose(S, SubEntity, SubKind, Arg);
-        else
-          IS.Diagnose(S, SubEntity, SubKind, std::nullopt);
+        IS.Diagnose(S, SubEntity, SubKind, Arg ? ArrayRef(Arg) : std::nullopt);
       } else {
         Sequence.SetFailed(
             InitializationSequence::FK_ParenthesizedListInitFailed);
@@ -5450,10 +5445,8 @@ static void TryOrBuildParenListInitialization(
     }
     if (!VerifyOnly) {
       ExprResult ER;
-      if (Arg)
-        ER = IS.Perform(S, SubEntity, SubKind, Arg);
-      else
-        ER = IS.Perform(S, SubEntity, SubKind, std::nullopt);
+      ER = IS.Perform(S, SubEntity, SubKind,
+                      Arg ? MultiExprArg(Arg) : std::nullopt);
       if (InitExpr)
         *InitExpr = ER.get();
       else

From 86dc6e15f22610bbb53eb4efda0a178ecefc933a Mon Sep 17 00:00:00 2001
From: Ilya Biryukov <ibiryukov@google.com>
Date: Fri, 22 Dec 2023 13:11:27 +0100
Subject: [PATCH 171/342] [Sema] Fix crash on invalid code with parenthesized
 aggregate initialization (#76232)

Fixes #76228.
Use the same logic as braced init lists, also adds a test that puts
incomplete types in various positions to check for regressions in the
future.
---
 clang/lib/Sema/SemaInit.cpp          |  8 ++++++++
 clang/test/SemaCXX/crash-GH76228.cpp | 28 ++++++++++++++++++++++++++++
 2 files changed, 36 insertions(+)
 create mode 100644 clang/test/SemaCXX/crash-GH76228.cpp

diff --git a/clang/lib/Sema/SemaInit.cpp b/clang/lib/Sema/SemaInit.cpp
index d5ba7fd341371..f768d2726b0a1 100644
--- a/clang/lib/Sema/SemaInit.cpp
+++ b/clang/lib/Sema/SemaInit.cpp
@@ -5512,6 +5512,14 @@ static void TryOrBuildParenListInitialization(
   } else if (auto *RT = Entity.getType()->getAs<RecordType>()) {
     bool IsUnion = RT->isUnionType();
     const CXXRecordDecl *RD = cast<CXXRecordDecl>(RT->getDecl());
+    if (RD->isInvalidDecl()) {
+      // Exit early to avoid confusion when processing members.
+      // We do the same for braced list initialization in
+      // `CheckStructUnionTypes`.
+      Sequence.SetFailed(
+          clang::InitializationSequence::FK_ParenthesizedListInitFailed);
+      return;
+    }
 
     if (!IsUnion) {
       for (const CXXBaseSpecifier &Base : RD->bases()) {
diff --git a/clang/test/SemaCXX/crash-GH76228.cpp b/clang/test/SemaCXX/crash-GH76228.cpp
new file mode 100644
index 0000000000000..33a9395823127
--- /dev/null
+++ b/clang/test/SemaCXX/crash-GH76228.cpp
@@ -0,0 +1,28 @@
+// RUN: %clang_cc1 -std=c++20 -verify %s
+// Check we don't crash on incomplete members and bases when handling parenthesized initialization.
+class incomplete; // expected-note@-0 3  {{forward declaration of 'incomplete'}}
+struct foo {
+  int a;
+  incomplete b;
+  // expected-error@-1 {{incomplete type}}
+};
+foo a1(0);
+
+struct one_int {
+    int a;
+};
+struct bar : one_int, incomplete {};
+// expected-error@-1 {{incomplete type}}
+bar a2(0);
+
+incomplete a3[3](1,2,3);
+// expected-error@-1 {{incomplete type}}
+
+struct qux : foo {
+};
+qux a4(0);
+
+struct fred {
+    foo a[3];
+};
+fred a5(0);

From 7ab16fb5207fe187ab999f882069bd632d2e68e5 Mon Sep 17 00:00:00 2001
From: Ilya Biryukov <ibiryukov@google.com>
Date: Fri, 22 Dec 2023 13:30:43 +0100
Subject: [PATCH 172/342] [Sema] Update test for previous change

The warning for C++20 extension does not fire in on specific instance
because conversion now fails as class is invalid because of an invalid
member.

The new behavior is expected, so updating the test accordingly
---
 clang/test/SemaCXX/paren-list-agg-init.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/test/SemaCXX/paren-list-agg-init.cpp b/clang/test/SemaCXX/paren-list-agg-init.cpp
index f60b20e0d4656..c1964a5a9eb00 100644
--- a/clang/test/SemaCXX/paren-list-agg-init.cpp
+++ b/clang/test/SemaCXX/paren-list-agg-init.cpp
@@ -289,7 +289,7 @@ int test() {
   // used to crash
   S a(0, 1);
   S b(0);
-  S c(0, 0, 1); // beforecxx20-warning {{aggregate initialization of type 'S' from a parenthesized list of values is a C++20 extension}}
+  S c(0, 0, 1);
 
   S d {0, 1};
   S e {0};

From d430c145ba92328e8363fab7adca4fc1e61e6637 Mon Sep 17 00:00:00 2001
From: Abhina Sree <69635948+abhina-sree@users.noreply.github.com>
Date: Fri, 22 Dec 2023 08:12:19 -0500
Subject: [PATCH 173/342] [CMake] Move check for dlfcn.h and dladdr to clang
 (#76163)

This patch checks for the presence of dlfcn.h and dladdr in clang to be used in clang/tools/libclang/CIndexer.cpp
---
 clang/CMakeLists.txt                         | 17 +++++++++++++++++
 clang/include/clang/Config/config.h.cmake    |  6 ++++++
 clang/tools/libclang/CIndexer.cpp            |  4 ++--
 llvm/include/llvm/Config/config.h.cmake      |  6 ++++++
 llvm/include/llvm/Config/llvm-config.h.cmake |  6 ------
 5 files changed, 31 insertions(+), 8 deletions(-)

diff --git a/clang/CMakeLists.txt b/clang/CMakeLists.txt
index 2ca6db02e5879..9f814478c4550 100644
--- a/clang/CMakeLists.txt
+++ b/clang/CMakeLists.txt
@@ -167,6 +167,23 @@ endif()
 include(CheckIncludeFile)
 check_include_file(sys/resource.h CLANG_HAVE_RLIMITS)
 
+# This check requires _GNU_SOURCE on linux
+check_include_file(dlfcn.h CLANG_HAVE_DLFCN_H)
+if( CLANG_HAVE_DLFCN_H )
+  include(CheckLibraryExists)
+  include(CheckSymbolExists)
+  check_library_exists(dl dlopen "" HAVE_LIBDL)
+  if( HAVE_LIBDL )
+    list(APPEND CMAKE_REQUIRED_LIBRARIES dl)
+  endif()
+  list(APPEND CMAKE_REQUIRED_DEFINITIONS -D_GNU_SOURCE)
+  check_symbol_exists(dladdr dlfcn.h CLANG_HAVE_DLADDR)
+  list(REMOVE_ITEM CMAKE_REQUIRED_DEFINITIONS -D_GNU_SOURCE)
+  if( HAVE_LIBDL )
+    list(REMOVE_ITEM CMAKE_REQUIRED_LIBRARIES dl)
+  endif()
+endif()
+
 set(CLANG_RESOURCE_DIR "" CACHE STRING
   "Relative directory from the Clang binary to its resource files.")
 
diff --git a/clang/include/clang/Config/config.h.cmake b/clang/include/clang/Config/config.h.cmake
index a54a26cd32ffe..4015ac8040861 100644
--- a/clang/include/clang/Config/config.h.cmake
+++ b/clang/include/clang/Config/config.h.cmake
@@ -57,6 +57,12 @@
 /* Define if we have sys/resource.h (rlimits) */
 #cmakedefine CLANG_HAVE_RLIMITS ${CLANG_HAVE_RLIMITS}
 
+/* Define if we have dlfcn.h */
+#cmakedefine CLANG_HAVE_DLFCN_H ${CLANG_HAVE_DLFCN_H}
+
+/* Define if dladdr() is available on this platform. */
+#cmakedefine CLANG_HAVE_DLADDR ${CLANG_HAVE_DLADDR}
+
 /* Linker version detected at compile time. */
 #cmakedefine HOST_LINK_VERSION "${HOST_LINK_VERSION}"
 
diff --git a/clang/tools/libclang/CIndexer.cpp b/clang/tools/libclang/CIndexer.cpp
index 430147b2aa77a..12d9d418dea51 100644
--- a/clang/tools/libclang/CIndexer.cpp
+++ b/clang/tools/libclang/CIndexer.cpp
@@ -14,10 +14,10 @@
 #include "CXString.h"
 #include "clang/Basic/LLVM.h"
 #include "clang/Basic/Version.h"
+#include "clang/Config/config.h"
 #include "clang/Driver/Driver.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallString.h"
-#include "llvm/Config/llvm-config.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/MD5.h"
 #include "llvm/Support/Path.h"
@@ -127,7 +127,7 @@ const std::string &CIndexer::getClangResourcesPath() {
   getClangResourcesPathImplAIX(LibClangPath);
 #else
   bool PathFound = false;
-#if defined(HAVE_DLFCN_H) && defined(HAVE_DLADDR)
+#if defined(CLANG_HAVE_DLFCN_H) && defined(CLANG_HAVE_DLADDR)
   Dl_info info;
   // This silly cast below avoids a C++ warning.
   if (dladdr((void *)(uintptr_t)clang_createTranslationUnit, &info) != 0) {
diff --git a/llvm/include/llvm/Config/config.h.cmake b/llvm/include/llvm/Config/config.h.cmake
index d464263c190a7..fc1f9bf342f8d 100644
--- a/llvm/include/llvm/Config/config.h.cmake
+++ b/llvm/include/llvm/Config/config.h.cmake
@@ -50,9 +50,15 @@
    don't. */
 #cmakedefine01 HAVE_DECL_STRERROR_S
 
+/* Define to 1 if you have the <dlfcn.h> header file. */
+#cmakedefine HAVE_DLFCN_H ${HAVE_DLFCN_H}
+
 /* Define if dlopen() is available on this platform. */
 #cmakedefine HAVE_DLOPEN ${HAVE_DLOPEN}
 
+/* Define if dladdr() is available on this platform. */
+#cmakedefine HAVE_DLADDR ${HAVE_DLADDR}
+
 /* Define to 1 if we can register EH frames on this platform. */
 #cmakedefine HAVE_REGISTER_FRAME ${HAVE_REGISTER_FRAME}
 
diff --git a/llvm/include/llvm/Config/llvm-config.h.cmake b/llvm/include/llvm/Config/llvm-config.h.cmake
index 483c5adc99ca8..6605ea60df99e 100644
--- a/llvm/include/llvm/Config/llvm-config.h.cmake
+++ b/llvm/include/llvm/Config/llvm-config.h.cmake
@@ -198,10 +198,4 @@
 /* Define if plugins enabled */
 #cmakedefine LLVM_ENABLE_PLUGINS
 
-/* Define to 1 if you have the <dlfcn.h> header file. */
-#cmakedefine HAVE_DLFCN_H ${HAVE_DLFCN_H}
-
-/* Define if dladdr() is available on this platform. */
-#cmakedefine HAVE_DLADDR ${HAVE_DLADDR}
-
 #endif

From 48b9106656870912dd68f3b3f24012f6f8dbee52 Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Fri, 22 Dec 2023 13:25:00 +0000
Subject: [PATCH 174/342] [AArch64] Add an strict fp reduction test. NFC

---
 .../CodeGen/AArch64/vecreduce-fadd-strict.ll  | 306 ++++++++++++++++++
 .../CodeGen/AArch64/vecreduce-fmul-strict.ll  | 274 ++++++++++++++++
 2 files changed, 580 insertions(+)
 create mode 100644 llvm/test/CodeGen/AArch64/vecreduce-fadd-strict.ll
 create mode 100644 llvm/test/CodeGen/AArch64/vecreduce-fmul-strict.ll

diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fadd-strict.ll b/llvm/test/CodeGen/AArch64/vecreduce-fadd-strict.ll
new file mode 100644
index 0000000000000..63b5a97703e64
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/vecreduce-fadd-strict.ll
@@ -0,0 +1,306 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-none-eabi -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-NOFP16
+; RUN: llc -mtriple=aarch64-none-eabi -mattr=+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-FP16
+
+define float @add_HalfS(<2 x float> %bin.rdx)  {
+; CHECK-LABEL: add_HalfS:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    faddp s0, v0.2s
+; CHECK-NEXT:    ret
+  %r = call float @llvm.vector.reduce.fadd.f32.v2f32(float -0.0, <2 x float> %bin.rdx)
+  ret float %r
+}
+
+define half @add_HalfH(<4 x half> %bin.rdx)  {
+; CHECK-SD-NOFP16-LABEL: add_HalfH:
+; CHECK-SD-NOFP16:       // %bb.0:
+; CHECK-SD-NOFP16-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NOFP16-NEXT:    mov h1, v0.h[1]
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h0
+; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
+; CHECK-SD-NOFP16-NEXT:    fadd s1, s2, s1
+; CHECK-SD-NOFP16-NEXT:    mov h2, v0.h[2]
+; CHECK-SD-NOFP16-NEXT:    mov h0, v0.h[3]
+; CHECK-SD-NOFP16-NEXT:    fcvt h1, s1
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
+; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
+; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
+; CHECK-SD-NOFP16-NEXT:    fadd s1, s1, s2
+; CHECK-SD-NOFP16-NEXT:    fcvt h1, s1
+; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
+; CHECK-SD-NOFP16-NEXT:    fadd s0, s1, s0
+; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
+; CHECK-SD-NOFP16-NEXT:    ret
+;
+; CHECK-SD-FP16-LABEL: add_HalfH:
+; CHECK-SD-FP16:       // %bb.0:
+; CHECK-SD-FP16-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-FP16-NEXT:    mov h1, v0.h[2]
+; CHECK-SD-FP16-NEXT:    faddp h2, v0.2h
+; CHECK-SD-FP16-NEXT:    mov h0, v0.h[3]
+; CHECK-SD-FP16-NEXT:    fadd h1, h2, h1
+; CHECK-SD-FP16-NEXT:    fadd h0, h1, h0
+; CHECK-SD-FP16-NEXT:    ret
+  %r = call half @llvm.vector.reduce.fadd.f16.v4f16(half -0.0, <4 x half> %bin.rdx)
+  ret half %r
+}
+
+
+define half @add_H(<8 x half> %bin.rdx)  {
+; CHECK-SD-NOFP16-LABEL: add_H:
+; CHECK-SD-NOFP16:       // %bb.0:
+; CHECK-SD-NOFP16-NEXT:    mov h1, v0.h[1]
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h0
+; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
+; CHECK-SD-NOFP16-NEXT:    fadd s1, s2, s1
+; CHECK-SD-NOFP16-NEXT:    mov h2, v0.h[2]
+; CHECK-SD-NOFP16-NEXT:    fcvt h1, s1
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
+; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
+; CHECK-SD-NOFP16-NEXT:    fadd s1, s1, s2
+; CHECK-SD-NOFP16-NEXT:    mov h2, v0.h[3]
+; CHECK-SD-NOFP16-NEXT:    fcvt h1, s1
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
+; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
+; CHECK-SD-NOFP16-NEXT:    fadd s1, s1, s2
+; CHECK-SD-NOFP16-NEXT:    mov h2, v0.h[4]
+; CHECK-SD-NOFP16-NEXT:    fcvt h1, s1
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
+; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
+; CHECK-SD-NOFP16-NEXT:    fadd s1, s1, s2
+; CHECK-SD-NOFP16-NEXT:    mov h2, v0.h[5]
+; CHECK-SD-NOFP16-NEXT:    fcvt h1, s1
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
+; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
+; CHECK-SD-NOFP16-NEXT:    fadd s1, s1, s2
+; CHECK-SD-NOFP16-NEXT:    mov h2, v0.h[6]
+; CHECK-SD-NOFP16-NEXT:    mov h0, v0.h[7]
+; CHECK-SD-NOFP16-NEXT:    fcvt h1, s1
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
+; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
+; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
+; CHECK-SD-NOFP16-NEXT:    fadd s1, s1, s2
+; CHECK-SD-NOFP16-NEXT:    fcvt h1, s1
+; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
+; CHECK-SD-NOFP16-NEXT:    fadd s0, s1, s0
+; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
+; CHECK-SD-NOFP16-NEXT:    ret
+;
+; CHECK-SD-FP16-LABEL: add_H:
+; CHECK-SD-FP16:       // %bb.0:
+; CHECK-SD-FP16-NEXT:    mov h1, v0.h[2]
+; CHECK-SD-FP16-NEXT:    faddp h2, v0.2h
+; CHECK-SD-FP16-NEXT:    mov h3, v0.h[3]
+; CHECK-SD-FP16-NEXT:    fadd h1, h2, h1
+; CHECK-SD-FP16-NEXT:    mov h2, v0.h[4]
+; CHECK-SD-FP16-NEXT:    fadd h1, h1, h3
+; CHECK-SD-FP16-NEXT:    mov h3, v0.h[5]
+; CHECK-SD-FP16-NEXT:    fadd h1, h1, h2
+; CHECK-SD-FP16-NEXT:    mov h2, v0.h[6]
+; CHECK-SD-FP16-NEXT:    mov h0, v0.h[7]
+; CHECK-SD-FP16-NEXT:    fadd h1, h1, h3
+; CHECK-SD-FP16-NEXT:    fadd h1, h1, h2
+; CHECK-SD-FP16-NEXT:    fadd h0, h1, h0
+; CHECK-SD-FP16-NEXT:    ret
+  %r = call half @llvm.vector.reduce.fadd.f16.v8f16(half -0.0, <8 x half> %bin.rdx)
+  ret half %r
+}
+
+define float @add_S(<4 x float> %bin.rdx)  {
+; CHECK-LABEL: add_S:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov s1, v0.s[2]
+; CHECK-NEXT:    faddp s2, v0.2s
+; CHECK-NEXT:    mov s0, v0.s[3]
+; CHECK-NEXT:    fadd s1, s2, s1
+; CHECK-NEXT:    fadd s0, s1, s0
+; CHECK-NEXT:    ret
+  %r = call float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %bin.rdx)
+  ret float %r
+}
+
+define double @add_D(<2 x double> %bin.rdx)  {
+; CHECK-LABEL: add_D:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    faddp d0, v0.2d
+; CHECK-NEXT:    ret
+  %r = call double @llvm.vector.reduce.fadd.f64.v2f64(double -0.0, <2 x double> %bin.rdx)
+  ret double %r
+}
+
+define half @add_2H(<16 x half> %bin.rdx)  {
+; CHECK-SD-NOFP16-LABEL: add_2H:
+; CHECK-SD-NOFP16:       // %bb.0:
+; CHECK-SD-NOFP16-NEXT:    mov h2, v0.h[1]
+; CHECK-SD-NOFP16-NEXT:    fcvt s3, h0
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
+; CHECK-SD-NOFP16-NEXT:    fadd s2, s3, s2
+; CHECK-SD-NOFP16-NEXT:    mov h3, v0.h[2]
+; CHECK-SD-NOFP16-NEXT:    fcvt h2, s2
+; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
+; CHECK-SD-NOFP16-NEXT:    fadd s2, s2, s3
+; CHECK-SD-NOFP16-NEXT:    mov h3, v0.h[3]
+; CHECK-SD-NOFP16-NEXT:    fcvt h2, s2
+; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
+; CHECK-SD-NOFP16-NEXT:    fadd s2, s2, s3
+; CHECK-SD-NOFP16-NEXT:    mov h3, v0.h[4]
+; CHECK-SD-NOFP16-NEXT:    fcvt h2, s2
+; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
+; CHECK-SD-NOFP16-NEXT:    fadd s2, s2, s3
+; CHECK-SD-NOFP16-NEXT:    mov h3, v0.h[5]
+; CHECK-SD-NOFP16-NEXT:    fcvt h2, s2
+; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
+; CHECK-SD-NOFP16-NEXT:    fadd s2, s2, s3
+; CHECK-SD-NOFP16-NEXT:    mov h3, v0.h[6]
+; CHECK-SD-NOFP16-NEXT:    mov h0, v0.h[7]
+; CHECK-SD-NOFP16-NEXT:    fcvt h2, s2
+; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
+; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
+; CHECK-SD-NOFP16-NEXT:    fadd s2, s2, s3
+; CHECK-SD-NOFP16-NEXT:    fcvt h2, s2
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
+; CHECK-SD-NOFP16-NEXT:    fadd s0, s2, s0
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h1
+; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
+; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
+; CHECK-SD-NOFP16-NEXT:    fadd s0, s0, s2
+; CHECK-SD-NOFP16-NEXT:    mov h2, v1.h[1]
+; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
+; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
+; CHECK-SD-NOFP16-NEXT:    fadd s0, s0, s2
+; CHECK-SD-NOFP16-NEXT:    mov h2, v1.h[2]
+; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
+; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
+; CHECK-SD-NOFP16-NEXT:    fadd s0, s0, s2
+; CHECK-SD-NOFP16-NEXT:    mov h2, v1.h[3]
+; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
+; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
+; CHECK-SD-NOFP16-NEXT:    fadd s0, s0, s2
+; CHECK-SD-NOFP16-NEXT:    mov h2, v1.h[4]
+; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
+; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
+; CHECK-SD-NOFP16-NEXT:    fadd s0, s0, s2
+; CHECK-SD-NOFP16-NEXT:    mov h2, v1.h[5]
+; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
+; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
+; CHECK-SD-NOFP16-NEXT:    fadd s0, s0, s2
+; CHECK-SD-NOFP16-NEXT:    mov h2, v1.h[6]
+; CHECK-SD-NOFP16-NEXT:    mov h1, v1.h[7]
+; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
+; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
+; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
+; CHECK-SD-NOFP16-NEXT:    fadd s0, s0, s2
+; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
+; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
+; CHECK-SD-NOFP16-NEXT:    fadd s0, s0, s1
+; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
+; CHECK-SD-NOFP16-NEXT:    ret
+;
+; CHECK-SD-FP16-LABEL: add_2H:
+; CHECK-SD-FP16:       // %bb.0:
+; CHECK-SD-FP16-NEXT:    mov h2, v0.h[2]
+; CHECK-SD-FP16-NEXT:    faddp h3, v0.2h
+; CHECK-SD-FP16-NEXT:    mov h4, v0.h[3]
+; CHECK-SD-FP16-NEXT:    fadd h2, h3, h2
+; CHECK-SD-FP16-NEXT:    mov h3, v0.h[4]
+; CHECK-SD-FP16-NEXT:    fadd h2, h2, h4
+; CHECK-SD-FP16-NEXT:    mov h4, v0.h[5]
+; CHECK-SD-FP16-NEXT:    fadd h2, h2, h3
+; CHECK-SD-FP16-NEXT:    mov h3, v0.h[6]
+; CHECK-SD-FP16-NEXT:    mov h0, v0.h[7]
+; CHECK-SD-FP16-NEXT:    fadd h2, h2, h4
+; CHECK-SD-FP16-NEXT:    fadd h2, h2, h3
+; CHECK-SD-FP16-NEXT:    mov h3, v1.h[2]
+; CHECK-SD-FP16-NEXT:    fadd h0, h2, h0
+; CHECK-SD-FP16-NEXT:    mov h2, v1.h[1]
+; CHECK-SD-FP16-NEXT:    fadd h0, h0, h1
+; CHECK-SD-FP16-NEXT:    fadd h0, h0, h2
+; CHECK-SD-FP16-NEXT:    mov h2, v1.h[3]
+; CHECK-SD-FP16-NEXT:    fadd h0, h0, h3
+; CHECK-SD-FP16-NEXT:    mov h3, v1.h[4]
+; CHECK-SD-FP16-NEXT:    fadd h0, h0, h2
+; CHECK-SD-FP16-NEXT:    mov h2, v1.h[5]
+; CHECK-SD-FP16-NEXT:    fadd h0, h0, h3
+; CHECK-SD-FP16-NEXT:    mov h3, v1.h[6]
+; CHECK-SD-FP16-NEXT:    mov h1, v1.h[7]
+; CHECK-SD-FP16-NEXT:    fadd h0, h0, h2
+; CHECK-SD-FP16-NEXT:    fadd h0, h0, h3
+; CHECK-SD-FP16-NEXT:    fadd h0, h0, h1
+; CHECK-SD-FP16-NEXT:    ret
+  %r = call half @llvm.vector.reduce.fadd.f16.v16f16(half -0.0, <16 x half> %bin.rdx)
+  ret half %r
+}
+
+define float @add_2S(<8 x float> %bin.rdx)  {
+; CHECK-LABEL: add_2S:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov s2, v0.s[2]
+; CHECK-NEXT:    faddp s3, v0.2s
+; CHECK-NEXT:    mov s0, v0.s[3]
+; CHECK-NEXT:    fadd s2, s3, s2
+; CHECK-NEXT:    mov s3, v1.s[2]
+; CHECK-NEXT:    fadd s0, s2, s0
+; CHECK-NEXT:    mov s2, v1.s[1]
+; CHECK-NEXT:    fadd s0, s0, s1
+; CHECK-NEXT:    mov s1, v1.s[3]
+; CHECK-NEXT:    fadd s0, s0, s2
+; CHECK-NEXT:    fadd s0, s0, s3
+; CHECK-NEXT:    fadd s0, s0, s1
+; CHECK-NEXT:    ret
+  %r = call float @llvm.vector.reduce.fadd.f32.v8f32(float -0.0, <8 x float> %bin.rdx)
+  ret float %r
+}
+
+define double @add_2D(<4 x double> %bin.rdx)  {
+; CHECK-LABEL: add_2D:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    faddp d0, v0.2d
+; CHECK-NEXT:    mov d2, v1.d[1]
+; CHECK-NEXT:    fadd d0, d0, d1
+; CHECK-NEXT:    fadd d0, d0, d2
+; CHECK-NEXT:    ret
+  %r = call double @llvm.vector.reduce.fadd.f64.v4f64(double -0.0, <4 x double> %bin.rdx)
+  ret double %r
+}
+
+; Added at least one test where the start value is not -0.0.
+define float @add_S_init_42(<4 x float> %bin.rdx)  {
+; CHECK-LABEL: add_S_init_42:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #1109917696 // =0x42280000
+; CHECK-NEXT:    mov s2, v0.s[1]
+; CHECK-NEXT:    mov s3, v0.s[2]
+; CHECK-NEXT:    fmov s1, w8
+; CHECK-NEXT:    fadd s1, s0, s1
+; CHECK-NEXT:    mov s0, v0.s[3]
+; CHECK-NEXT:    fadd s1, s1, s2
+; CHECK-NEXT:    fadd s1, s1, s3
+; CHECK-NEXT:    fadd s0, s1, s0
+; CHECK-NEXT:    ret
+  %r = call float @llvm.vector.reduce.fadd.f32.v4f32(float 42.0, <4 x float> %bin.rdx)
+  ret float %r
+}
+
+; Function Attrs: nounwind readnone
+declare half @llvm.vector.reduce.fadd.f16.v4f16(half, <4 x half>)
+declare half @llvm.vector.reduce.fadd.f16.v8f16(half, <8 x half>)
+declare half @llvm.vector.reduce.fadd.f16.v16f16(half, <16 x half>)
+declare float @llvm.vector.reduce.fadd.f32.v2f32(float, <2 x float>)
+declare float @llvm.vector.reduce.fadd.f32.v4f32(float, <4 x float>)
+declare float @llvm.vector.reduce.fadd.f32.v8f32(float, <8 x float>)
+declare double @llvm.vector.reduce.fadd.f64.v2f64(double, <2 x double>)
+declare double @llvm.vector.reduce.fadd.f64.v4f64(double, <4 x double>)
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK-SD: {{.*}}
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fmul-strict.ll b/llvm/test/CodeGen/AArch64/vecreduce-fmul-strict.ll
new file mode 100644
index 0000000000000..68cd3496a923a
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/vecreduce-fmul-strict.ll
@@ -0,0 +1,274 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-none-eabi -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-NOFP16
+; RUN: llc -mtriple=aarch64-none-eabi -mattr=+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-FP16
+
+define float @mul_HalfS(<2 x float> %bin.rdx)  {
+; CHECK-LABEL: mul_HalfS:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    fmul s0, s0, v0.s[1]
+; CHECK-NEXT:    ret
+  %r = call float @llvm.vector.reduce.fmul.f32.v2f32(float 1.0, <2 x float> %bin.rdx)
+  ret float %r
+}
+
+define half @mul_HalfH(<4 x half> %bin.rdx)  {
+; CHECK-SD-NOFP16-LABEL: mul_HalfH:
+; CHECK-SD-NOFP16:       // %bb.0:
+; CHECK-SD-NOFP16-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NOFP16-NEXT:    mov h1, v0.h[1]
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h0
+; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
+; CHECK-SD-NOFP16-NEXT:    fmul s1, s2, s1
+; CHECK-SD-NOFP16-NEXT:    mov h2, v0.h[2]
+; CHECK-SD-NOFP16-NEXT:    mov h0, v0.h[3]
+; CHECK-SD-NOFP16-NEXT:    fcvt h1, s1
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
+; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
+; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
+; CHECK-SD-NOFP16-NEXT:    fmul s1, s1, s2
+; CHECK-SD-NOFP16-NEXT:    fcvt h1, s1
+; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
+; CHECK-SD-NOFP16-NEXT:    fmul s0, s1, s0
+; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
+; CHECK-SD-NOFP16-NEXT:    ret
+;
+; CHECK-SD-FP16-LABEL: mul_HalfH:
+; CHECK-SD-FP16:       // %bb.0:
+; CHECK-SD-FP16-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-FP16-NEXT:    fmul h1, h0, v0.h[1]
+; CHECK-SD-FP16-NEXT:    fmul h1, h1, v0.h[2]
+; CHECK-SD-FP16-NEXT:    fmul h0, h1, v0.h[3]
+; CHECK-SD-FP16-NEXT:    ret
+  %r = call half @llvm.vector.reduce.fmul.f16.v4f16(half 1.0, <4 x half> %bin.rdx)
+  ret half %r
+}
+
+
+define half @mul_H(<8 x half> %bin.rdx)  {
+; CHECK-SD-NOFP16-LABEL: mul_H:
+; CHECK-SD-NOFP16:       // %bb.0:
+; CHECK-SD-NOFP16-NEXT:    mov h1, v0.h[1]
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h0
+; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
+; CHECK-SD-NOFP16-NEXT:    fmul s1, s2, s1
+; CHECK-SD-NOFP16-NEXT:    mov h2, v0.h[2]
+; CHECK-SD-NOFP16-NEXT:    fcvt h1, s1
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
+; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
+; CHECK-SD-NOFP16-NEXT:    fmul s1, s1, s2
+; CHECK-SD-NOFP16-NEXT:    mov h2, v0.h[3]
+; CHECK-SD-NOFP16-NEXT:    fcvt h1, s1
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
+; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
+; CHECK-SD-NOFP16-NEXT:    fmul s1, s1, s2
+; CHECK-SD-NOFP16-NEXT:    mov h2, v0.h[4]
+; CHECK-SD-NOFP16-NEXT:    fcvt h1, s1
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
+; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
+; CHECK-SD-NOFP16-NEXT:    fmul s1, s1, s2
+; CHECK-SD-NOFP16-NEXT:    mov h2, v0.h[5]
+; CHECK-SD-NOFP16-NEXT:    fcvt h1, s1
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
+; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
+; CHECK-SD-NOFP16-NEXT:    fmul s1, s1, s2
+; CHECK-SD-NOFP16-NEXT:    mov h2, v0.h[6]
+; CHECK-SD-NOFP16-NEXT:    mov h0, v0.h[7]
+; CHECK-SD-NOFP16-NEXT:    fcvt h1, s1
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
+; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
+; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
+; CHECK-SD-NOFP16-NEXT:    fmul s1, s1, s2
+; CHECK-SD-NOFP16-NEXT:    fcvt h1, s1
+; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
+; CHECK-SD-NOFP16-NEXT:    fmul s0, s1, s0
+; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
+; CHECK-SD-NOFP16-NEXT:    ret
+;
+; CHECK-SD-FP16-LABEL: mul_H:
+; CHECK-SD-FP16:       // %bb.0:
+; CHECK-SD-FP16-NEXT:    fmul h1, h0, v0.h[1]
+; CHECK-SD-FP16-NEXT:    fmul h1, h1, v0.h[2]
+; CHECK-SD-FP16-NEXT:    fmul h1, h1, v0.h[3]
+; CHECK-SD-FP16-NEXT:    fmul h1, h1, v0.h[4]
+; CHECK-SD-FP16-NEXT:    fmul h1, h1, v0.h[5]
+; CHECK-SD-FP16-NEXT:    fmul h1, h1, v0.h[6]
+; CHECK-SD-FP16-NEXT:    fmul h0, h1, v0.h[7]
+; CHECK-SD-FP16-NEXT:    ret
+  %r = call half @llvm.vector.reduce.fmul.f16.v8f16(half 1.0, <8 x half> %bin.rdx)
+  ret half %r
+}
+
+define float @mul_S(<4 x float> %bin.rdx)  {
+; CHECK-LABEL: mul_S:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmul s1, s0, v0.s[1]
+; CHECK-NEXT:    fmul s1, s1, v0.s[2]
+; CHECK-NEXT:    fmul s0, s1, v0.s[3]
+; CHECK-NEXT:    ret
+  %r = call float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %bin.rdx)
+  ret float %r
+}
+
+define double @mul_D(<2 x double> %bin.rdx)  {
+; CHECK-LABEL: mul_D:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmul d0, d0, v0.d[1]
+; CHECK-NEXT:    ret
+  %r = call double @llvm.vector.reduce.fmul.f64.v2f64(double 1.0, <2 x double> %bin.rdx)
+  ret double %r
+}
+
+define half @mul_2H(<16 x half> %bin.rdx)  {
+; CHECK-SD-NOFP16-LABEL: mul_2H:
+; CHECK-SD-NOFP16:       // %bb.0:
+; CHECK-SD-NOFP16-NEXT:    mov h2, v0.h[1]
+; CHECK-SD-NOFP16-NEXT:    fcvt s3, h0
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
+; CHECK-SD-NOFP16-NEXT:    fmul s2, s3, s2
+; CHECK-SD-NOFP16-NEXT:    mov h3, v0.h[2]
+; CHECK-SD-NOFP16-NEXT:    fcvt h2, s2
+; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
+; CHECK-SD-NOFP16-NEXT:    fmul s2, s2, s3
+; CHECK-SD-NOFP16-NEXT:    mov h3, v0.h[3]
+; CHECK-SD-NOFP16-NEXT:    fcvt h2, s2
+; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
+; CHECK-SD-NOFP16-NEXT:    fmul s2, s2, s3
+; CHECK-SD-NOFP16-NEXT:    mov h3, v0.h[4]
+; CHECK-SD-NOFP16-NEXT:    fcvt h2, s2
+; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
+; CHECK-SD-NOFP16-NEXT:    fmul s2, s2, s3
+; CHECK-SD-NOFP16-NEXT:    mov h3, v0.h[5]
+; CHECK-SD-NOFP16-NEXT:    fcvt h2, s2
+; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
+; CHECK-SD-NOFP16-NEXT:    fmul s2, s2, s3
+; CHECK-SD-NOFP16-NEXT:    mov h3, v0.h[6]
+; CHECK-SD-NOFP16-NEXT:    mov h0, v0.h[7]
+; CHECK-SD-NOFP16-NEXT:    fcvt h2, s2
+; CHECK-SD-NOFP16-NEXT:    fcvt s3, h3
+; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
+; CHECK-SD-NOFP16-NEXT:    fmul s2, s2, s3
+; CHECK-SD-NOFP16-NEXT:    fcvt h2, s2
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
+; CHECK-SD-NOFP16-NEXT:    fmul s0, s2, s0
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h1
+; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
+; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
+; CHECK-SD-NOFP16-NEXT:    fmul s0, s0, s2
+; CHECK-SD-NOFP16-NEXT:    mov h2, v1.h[1]
+; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
+; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
+; CHECK-SD-NOFP16-NEXT:    fmul s0, s0, s2
+; CHECK-SD-NOFP16-NEXT:    mov h2, v1.h[2]
+; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
+; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
+; CHECK-SD-NOFP16-NEXT:    fmul s0, s0, s2
+; CHECK-SD-NOFP16-NEXT:    mov h2, v1.h[3]
+; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
+; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
+; CHECK-SD-NOFP16-NEXT:    fmul s0, s0, s2
+; CHECK-SD-NOFP16-NEXT:    mov h2, v1.h[4]
+; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
+; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
+; CHECK-SD-NOFP16-NEXT:    fmul s0, s0, s2
+; CHECK-SD-NOFP16-NEXT:    mov h2, v1.h[5]
+; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
+; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
+; CHECK-SD-NOFP16-NEXT:    fmul s0, s0, s2
+; CHECK-SD-NOFP16-NEXT:    mov h2, v1.h[6]
+; CHECK-SD-NOFP16-NEXT:    mov h1, v1.h[7]
+; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
+; CHECK-SD-NOFP16-NEXT:    fcvt s2, h2
+; CHECK-SD-NOFP16-NEXT:    fcvt s1, h1
+; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
+; CHECK-SD-NOFP16-NEXT:    fmul s0, s0, s2
+; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
+; CHECK-SD-NOFP16-NEXT:    fcvt s0, h0
+; CHECK-SD-NOFP16-NEXT:    fmul s0, s0, s1
+; CHECK-SD-NOFP16-NEXT:    fcvt h0, s0
+; CHECK-SD-NOFP16-NEXT:    ret
+;
+; CHECK-SD-FP16-LABEL: mul_2H:
+; CHECK-SD-FP16:       // %bb.0:
+; CHECK-SD-FP16-NEXT:    fmul h2, h0, v0.h[1]
+; CHECK-SD-FP16-NEXT:    fmul h2, h2, v0.h[2]
+; CHECK-SD-FP16-NEXT:    fmul h2, h2, v0.h[3]
+; CHECK-SD-FP16-NEXT:    fmul h2, h2, v0.h[4]
+; CHECK-SD-FP16-NEXT:    fmul h2, h2, v0.h[5]
+; CHECK-SD-FP16-NEXT:    fmul h2, h2, v0.h[6]
+; CHECK-SD-FP16-NEXT:    fmul h0, h2, v0.h[7]
+; CHECK-SD-FP16-NEXT:    fmul h0, h0, h1
+; CHECK-SD-FP16-NEXT:    fmul h0, h0, v1.h[1]
+; CHECK-SD-FP16-NEXT:    fmul h0, h0, v1.h[2]
+; CHECK-SD-FP16-NEXT:    fmul h0, h0, v1.h[3]
+; CHECK-SD-FP16-NEXT:    fmul h0, h0, v1.h[4]
+; CHECK-SD-FP16-NEXT:    fmul h0, h0, v1.h[5]
+; CHECK-SD-FP16-NEXT:    fmul h0, h0, v1.h[6]
+; CHECK-SD-FP16-NEXT:    fmul h0, h0, v1.h[7]
+; CHECK-SD-FP16-NEXT:    ret
+  %r = call half @llvm.vector.reduce.fmul.f16.v16f16(half 1.0, <16 x half> %bin.rdx)
+  ret half %r
+}
+
+define float @mul_2S(<8 x float> %bin.rdx)  {
+; CHECK-LABEL: mul_2S:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmul s2, s0, v0.s[1]
+; CHECK-NEXT:    fmul s2, s2, v0.s[2]
+; CHECK-NEXT:    fmul s0, s2, v0.s[3]
+; CHECK-NEXT:    fmul s0, s0, s1
+; CHECK-NEXT:    fmul s0, s0, v1.s[1]
+; CHECK-NEXT:    fmul s0, s0, v1.s[2]
+; CHECK-NEXT:    fmul s0, s0, v1.s[3]
+; CHECK-NEXT:    ret
+  %r = call float @llvm.vector.reduce.fmul.f32.v8f32(float 1.0, <8 x float> %bin.rdx)
+  ret float %r
+}
+
+define double @mul_2D(<4 x double> %bin.rdx)  {
+; CHECK-LABEL: mul_2D:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmul d0, d0, v0.d[1]
+; CHECK-NEXT:    fmul d0, d0, d1
+; CHECK-NEXT:    fmul d0, d0, v1.d[1]
+; CHECK-NEXT:    ret
+  %r = call double @llvm.vector.reduce.fmul.f64.v4f64(double 1.0, <4 x double> %bin.rdx)
+  ret double %r
+}
+
+; Added at least one test where the start value is not 1.0.
+define float @mul_S_init_42(<4 x float> %bin.rdx)  {
+; CHECK-LABEL: mul_S_init_42:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #1109917696 // =0x42280000
+; CHECK-NEXT:    fmov s1, w8
+; CHECK-NEXT:    fmul s1, s1, s0
+; CHECK-NEXT:    fmul s1, s1, v0.s[1]
+; CHECK-NEXT:    fmul s1, s1, v0.s[2]
+; CHECK-NEXT:    fmul s0, s1, v0.s[3]
+; CHECK-NEXT:    ret
+  %r = call float @llvm.vector.reduce.fmul.f32.v4f32(float 42.0, <4 x float> %bin.rdx)
+  ret float %r
+}
+
+; Function Attrs: nounwind readnone
+declare half @llvm.vector.reduce.fmul.f16.v4f16(half, <4 x half>)
+declare half @llvm.vector.reduce.fmul.f16.v8f16(half, <8 x half>)
+declare half @llvm.vector.reduce.fmul.f16.v16f16(half, <16 x half>)
+declare float @llvm.vector.reduce.fmul.f32.v2f32(float, <2 x float>)
+declare float @llvm.vector.reduce.fmul.f32.v4f32(float, <4 x float>)
+declare float @llvm.vector.reduce.fmul.f32.v8f32(float, <8 x float>)
+declare double @llvm.vector.reduce.fmul.f64.v2f64(double, <2 x double>)
+declare double @llvm.vector.reduce.fmul.f64.v4f64(double, <4 x double>)
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK-SD: {{.*}}

From d03beb94195ae6889d3075dabe64d58c9ab5d1d2 Mon Sep 17 00:00:00 2001
From: Ilya Biryukov <ibiryukov@google.com>
Date: Fri, 22 Dec 2023 14:41:38 +0100
Subject: [PATCH 175/342] [clang-format] Do not break on JS fields like on goto
 labels (#76233)

This regressions was introduced in
70d7ea0cebcf363cd0ddcfb76375fb5fada87dd5.
The commit moved some code and correctly picked up an explicit check for
not running on Verilog.
However, the moved code also never ran for JavaScript and after the
commit we run it there and
this causes the wrong formatting of:

```js
export type Params = Config&{
  columns: Column[];
};
```
into
```js
export type Params = Config&{
columns:
  Column[];
};
```
---
 clang/lib/Format/UnwrappedLineParser.cpp | 6 ++++--
 clang/unittests/Format/FormatTestJS.cpp  | 6 ++++++
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/clang/lib/Format/UnwrappedLineParser.cpp b/clang/lib/Format/UnwrappedLineParser.cpp
index c38b4c884070b..684609747a551 100644
--- a/clang/lib/Format/UnwrappedLineParser.cpp
+++ b/clang/lib/Format/UnwrappedLineParser.cpp
@@ -1650,8 +1650,10 @@ void UnwrappedLineParser::parseStructuralElement(
       return;
     }
     // In Verilog labels can be any expression, so we don't do them here.
-    if (!Style.isVerilog() && Tokens->peekNextToken()->is(tok::colon) &&
-        !Line->MustBeDeclaration) {
+    // JS doesn't have macros, and within classes colons indicate fields, not
+    // labels.
+    if (!Style.isJavaScript() && !Style.isVerilog() &&
+        Tokens->peekNextToken()->is(tok::colon) && !Line->MustBeDeclaration) {
       nextToken();
       Line->Tokens.begin()->Tok->MustBreakBefore = true;
       FormatTok->setFinalizedType(TT_GotoLabelColon);
diff --git a/clang/unittests/Format/FormatTestJS.cpp b/clang/unittests/Format/FormatTestJS.cpp
index e185eceb35305..3aded8f3726d8 100644
--- a/clang/unittests/Format/FormatTestJS.cpp
+++ b/clang/unittests/Format/FormatTestJS.cpp
@@ -2836,5 +2836,11 @@ TEST_F(FormatTestJS, AlignConsecutiveAssignmentsAndDeclarations) {
                Style);
 }
 
+TEST_F(FormatTestJS, DontBreakFieldsAsGoToLabels) {
+  verifyFormat("export type Params = Config&{\n"
+               "  columns: Column[];\n"
+               "};");
+}
+
 } // namespace format
 } // end namespace clang

From 85b23271928c48f87cd950b55a434fc11a212306 Mon Sep 17 00:00:00 2001
From: Adam Paszke <apaszke@google.com>
Date: Fri, 22 Dec 2023 14:46:34 +0100
Subject: [PATCH 176/342] [mlir][nvvm] Fix the PTX lowering of wgmma.mma_async
 (#76150)

---
 mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp        | 2 +-
 mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
index 4f5d71e10f68c..a4de89d928e1b 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
@@ -1003,7 +1003,7 @@ void NVVM::WgmmaMmaAsyncOp::getAsmValues(
         {makeConstantI32(rewriter, static_cast<int>(getLayoutA())),
          mlir::NVVM::PTXRegisterMod::Read});
     asmValues.push_back(
-        {makeConstantI32(rewriter, static_cast<int>(getLayoutB())),
+        {makeConstantI32(rewriter, 1 - static_cast<int>(getLayoutB())),
          mlir::NVVM::PTXRegisterMod::Read});
   }
 }
diff --git a/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir b/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir
index 43de50f3dc8de..74186138c3a98 100644
--- a/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir
+++ b/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir
@@ -297,7 +297,7 @@ func.func @wgmma_f32_f16_f16(%descA : i64, %descB : i64) -> !mat64f32{
   // CHECK: %[[A2:.*]] = llvm.mlir.constant(-1 : i32) : i32
   // CHECK: %[[A3:.*]] = llvm.mlir.constant(-1 : i32) : i32
   // CHECK: %[[A4:.*]] = llvm.mlir.constant(1 : i32) : i32
-  // CHECK: %[[A5:.*]] = llvm.mlir.constant(1 : i32) : i32
+  // CHECK: %[[A5:.*]] = llvm.mlir.constant(0 : i32) : i32
   // CHECK: %[[V0:.*]] = llvm.extractvalue %[[RES]][0] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)> 
   // CHECK: %[[V4:.*]] = llvm.extractvalue %[[RES]][4] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)> 
   // CHECK: %[[V11:.*]] = llvm.extractvalue %[[RES]][11] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>  

From f5ab0bb14855154b8ecaf24fc2a2797dd8e95d17 Mon Sep 17 00:00:00 2001
From: Tomas Matheson <76168689+tmatheson-arm@users.noreply.github.com>
Date: Fri, 22 Dec 2023 13:54:21 +0000
Subject: [PATCH 177/342] [AArch64] paci<k>171615 auti<k>171615 assembly
 (#76227)

This adds the following instructions which are added in PAuthLR:
 - PACIA171615
 - PACIB171615
 - AUTIA171615
 - AUTIB171615

Also updates some encodings to match final published values.

Documentation can be found here:

https://developer.arm.com/documentation/ddi0602/2023-12/Base-Instructions

Co-authored-by: Lucas Prates <lucas.prates@arm.com>
---
 llvm/lib/Target/AArch64/AArch64InstrInfo.td   |  5 +++
 llvm/test/MC/AArch64/armv9.5a-pauthlr.s       | 35 ++++++++++++++++---
 .../Disassembler/AArch64/armv9.5a-pauthlr.txt | 16 +++++++++
 3 files changed, 52 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 977729bb082b7..62b2bf490f37a 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -1671,6 +1671,11 @@ let Predicates = [HasPAuthLR] in {
     //                              opcode2, opcode,   asm
     def AUTIASPPCr : SignAuthOneReg<0b00001, 0b100100, "autiasppc">;
     def AUTIBSPPCr : SignAuthOneReg<0b00001, 0b100101, "autibsppc">;
+    //                                  opcode2, opcode,   asm
+    def PACIA171615 : SignAuthFixedRegs<0b00001, 0b100010, "pacia171615">;
+    def PACIB171615 : SignAuthFixedRegs<0b00001, 0b100011, "pacib171615">;
+    def AUTIA171615 : SignAuthFixedRegs<0b00001, 0b101110, "autia171615">;
+    def AUTIB171615 : SignAuthFixedRegs<0b00001, 0b101111, "autib171615">;
   }
 
   let Uses = [LR, SP], isReturn = 1, isTerminator = 1, isBarrier = 1 in {
diff --git a/llvm/test/MC/AArch64/armv9.5a-pauthlr.s b/llvm/test/MC/AArch64/armv9.5a-pauthlr.s
index 24e9c44984683..2bc84c13f70ff 100644
--- a/llvm/test/MC/AArch64/armv9.5a-pauthlr.s
+++ b/llvm/test/MC/AArch64/armv9.5a-pauthlr.s
@@ -98,6 +98,33 @@ label1:
 // CHECK-ERROR: instruction requires: pauth-lr
 // CHECK-UNKNOWN: dac197fe <unknown>
 
+  pacia171615
+// CHECK-INST: pacia171615
+// CHECK-DISASS: pacia171615
+// CHECK-ENCODING: [0xfe,0x8b,0xc1,0xda]
+// CHECK-ERROR: instruction requires: pauth-lr
+// CHECK-UNKNOWN: dac18bfe <unknown>
+
+  pacib171615
+// CHECK-INST: pacib171615
+// CHECK-DISASS: pacib171615
+// CHECK-ENCODING: [0xfe,0x8f,0xc1,0xda]
+// CHECK-ERROR: instruction requires: pauth-lr
+// CHECK-UNKNOWN: dac18ffe <unknown>
+
+  autia171615
+// CHECK-INST: autia171615
+// CHECK-DISASS: autia171615
+// CHECK-ENCODING: [0xfe,0xbb,0xc1,0xda]
+// CHECK-ERROR: instruction requires: pauth-lr
+// CHECK-UNKNOWN: dac1bbfe <unknown>
+
+  autib171615
+// CHECK-INST: autib171615
+// CHECK-DISASS: autib171615
+// CHECK-ENCODING: [0xfe,0xbf,0xc1,0xda]
+// CHECK-ERROR: instruction requires: pauth-lr
+// CHECK-UNKNOWN: dac1bffe <unknown>
 
   retaasppc label1
 // CHECK-INST: retaasppc label1
@@ -105,7 +132,7 @@ label1:
 // CHECK-ENCODING: [0bAAA11111,A,0b000AAAAA,0x55]
 // CHECK-ENCODING: //   fixup A - offset: 0, value: label1, kind: fixup_aarch64_pcrel_branch16
 // CHECK-ERROR: instruction requires: pauth-lr
-// CHECK-UNKNOWN: 5500019f <unknown>
+// CHECK-UNKNOWN: 5500021f <unknown>
 
   retabsppc label1
 // CHECK-INST: retabsppc label1
@@ -113,18 +140,18 @@ label1:
 // CHECK-ENCODING: [0bAAA11111,A,0b001AAAAA,0x55]
 // CHECK-ENCODING: //   fixup A - offset: 0, value: label1, kind: fixup_aarch64_pcrel_branch16
 // CHECK-ERROR: instruction requires: pauth-lr
-// CHECK-UNKNOWN: 552001bf <unknown>
+// CHECK-UNKNOWN: 5520023f <unknown>
 
   retaasppc #0
 // CHECK-INST: retaasppc #0
-// CHECK-DISASS: retaasppc 0x3c <label1+0x38>
+// CHECK-DISASS: retaasppc 0x4c <label1+0x48>
 // CHECK-ENCODING: [0x1f,0x00,0x00,0x55]
 // CHECK-ERROR: instruction requires: pauth-lr
 // CHECK-UNKNOWN: 5500001f <unknown>
 
   retaasppc #-(1<<18)+4
 // CHECK-INST: retaasppc #-262140
-// CHECK-DISASS: retaasppc 0xfffffffffffc0044 <label1+0xfffffffffffc0040>
+// CHECK-DISASS: retaasppc 0xfffffffffffc0054 <label1+0xfffffffffffc0050>
 // CHECK-ENCODING: [0xff,0xff,0x1f,0x55]
 // CHECK-ERROR: instruction requires: pauth-lr
 // CHECK-UNKNOWN: 551fffff <unknown>
diff --git a/llvm/test/MC/Disassembler/AArch64/armv9.5a-pauthlr.txt b/llvm/test/MC/Disassembler/AArch64/armv9.5a-pauthlr.txt
index caf1fde2c2b7c..53d78023bc4b4 100644
--- a/llvm/test/MC/Disassembler/AArch64/armv9.5a-pauthlr.txt
+++ b/llvm/test/MC/Disassembler/AArch64/armv9.5a-pauthlr.txt
@@ -49,6 +49,22 @@
 # CHECK: autibsppc xzr
 # NO-PAUTHLR: invalid instruction encoding
 
+[0xfe,0x8b,0xc1,0xda]
+# CHECK: pacia171615
+# NO-PAUTHLR: invalid instruction encoding
+
+[0xfe,0x8f,0xc1,0xda]
+# CHECK: pacib171615
+# NO-PAUTHLR: invalid instruction encoding
+
+[0xfe,0xbb,0xc1,0xda]
+# CHECK: autia171615
+# NO-PAUTHLR: invalid instruction encoding
+
+[0xfe,0xbf,0xc1,0xda]
+# CHECK: autib171615
+# NO-PAUTHLR: invalid instruction encoding
+
 [0xbf,0x01,0x00,0x55]
 # CHECK: retaasppc #-52
 # NO-PAUTHLR: invalid instruction encoding

From efeb546865c233dfa7706ee0316c676de9f69897 Mon Sep 17 00:00:00 2001
From: Ilya Biryukov <ibiryukov@google.com>
Date: Fri, 22 Dec 2023 15:07:43 +0100
Subject: [PATCH 178/342] [clang-format] Add common attribute macros to Google
 style (#76239)

We have found that 199fc973ced20016b04ba540cf63a1d4914fa513 regresses
formatting of our codebases because we do not properly configure the
names of attribute macros.

`GUARDED_BY` and `ABSL_GUARDED_BY` are very commoon in Google codebases
so it is reasonable to include them by default to avoid the need for
extra configuration in every Google repository.
---
 clang/lib/Format/Format.cpp           |  3 +++
 clang/unittests/Format/FormatTest.cpp | 27 ++++++++++++++++++++++++---
 2 files changed, 27 insertions(+), 3 deletions(-)

diff --git a/clang/lib/Format/Format.cpp b/clang/lib/Format/Format.cpp
index 28271181e07d0..f798d555bf992 100644
--- a/clang/lib/Format/Format.cpp
+++ b/clang/lib/Format/Format.cpp
@@ -1698,6 +1698,9 @@ FormatStyle getGoogleStyle(FormatStyle::LanguageKind Language) {
           /*BasedOnStyle=*/"google",
       },
   };
+  GoogleStyle.AttributeMacros.push_back("GUARDED_BY");
+  GoogleStyle.AttributeMacros.push_back("ABSL_GUARDED_BY");
+
   GoogleStyle.SpacesBeforeTrailingComments = 2;
   GoogleStyle.Standard = FormatStyle::LS_Auto;
 
diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp
index 0e08723aa9e94..9772c3be71877 100644
--- a/clang/unittests/Format/FormatTest.cpp
+++ b/clang/unittests/Format/FormatTest.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "FormatTestBase.h"
+#include "gmock/gmock.h"
 
 #define DEBUG_TYPE "format-test"
 
@@ -8497,7 +8498,10 @@ TEST_F(FormatTest, BreaksFunctionDeclarationsWithTrailingTokens) {
                "    __attribute__((unused));");
 
   Style = getGoogleStyle();
-  Style.AttributeMacros.push_back("GUARDED_BY");
+  ASSERT_THAT(Style.AttributeMacros,
+              testing::AllOf(testing::Contains("GUARDED_BY"),
+                             testing::Contains("ABSL_GUARDED_BY")));
+
   verifyFormat(
       "bool aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\n"
       "    GUARDED_BY(aaaaaaaaaaaa);",
@@ -8514,6 +8518,23 @@ TEST_F(FormatTest, BreaksFunctionDeclarationsWithTrailingTokens) {
       "bool aaaaaaaaaaaaaaaaaaaaaaaaaaaaaa GUARDED_BY(aaaaaaaaaaaa) =\n"
       "    aaaaaaaaaaaaaaaaaaaaaaaaa;",
       Style);
+
+  verifyFormat(
+      "bool aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\n"
+      "    ABSL_GUARDED_BY(aaaaaaaaaaaa);",
+      Style);
+  verifyFormat(
+      "bool aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\n"
+      "    ABSL_GUARDED_BY(aaaaaaaaaaaa);",
+      Style);
+  verifyFormat(
+      "bool aaaaaaaaaaaaaaaaaaaaaaaaaaaaaa ABSL_GUARDED_BY(aaaaaaaaaaaa) =\n"
+      "    aaaaaaaa::aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa;",
+      Style);
+  verifyFormat(
+      "bool aaaaaaaaaaaaaaaaaaaaaaaaaaaaaa ABSL_GUARDED_BY(aaaaaaaaaaaa) =\n"
+      "    aaaaaaaaaaaaaaaaaaaaaaaaa;",
+      Style);
 }
 
 TEST_F(FormatTest, FunctionAnnotations) {
@@ -10072,11 +10093,11 @@ TEST_F(FormatTest, ReturnTypeBreakingStyle) {
                getGoogleStyleWithColumns(40));
   verifyFormat("Tttttttttttttttttttttttt ppppppppppppppp\n"
                "    ABSL_GUARDED_BY(mutex1)\n"
-               "        ABSL_GUARDED_BY(mutex2);",
+               "    ABSL_GUARDED_BY(mutex2);",
                getGoogleStyleWithColumns(40));
   verifyFormat("Tttttt f(int a, int b)\n"
                "    ABSL_GUARDED_BY(mutex1)\n"
-               "        ABSL_GUARDED_BY(mutex2);",
+               "    ABSL_GUARDED_BY(mutex2);",
                getGoogleStyleWithColumns(40));
   // * typedefs
   verifyGoogleFormat("typedef ATTR(X) char x;");

From 3736e1d1cd5c24b554a13e493c8614f458bdf123 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Fri, 22 Dec 2023 13:53:49 +0000
Subject: [PATCH 179/342] [SCEV] Ensure shift amount is in range before calling
 getZExtValue()

Fixes #76234
---
 llvm/lib/Analysis/ScalarEvolution.cpp         |  7 +++---
 llvm/test/Analysis/ScalarEvolution/pr76234.ll | 23 +++++++++++++++++++
 2 files changed, 27 insertions(+), 3 deletions(-)
 create mode 100644 llvm/test/Analysis/ScalarEvolution/pr76234.ll

diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp
index 580fe112fcd7b..623814c038a78 100644
--- a/llvm/lib/Analysis/ScalarEvolution.cpp
+++ b/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -7914,9 +7914,10 @@ const SCEV *ScalarEvolution::createSCEV(Value *V) {
         //    expression. We already checked that ShlAmt < BitWidth, so
         //    the multiplier, 1 << (ShlAmt - AShrAmt), fits into TruncTy as
         //    ShlAmt - AShrAmt < Amt.
-        uint64_t ShlAmt = ShlAmtCI->getZExtValue();
-        if (ShlAmtCI->getValue().ult(BitWidth) && ShlAmt >= AShrAmt) {
-          APInt Mul = APInt::getOneBitSet(BitWidth - AShrAmt, ShlAmt - AShrAmt);
+        const APInt &ShlAmt = ShlAmtCI->getValue();
+        if (ShlAmt.ult(BitWidth) && ShlAmt.uge(AShrAmt)) {
+          APInt Mul = APInt::getOneBitSet(BitWidth - AShrAmt,
+                                          ShlAmtCI->getZExtValue() - AShrAmt);
           const SCEV *CompositeExpr =
               getMulExpr(AddTruncateExpr, getConstant(Mul));
           if (L->getOpcode() != Instruction::Shl)
diff --git a/llvm/test/Analysis/ScalarEvolution/pr76234.ll b/llvm/test/Analysis/ScalarEvolution/pr76234.ll
new file mode 100644
index 0000000000000..0d82f0ed1a81c
--- /dev/null
+++ b/llvm/test/Analysis/ScalarEvolution/pr76234.ll
@@ -0,0 +1,23 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 4
+; RUN: opt < %s -disable-output "-passes=print<scalar-evolution>" 2>&1 | FileCheck %s
+
+; Reduced from https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=65278
+define i32 @PR76234() {
+; CHECK-LABEL: 'PR76234'
+; CHECK-NEXT:  Classifying expressions for: @PR76234
+; CHECK-NEXT:    %B9 = shl i896 0, -264147265567832623176169892458258303259423663018060761063980354513336951278362429737208627943828593947337197496628564339441173779751342768625269489231469788454193341999502542084365758838213220526512116454105594202074014146375780869419198449383518238244769290448868999168
+; CHECK-NEXT:    --> %B9 U: [0,1) S: [0,1)
+; CHECK-NEXT:    %B39 = ashr i896 %B9, 1
+; CHECK-NEXT:    --> %B39 U: [0,1) S: [0,1) Exits: <<Unknown>> LoopDispositions: { %1: Variant }
+; CHECK-NEXT:  Determining loop execution counts for: @PR76234
+; CHECK-NEXT:  Loop %1: <multiple exits> Unpredictable backedge-taken count.
+; CHECK-NEXT:  Loop %1: Unpredictable constant max backedge-taken count.
+; CHECK-NEXT:  Loop %1: Unpredictable symbolic max backedge-taken count.
+; CHECK-NEXT:  Loop %1: Unpredictable predicated backedge-taken count.
+;
+  %B9 = shl i896 0, -264147265567832623176169892458258303259423663018060761063980354513336951278362429737208627943828593947337197496628564339441173779751342768625269489231469788454193341999502542084365758838213220526512116454105594202074014146375780869419198449383518238244769290448868999168
+  br label %1
+1:
+  %B39 = ashr i896 %B9, 1
+  br label %1
+}

From e4f1c528326ff1b32ea4b9cdf496312da385cc47 Mon Sep 17 00:00:00 2001
From: Lucas Duarte Prates <lucas.prates@arm.com>
Date: Fri, 22 Dec 2023 14:40:29 +0000
Subject: [PATCH 180/342] [AArch64] Assembly support for the Armv9.5-A Memory
 System Extensions (#76237)

This implements assembly support for the Memory Systems Extensions
introduced as part of the Armv9.5-A architecture version.
The changes include:
* New subtarget feature for FEAT_TLBIW.
* New system registers for FEAT_HDBSS:
  * HDBSSBR_EL2 and HDBSSPROD_EL2.
* New system registers for FEAT_HACDBS:
  * HACDBSBR_EL2 and HACDBSCONS_EL2.
* New TLBI instructions for FEAT_TLBIW:
  * VMALLWS2E1(nXS), VMALLWS2E1IS(nXS) and VMALLWS2E1OS(nXS).
* New system register for FEAT_FGWTE3:
  * FGWTE3_EL3.
---
 clang/test/Driver/aarch64-v95a.c              |  4 +++
 .../llvm/TargetParser/AArch64TargetParser.h   |  2 ++
 llvm/lib/Target/AArch64/AArch64.td            |  3 +++
 .../Target/AArch64/AArch64SystemOperands.td   | 22 +++++++++++++++
 .../AArch64/AsmParser/AArch64AsmParser.cpp    |  1 +
 llvm/test/MC/AArch64/armv9.5a-fgwte3.s        |  6 +++++
 llvm/test/MC/AArch64/armv9.5a-hacdbs.s        | 12 +++++++++
 llvm/test/MC/AArch64/armv9.5a-hdbss.s         | 12 +++++++++
 llvm/test/MC/AArch64/armv9.5a-tlbiw.s         | 27 +++++++++++++++++++
 .../Disassembler/AArch64/armv9.5a-fgwte3.txt  |  7 +++++
 .../Disassembler/AArch64/armv9.5a-hacdbs.txt  | 14 ++++++++++
 .../Disassembler/AArch64/armv9.5a-hdbss.txt   | 14 ++++++++++
 .../Disassembler/AArch64/armv9.5a-tlbiw.txt   | 27 +++++++++++++++++++
 .../TargetParser/TargetParserTest.cpp         |  2 ++
 14 files changed, 153 insertions(+)
 create mode 100644 llvm/test/MC/AArch64/armv9.5a-fgwte3.s
 create mode 100644 llvm/test/MC/AArch64/armv9.5a-hacdbs.s
 create mode 100644 llvm/test/MC/AArch64/armv9.5a-hdbss.s
 create mode 100644 llvm/test/MC/AArch64/armv9.5a-tlbiw.s
 create mode 100644 llvm/test/MC/Disassembler/AArch64/armv9.5a-fgwte3.txt
 create mode 100644 llvm/test/MC/Disassembler/AArch64/armv9.5a-hacdbs.txt
 create mode 100644 llvm/test/MC/Disassembler/AArch64/armv9.5a-hdbss.txt
 create mode 100644 llvm/test/MC/Disassembler/AArch64/armv9.5a-tlbiw.txt

diff --git a/clang/test/Driver/aarch64-v95a.c b/clang/test/Driver/aarch64-v95a.c
index 6fac62e8b389a..13069c04c8d1c 100644
--- a/clang/test/Driver/aarch64-v95a.c
+++ b/clang/test/Driver/aarch64-v95a.c
@@ -25,3 +25,7 @@
 // RUN: %clang -target aarch64 -march=armv9.5a+pauth-lr -### -c %s 2>&1 | FileCheck -check-prefix=V95A-PAUTHLR %s
 // RUN: %clang -target aarch64 -march=armv9.5-a+pauth-lr -### -c %s 2>&1 | FileCheck -check-prefix=V95A-PAUTHLR %s
 // V95A-PAUTHLR: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "generic" "-target-feature" "+neon" "-target-feature" "+v9.5a" "-target-feature" "+pauth-lr"
+
+// RUN: %clang -target aarch64 -march=armv9.5a+tlbiw -### -c %s 2>&1 | FileCheck -check-prefix=V95A-TLBIW %s
+// RUN: %clang -target aarch64 -march=armv9.5-a+tlbiw -### -c %s 2>&1 | FileCheck -check-prefix=V95A-TLBIW %s
+// V95A-TLBIW: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "generic" "-target-feature" "+neon" "-target-feature" "+v9.5a" "-target-feature" "+tlbiw"
diff --git a/llvm/include/llvm/TargetParser/AArch64TargetParser.h b/llvm/include/llvm/TargetParser/AArch64TargetParser.h
index 6c7410a8b8f79..53dc2be825f28 100644
--- a/llvm/include/llvm/TargetParser/AArch64TargetParser.h
+++ b/llvm/include/llvm/TargetParser/AArch64TargetParser.h
@@ -175,6 +175,7 @@ enum ArchExtKind : unsigned {
   AEK_SMEFA64 =       71, // FEAT_SME_FA64
   AEK_CPA =           72, // FEAT_CPA
   AEK_PAUTHLR =       73, // FEAT_PAuth_LR
+  AEK_TLBIW =         74, // FEAT_TLBIW
   AEK_NUM_EXTENSIONS
 };
 using ExtensionBitset = Bitset<AEK_NUM_EXTENSIONS>;
@@ -299,6 +300,7 @@ inline constexpr ExtensionInfo Extensions[] = {
     {"sme-fa64",  AArch64::AEK_SMEFA64,  "+sme-fa64", "-sme-fa64",  FEAT_INIT, "", 0},
     {"cpa", AArch64::AEK_CPA, "+cpa", "-cpa", FEAT_INIT, "", 0},
     {"pauth-lr", AArch64::AEK_PAUTHLR, "+pauth-lr", "-pauth-lr", FEAT_INIT, "", 0},
+    {"tlbiw", AArch64::AEK_TLBIW, "+tlbiw", "-tlbiw", FEAT_INIT, "", 0},
     // Special cases
     {"none", AArch64::AEK_NONE, {}, {}, FEAT_INIT, "", ExtensionInfo::MaxFMVPriority},
 };
diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td
index 97e92a57a7ff4..68f452039c9b6 100644
--- a/llvm/lib/Target/AArch64/AArch64.td
+++ b/llvm/lib/Target/AArch64/AArch64.td
@@ -630,6 +630,9 @@ def FeatureCPA : SubtargetFeature<"cpa", "HasCPA", "true",
 def FeaturePAuthLR : SubtargetFeature<"pauth-lr", "HasPAuthLR",
     "true", "Enable Armv9.5-A PAC enhancements (FEAT_PAuth_LR)">;
 
+def FeatureTLBIW : SubtargetFeature<"tlbiw", "HasTLBIW", "true",
+  "Enable ARMv9.5-A TLBI VMALL for Dirty State (FEAT_TLBIW)">;
+
 //===----------------------------------------------------------------------===//
 // Architectures.
 //
diff --git a/llvm/lib/Target/AArch64/AArch64SystemOperands.td b/llvm/lib/Target/AArch64/AArch64SystemOperands.td
index 28a5776a3089c..0b80f263e12ee 100644
--- a/llvm/lib/Target/AArch64/AArch64SystemOperands.td
+++ b/llvm/lib/Target/AArch64/AArch64SystemOperands.td
@@ -643,6 +643,14 @@ defm : TLBI<"PAALLOS",      0b110, 0b1000, 0b0001, 0b100, 0>;
 defm : TLBI<"PAALL",        0b110, 0b1000, 0b0111, 0b100, 0>;
 }
 
+// Armv9.5-A TLBI VMALL for Dirty State
+let Requires = ["AArch64::FeatureTLBIW"] in {
+//                           op1,   CRn,    CRm,    op2,   needsreg
+defm : TLBI<"VMALLWS2E1",    0b100, 0b1000, 0b0110, 0b010, 0>;
+defm : TLBI<"VMALLWS2E1IS",  0b100, 0b1000, 0b0010, 0b010, 0>;
+defm : TLBI<"VMALLWS2E1OS",  0b100, 0b1000, 0b0101, 0b010, 0>;
+}
+
 //===----------------------------------------------------------------------===//
 // MRS/MSR (system register read/write) instruction options.
 //===----------------------------------------------------------------------===//
@@ -1951,3 +1959,17 @@ def : WOSysReg<"SPMZR_EL0",         0b10, 0b011, 0b1001, 0b1100, 0b100>;
 //                                  Op0   Op1    CRn     CRm     Op2
 def : RWSysReg<"VDISR_EL3",         0b11, 0b110, 0b1100, 0b0001, 0b001>;
 def : RWSysReg<"VSESR_EL3",         0b11, 0b110, 0b0101, 0b0010, 0b011>;
+
+// v9.5a Hardware Dirty State Tracking Structure (FEAT_HDBSS)
+//                                  Op0   Op1    CRn     CRm     Op2
+def : RWSysReg<"HDBSSBR_EL2",       0b11, 0b100, 0b0010, 0b0011, 0b010>;
+def : RWSysReg<"HDBSSPROD_EL2",     0b11, 0b100, 0b0010, 0b0011, 0b011>;
+
+// v9.5a Hardware Accelerator for Cleaning Dirty State (FEAT_HACDBS)
+//                                  Op0   Op1    CRn     CRm     Op2
+def : RWSysReg<"HACDBSBR_EL2",      0b11, 0b100, 0b0010, 0b0011, 0b100>;
+def : RWSysReg<"HACDBSCONS_EL2",    0b11, 0b100, 0b0010, 0b0011, 0b101>;
+
+// v9.5a Fine Grained Write Trap EL3 (FEAT_FGWTE3)
+//                                  Op0   Op1    CRn     CRm     Op2
+def : RWSysReg<"FGWTE3_EL3",        0b11, 0b110, 0b0001, 0b0001, 0b101>;
diff --git a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
index 38a92cb096029..be66790c42776 100644
--- a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
+++ b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
@@ -3706,6 +3706,7 @@ static const struct Extension {
     {"sme-f8f32", {AArch64::FeatureSMEF8F32}},
     {"sme-fa64",  {AArch64::FeatureSMEFA64}},
     {"cpa", {AArch64::FeatureCPA}},
+    {"tlbiw", {AArch64::FeatureTLBIW}},
 };
 
 static void setRequiredFeatureString(FeatureBitset FBS, std::string &Str) {
diff --git a/llvm/test/MC/AArch64/armv9.5a-fgwte3.s b/llvm/test/MC/AArch64/armv9.5a-fgwte3.s
new file mode 100644
index 0000000000000..2352bc7e1ca71
--- /dev/null
+++ b/llvm/test/MC/AArch64/armv9.5a-fgwte3.s
@@ -0,0 +1,6 @@
+// RUN: llvm-mc -triple aarch64 -show-encoding < %s | FileCheck %s
+
+mrs x0, FGWTE3_EL3
+// CHECK: mrs x0, FGWTE3_EL3                  // encoding: [0xa0,0x11,0x3e,0xd5]
+msr FGWTE3_EL3, x0
+// CHECK: msr FGWTE3_EL3, x0                  // encoding: [0xa0,0x11,0x1e,0xd5]
diff --git a/llvm/test/MC/AArch64/armv9.5a-hacdbs.s b/llvm/test/MC/AArch64/armv9.5a-hacdbs.s
new file mode 100644
index 0000000000000..8ccba29beb444
--- /dev/null
+++ b/llvm/test/MC/AArch64/armv9.5a-hacdbs.s
@@ -0,0 +1,12 @@
+// RUN: llvm-mc -triple aarch64 -show-encoding < %s | FileCheck %s
+
+mrs x0, HACDBSBR_EL2
+// CHECK: mrs x0, HACDBSBR_EL2                  // encoding: [0x80,0x23,0x3c,0xd5]
+msr HACDBSBR_EL2, x0
+// CHECK: msr HACDBSBR_EL2, x0                  // encoding: [0x80,0x23,0x1c,0xd5]
+
+mrs x0, HACDBSCONS_EL2
+// CHECK: mrs x0, HACDBSCONS_EL2                  // encoding: [0xa0,0x23,0x3c,0xd5]
+msr HACDBSCONS_EL2, x0
+// CHECK: msr HACDBSCONS_EL2, x0                  // encoding: [0xa0,0x23,0x1c,0xd5]
+
diff --git a/llvm/test/MC/AArch64/armv9.5a-hdbss.s b/llvm/test/MC/AArch64/armv9.5a-hdbss.s
new file mode 100644
index 0000000000000..c4505c9d70e7f
--- /dev/null
+++ b/llvm/test/MC/AArch64/armv9.5a-hdbss.s
@@ -0,0 +1,12 @@
+// RUN: llvm-mc -triple aarch64 -show-encoding < %s | FileCheck %s
+
+mrs x0, HDBSSBR_EL2
+// CHECK: mrs x0, HDBSSBR_EL2                  // encoding: [0x40,0x23,0x3c,0xd5]
+msr HDBSSBR_EL2, x0
+// CHECK: msr HDBSSBR_EL2, x0                  // encoding: [0x40,0x23,0x1c,0xd5]
+
+mrs x0, HDBSSPROD_EL2
+// CHECK: mrs x0, HDBSSPROD_EL2                  // encoding: [0x60,0x23,0x3c,0xd5]
+msr HDBSSPROD_EL2, x0
+// CHECK: msr HDBSSPROD_EL2, x0                  // encoding: [0x60,0x23,0x1c,0xd5]
+
diff --git a/llvm/test/MC/AArch64/armv9.5a-tlbiw.s b/llvm/test/MC/AArch64/armv9.5a-tlbiw.s
new file mode 100644
index 0000000000000..435ed06b33c8c
--- /dev/null
+++ b/llvm/test/MC/AArch64/armv9.5a-tlbiw.s
@@ -0,0 +1,27 @@
+// RUN: llvm-mc -triple aarch64 -show-encoding -mattr=+tlbiw -mattr=+xs < %s | FileCheck --check-prefix=CHECK-TLBIW --check-prefix=CHECK-XS %s
+// RUN: not llvm-mc -triple aarch64 -show-encoding -mattr=+tlbiw < %s 2> %t | FileCheck --check-prefix=CHECK-TLBIW %s && FileCheck --check-prefix=ERROR-NO-XS-TLBIW %s < %t
+// RUN: not llvm-mc -triple aarch64 < %s 2>&1 | FileCheck --check-prefix=ERROR-NO-TLBIW --check-prefix=ERROR-NO-XS-TLBIW %s
+
+tlbi VMALLWS2E1
+// CHECK-TLBIW: tlbi vmallws2e1                  // encoding: [0x5f,0x86,0x0c,0xd5]
+// ERROR-NO-TLBIW: [[@LINE-2]]:6: error: TLBI VMALLWS2E1 requires: tlbiw
+
+tlbi VMALLWS2E1IS
+// CHECK-TLBIW: tlbi vmallws2e1is                // encoding: [0x5f,0x82,0x0c,0xd5]
+// ERROR-NO-TLBIW: [[@LINE-2]]:6: error: TLBI VMALLWS2E1IS requires: tlbiw
+
+tlbi VMALLWS2E1OS
+// CHECK-TLBIW: tlbi vmallws2e1os                // encoding: [0x5f,0x85,0x0c,0xd5]
+// ERROR-NO-TLBIW: [[@LINE-2]]:6: error: TLBI VMALLWS2E1OS requires: tlbiw
+
+tlbi VMALLWS2E1nXS
+// CHECK-XS: tlbi vmallws2e1nxs                  // encoding: [0x5f,0x96,0x0c,0xd5]
+// ERROR-NO-XS-TLBIW: [[@LINE-2]]:6: error: TLBI VMALLWS2E1nXS requires: xs, tlbiw
+
+tlbi VMALLWS2E1ISnXS
+// CHECK-XS: tlbi vmallws2e1isnxs                // encoding: [0x5f,0x92,0x0c,0xd5]
+// ERROR-NO-XS-TLBIW: [[@LINE-2]]:6: error: TLBI VMALLWS2E1ISnXS requires: xs, tlbiw
+
+tlbi VMALLWS2E1OSnXS
+// CHECK-XS: tlbi vmallws2e1osnxs                // encoding: [0x5f,0x95,0x0c,0xd5]
+// ERROR-NO-XS-TLBIW: [[@LINE-2]]:6: error: TLBI VMALLWS2E1OSnXS requires: xs, tlbiw
diff --git a/llvm/test/MC/Disassembler/AArch64/armv9.5a-fgwte3.txt b/llvm/test/MC/Disassembler/AArch64/armv9.5a-fgwte3.txt
new file mode 100644
index 0000000000000..f7e355a700af0
--- /dev/null
+++ b/llvm/test/MC/Disassembler/AArch64/armv9.5a-fgwte3.txt
@@ -0,0 +1,7 @@
+# RUN: llvm-mc -triple aarch64 -disassemble < %s | FileCheck %s
+
+[0xa0,0x11,0x3e,0xd5]
+# CHECK: mrs x0, FGWTE3_EL3
+
+[0xa0,0x11,0x1e,0xd5]
+# CHECK: msr FGWTE3_EL3, x0
diff --git a/llvm/test/MC/Disassembler/AArch64/armv9.5a-hacdbs.txt b/llvm/test/MC/Disassembler/AArch64/armv9.5a-hacdbs.txt
new file mode 100644
index 0000000000000..d9be7e5ba4432
--- /dev/null
+++ b/llvm/test/MC/Disassembler/AArch64/armv9.5a-hacdbs.txt
@@ -0,0 +1,14 @@
+# RUN: llvm-mc -triple aarch64 -disassemble < %s | FileCheck %s
+
+[0x80,0x23,0x3c,0xd5]
+# CHECK: mrs x0, HACDBSBR_EL2
+
+[0x80,0x23,0x1c,0xd5]
+# CHECK: msr HACDBSBR_EL2, x0
+
+[0xa0,0x23,0x3c,0xd5]
+# CHECK: mrs x0, HACDBSCONS_EL2
+
+[0xa0,0x23,0x1c,0xd5]
+# CHECK: msr HACDBSCONS_EL2, x0
+
diff --git a/llvm/test/MC/Disassembler/AArch64/armv9.5a-hdbss.txt b/llvm/test/MC/Disassembler/AArch64/armv9.5a-hdbss.txt
new file mode 100644
index 0000000000000..999f322548f46
--- /dev/null
+++ b/llvm/test/MC/Disassembler/AArch64/armv9.5a-hdbss.txt
@@ -0,0 +1,14 @@
+# RUN: llvm-mc -triple aarch64 -disassemble < %s | FileCheck %s
+
+[0x40,0x23,0x3c,0xd5]
+# CHECK: mrs x0, HDBSSBR_EL2
+
+[0x40,0x23,0x1c,0xd5]
+# CHECK: msr HDBSSBR_EL2, x0
+
+[0x60,0x23,0x3c,0xd5]
+# CHECK: mrs x0, HDBSSPROD_EL2
+
+[0x60,0x23,0x1c,0xd5]
+# CHECK: msr HDBSSPROD_EL2, x0
+
diff --git a/llvm/test/MC/Disassembler/AArch64/armv9.5a-tlbiw.txt b/llvm/test/MC/Disassembler/AArch64/armv9.5a-tlbiw.txt
new file mode 100644
index 0000000000000..df5e894a929e4
--- /dev/null
+++ b/llvm/test/MC/Disassembler/AArch64/armv9.5a-tlbiw.txt
@@ -0,0 +1,27 @@
+# RUN: llvm-mc -triple aarch64 -disassemble -mattr=+tlbiw -mattr=+xs < %s | FileCheck --check-prefix=CHECK-TLBIW --check-prefix=CHECK-XS %s
+# RUN: llvm-mc -triple aarch64 -disassemble -mattr=+tlbiw < %s | FileCheck --check-prefix=CHECK-TLBIW --check-prefix=CHECK-NO-XS-TLBIW %s
+# RUN: llvm-mc -triple aarch64 -disassemble < %s | FileCheck --check-prefix=CHECK-NO-TLBIW --check-prefix=CHECK-NO-XS-TLBIW %s
+
+[0x5f,0x86,0x0c,0xd5]
+# CHECK-TLBIW: tlbi vmallws2e1
+# CHECK-NO-TLBIW: sys #4, c8, c6, #2
+
+[0x5f,0x82,0x0c,0xd5]
+# CHECK-TLBIW: tlbi vmallws2e1is
+# CHECK-NO-TLBIW: sys #4, c8, c2, #2
+
+[0x5f,0x85,0x0c,0xd5]
+# CHECK-TLBIW: tlbi vmallws2e1os
+# CHECK-NO-TLBIW: sys #4, c8, c5, #2
+
+[0x5f,0x96,0x0c,0xd5]
+# CHECK-XS: tlbi vmallws2e1nxs
+# CHECK-NO-XS-TLBIW: sys #4, c9, c6, #2
+
+[0x5f,0x92,0x0c,0xd5]
+# CHECK-XS: tlbi vmallws2e1isnxs
+# CHECK-NO-XS-TLBIW: sys #4, c9, c2, #2
+
+[0x5f,0x95,0x0c,0xd5]
+# CHECK-XS: tlbi vmallws2e1osnxs
+# CHECK-NO-XS-TLBIW: sys #4, c9, c5, #2
diff --git a/llvm/unittests/TargetParser/TargetParserTest.cpp b/llvm/unittests/TargetParser/TargetParserTest.cpp
index 866176ab09836..92bd4da1d3a47 100644
--- a/llvm/unittests/TargetParser/TargetParserTest.cpp
+++ b/llvm/unittests/TargetParser/TargetParserTest.cpp
@@ -1813,6 +1813,7 @@ TEST(TargetParserTest, AArch64ExtensionFeatures) {
       AArch64::AEK_SME_LUTv2,    AArch64::AEK_SMEF8F16,
       AArch64::AEK_SMEF8F32,     AArch64::AEK_SMEFA64,
       AArch64::AEK_CPA,          AArch64::AEK_PAUTHLR,
+      AArch64::AEK_TLBIW,
   };
 
   std::vector<StringRef> Features;
@@ -1901,6 +1902,7 @@ TEST(TargetParserTest, AArch64ExtensionFeatures) {
   EXPECT_TRUE(llvm::is_contained(Features, "+sme-fa64"));
   EXPECT_TRUE(llvm::is_contained(Features, "+cpa"));
   EXPECT_TRUE(llvm::is_contained(Features, "+pauth-lr"));
+  EXPECT_TRUE(llvm::is_contained(Features, "+tlbiw"));
 
   // Assuming we listed every extension above, this should produce the same
   // result. (note that AEK_NONE doesn't have a name so it won't be in the

From 5cb7534a7d17e65bdfa676a4e56698e2a05b89b9 Mon Sep 17 00:00:00 2001
From: Rainer Orth <ro@gcc.gnu.org>
Date: Fri, 22 Dec 2023 15:58:03 +0100
Subject: [PATCH 181/342] [tsan] Only intercept pthread_mutex_clocklock on
 Linux (#76220)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

`tsan_interceptors_posix.cpp` doesn't compile on FreeBSD 14.0/amd64:
```
In file included from /vol/llvm/src/llvm-project/local-freebsd/compiler-rt/lib/tsan/rtl/tsan_interceptors_posix.cpp:25:
/vol/llvm/src/llvm-project/local-freebsd/compiler-rt/lib/tsan/rtl/tsan_interceptors_posix.cpp: In function ‘void __tsan::InitializeInterceptors()’:
/vol/llvm/src/llvm-project/local-freebsd/compiler-rt/lib/tsan/rtl/../../interception/interception.h:243:25: error: ‘real_pthread_mutex_clocklock’ is not a member of ‘__interception’; did you mean ‘real_pthread_mutex_unlock’?
```
Fixed by wrapping the `TSAN_INTERCEPT` invocation with `SANITIZER_LINUX`
as is already done for the interceptor definition.

Tested on `amd64-pc-freebsd14.0`.
---
 compiler-rt/lib/tsan/rtl/tsan_interceptors_posix.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/compiler-rt/lib/tsan/rtl/tsan_interceptors_posix.cpp b/compiler-rt/lib/tsan/rtl/tsan_interceptors_posix.cpp
index fdcba6e8ca739..a9f6673ac44e9 100644
--- a/compiler-rt/lib/tsan/rtl/tsan_interceptors_posix.cpp
+++ b/compiler-rt/lib/tsan/rtl/tsan_interceptors_posix.cpp
@@ -2918,7 +2918,9 @@ void InitializeInterceptors() {
   TSAN_INTERCEPT(pthread_mutex_trylock);
   TSAN_INTERCEPT(pthread_mutex_timedlock);
   TSAN_INTERCEPT(pthread_mutex_unlock);
+#if SANITIZER_LINUX
   TSAN_INTERCEPT(pthread_mutex_clocklock);
+#endif
 #if SANITIZER_GLIBC
 #  if !__GLIBC_PREREQ(2, 34)
   TSAN_INTERCEPT(__pthread_mutex_lock);

From 40ec791b15b0110785a91b057e95535e8b0989b6 Mon Sep 17 00:00:00 2001
From: HaohaiWen <haohai.wen@intel.com>
Date: Fri, 22 Dec 2023 23:06:16 +0800
Subject: [PATCH 182/342] [RegAllocFast] Refactor dominates algorithm for large
 basic block (#72250)

The original brute force dominates algorithm is O(n) complexity so it is
very slow for very large machine basic block which is very common with
O0. This patch added InstrPosIndexes to assign index for each
instruction and use it to determine dominance. The complexity is now
O(1).
---
 llvm/lib/CodeGen/RegAllocFast.cpp | 128 ++++++++++++++++++++++++++----
 1 file changed, 114 insertions(+), 14 deletions(-)

diff --git a/llvm/lib/CodeGen/RegAllocFast.cpp b/llvm/lib/CodeGen/RegAllocFast.cpp
index 40c42cabf7763..a52013a74c2e1 100644
--- a/llvm/lib/CodeGen/RegAllocFast.cpp
+++ b/llvm/lib/CodeGen/RegAllocFast.cpp
@@ -62,6 +62,107 @@ static RegisterRegAlloc fastRegAlloc("fast", "fast register allocator",
 
 namespace {
 
+/// Assign ascending index for instructions in machine basic block. The index
+/// can be used to determine dominance between instructions in same MBB.
+class InstrPosIndexes {
+public:
+  void init(const MachineBasicBlock &MBB) {
+    CurMBB = &MBB;
+    Instr2PosIndex.clear();
+    uint64_t LastIndex = 0;
+    for (const MachineInstr &MI : MBB) {
+      LastIndex += InstrDist;
+      Instr2PosIndex[&MI] = LastIndex;
+    }
+  }
+
+  /// Set \p Index to index of \p MI. If \p MI is new inserted, it try to assign
+  /// index without affecting existing instruction's index. Return true if all
+  /// instructions index has been reassigned.
+  bool getIndex(const MachineInstr &MI, uint64_t &Index) {
+    assert(MI.getParent() == CurMBB && "MI is not in CurMBB");
+    if (Instr2PosIndex.count(&MI)) {
+      Index = Instr2PosIndex[&MI];
+      return false;
+    }
+
+    // Distance is the number of consecutive unassigned instructions including
+    // MI. Start is the first instruction of them. End is the next of last
+    // instruction of them.
+    // e.g.
+    // |Instruction|  A   |  B   |  C   |  MI  |  D   |  E   |
+    // |   Index   | 1024 |      |      |      |      | 2048 |
+    //
+    // In this case, B, C, MI, D are unassigned. Distance is 4, Start is B, End
+    // is E.
+    unsigned Distance = 1;
+    MachineBasicBlock::const_iterator Start = MI.getIterator(),
+                                      End = std::next(Start);
+    while (Start != CurMBB->begin() &&
+           !Instr2PosIndex.count(&*std::prev(Start))) {
+      --Start;
+      ++Distance;
+    }
+    while (End != CurMBB->end() && !Instr2PosIndex.count(&*(End))) {
+      ++End;
+      ++Distance;
+    }
+
+    // LastIndex is initialized to last used index prior to MI or zero.
+    // In previous example, LastIndex is 1024, EndIndex is 2048;
+    uint64_t LastIndex =
+        Start == CurMBB->begin() ? 0 : Instr2PosIndex.at(&*std::prev(Start));
+    uint64_t Step;
+    if (End == CurMBB->end())
+      Step = static_cast<uint64_t>(InstrDist);
+    else {
+      // No instruction uses index zero.
+      uint64_t EndIndex = Instr2PosIndex.at(&*End);
+      assert(EndIndex > LastIndex && "Index must be ascending order");
+      unsigned NumAvailableIndexes = EndIndex - LastIndex - 1;
+      // We want index gap between two adjacent MI is as same as possible. Given
+      // total A available indexes, D is number of consecutive unassigned
+      // instructions, S is the step.
+      // |<- S-1 -> MI <- S-1 -> MI <- A-S*D ->|
+      // There're S-1 available indexes between unassigned instruction and its
+      // predecessor. There're A-S*D available indexes between the last
+      // unassigned instruction and its successor.
+      // Ideally, we want
+      //    S-1 = A-S*D
+      // then
+      //    S = (A+1)/(D+1)
+      // An valid S must be integer greater than zero, so
+      //    S <= (A+1)/(D+1)
+      // =>
+      //    A-S*D >= 0
+      // That means we can safely use (A+1)/(D+1) as step.
+      // In previous example, Step is 204, Index of B, C, MI, D is 1228, 1432,
+      // 1636, 1840.
+      Step = (NumAvailableIndexes + 1) / (Distance + 1);
+    }
+
+    // Reassign index for all instructions if number of new inserted
+    // instructions exceed slot or all instructions are new.
+    if (LLVM_UNLIKELY(!Step || (!LastIndex && Step == InstrDist))) {
+      init(*CurMBB);
+      Index = Instr2PosIndex.at(&MI);
+      return true;
+    }
+
+    for (auto I = Start; I != End; ++I) {
+      LastIndex += Step;
+      Instr2PosIndex[&*I] = LastIndex;
+    }
+    Index = Instr2PosIndex.at(&MI);
+    return false;
+  }
+
+private:
+  enum { InstrDist = 1024 };
+  const MachineBasicBlock *CurMBB = nullptr;
+  DenseMap<const MachineInstr *, uint64_t> Instr2PosIndex;
+};
+
 class RegAllocFast : public MachineFunctionPass {
 public:
   static char ID;
@@ -153,6 +254,9 @@ class RegAllocFast : public MachineFunctionPass {
   // Register masks attached to the current instruction.
   SmallVector<const uint32_t *> RegMasks;
 
+  // Assign index for each instruction to quickly determine dominance.
+  InstrPosIndexes PosIndexes;
+
   void setPhysRegState(MCPhysReg PhysReg, unsigned NewState);
   bool isPhysRegFree(MCPhysReg PhysReg) const;
 
@@ -339,18 +443,13 @@ int RegAllocFast::getStackSpaceFor(Register VirtReg) {
   return FrameIdx;
 }
 
-static bool dominates(MachineBasicBlock &MBB,
-                      MachineBasicBlock::const_iterator A,
-                      MachineBasicBlock::const_iterator B) {
-  auto MBBEnd = MBB.end();
-  if (B == MBBEnd)
-    return true;
-
-  MachineBasicBlock::const_iterator I = MBB.begin();
-  for (; &*I != A && &*I != B; ++I)
-    ;
-
-  return &*I == A;
+static bool dominates(InstrPosIndexes &PosIndexes, const MachineInstr &A,
+                      const MachineInstr &B) {
+  uint64_t IndexA, IndexB;
+  PosIndexes.getIndex(A, IndexA);
+  if (LLVM_UNLIKELY(PosIndexes.getIndex(B, IndexB)))
+    PosIndexes.getIndex(A, IndexA);
+  return IndexA < IndexB;
 }
 
 /// Returns false if \p VirtReg is known to not live out of the current block.
@@ -371,7 +470,7 @@ bool RegAllocFast::mayLiveOut(Register VirtReg) {
         MayLiveAcrossBlocks.set(Register::virtReg2Index(VirtReg));
         return true;
       } else {
-        if (!SelfLoopDef || dominates(*MBB, DefInst.getIterator(), SelfLoopDef))
+        if (!SelfLoopDef || dominates(PosIndexes, DefInst, *SelfLoopDef))
           SelfLoopDef = &DefInst;
       }
     }
@@ -396,7 +495,7 @@ bool RegAllocFast::mayLiveOut(Register VirtReg) {
       // Try to handle some simple cases to avoid spilling and reloading every
       // value inside a self looping block.
       if (SelfLoopDef == &UseInst ||
-          !dominates(*MBB, SelfLoopDef->getIterator(), UseInst.getIterator())) {
+          !dominates(PosIndexes, *SelfLoopDef, UseInst)) {
         MayLiveAcrossBlocks.set(Register::virtReg2Index(VirtReg));
         return true;
       }
@@ -1565,6 +1664,7 @@ void RegAllocFast::allocateBasicBlock(MachineBasicBlock &MBB) {
   this->MBB = &MBB;
   LLVM_DEBUG(dbgs() << "\nAllocating " << MBB);
 
+  PosIndexes.init(MBB);
   RegUnitStates.assign(TRI->getNumRegUnits(), regFree);
   assert(LiveVirtRegs.empty() && "Mapping not cleared from last block?");
 

From 5f254eb05566f5b400a212ae77117a9efd9019a1 Mon Sep 17 00:00:00 2001
From: NAKAMURA Takumi <geek4civic@gmail.com>
Date: Sat, 23 Dec 2023 00:01:43 +0900
Subject: [PATCH 183/342] [Bazel] Fixup for #76163

This also reverts 7c9c807fa433 and 476812a74260.
---
 .../clang/include/clang/Config/config.h                     | 6 ++++++
 .../llvm-project-overlay/llvm/include/llvm/Config/config.h  | 6 ++++++
 .../llvm/include/llvm/Config/llvm-config.h                  | 6 ------
 utils/bazel/llvm_configs/config.h.cmake                     | 6 ++++++
 utils/bazel/llvm_configs/llvm-config.h.cmake                | 6 ------
 5 files changed, 18 insertions(+), 12 deletions(-)

diff --git a/utils/bazel/llvm-project-overlay/clang/include/clang/Config/config.h b/utils/bazel/llvm-project-overlay/clang/include/clang/Config/config.h
index 88ace233b5418..ac0d9eb24931f 100644
--- a/utils/bazel/llvm-project-overlay/clang/include/clang/Config/config.h
+++ b/utils/bazel/llvm-project-overlay/clang/include/clang/Config/config.h
@@ -71,6 +71,12 @@
 /* Define if we have sys/resource.h (rlimits) */
 /* CLANG_HAVE_RLIMITS defined conditionally below */
 
+/* Define if we have dlfcn.h */
+#define CLANG_HAVE_DLFCN_H 1
+
+/* Define if dladdr() is available on this platform. */
+#define CLANG_HAVE_DLADDR 1
+
 /* Linker version detected at compile time. */
 /* #undef HOST_LINK_VERSION */
 
diff --git a/utils/bazel/llvm-project-overlay/llvm/include/llvm/Config/config.h b/utils/bazel/llvm-project-overlay/llvm/include/llvm/Config/config.h
index da18916b14a76..b4fb2373d571f 100644
--- a/utils/bazel/llvm-project-overlay/llvm/include/llvm/Config/config.h
+++ b/utils/bazel/llvm-project-overlay/llvm/include/llvm/Config/config.h
@@ -66,9 +66,15 @@
    don't. */
 #define HAVE_DECL_STRERROR_S 0
 
+/* Define to 1 if you have the <dlfcn.h> header file. */
+#define HAVE_DLFCN_H 1
+
 /* Define if dlopen() is available on this platform. */
 #define HAVE_DLOPEN 1
 
+/* Define if dladdr() is available on this platform. */
+#define HAVE_DLADDR 1
+
 /* Define to 1 if we can register EH frames on this platform. */
 /* HAVE_REGISTER_FRAME defined in Bazel*/
 
diff --git a/utils/bazel/llvm-project-overlay/llvm/include/llvm/Config/llvm-config.h b/utils/bazel/llvm-project-overlay/llvm/include/llvm/Config/llvm-config.h
index 5235d8303f568..5240b8299c109 100644
--- a/utils/bazel/llvm-project-overlay/llvm/include/llvm/Config/llvm-config.h
+++ b/utils/bazel/llvm-project-overlay/llvm/include/llvm/Config/llvm-config.h
@@ -124,10 +124,4 @@
 /* Define to 1 if you have the DIA SDK installed, and to 0 if you don't. */
 #define LLVM_ENABLE_DIA_SDK 0
 
-/* Define to 1 if you have the <dlfcn.h> header file. */
-#define HAVE_DLFCN_H 1
-
-/* Define if dladdr() is available on this platform. */
-#define HAVE_DLADDR 1
-
 #endif
diff --git a/utils/bazel/llvm_configs/config.h.cmake b/utils/bazel/llvm_configs/config.h.cmake
index d464263c190a7..fc1f9bf342f8d 100644
--- a/utils/bazel/llvm_configs/config.h.cmake
+++ b/utils/bazel/llvm_configs/config.h.cmake
@@ -50,9 +50,15 @@
    don't. */
 #cmakedefine01 HAVE_DECL_STRERROR_S
 
+/* Define to 1 if you have the <dlfcn.h> header file. */
+#cmakedefine HAVE_DLFCN_H ${HAVE_DLFCN_H}
+
 /* Define if dlopen() is available on this platform. */
 #cmakedefine HAVE_DLOPEN ${HAVE_DLOPEN}
 
+/* Define if dladdr() is available on this platform. */
+#cmakedefine HAVE_DLADDR ${HAVE_DLADDR}
+
 /* Define to 1 if we can register EH frames on this platform. */
 #cmakedefine HAVE_REGISTER_FRAME ${HAVE_REGISTER_FRAME}
 
diff --git a/utils/bazel/llvm_configs/llvm-config.h.cmake b/utils/bazel/llvm_configs/llvm-config.h.cmake
index 483c5adc99ca8..6605ea60df99e 100644
--- a/utils/bazel/llvm_configs/llvm-config.h.cmake
+++ b/utils/bazel/llvm_configs/llvm-config.h.cmake
@@ -198,10 +198,4 @@
 /* Define if plugins enabled */
 #cmakedefine LLVM_ENABLE_PLUGINS
 
-/* Define to 1 if you have the <dlfcn.h> header file. */
-#cmakedefine HAVE_DLFCN_H ${HAVE_DLFCN_H}
-
-/* Define if dladdr() is available on this platform. */
-#cmakedefine HAVE_DLADDR ${HAVE_DLADDR}
-
 #endif

From 658b260dbf0afa86d1339d276145840da37242df Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Fri, 22 Dec 2023 15:18:24 +0100
Subject: [PATCH 184/342] [Attributor] Don't construct pretty GEPs

Bring this in line with other transforms like ArgPromotion/SROA/
SCEVExpander and always produce canonical i8 GEPs.
---
 .../Transforms/IPO/AttributorAttributes.cpp   |  53 +----
 .../Attributor/ArgumentPromotion/array.ll     |  20 +-
 .../Attributor/ArgumentPromotion/attrs.ll     |  20 +-
 .../Attributor/ArgumentPromotion/byval-2.ll   |  18 +-
 .../Attributor/ArgumentPromotion/byval.ll     |  36 +--
 .../Attributor/ArgumentPromotion/fp80.ll      |   8 +-
 .../Attributor/ArgumentPromotion/tail.ll      |  20 +-
 .../IPConstantProp/2009-09-24-byval-ptr.ll    |  38 ++--
 .../Attributor/value-simplify-pointer-info.ll | 206 +++++++++---------
 9 files changed, 197 insertions(+), 222 deletions(-)

diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
index 8e1f782f7cd81..b2618e35b0855 100644
--- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
+++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
@@ -291,42 +291,15 @@ static const Value *getPointerOperand(const Instruction *I,
 }
 
 /// Helper function to create a pointer based on \p Ptr, and advanced by \p
-/// Offset bytes. To aid later analysis the method tries to build
-/// getelement pointer instructions that traverse the natural type of \p Ptr if
-/// possible. If that fails, the remaining offset is adjusted byte-wise, hence
-/// through a cast to i8*.
-///
-/// TODO: This could probably live somewhere more prominantly if it doesn't
-///       already exist.
-static Value *constructPointer(Type *PtrElemTy, Value *Ptr, int64_t Offset,
-                               IRBuilder<NoFolder> &IRB, const DataLayout &DL) {
-  assert(Offset >= 0 && "Negative offset not supported yet!");
+/// Offset bytes.
+static Value *constructPointer(Value *Ptr, int64_t Offset,
+                               IRBuilder<NoFolder> &IRB) {
   LLVM_DEBUG(dbgs() << "Construct pointer: " << *Ptr << " + " << Offset
                     << "-bytes\n");
 
-  if (Offset) {
-    Type *Ty = PtrElemTy;
-    APInt IntOffset(DL.getIndexTypeSizeInBits(Ptr->getType()), Offset);
-    SmallVector<APInt> IntIndices = DL.getGEPIndicesForOffset(Ty, IntOffset);
-
-    SmallVector<Value *, 4> ValIndices;
-    std::string GEPName = Ptr->getName().str();
-    for (const APInt &Index : IntIndices) {
-      ValIndices.push_back(IRB.getInt(Index));
-      GEPName += "." + std::to_string(Index.getZExtValue());
-    }
-
-    // Create a GEP for the indices collected above.
-    Ptr = IRB.CreateGEP(PtrElemTy, Ptr, ValIndices, GEPName);
-
-    // If an offset is left we use byte-wise adjustment.
-    if (IntOffset != 0) {
-      Ptr = IRB.CreateGEP(IRB.getInt8Ty(), Ptr, IRB.getInt(IntOffset),
-                          GEPName + ".b" + Twine(IntOffset.getZExtValue()));
-    }
-  }
-
-  LLVM_DEBUG(dbgs() << "Constructed pointer: " << *Ptr << "\n");
+  if (Offset)
+    Ptr = IRB.CreateGEP(IRB.getInt8Ty(), Ptr, IRB.getInt64(Offset),
+                        Ptr->getName() + ".b" + Twine(Offset));
   return Ptr;
 }
 
@@ -7487,16 +7460,15 @@ struct AAPrivatizablePtrArgument final : public AAPrivatizablePtrImpl {
     if (auto *PrivStructType = dyn_cast<StructType>(PrivType)) {
       const StructLayout *PrivStructLayout = DL.getStructLayout(PrivStructType);
       for (unsigned u = 0, e = PrivStructType->getNumElements(); u < e; u++) {
-        Value *Ptr = constructPointer(
-            PrivType, &Base, PrivStructLayout->getElementOffset(u), IRB, DL);
+        Value *Ptr =
+            constructPointer(&Base, PrivStructLayout->getElementOffset(u), IRB);
         new StoreInst(F.getArg(ArgNo + u), Ptr, &IP);
       }
     } else if (auto *PrivArrayType = dyn_cast<ArrayType>(PrivType)) {
       Type *PointeeTy = PrivArrayType->getElementType();
       uint64_t PointeeTySize = DL.getTypeStoreSize(PointeeTy);
       for (unsigned u = 0, e = PrivArrayType->getNumElements(); u < e; u++) {
-        Value *Ptr =
-            constructPointer(PrivType, &Base, u * PointeeTySize, IRB, DL);
+        Value *Ptr = constructPointer(&Base, u * PointeeTySize, IRB);
         new StoreInst(F.getArg(ArgNo + u), Ptr, &IP);
       }
     } else {
@@ -7521,8 +7493,8 @@ struct AAPrivatizablePtrArgument final : public AAPrivatizablePtrImpl {
       const StructLayout *PrivStructLayout = DL.getStructLayout(PrivStructType);
       for (unsigned u = 0, e = PrivStructType->getNumElements(); u < e; u++) {
         Type *PointeeTy = PrivStructType->getElementType(u);
-        Value *Ptr = constructPointer(
-            PrivType, Base, PrivStructLayout->getElementOffset(u), IRB, DL);
+        Value *Ptr =
+            constructPointer(Base, PrivStructLayout->getElementOffset(u), IRB);
         LoadInst *L = new LoadInst(PointeeTy, Ptr, "", IP);
         L->setAlignment(Alignment);
         ReplacementValues.push_back(L);
@@ -7531,8 +7503,7 @@ struct AAPrivatizablePtrArgument final : public AAPrivatizablePtrImpl {
       Type *PointeeTy = PrivArrayType->getElementType();
       uint64_t PointeeTySize = DL.getTypeStoreSize(PointeeTy);
       for (unsigned u = 0, e = PrivArrayType->getNumElements(); u < e; u++) {
-        Value *Ptr =
-            constructPointer(PrivType, Base, u * PointeeTySize, IRB, DL);
+        Value *Ptr = constructPointer(Base, u * PointeeTySize, IRB);
         LoadInst *L = new LoadInst(PointeeTy, Ptr, "", IP);
         L->setAlignment(Alignment);
         ReplacementValues.push_back(L);
diff --git a/llvm/test/Transforms/Attributor/ArgumentPromotion/array.ll b/llvm/test/Transforms/Attributor/ArgumentPromotion/array.ll
index b777e764247f8..a52bbfbe1a346 100644
--- a/llvm/test/Transforms/Attributor/ArgumentPromotion/array.ll
+++ b/llvm/test/Transforms/Attributor/ArgumentPromotion/array.ll
@@ -11,10 +11,10 @@ define void @caller() {
 ; TUNIT-NEXT:  entry:
 ; TUNIT-NEXT:    [[LEFT:%.*]] = alloca [3 x i32], align 4
 ; TUNIT-NEXT:    [[TMP0:%.*]] = load i32, ptr [[LEFT]], align 4
-; TUNIT-NEXT:    [[LEFT_0_1:%.*]] = getelementptr [3 x i32], ptr [[LEFT]], i64 0, i64 1
-; TUNIT-NEXT:    [[TMP1:%.*]] = load i32, ptr [[LEFT_0_1]], align 4
-; TUNIT-NEXT:    [[LEFT_0_2:%.*]] = getelementptr [3 x i32], ptr [[LEFT]], i64 0, i64 2
-; TUNIT-NEXT:    [[TMP2:%.*]] = load i32, ptr [[LEFT_0_2]], align 4
+; TUNIT-NEXT:    [[LEFT_B4:%.*]] = getelementptr i8, ptr [[LEFT]], i64 4
+; TUNIT-NEXT:    [[TMP1:%.*]] = load i32, ptr [[LEFT_B4]], align 4
+; TUNIT-NEXT:    [[LEFT_B8:%.*]] = getelementptr i8, ptr [[LEFT]], i64 8
+; TUNIT-NEXT:    [[TMP2:%.*]] = load i32, ptr [[LEFT_B8]], align 4
 ; TUNIT-NEXT:    call void @callee(i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]])
 ; TUNIT-NEXT:    ret void
 ;
@@ -36,10 +36,10 @@ define internal void @callee(ptr noalias %arg) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[ARG_PRIV:%.*]] = alloca [3 x i32], align 4
 ; CHECK-NEXT:    store i32 [[TMP0]], ptr [[ARG_PRIV]], align 4
-; CHECK-NEXT:    [[ARG_PRIV_0_1:%.*]] = getelementptr [3 x i32], ptr [[ARG_PRIV]], i64 0, i64 1
-; CHECK-NEXT:    store i32 [[TMP1]], ptr [[ARG_PRIV_0_1]], align 4
-; CHECK-NEXT:    [[ARG_PRIV_0_2:%.*]] = getelementptr [3 x i32], ptr [[ARG_PRIV]], i64 0, i64 2
-; CHECK-NEXT:    store i32 [[TMP2]], ptr [[ARG_PRIV_0_2]], align 4
+; CHECK-NEXT:    [[ARG_PRIV_B4:%.*]] = getelementptr i8, ptr [[ARG_PRIV]], i64 4
+; CHECK-NEXT:    store i32 [[TMP1]], ptr [[ARG_PRIV_B4]], align 4
+; CHECK-NEXT:    [[ARG_PRIV_B8:%.*]] = getelementptr i8, ptr [[ARG_PRIV]], i64 8
+; CHECK-NEXT:    store i32 [[TMP2]], ptr [[ARG_PRIV_B8]], align 4
 ; CHECK-NEXT:    call void @use(ptr noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(12) [[ARG_PRIV]])
 ; CHECK-NEXT:    ret void
 ;
@@ -48,5 +48,7 @@ entry:
   ret void
 }
 ;.
-; CHECK: attributes #[[ATTR0]] = { memory(readwrite, argmem: none) }
+; TUNIT: attributes #[[ATTR0]] = { memory(readwrite, argmem: none) }
+;.
+; CGSCC: attributes #[[ATTR0]] = { memory(readwrite, argmem: none) }
 ;.
diff --git a/llvm/test/Transforms/Attributor/ArgumentPromotion/attrs.ll b/llvm/test/Transforms/Attributor/ArgumentPromotion/attrs.ll
index fd9e955819707..877071c1a3fec 100644
--- a/llvm/test/Transforms/Attributor/ArgumentPromotion/attrs.ll
+++ b/llvm/test/Transforms/Attributor/ArgumentPromotion/attrs.ll
@@ -15,14 +15,14 @@ define internal i32 @f(ptr byval(%struct.ss) %b, ptr byval(i32) %X, i32 %i) noun
 ; CHECK-NEXT:    store i32 [[TMP2]], ptr [[X_PRIV]], align 4
 ; CHECK-NEXT:    [[B_PRIV:%.*]] = alloca [[STRUCT_SS:%.*]], align 8
 ; CHECK-NEXT:    store i32 [[TMP0]], ptr [[B_PRIV]], align 4
-; CHECK-NEXT:    [[B_PRIV_0_1:%.*]] = getelementptr [[STRUCT_SS]], ptr [[B_PRIV]], i64 0, i32 1
-; CHECK-NEXT:    store i64 [[TMP1]], ptr [[B_PRIV_0_1]], align 4
-; CHECK-NEXT:    [[TRUETMP1:%.*]] = load i32, ptr [[B_PRIV]], align 8
-; CHECK-NEXT:    [[TRUETMP2:%.*]] = add i32 [[TRUETMP1]], 1
-; CHECK-NEXT:    store i32 [[TRUETMP2]], ptr [[B_PRIV]], align 8
+; CHECK-NEXT:    [[B_PRIV_B4:%.*]] = getelementptr i8, ptr [[B_PRIV]], i64 4
+; CHECK-NEXT:    store i64 [[TMP1]], ptr [[B_PRIV_B4]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[B_PRIV]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[TMP1]], 1
+; CHECK-NEXT:    store i32 [[TMP2]], ptr [[B_PRIV]], align 8
 ; CHECK-NEXT:    store i32 0, ptr [[X_PRIV]], align 4
 ; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[X_PRIV]], align 4
-; CHECK-NEXT:    [[A:%.*]] = add i32 [[L]], [[TRUETMP2]]
+; CHECK-NEXT:    [[A:%.*]] = add i32 [[L]], [[TMP2]]
 ; CHECK-NEXT:    ret i32 [[A]]
 ;
 entry:
@@ -46,10 +46,10 @@ define i32 @test(ptr %X) {
 ; TUNIT-NEXT:  entry:
 ; TUNIT-NEXT:    [[S:%.*]] = alloca [[STRUCT_SS:%.*]], align 8
 ; TUNIT-NEXT:    store i32 1, ptr [[S]], align 8
-; TUNIT-NEXT:    [[TRUETMP4:%.*]] = getelementptr [[STRUCT_SS]], ptr [[S]], i32 0, i32 1
+; TUNIT-NEXT:    [[TMP4:%.*]] = getelementptr [[STRUCT_SS]], ptr [[S]], i32 0, i32 1
 ; TUNIT-NEXT:    [[TMP0:%.*]] = load i32, ptr [[S]], align 8
-; TUNIT-NEXT:    [[S_0_1:%.*]] = getelementptr [[STRUCT_SS]], ptr [[S]], i64 0, i32 1
-; TUNIT-NEXT:    [[TMP1:%.*]] = load i64, ptr [[S_0_1]], align 8
+; TUNIT-NEXT:    [[S_B4:%.*]] = getelementptr i8, ptr [[S]], i64 4
+; TUNIT-NEXT:    [[TMP1:%.*]] = load i64, ptr [[S_B4]], align 8
 ; TUNIT-NEXT:    [[TMP2:%.*]] = load i32, ptr [[X]], align 4
 ; TUNIT-NEXT:    [[C:%.*]] = call i32 @f(i32 [[TMP0]], i64 [[TMP1]], i32 [[TMP2]]) #[[ATTR2:[0-9]+]]
 ; TUNIT-NEXT:    ret i32 [[C]]
@@ -59,7 +59,7 @@ define i32 @test(ptr %X) {
 ; CGSCC-SAME: (ptr nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[X:%.*]]) #[[ATTR1:[0-9]+]] {
 ; CGSCC-NEXT:  entry:
 ; CGSCC-NEXT:    [[S:%.*]] = alloca [[STRUCT_SS:%.*]], align 8
-; CGSCC-NEXT:    [[TRUETMP4:%.*]] = getelementptr [[STRUCT_SS]], ptr [[S]], i32 0, i32 1
+; CGSCC-NEXT:    [[TMP4:%.*]] = getelementptr [[STRUCT_SS]], ptr [[S]], i32 0, i32 1
 ; CGSCC-NEXT:    [[TMP0:%.*]] = load i32, ptr [[X]], align 4
 ; CGSCC-NEXT:    [[C:%.*]] = call i32 @f(i32 noundef 1, i64 noundef 2, i32 [[TMP0]]) #[[ATTR2:[0-9]+]]
 ; CGSCC-NEXT:    ret i32 [[C]]
diff --git a/llvm/test/Transforms/Attributor/ArgumentPromotion/byval-2.ll b/llvm/test/Transforms/Attributor/ArgumentPromotion/byval-2.ll
index 2bb080c42760e..b76254f660904 100644
--- a/llvm/test/Transforms/Attributor/ArgumentPromotion/byval-2.ll
+++ b/llvm/test/Transforms/Attributor/ArgumentPromotion/byval-2.ll
@@ -13,11 +13,11 @@ define internal void @f(ptr byval(%struct.ss)  %b, ptr byval(i32) %X) nounwind
 ; CHECK-NEXT:    store i32 [[TMP2]], ptr [[X_PRIV]], align 4
 ; CHECK-NEXT:    [[B_PRIV:%.*]] = alloca [[STRUCT_SS:%.*]], align 8
 ; CHECK-NEXT:    store i32 [[TMP0]], ptr [[B_PRIV]], align 4
-; CHECK-NEXT:    [[B_PRIV_0_1:%.*]] = getelementptr [[STRUCT_SS]], ptr [[B_PRIV]], i64 0, i32 1
-; CHECK-NEXT:    store i64 [[TMP1]], ptr [[B_PRIV_0_1]], align 4
-; CHECK-NEXT:    [[TRUETMP1:%.*]] = load i32, ptr [[B_PRIV]], align 8
-; CHECK-NEXT:    [[TRUETMP2:%.*]] = add i32 [[TRUETMP1]], 1
-; CHECK-NEXT:    store i32 [[TRUETMP2]], ptr [[B_PRIV]], align 8
+; CHECK-NEXT:    [[B_PRIV_B4:%.*]] = getelementptr i8, ptr [[B_PRIV]], i64 4
+; CHECK-NEXT:    store i64 [[TMP1]], ptr [[B_PRIV_B4]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[B_PRIV]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[TMP1]], 1
+; CHECK-NEXT:    store i32 [[TMP2]], ptr [[B_PRIV]], align 8
 ; CHECK-NEXT:    store i32 0, ptr [[X_PRIV]], align 4
 ; CHECK-NEXT:    ret void
 ;
@@ -38,10 +38,10 @@ define i32 @test(ptr %X) {
 ; TUNIT-NEXT:  entry:
 ; TUNIT-NEXT:    [[S:%.*]] = alloca [[STRUCT_SS:%.*]], align 8
 ; TUNIT-NEXT:    store i32 1, ptr [[S]], align 8
-; TUNIT-NEXT:    [[TRUETMP4:%.*]] = getelementptr [[STRUCT_SS]], ptr [[S]], i32 0, i32 1
+; TUNIT-NEXT:    [[TMP4:%.*]] = getelementptr [[STRUCT_SS]], ptr [[S]], i32 0, i32 1
 ; TUNIT-NEXT:    [[TMP0:%.*]] = load i32, ptr [[S]], align 8
-; TUNIT-NEXT:    [[S_0_1:%.*]] = getelementptr [[STRUCT_SS]], ptr [[S]], i64 0, i32 1
-; TUNIT-NEXT:    [[TMP1:%.*]] = load i64, ptr [[S_0_1]], align 8
+; TUNIT-NEXT:    [[S_B4:%.*]] = getelementptr i8, ptr [[S]], i64 4
+; TUNIT-NEXT:    [[TMP1:%.*]] = load i64, ptr [[S_B4]], align 8
 ; TUNIT-NEXT:    [[TMP2:%.*]] = load i32, ptr [[X]], align 4
 ; TUNIT-NEXT:    call void @f(i32 [[TMP0]], i64 [[TMP1]], i32 [[TMP2]]) #[[ATTR2:[0-9]+]]
 ; TUNIT-NEXT:    ret i32 0
@@ -51,7 +51,7 @@ define i32 @test(ptr %X) {
 ; CGSCC-SAME: (ptr nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[X:%.*]]) #[[ATTR1:[0-9]+]] {
 ; CGSCC-NEXT:  entry:
 ; CGSCC-NEXT:    [[S:%.*]] = alloca [[STRUCT_SS:%.*]], align 8
-; CGSCC-NEXT:    [[TRUETMP4:%.*]] = getelementptr [[STRUCT_SS]], ptr [[S]], i32 0, i32 1
+; CGSCC-NEXT:    [[TMP4:%.*]] = getelementptr [[STRUCT_SS]], ptr [[S]], i32 0, i32 1
 ; CGSCC-NEXT:    ret i32 0
 ;
 entry:
diff --git a/llvm/test/Transforms/Attributor/ArgumentPromotion/byval.ll b/llvm/test/Transforms/Attributor/ArgumentPromotion/byval.ll
index 21f3e039f11d6..77667875256f3 100644
--- a/llvm/test/Transforms/Attributor/ArgumentPromotion/byval.ll
+++ b/llvm/test/Transforms/Attributor/ArgumentPromotion/byval.ll
@@ -13,12 +13,12 @@ define internal i32 @f(ptr byval(%struct.ss)  %b) nounwind  {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[B_PRIV:%.*]] = alloca [[STRUCT_SS:%.*]], align 4
 ; CHECK-NEXT:    store i32 [[TMP0]], ptr [[B_PRIV]], align 4
-; CHECK-NEXT:    [[B_PRIV_0_1:%.*]] = getelementptr [[STRUCT_SS]], ptr [[B_PRIV]], i64 0, i32 1
-; CHECK-NEXT:    store i64 [[TMP1]], ptr [[B_PRIV_0_1]], align 4
-; CHECK-NEXT:    [[TRUETMP1:%.*]] = load i32, ptr [[B_PRIV]], align 8
-; CHECK-NEXT:    [[TRUETMP2:%.*]] = add i32 [[TRUETMP1]], 1
-; CHECK-NEXT:    store i32 [[TRUETMP2]], ptr [[B_PRIV]], align 8
-; CHECK-NEXT:    ret i32 [[TRUETMP1]]
+; CHECK-NEXT:    [[B_PRIV_B4:%.*]] = getelementptr i8, ptr [[B_PRIV]], i64 4
+; CHECK-NEXT:    store i64 [[TMP1]], ptr [[B_PRIV_B4]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[B_PRIV]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[TMP1]], 1
+; CHECK-NEXT:    store i32 [[TMP2]], ptr [[B_PRIV]], align 8
+; CHECK-NEXT:    ret i32 [[TMP1]]
 ;
 entry:
   %tmp1 = load i32, ptr %b, align 4
@@ -35,12 +35,12 @@ define internal i32 @g(ptr byval(%struct.ss) align 32 %b) nounwind {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[B_PRIV:%.*]] = alloca [[STRUCT_SS:%.*]], align 4
 ; CHECK-NEXT:    store i32 [[TMP0]], ptr [[B_PRIV]], align 4
-; CHECK-NEXT:    [[B_PRIV_0_1:%.*]] = getelementptr [[STRUCT_SS]], ptr [[B_PRIV]], i64 0, i32 1
-; CHECK-NEXT:    store i64 [[TMP1]], ptr [[B_PRIV_0_1]], align 4
-; CHECK-NEXT:    [[TRUETMP1:%.*]] = load i32, ptr [[B_PRIV]], align 32
-; CHECK-NEXT:    [[TRUETMP2:%.*]] = add i32 [[TRUETMP1]], 1
-; CHECK-NEXT:    store i32 [[TRUETMP2]], ptr [[B_PRIV]], align 32
-; CHECK-NEXT:    ret i32 [[TRUETMP2]]
+; CHECK-NEXT:    [[B_PRIV_B4:%.*]] = getelementptr i8, ptr [[B_PRIV]], i64 4
+; CHECK-NEXT:    store i64 [[TMP1]], ptr [[B_PRIV_B4]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[B_PRIV]], align 32
+; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[TMP1]], 1
+; CHECK-NEXT:    store i32 [[TMP2]], ptr [[B_PRIV]], align 32
+; CHECK-NEXT:    ret i32 [[TMP2]]
 ;
 entry:
   %tmp1 = load i32, ptr %b, align 4
@@ -57,14 +57,14 @@ define i32 @main() nounwind  {
 ; TUNIT-NEXT:  entry:
 ; TUNIT-NEXT:    [[S:%.*]] = alloca [[STRUCT_SS:%.*]], align 4
 ; TUNIT-NEXT:    store i32 1, ptr [[S]], align 32
-; TUNIT-NEXT:    [[TRUETMP4:%.*]] = getelementptr [[STRUCT_SS]], ptr [[S]], i32 0, i32 1
+; TUNIT-NEXT:    [[TMP4:%.*]] = getelementptr [[STRUCT_SS]], ptr [[S]], i32 0, i32 1
 ; TUNIT-NEXT:    [[TMP0:%.*]] = load i32, ptr [[S]], align 8
-; TUNIT-NEXT:    [[S_0_11:%.*]] = getelementptr [[STRUCT_SS]], ptr [[S]], i64 0, i32 1
-; TUNIT-NEXT:    [[TMP1:%.*]] = load i64, ptr [[S_0_11]], align 8
+; TUNIT-NEXT:    [[S_B4:%.*]] = getelementptr i8, ptr [[S]], i64 4
+; TUNIT-NEXT:    [[TMP1:%.*]] = load i64, ptr [[S_B4]], align 8
 ; TUNIT-NEXT:    [[C0:%.*]] = call i32 @f(i32 [[TMP0]], i64 [[TMP1]]) #[[ATTR1:[0-9]+]]
 ; TUNIT-NEXT:    [[TMP2:%.*]] = load i32, ptr [[S]], align 32
-; TUNIT-NEXT:    [[S_0_1:%.*]] = getelementptr [[STRUCT_SS]], ptr [[S]], i64 0, i32 1
-; TUNIT-NEXT:    [[TMP3:%.*]] = load i64, ptr [[S_0_1]], align 32
+; TUNIT-NEXT:    [[S_B41:%.*]] = getelementptr i8, ptr [[S]], i64 4
+; TUNIT-NEXT:    [[TMP3:%.*]] = load i64, ptr [[S_B41]], align 32
 ; TUNIT-NEXT:    [[C1:%.*]] = call i32 @g(i32 [[TMP2]], i64 [[TMP3]]) #[[ATTR1]]
 ; TUNIT-NEXT:    [[A:%.*]] = add i32 [[C0]], [[C1]]
 ; TUNIT-NEXT:    ret i32 [[A]]
@@ -74,7 +74,7 @@ define i32 @main() nounwind  {
 ; CGSCC-SAME: () #[[ATTR1:[0-9]+]] {
 ; CGSCC-NEXT:  entry:
 ; CGSCC-NEXT:    [[S:%.*]] = alloca [[STRUCT_SS:%.*]], align 4
-; CGSCC-NEXT:    [[TRUETMP4:%.*]] = getelementptr [[STRUCT_SS]], ptr [[S]], i32 0, i32 1
+; CGSCC-NEXT:    [[TMP4:%.*]] = getelementptr [[STRUCT_SS]], ptr [[S]], i32 0, i32 1
 ; CGSCC-NEXT:    [[C0:%.*]] = call i32 @f(i32 noundef 1, i64 noundef 2) #[[ATTR2:[0-9]+]]
 ; CGSCC-NEXT:    [[C1:%.*]] = call i32 @g(i32 noundef 1, i64 noundef 2) #[[ATTR2]]
 ; CGSCC-NEXT:    [[A:%.*]] = add i32 [[C0]], [[C1]]
diff --git a/llvm/test/Transforms/Attributor/ArgumentPromotion/fp80.ll b/llvm/test/Transforms/Attributor/ArgumentPromotion/fp80.ll
index 0370b86faa096..274181fa8b9ef 100644
--- a/llvm/test/Transforms/Attributor/ArgumentPromotion/fp80.ll
+++ b/llvm/test/Transforms/Attributor/ArgumentPromotion/fp80.ll
@@ -14,8 +14,8 @@ target triple = "x86_64-unknown-linux-gnu"
 @a = internal global %struct.Foo { i32 1, i64 2 }, align 8
 
 ;.
-; CHECK: @[[B:[a-zA-Z0-9_$"\\.-]+]] = internal global [[STRUCT_S:%.*]] { double 3.140000e+00, i16 9439, i8 25, [5 x i8] undef }, align 16
-; CHECK: @[[A:[a-zA-Z0-9_$"\\.-]+]] = internal global [[STRUCT_FOO:%.*]] { i32 1, i64 2 }, align 8
+; CHECK: @b = internal global %struct.s { double 3.140000e+00, i16 9439, i8 25, [5 x i8] undef }, align 16
+; CHECK: @a = internal global %struct.Foo { i32 1, i64 2 }, align 8
 ;.
 define void @run() {
 ;
@@ -82,8 +82,8 @@ define internal i64 @CaptureAStruct(ptr byval(%struct.Foo) %a) {
 ; CGSCC-NEXT:  entry:
 ; CGSCC-NEXT:    [[A_PRIV:%.*]] = alloca [[STRUCT_FOO:%.*]], align 8
 ; CGSCC-NEXT:    store i32 [[TMP0]], ptr [[A_PRIV]], align 4
-; CGSCC-NEXT:    [[A_PRIV_0_1:%.*]] = getelementptr [[STRUCT_FOO]], ptr [[A_PRIV]], i64 0, i32 1
-; CGSCC-NEXT:    store i64 [[TMP1]], ptr [[A_PRIV_0_1]], align 8
+; CGSCC-NEXT:    [[A_PRIV_B8:%.*]] = getelementptr i8, ptr [[A_PRIV]], i64 8
+; CGSCC-NEXT:    store i64 [[TMP1]], ptr [[A_PRIV_B8]], align 8
 ; CGSCC-NEXT:    [[A_PTR:%.*]] = alloca ptr, align 8
 ; CGSCC-NEXT:    br label [[LOOP:%.*]]
 ; CGSCC:       loop:
diff --git a/llvm/test/Transforms/Attributor/ArgumentPromotion/tail.ll b/llvm/test/Transforms/Attributor/ArgumentPromotion/tail.ll
index 71a6e39e71668..c1e92bde0236d 100644
--- a/llvm/test/Transforms/Attributor/ArgumentPromotion/tail.ll
+++ b/llvm/test/Transforms/Attributor/ArgumentPromotion/tail.ll
@@ -15,8 +15,8 @@ define internal void @bar(ptr byval(%pair) %Data) {
 ; TUNIT-SAME: (i32 [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0:[0-9]+]] {
 ; TUNIT-NEXT:    [[DATA_PRIV:%.*]] = alloca [[PAIR:%.*]], align 8
 ; TUNIT-NEXT:    store i32 [[TMP0]], ptr [[DATA_PRIV]], align 4
-; TUNIT-NEXT:    [[DATA_PRIV_0_1:%.*]] = getelementptr [[PAIR]], ptr [[DATA_PRIV]], i64 0, i32 1
-; TUNIT-NEXT:    store i32 [[TMP1]], ptr [[DATA_PRIV_0_1]], align 4
+; TUNIT-NEXT:    [[DATA_PRIV_B4:%.*]] = getelementptr i8, ptr [[DATA_PRIV]], i64 4
+; TUNIT-NEXT:    store i32 [[TMP1]], ptr [[DATA_PRIV_B4]], align 4
 ; TUNIT-NEXT:    [[TMP3:%.*]] = call ptr @foo(ptr nonnull dereferenceable(8) [[DATA_PRIV]])
 ; TUNIT-NEXT:    ret void
 ;
@@ -25,8 +25,8 @@ define internal void @bar(ptr byval(%pair) %Data) {
 ; CGSCC-SAME: (i32 [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CGSCC-NEXT:    [[DATA_PRIV:%.*]] = alloca [[PAIR:%.*]], align 8
 ; CGSCC-NEXT:    store i32 [[TMP0]], ptr [[DATA_PRIV]], align 4
-; CGSCC-NEXT:    [[DATA_PRIV_0_1:%.*]] = getelementptr [[PAIR]], ptr [[DATA_PRIV]], i64 0, i32 1
-; CGSCC-NEXT:    store i32 [[TMP1]], ptr [[DATA_PRIV_0_1]], align 4
+; CGSCC-NEXT:    [[DATA_PRIV_B4:%.*]] = getelementptr i8, ptr [[DATA_PRIV]], i64 4
+; CGSCC-NEXT:    store i32 [[TMP1]], ptr [[DATA_PRIV_B4]], align 4
 ; CGSCC-NEXT:    [[TMP3:%.*]] = call ptr @foo(ptr noundef nonnull dereferenceable(8) [[DATA_PRIV]])
 ; CGSCC-NEXT:    ret void
 ;
@@ -38,16 +38,16 @@ define void @zed(ptr byval(%pair) %Data) {
 ; TUNIT-LABEL: define {{[^@]+}}@zed
 ; TUNIT-SAME: (ptr noalias nocapture nonnull readonly byval([[PAIR:%.*]]) dereferenceable(8) [[DATA:%.*]]) {
 ; TUNIT-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DATA]], align 1
-; TUNIT-NEXT:    [[DATA_0_1:%.*]] = getelementptr [[PAIR]], ptr [[DATA]], i64 0, i32 1
-; TUNIT-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DATA_0_1]], align 1
+; TUNIT-NEXT:    [[DATA_B4:%.*]] = getelementptr i8, ptr [[DATA]], i64 4
+; TUNIT-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DATA_B4]], align 1
 ; TUNIT-NEXT:    call void @bar(i32 [[TMP1]], i32 [[TMP2]])
 ; TUNIT-NEXT:    ret void
 ;
 ; CGSCC-LABEL: define {{[^@]+}}@zed
 ; CGSCC-SAME: (ptr noalias nocapture nofree noundef nonnull readonly byval([[PAIR:%.*]]) dereferenceable(8) [[DATA:%.*]]) {
 ; CGSCC-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DATA]], align 1
-; CGSCC-NEXT:    [[DATA_0_1:%.*]] = getelementptr [[PAIR]], ptr [[DATA]], i64 0, i32 1
-; CGSCC-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DATA_0_1]], align 1
+; CGSCC-NEXT:    [[DATA_B4:%.*]] = getelementptr i8, ptr [[DATA]], i64 4
+; CGSCC-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DATA_B4]], align 1
 ; CGSCC-NEXT:    call void @bar(i32 [[TMP1]], i32 [[TMP2]])
 ; CGSCC-NEXT:    ret void
 ;
@@ -55,7 +55,9 @@ define void @zed(ptr byval(%pair) %Data) {
   ret void
 }
 ;.
-; CHECK: attributes #[[ATTR0:[0-9]+]] = { memory(readwrite, argmem: none) }
+; TUNIT: attributes #[[ATTR0]] = { memory(readwrite, argmem: none) }
+;.
+; CGSCC: attributes #[[ATTR0]] = { memory(readwrite, argmem: none) }
 ;.
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; CHECK: {{.*}}
diff --git a/llvm/test/Transforms/Attributor/IPConstantProp/2009-09-24-byval-ptr.ll b/llvm/test/Transforms/Attributor/IPConstantProp/2009-09-24-byval-ptr.ll
index 81e6f959cef4c..154b093e9dbb8 100644
--- a/llvm/test/Transforms/Attributor/IPConstantProp/2009-09-24-byval-ptr.ll
+++ b/llvm/test/Transforms/Attributor/IPConstantProp/2009-09-24-byval-ptr.ll
@@ -9,7 +9,7 @@
 
 declare void @use(i8)
 ;.
-; CHECK: @[[MYSTR:[a-zA-Z0-9_$"\\.-]+]] = internal global [[STRUCT_MYSTR:%.*]] zeroinitializer
+; CHECK: @mystr = internal global %struct.MYstr zeroinitializer
 ;.
 define internal void @vfu1(ptr byval(%struct.MYstr) align 4 %u) nounwind {
 ; CHECK: Function Attrs: nounwind memory(readwrite, argmem: none)
@@ -18,8 +18,8 @@ define internal void @vfu1(ptr byval(%struct.MYstr) align 4 %u) nounwind {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[U_PRIV:%.*]] = alloca [[STRUCT_MYSTR:%.*]], align 8
 ; CHECK-NEXT:    store i8 [[TMP0]], ptr [[U_PRIV]], align 1
-; CHECK-NEXT:    [[U_PRIV_0_1:%.*]] = getelementptr [[STRUCT_MYSTR]], ptr [[U_PRIV]], i64 0, i32 1
-; CHECK-NEXT:    store i32 [[TMP1]], ptr [[U_PRIV_0_1]], align 4
+; CHECK-NEXT:    [[U_PRIV_B4:%.*]] = getelementptr i8, ptr [[U_PRIV]], i64 4
+; CHECK-NEXT:    store i32 [[TMP1]], ptr [[U_PRIV_B4]], align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr [[STRUCT_MYSTR]], ptr [[U_PRIV]], i32 0, i32 1
 ; CHECK-NEXT:    store i32 99, ptr [[TMP2]], align 4
 ; CHECK-NEXT:    store i8 97, ptr [[U_PRIV]], align 8
@@ -49,8 +49,8 @@ define internal i32 @vfu2(ptr byval(%struct.MYstr) align 4 %u) nounwind readonly
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[U_PRIV:%.*]] = alloca [[STRUCT_MYSTR:%.*]], align 8
 ; CHECK-NEXT:    store i8 [[TMP0]], ptr [[U_PRIV]], align 1
-; CHECK-NEXT:    [[U_PRIV_0_1:%.*]] = getelementptr [[STRUCT_MYSTR]], ptr [[U_PRIV]], i64 0, i32 1
-; CHECK-NEXT:    store i32 [[TMP1]], ptr [[U_PRIV_0_1]], align 4
+; CHECK-NEXT:    [[U_PRIV_B4:%.*]] = getelementptr i8, ptr [[U_PRIV]], i64 4
+; CHECK-NEXT:    store i32 [[TMP1]], ptr [[U_PRIV_B4]], align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr [[STRUCT_MYSTR]], ptr [[U_PRIV]], i32 0, i32 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr [[U_PRIV]], align 8
@@ -74,12 +74,12 @@ define i32 @unions() nounwind {
 ; TUNIT-SAME: () #[[ATTR2:[0-9]+]] {
 ; TUNIT-NEXT:  entry:
 ; TUNIT-NEXT:    [[TMP0:%.*]] = load i8, ptr @mystr, align 8
-; TUNIT-NEXT:    [[MYSTR_0_1:%.*]] = getelementptr [[STRUCT_MYSTR:%.*]], ptr @mystr, i64 0, i32 1
-; TUNIT-NEXT:    [[TMP1:%.*]] = load i32, ptr [[MYSTR_0_1]], align 8
+; TUNIT-NEXT:    [[MYSTR_B4:%.*]] = getelementptr i8, ptr @mystr, i64 4
+; TUNIT-NEXT:    [[TMP1:%.*]] = load i32, ptr [[MYSTR_B4]], align 8
 ; TUNIT-NEXT:    call void @vfu1(i8 [[TMP0]], i32 [[TMP1]]) #[[ATTR2]]
 ; TUNIT-NEXT:    [[TMP2:%.*]] = load i8, ptr @mystr, align 8
-; TUNIT-NEXT:    [[MYSTR_0_11:%.*]] = getelementptr [[STRUCT_MYSTR]], ptr @mystr, i64 0, i32 1
-; TUNIT-NEXT:    [[TMP3:%.*]] = load i32, ptr [[MYSTR_0_11]], align 8
+; TUNIT-NEXT:    [[MYSTR_B41:%.*]] = getelementptr i8, ptr @mystr, i64 4
+; TUNIT-NEXT:    [[TMP3:%.*]] = load i32, ptr [[MYSTR_B41]], align 8
 ; TUNIT-NEXT:    [[RESULT:%.*]] = call i32 @vfu2(i8 [[TMP2]], i32 [[TMP3]]) #[[ATTR3:[0-9]+]]
 ; TUNIT-NEXT:    ret i32 [[RESULT]]
 ;
@@ -88,12 +88,12 @@ define i32 @unions() nounwind {
 ; CGSCC-SAME: () #[[ATTR2:[0-9]+]] {
 ; CGSCC-NEXT:  entry:
 ; CGSCC-NEXT:    [[TMP0:%.*]] = load i8, ptr @mystr, align 8
-; CGSCC-NEXT:    [[MYSTR_0_1:%.*]] = getelementptr [[STRUCT_MYSTR:%.*]], ptr @mystr, i64 0, i32 1
-; CGSCC-NEXT:    [[TMP1:%.*]] = load i32, ptr [[MYSTR_0_1]], align 8
+; CGSCC-NEXT:    [[MYSTR_B4:%.*]] = getelementptr i8, ptr @mystr, i64 4
+; CGSCC-NEXT:    [[TMP1:%.*]] = load i32, ptr [[MYSTR_B4]], align 8
 ; CGSCC-NEXT:    call void @vfu1(i8 [[TMP0]], i32 [[TMP1]]) #[[ATTR2]]
 ; CGSCC-NEXT:    [[TMP2:%.*]] = load i8, ptr @mystr, align 8
-; CGSCC-NEXT:    [[MYSTR_0_11:%.*]] = getelementptr [[STRUCT_MYSTR]], ptr @mystr, i64 0, i32 1
-; CGSCC-NEXT:    [[TMP3:%.*]] = load i32, ptr [[MYSTR_0_11]], align 8
+; CGSCC-NEXT:    [[MYSTR_B41:%.*]] = getelementptr i8, ptr @mystr, i64 4
+; CGSCC-NEXT:    [[TMP3:%.*]] = load i32, ptr [[MYSTR_B41]], align 8
 ; CGSCC-NEXT:    [[RESULT:%.*]] = call i32 @vfu2(i8 [[TMP2]], i32 [[TMP3]]) #[[ATTR2]]
 ; CGSCC-NEXT:    ret i32 [[RESULT]]
 ;
@@ -110,8 +110,8 @@ define internal i32 @vfu2_v2(ptr byval(%struct.MYstr) align 4 %u) nounwind reado
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[U_PRIV:%.*]] = alloca [[STRUCT_MYSTR:%.*]], align 8
 ; CHECK-NEXT:    store i8 [[TMP0]], ptr [[U_PRIV]], align 1
-; CHECK-NEXT:    [[U_PRIV_0_1:%.*]] = getelementptr [[STRUCT_MYSTR]], ptr [[U_PRIV]], i64 0, i32 1
-; CHECK-NEXT:    store i32 [[TMP1]], ptr [[U_PRIV_0_1]], align 4
+; CHECK-NEXT:    [[U_PRIV_B4:%.*]] = getelementptr i8, ptr [[U_PRIV]], i64 4
+; CHECK-NEXT:    store i32 [[TMP1]], ptr [[U_PRIV_B4]], align 4
 ; CHECK-NEXT:    [[Z:%.*]] = getelementptr [[STRUCT_MYSTR]], ptr [[U_PRIV]], i32 0, i32 1
 ; CHECK-NEXT:    store i32 99, ptr [[Z]], align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr [[STRUCT_MYSTR]], ptr [[U_PRIV]], i32 0, i32 1
@@ -139,12 +139,12 @@ define i32 @unions_v2() nounwind {
 ; TUNIT-SAME: () #[[ATTR2]] {
 ; TUNIT-NEXT:  entry:
 ; TUNIT-NEXT:    [[TMP0:%.*]] = load i8, ptr @mystr, align 8
-; TUNIT-NEXT:    [[MYSTR_0_11:%.*]] = getelementptr [[STRUCT_MYSTR:%.*]], ptr @mystr, i64 0, i32 1
-; TUNIT-NEXT:    [[TMP1:%.*]] = load i32, ptr [[MYSTR_0_11]], align 8
+; TUNIT-NEXT:    [[MYSTR_B41:%.*]] = getelementptr i8, ptr @mystr, i64 4
+; TUNIT-NEXT:    [[TMP1:%.*]] = load i32, ptr [[MYSTR_B41]], align 8
 ; TUNIT-NEXT:    call void @vfu1(i8 [[TMP0]], i32 [[TMP1]]) #[[ATTR2]]
 ; TUNIT-NEXT:    [[TMP2:%.*]] = load i8, ptr @mystr, align 8
-; TUNIT-NEXT:    [[MYSTR_0_1:%.*]] = getelementptr [[STRUCT_MYSTR]], ptr @mystr, i64 0, i32 1
-; TUNIT-NEXT:    [[TMP3:%.*]] = load i32, ptr [[MYSTR_0_1]], align 8
+; TUNIT-NEXT:    [[MYSTR_B4:%.*]] = getelementptr i8, ptr @mystr, i64 4
+; TUNIT-NEXT:    [[TMP3:%.*]] = load i32, ptr [[MYSTR_B4]], align 8
 ; TUNIT-NEXT:    [[RESULT:%.*]] = call i32 @vfu2_v2(i8 [[TMP2]], i32 [[TMP3]]) #[[ATTR3]]
 ; TUNIT-NEXT:    ret i32 [[RESULT]]
 ;
diff --git a/llvm/test/Transforms/Attributor/value-simplify-pointer-info.ll b/llvm/test/Transforms/Attributor/value-simplify-pointer-info.ll
index ef1458b8bc84b..99a4c0aac7a23 100644
--- a/llvm/test/Transforms/Attributor/value-simplify-pointer-info.ll
+++ b/llvm/test/Transforms/Attributor/value-simplify-pointer-info.ll
@@ -32,37 +32,37 @@
 @rec_storage = internal global i32 undef
 
 ;.
-; CHECK: @[[GLOBALBYTES:[a-zA-Z0-9_$"\\.-]+]] = global [1024 x i8] zeroinitializer, align 16
-; CHECK: @[[GINT1:[a-zA-Z0-9_$"\\.-]+]] = global i32 0, align 4
-; CHECK: @[[GINT2:[a-zA-Z0-9_$"\\.-]+]] = global i32 0, align 4
-; CHECK: @[[GSTATIC_INT1:[a-zA-Z0-9_$"\\.-]+]] = internal global i32 0, align 4
-; CHECK: @[[GSTATIC_INT2:[a-zA-Z0-9_$"\\.-]+]] = internal global i32 0, align 4
-; CHECK: @[[GSTATIC_INT3:[a-zA-Z0-9_$"\\.-]+]] = internal global i32 0, align 4
-; CHECK: @[[GSTATIC_UNDEF_INT1:[a-zA-Z0-9_$"\\.-]+]] = internal global i32 undef, align 4
-; CHECK: @[[GSTATIC_UNDEF_INT2:[a-zA-Z0-9_$"\\.-]+]] = internal global i32 undef, align 4
-; CHECK: @[[GI1:[a-zA-Z0-9_$"\\.-]+]] = internal global i32 undef, align 4
-; CHECK: @[[GI2:[a-zA-Z0-9_$"\\.-]+]] = internal global i32 undef, align 4
-; CHECK: @[[GS1:[a-zA-Z0-9_$"\\.-]+]] = internal global [[STRUCT_S:%.*]] undef, align 4
-; CHECK: @[[GS2:[a-zA-Z0-9_$"\\.-]+]] = internal global [[STRUCT_S:%.*]] zeroinitializer, align 4
-; CHECK: @[[VS1:[a-zA-Z0-9_$"\\.-]+]] = internal global [[STRUCT_S:%.*]] undef, align 4
-; CHECK: @[[VS2:[a-zA-Z0-9_$"\\.-]+]] = internal global [[STRUCT_S:%.*]] undef, align 4
-; CHECK: @[[GBYTES:[a-zA-Z0-9_$"\\.-]+]] = internal global [1024 x i8] zeroinitializer, align 16
-; CHECK: @[[FLAG0:[a-zA-Z0-9_$"\\.-]+]] = global i32 0, align 4
-; CHECK: @[[FLAG1:[a-zA-Z0-9_$"\\.-]+]] = internal global i32 undef, align 4
-; CHECK: @[[FLAG2:[a-zA-Z0-9_$"\\.-]+]] = internal global i32 undef, align 4
-; CHECK: @[[FLAG4:[a-zA-Z0-9_$"\\.-]+]] = internal global i32 undef, align 4
-; CHECK: @[[FLAG3:[a-zA-Z0-9_$"\\.-]+]] = internal global i32 0, align 4
-; CHECK: @[[A1:[a-zA-Z0-9_$"\\.-]+]] = internal global i32 0
-; CHECK: @[[A2:[a-zA-Z0-9_$"\\.-]+]] = internal global i32 0
-; CHECK: @[[A3:[a-zA-Z0-9_$"\\.-]+]] = internal global i32 undef
-; CHECK: @[[BYTES1:[a-zA-Z0-9_$"\\.-]+]] = internal global i32 undef
-; CHECK: @[[BYTES2:[a-zA-Z0-9_$"\\.-]+]] = internal global i32 undef
-; CHECK: @[[REC_STORAGE:[a-zA-Z0-9_$"\\.-]+]] = internal global i32 undef
-; CHECK: @[[GLOBAL:[a-zA-Z0-9_$"\\.-]+]] = internal global [[STRUCT_STY:%.*]] zeroinitializer, align 8
-; CHECK: @[[G:[a-zA-Z0-9_$"\\.-]+]] = internal global i32 0, align 4
-; CHECK: @[[GC:[a-zA-Z0-9_$"\\.-]+]] = internal global i32 undef, align 4
-; CHECK: @[[GRS:[a-zA-Z0-9_$"\\.-]+]] = internal thread_local global i32 undef
-; CHECK: @[[GRS2:[a-zA-Z0-9_$"\\.-]+]] = global i32 undef
+; CHECK: @globalBytes = global [1024 x i8] zeroinitializer, align 16
+; CHECK: @Gint1 = global i32 0, align 4
+; CHECK: @Gint2 = global i32 0, align 4
+; CHECK: @Gstatic_int1 = internal global i32 0, align 4
+; CHECK: @Gstatic_int2 = internal global i32 0, align 4
+; CHECK: @Gstatic_int3 = internal global i32 0, align 4
+; CHECK: @Gstatic_undef_int1 = internal global i32 undef, align 4
+; CHECK: @Gstatic_undef_int2 = internal global i32 undef, align 4
+; CHECK: @GI1 = internal global i32 undef, align 4
+; CHECK: @GI2 = internal global i32 undef, align 4
+; CHECK: @Gs1 = internal global %struct.S undef, align 4
+; CHECK: @Gs2 = internal global %struct.S zeroinitializer, align 4
+; CHECK: @Vs1 = internal global %struct.S undef, align 4
+; CHECK: @Vs2 = internal global %struct.S undef, align 4
+; CHECK: @GBytes = internal global [1024 x i8] zeroinitializer, align 16
+; CHECK: @Flag0 = global i32 0, align 4
+; CHECK: @Flag1 = internal global i32 undef, align 4
+; CHECK: @Flag2 = internal global i32 undef, align 4
+; CHECK: @Flag4 = internal global i32 undef, align 4
+; CHECK: @Flag3 = internal global i32 0, align 4
+; CHECK: @a1 = internal global i32 0
+; CHECK: @a2 = internal global i32 0
+; CHECK: @a3 = internal global i32 undef
+; CHECK: @bytes1 = internal global i32 undef
+; CHECK: @bytes2 = internal global i32 undef
+; CHECK: @rec_storage = internal global i32 undef
+; CHECK: @global = internal global %struct.STy zeroinitializer, align 8
+; CHECK: @G = internal global i32 0, align 4
+; CHECK: @GC = internal global i32 undef, align 4
+; CHECK: @GRS = internal thread_local global i32 undef
+; CHECK: @GRS2 = global i32 undef
 ;.
 define void @write_arg(ptr %p, i32 %v) {
 ; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: write)
@@ -2674,10 +2674,10 @@ define dso_local void @test_nested_memory(ptr %dst, ptr %src) {
 ; TUNIT-NEXT:    store ptr [[SRC]], ptr [[SRC2]], align 8
 ; TUNIT-NEXT:    store ptr [[CALL_H2S]], ptr getelementptr inbounds ([[STRUCT_STY]], ptr @global, i64 0, i32 2), align 8
 ; TUNIT-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[LOCAL]], align 8
-; TUNIT-NEXT:    [[LOCAL_0_1:%.*]] = getelementptr [[STRUCT_STY]], ptr [[LOCAL]], i64 0, i32 1
-; TUNIT-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[LOCAL_0_1]], align 8
-; TUNIT-NEXT:    [[LOCAL_0_2:%.*]] = getelementptr [[STRUCT_STY]], ptr [[LOCAL]], i64 0, i32 2
-; TUNIT-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[LOCAL_0_2]], align 8
+; TUNIT-NEXT:    [[LOCAL_B8:%.*]] = getelementptr i8, ptr [[LOCAL]], i64 8
+; TUNIT-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[LOCAL_B8]], align 8
+; TUNIT-NEXT:    [[LOCAL_B16:%.*]] = getelementptr i8, ptr [[LOCAL]], i64 16
+; TUNIT-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[LOCAL_B16]], align 8
 ; TUNIT-NEXT:    call fastcc void @nested_memory_callee(ptr [[TMP0]], ptr [[TMP1]], ptr [[TMP2]]) #[[ATTR21:[0-9]+]]
 ; TUNIT-NEXT:    ret void
 ;
@@ -2714,10 +2714,10 @@ define internal fastcc void @nested_memory_callee(ptr nocapture readonly %S) nof
 ; TUNIT-NEXT:  entry:
 ; TUNIT-NEXT:    [[S_PRIV:%.*]] = alloca [[STRUCT_STY:%.*]], align 8
 ; TUNIT-NEXT:    store ptr [[TMP0]], ptr [[S_PRIV]], align 8
-; TUNIT-NEXT:    [[S_PRIV_0_1:%.*]] = getelementptr [[STRUCT_STY]], ptr [[S_PRIV]], i64 0, i32 1
-; TUNIT-NEXT:    store ptr [[TMP1]], ptr [[S_PRIV_0_1]], align 8
-; TUNIT-NEXT:    [[S_PRIV_0_2:%.*]] = getelementptr [[STRUCT_STY]], ptr [[S_PRIV]], i64 0, i32 2
-; TUNIT-NEXT:    store ptr [[TMP2]], ptr [[S_PRIV_0_2]], align 8
+; TUNIT-NEXT:    [[S_PRIV_B8:%.*]] = getelementptr i8, ptr [[S_PRIV]], i64 8
+; TUNIT-NEXT:    store ptr [[TMP1]], ptr [[S_PRIV_B8]], align 8
+; TUNIT-NEXT:    [[S_PRIV_B16:%.*]] = getelementptr i8, ptr [[S_PRIV]], i64 16
+; TUNIT-NEXT:    store ptr [[TMP2]], ptr [[S_PRIV_B16]], align 8
 ; TUNIT-NEXT:    [[INNER:%.*]] = getelementptr inbounds [[STRUCT_STY]], ptr [[S_PRIV]], i64 0, i32 2
 ; TUNIT-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[INNER]], align 8
 ; TUNIT-NEXT:    [[INNER1:%.*]] = getelementptr inbounds [[STRUCT_STY]], ptr [[TMP3]], i64 0, i32 2
@@ -2736,10 +2736,10 @@ define internal fastcc void @nested_memory_callee(ptr nocapture readonly %S) nof
 ; CGSCC-NEXT:  entry:
 ; CGSCC-NEXT:    [[S_PRIV:%.*]] = alloca [[STRUCT_STY:%.*]], align 8
 ; CGSCC-NEXT:    store ptr [[TMP0]], ptr [[S_PRIV]], align 8
-; CGSCC-NEXT:    [[S_PRIV_0_1:%.*]] = getelementptr [[STRUCT_STY]], ptr [[S_PRIV]], i64 0, i32 1
-; CGSCC-NEXT:    store ptr [[TMP1]], ptr [[S_PRIV_0_1]], align 8
-; CGSCC-NEXT:    [[S_PRIV_0_2:%.*]] = getelementptr [[STRUCT_STY]], ptr [[S_PRIV]], i64 0, i32 2
-; CGSCC-NEXT:    store ptr [[TMP2]], ptr [[S_PRIV_0_2]], align 8
+; CGSCC-NEXT:    [[S_PRIV_B8:%.*]] = getelementptr i8, ptr [[S_PRIV]], i64 8
+; CGSCC-NEXT:    store ptr [[TMP1]], ptr [[S_PRIV_B8]], align 8
+; CGSCC-NEXT:    [[S_PRIV_B16:%.*]] = getelementptr i8, ptr [[S_PRIV]], i64 16
+; CGSCC-NEXT:    store ptr [[TMP2]], ptr [[S_PRIV_B16]], align 8
 ; CGSCC-NEXT:    [[INNER:%.*]] = getelementptr inbounds [[STRUCT_STY]], ptr [[S_PRIV]], i64 0, i32 2
 ; CGSCC-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[INNER]], align 8
 ; CGSCC-NEXT:    [[INNER1:%.*]] = getelementptr inbounds [[STRUCT_STY]], ptr [[TMP3]], i64 0, i32 2
@@ -3289,67 +3289,67 @@ declare void @llvm.assume(i1 noundef)
 ;.
 ; TUNIT: [[META0:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
 ; TUNIT: [[META1:![0-9]+]] = !{i32 7, !"uwtable", i32 1}
-; TUNIT: [[META2:![0-9]+]] = !{!"clang version 13.0.0"}
-; TUNIT: [[TBAA3]] = !{!4, !4, i64 0}
-; TUNIT: [[META4:![0-9]+]] = !{!"int", !5, i64 0}
-; TUNIT: [[META5:![0-9]+]] = !{!"omnipotent char", !6, i64 0}
-; TUNIT: [[META6:![0-9]+]] = !{!"Simple C/C++ TBAA"}
-; TUNIT: [[TBAA7]] = !{!8, !9, i64 12}
-; TUNIT: [[META8:![0-9]+]] = !{!"S", !4, i64 0, !4, i64 4, !4, i64 8, !9, i64 12, !9, i64 16, !9, i64 20}
-; TUNIT: [[META9:![0-9]+]] = !{!"float", !5, i64 0}
-; TUNIT: [[TBAA10]] = !{!8, !9, i64 16}
-; TUNIT: [[TBAA11]] = !{!8, !9, i64 20}
-; TUNIT: [[TBAA12]] = !{!8, !4, i64 0}
-; TUNIT: [[TBAA13]] = !{!8, !4, i64 4}
-; TUNIT: [[TBAA14]] = !{!8, !4, i64 8}
-; TUNIT: [[LOOP15]] = distinct !{!15, !16}
-; TUNIT: [[META16:![0-9]+]] = !{!"llvm.loop.mustprogress"}
-; TUNIT: [[LOOP17]] = distinct !{!17, !16}
-; TUNIT: [[LOOP18]] = distinct !{!18, !16}
-; TUNIT: [[TBAA19]] = !{!5, !5, i64 0}
-; TUNIT: [[LOOP20]] = distinct !{!20, !16}
-; TUNIT: [[LOOP21]] = distinct !{!21, !16}
-; TUNIT: [[LOOP22]] = distinct !{!22, !16}
-; TUNIT: [[LOOP23]] = distinct !{!23, !16}
-; TUNIT: [[LOOP24]] = distinct !{!24, !16}
-; TUNIT: [[LOOP25]] = distinct !{!25, !16}
-; TUNIT: [[TBAA26]] = !{!9, !9, i64 0}
-; TUNIT: [[LOOP27]] = distinct !{!27, !16}
-; TUNIT: [[TBAA28]] = !{!29, !29, i64 0}
-; TUNIT: [[META29:![0-9]+]] = !{!"long long", !5, i64 0}
-; TUNIT: [[LOOP30]] = distinct !{!30, !16}
-; TUNIT: [[LOOP31]] = distinct !{!31, !16}
+; TUNIT: [[META2:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+; TUNIT: [[TBAA3]] = !{[[META4:![0-9]+]], [[META4]], i64 0}
+; TUNIT: [[META4]] = !{!"int", [[META5:![0-9]+]], i64 0}
+; TUNIT: [[META5]] = !{!"omnipotent char", [[META6:![0-9]+]], i64 0}
+; TUNIT: [[META6]] = !{!"Simple C/C++ TBAA"}
+; TUNIT: [[TBAA7]] = !{[[META8:![0-9]+]], [[META9:![0-9]+]], i64 12}
+; TUNIT: [[META8]] = !{!"S", [[META4]], i64 0, [[META4]], i64 4, [[META4]], i64 8, [[META9]], i64 12, [[META9]], i64 16, [[META9]], i64 20}
+; TUNIT: [[META9]] = !{!"float", [[META5]], i64 0}
+; TUNIT: [[TBAA10]] = !{[[META8]], [[META9]], i64 16}
+; TUNIT: [[TBAA11]] = !{[[META8]], [[META9]], i64 20}
+; TUNIT: [[TBAA12]] = !{[[META8]], [[META4]], i64 0}
+; TUNIT: [[TBAA13]] = !{[[META8]], [[META4]], i64 4}
+; TUNIT: [[TBAA14]] = !{[[META8]], [[META4]], i64 8}
+; TUNIT: [[LOOP15]] = distinct !{[[LOOP15]], [[META16:![0-9]+]]}
+; TUNIT: [[META16]] = !{!"llvm.loop.mustprogress"}
+; TUNIT: [[LOOP17]] = distinct !{[[LOOP17]], [[META16]]}
+; TUNIT: [[LOOP18]] = distinct !{[[LOOP18]], [[META16]]}
+; TUNIT: [[TBAA19]] = !{[[META5]], [[META5]], i64 0}
+; TUNIT: [[LOOP20]] = distinct !{[[LOOP20]], [[META16]]}
+; TUNIT: [[LOOP21]] = distinct !{[[LOOP21]], [[META16]]}
+; TUNIT: [[LOOP22]] = distinct !{[[LOOP22]], [[META16]]}
+; TUNIT: [[LOOP23]] = distinct !{[[LOOP23]], [[META16]]}
+; TUNIT: [[LOOP24]] = distinct !{[[LOOP24]], [[META16]]}
+; TUNIT: [[LOOP25]] = distinct !{[[LOOP25]], [[META16]]}
+; TUNIT: [[TBAA26]] = !{[[META9]], [[META9]], i64 0}
+; TUNIT: [[LOOP27]] = distinct !{[[LOOP27]], [[META16]]}
+; TUNIT: [[TBAA28]] = !{[[META29:![0-9]+]], [[META29]], i64 0}
+; TUNIT: [[META29]] = !{!"long long", [[META5]], i64 0}
+; TUNIT: [[LOOP30]] = distinct !{[[LOOP30]], [[META16]]}
+; TUNIT: [[LOOP31]] = distinct !{[[LOOP31]], [[META16]]}
 ;.
 ; CGSCC: [[META0:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
 ; CGSCC: [[META1:![0-9]+]] = !{i32 7, !"uwtable", i32 1}
-; CGSCC: [[META2:![0-9]+]] = !{!"clang version 13.0.0"}
-; CGSCC: [[TBAA3]] = !{!4, !4, i64 0}
-; CGSCC: [[META4:![0-9]+]] = !{!"int", !5, i64 0}
-; CGSCC: [[META5:![0-9]+]] = !{!"omnipotent char", !6, i64 0}
-; CGSCC: [[META6:![0-9]+]] = !{!"Simple C/C++ TBAA"}
-; CGSCC: [[TBAA7]] = !{!8, !9, i64 12}
-; CGSCC: [[META8:![0-9]+]] = !{!"S", !4, i64 0, !4, i64 4, !4, i64 8, !9, i64 12, !9, i64 16, !9, i64 20}
-; CGSCC: [[META9:![0-9]+]] = !{!"float", !5, i64 0}
-; CGSCC: [[TBAA10]] = !{!8, !9, i64 16}
-; CGSCC: [[TBAA11]] = !{!8, !9, i64 20}
-; CGSCC: [[TBAA12]] = !{!8, !4, i64 0}
-; CGSCC: [[TBAA13]] = !{!8, !4, i64 4}
-; CGSCC: [[TBAA14]] = !{!8, !4, i64 8}
-; CGSCC: [[TBAA15]] = !{!5, !5, i64 0}
-; CGSCC: [[LOOP16]] = distinct !{!16, !17}
-; CGSCC: [[META17:![0-9]+]] = !{!"llvm.loop.mustprogress"}
-; CGSCC: [[TBAA18]] = !{!9, !9, i64 0}
-; CGSCC: [[LOOP19]] = distinct !{!19, !17}
-; CGSCC: [[TBAA20]] = !{!21, !21, i64 0}
-; CGSCC: [[META21:![0-9]+]] = !{!"long long", !5, i64 0}
-; CGSCC: [[LOOP22]] = distinct !{!22, !17}
-; CGSCC: [[LOOP23]] = distinct !{!23, !17}
-; CGSCC: [[LOOP24]] = distinct !{!24, !17}
-; CGSCC: [[LOOP25]] = distinct !{!25, !17}
-; CGSCC: [[LOOP26]] = distinct !{!26, !17}
-; CGSCC: [[LOOP27]] = distinct !{!27, !17}
-; CGSCC: [[LOOP28]] = distinct !{!28, !17}
-; CGSCC: [[LOOP29]] = distinct !{!29, !17}
-; CGSCC: [[LOOP30]] = distinct !{!30, !17}
-; CGSCC: [[LOOP31]] = distinct !{!31, !17}
+; CGSCC: [[META2:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+; CGSCC: [[TBAA3]] = !{[[META4:![0-9]+]], [[META4]], i64 0}
+; CGSCC: [[META4]] = !{!"int", [[META5:![0-9]+]], i64 0}
+; CGSCC: [[META5]] = !{!"omnipotent char", [[META6:![0-9]+]], i64 0}
+; CGSCC: [[META6]] = !{!"Simple C/C++ TBAA"}
+; CGSCC: [[TBAA7]] = !{[[META8:![0-9]+]], [[META9:![0-9]+]], i64 12}
+; CGSCC: [[META8]] = !{!"S", [[META4]], i64 0, [[META4]], i64 4, [[META4]], i64 8, [[META9]], i64 12, [[META9]], i64 16, [[META9]], i64 20}
+; CGSCC: [[META9]] = !{!"float", [[META5]], i64 0}
+; CGSCC: [[TBAA10]] = !{[[META8]], [[META9]], i64 16}
+; CGSCC: [[TBAA11]] = !{[[META8]], [[META9]], i64 20}
+; CGSCC: [[TBAA12]] = !{[[META8]], [[META4]], i64 0}
+; CGSCC: [[TBAA13]] = !{[[META8]], [[META4]], i64 4}
+; CGSCC: [[TBAA14]] = !{[[META8]], [[META4]], i64 8}
+; CGSCC: [[TBAA15]] = !{[[META5]], [[META5]], i64 0}
+; CGSCC: [[LOOP16]] = distinct !{[[LOOP16]], [[META17:![0-9]+]]}
+; CGSCC: [[META17]] = !{!"llvm.loop.mustprogress"}
+; CGSCC: [[TBAA18]] = !{[[META9]], [[META9]], i64 0}
+; CGSCC: [[LOOP19]] = distinct !{[[LOOP19]], [[META17]]}
+; CGSCC: [[TBAA20]] = !{[[META21:![0-9]+]], [[META21]], i64 0}
+; CGSCC: [[META21]] = !{!"long long", [[META5]], i64 0}
+; CGSCC: [[LOOP22]] = distinct !{[[LOOP22]], [[META17]]}
+; CGSCC: [[LOOP23]] = distinct !{[[LOOP23]], [[META17]]}
+; CGSCC: [[LOOP24]] = distinct !{[[LOOP24]], [[META17]]}
+; CGSCC: [[LOOP25]] = distinct !{[[LOOP25]], [[META17]]}
+; CGSCC: [[LOOP26]] = distinct !{[[LOOP26]], [[META17]]}
+; CGSCC: [[LOOP27]] = distinct !{[[LOOP27]], [[META17]]}
+; CGSCC: [[LOOP28]] = distinct !{[[LOOP28]], [[META17]]}
+; CGSCC: [[LOOP29]] = distinct !{[[LOOP29]], [[META17]]}
+; CGSCC: [[LOOP30]] = distinct !{[[LOOP30]], [[META17]]}
+; CGSCC: [[LOOP31]] = distinct !{[[LOOP31]], [[META17]]}
 ;.

From d82eccc7524622e482d3dab2219651587eb93429 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Fri, 22 Dec 2023 16:52:20 +0100
Subject: [PATCH 185/342] [RegAllocFast] Avoid duplicate hash lookup (NFC)

---
 llvm/lib/CodeGen/RegAllocFast.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/CodeGen/RegAllocFast.cpp b/llvm/lib/CodeGen/RegAllocFast.cpp
index a52013a74c2e1..d7edaa1d7ea47 100644
--- a/llvm/lib/CodeGen/RegAllocFast.cpp
+++ b/llvm/lib/CodeGen/RegAllocFast.cpp
@@ -81,8 +81,9 @@ class InstrPosIndexes {
   /// instructions index has been reassigned.
   bool getIndex(const MachineInstr &MI, uint64_t &Index) {
     assert(MI.getParent() == CurMBB && "MI is not in CurMBB");
-    if (Instr2PosIndex.count(&MI)) {
-      Index = Instr2PosIndex[&MI];
+    auto It = Instr2PosIndex.find(&MI);
+    if (It != Instr2PosIndex.end()) {
+      Index = It->second;
       return false;
     }
 

From 4b6968952e653cb4da301d404717899393e4c530 Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.velikov@arm.com>
Date: Fri, 22 Dec 2023 15:54:12 +0000
Subject: [PATCH 186/342] [AArch64] Implement spill/fill of predicate pair
 register classes (#76068)

We are getting ICE with, e.g.
```
#include <arm_sve.h>

 void g();
 svboolx2_t f0(int64_t i, int64_t n) {
     svboolx2_t r = svwhilelt_b16_x2(i, n);
     g();
     return r;
 }
```
---
 .../AArch64/AArch64ExpandPseudoInsts.cpp      | 18 +++-
 llvm/lib/Target/AArch64/AArch64InstrInfo.cpp  | 15 +++
 .../lib/Target/AArch64/AArch64SVEInstrInfo.td |  2 +
 llvm/test/CodeGen/AArch64/spillfill-sve.mir   | 92 +++++++++++++++++++
 .../AArch64/sve-pred-pair-spill-fill.ll       | 67 ++++++++++++++
 5 files changed, 191 insertions(+), 3 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/sve-pred-pair-spill-fill.ll

diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
index 757471d6a905e..bb7f4d907ffd7 100644
--- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -747,6 +747,15 @@ bool AArch64ExpandPseudo::expandSetTagLoop(
 bool AArch64ExpandPseudo::expandSVESpillFill(MachineBasicBlock &MBB,
                                              MachineBasicBlock::iterator MBBI,
                                              unsigned Opc, unsigned N) {
+  assert((Opc == AArch64::LDR_ZXI || Opc == AArch64::STR_ZXI ||
+          Opc == AArch64::LDR_PXI || Opc == AArch64::STR_PXI) &&
+         "Unexpected opcode");
+  unsigned RState = (Opc == AArch64::LDR_ZXI || Opc == AArch64::LDR_PXI)
+                        ? RegState::Define
+                        : 0;
+  unsigned sub0 = (Opc == AArch64::LDR_ZXI || Opc == AArch64::STR_ZXI)
+                      ? AArch64::zsub0
+                      : AArch64::psub0;
   const TargetRegisterInfo *TRI =
       MBB.getParent()->getSubtarget().getRegisterInfo();
   MachineInstr &MI = *MBBI;
@@ -756,9 +765,8 @@ bool AArch64ExpandPseudo::expandSVESpillFill(MachineBasicBlock &MBB,
     assert(ImmOffset >= -256 && ImmOffset < 256 &&
            "Immediate spill offset out of range");
     BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc))
-        .addReg(
-            TRI->getSubReg(MI.getOperand(0).getReg(), AArch64::zsub0 + Offset),
-            Opc == AArch64::LDR_ZXI ? RegState::Define : 0)
+        .addReg(TRI->getSubReg(MI.getOperand(0).getReg(), sub0 + Offset),
+                RState)
         .addReg(MI.getOperand(1).getReg(), getKillRegState(Kill))
         .addImm(ImmOffset);
   }
@@ -1492,12 +1500,16 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
      return expandSVESpillFill(MBB, MBBI, AArch64::STR_ZXI, 3);
    case AArch64::STR_ZZXI:
      return expandSVESpillFill(MBB, MBBI, AArch64::STR_ZXI, 2);
+   case AArch64::STR_PPXI:
+     return expandSVESpillFill(MBB, MBBI, AArch64::STR_PXI, 2);
    case AArch64::LDR_ZZZZXI:
      return expandSVESpillFill(MBB, MBBI, AArch64::LDR_ZXI, 4);
    case AArch64::LDR_ZZZXI:
      return expandSVESpillFill(MBB, MBBI, AArch64::LDR_ZXI, 3);
    case AArch64::LDR_ZZXI:
      return expandSVESpillFill(MBB, MBBI, AArch64::LDR_ZXI, 2);
+   case AArch64::LDR_PPXI:
+     return expandSVESpillFill(MBB, MBBI, AArch64::LDR_PXI, 2);
    case AArch64::BLR_RVMARKER:
      return expandCALL_RVMARKER(MBB, MBBI);
    case AArch64::BLR_BTI:
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index bc9678c13971f..1cfbf4737a6f7 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -3771,6 +3771,13 @@ bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, TypeSize &Scale,
     MinOffset = -256;
     MaxOffset = 255;
     break;
+  case AArch64::LDR_PPXI:
+  case AArch64::STR_PPXI:
+    Scale = TypeSize::getScalable(2);
+    Width = TypeSize::getScalable(2 * 2);
+    MinOffset = -256;
+    MaxOffset = 254;
+    break;
   case AArch64::LDR_ZXI:
   case AArch64::STR_ZXI:
     Scale = TypeSize::getScalable(16);
@@ -4814,6 +4821,10 @@ void AArch64InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
         assert(SrcReg != AArch64::WSP);
     } else if (AArch64::FPR32RegClass.hasSubClassEq(RC))
       Opc = AArch64::STRSui;
+    else if (AArch64::PPR2RegClass.hasSubClassEq(RC)) {
+      Opc = AArch64::STR_PPXI;
+      StackID = TargetStackID::ScalableVector;
+    }
     break;
   case 8:
     if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) {
@@ -4990,6 +5001,10 @@ void AArch64InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
         assert(DestReg != AArch64::WSP);
     } else if (AArch64::FPR32RegClass.hasSubClassEq(RC))
       Opc = AArch64::LDRSui;
+    else if (AArch64::PPR2RegClass.hasSubClassEq(RC)) {
+      Opc = AArch64::LDR_PPXI;
+      StackID = TargetStackID::ScalableVector;
+    }
     break;
   case 8:
     if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) {
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 3dae6f7795ee9..344a153890631 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -2398,11 +2398,13 @@ let Predicates = [HasSVEorSME] in {
     def LDR_ZZXI   : Pseudo<(outs   ZZ_b_strided_and_contiguous:$Zd), (ins GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
     def LDR_ZZZXI  : Pseudo<(outs  ZZZ_b:$Zd), (ins GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
     def LDR_ZZZZXI : Pseudo<(outs ZZZZ_b_strided_and_contiguous:$Zd), (ins GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
+    def LDR_PPXI   : Pseudo<(outs PPR2:$pp), (ins GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
   }
   let mayStore = 1, hasSideEffects = 0 in {
     def STR_ZZXI   : Pseudo<(outs), (ins   ZZ_b_strided_and_contiguous:$Zs, GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
     def STR_ZZZXI  : Pseudo<(outs), (ins  ZZZ_b:$Zs, GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
     def STR_ZZZZXI : Pseudo<(outs), (ins ZZZZ_b_strided_and_contiguous:$Zs, GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
+    def STR_PPXI   : Pseudo<(outs), (ins PPR2:$pp, GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
   }
 
   let AddedComplexity = 1 in {
diff --git a/llvm/test/CodeGen/AArch64/spillfill-sve.mir b/llvm/test/CodeGen/AArch64/spillfill-sve.mir
index 01756b8460019..ef7d55a1c2395 100644
--- a/llvm/test/CodeGen/AArch64/spillfill-sve.mir
+++ b/llvm/test/CodeGen/AArch64/spillfill-sve.mir
@@ -7,6 +7,8 @@
   target triple = "aarch64--linux-gnu"
 
   define aarch64_sve_vector_pcs void @spills_fills_stack_id_ppr() #0 { entry: unreachable }
+  define aarch64_sve_vector_pcs void @spills_fills_stack_id_ppr2() #0 { entry: unreachable }
+  define aarch64_sve_vector_pcs void @spills_fills_stack_id_ppr2mul2() #0 { entry: unreachable }
   define aarch64_sve_vector_pcs void @spills_fills_stack_id_pnr() #1 { entry: unreachable }
   define aarch64_sve_vector_pcs void @spills_fills_stack_id_virtreg_pnr() #1 { entry: unreachable }
   define aarch64_sve_vector_pcs void @spills_fills_stack_id_zpr() #0 { entry: unreachable }
@@ -64,6 +66,96 @@ body:             |
     RET_ReallyLR
 ...
 ---
+name: spills_fills_stack_id_ppr2
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: ppr2 }
+stack:
+liveins:
+  - { reg: '$p0_p1', virtual-reg: '%0' }
+body:             |
+  bb.0.entry:
+    liveins: $p0_p1
+
+    ; CHECK-LABEL: name: spills_fills_stack_id_ppr2
+    ; CHECK: stack:
+    ; CHECK:      - { id: 0, name: '', type: spill-slot, offset: 0, size: 4, alignment: 2
+    ; CHECK-NEXT:     stack-id: scalable-vector, callee-saved-register: ''
+
+    ; EXPAND-LABEL: name: spills_fills_stack_id_ppr2
+    ; EXPAND: STR_PXI $p0, $sp, 6
+    ; EXPAND: STR_PXI $p1, $sp, 7
+    ; EXPAND: $p0 = LDR_PXI $sp, 6
+    ; EXPAND: $p1 = LDR_PXI $sp, 7
+
+    %0:ppr2 = COPY $p0_p1
+
+    $p0 = IMPLICIT_DEF
+    $p1 = IMPLICIT_DEF
+    $p2 = IMPLICIT_DEF
+    $p3 = IMPLICIT_DEF
+    $p4 = IMPLICIT_DEF
+    $p5 = IMPLICIT_DEF
+    $p6 = IMPLICIT_DEF
+    $p7 = IMPLICIT_DEF
+    $p8 = IMPLICIT_DEF
+    $p9 = IMPLICIT_DEF
+    $p10 = IMPLICIT_DEF
+    $p11 = IMPLICIT_DEF
+    $p12 = IMPLICIT_DEF
+    $p13 = IMPLICIT_DEF
+    $p14 = IMPLICIT_DEF
+    $p15 = IMPLICIT_DEF
+
+    $p0_p1 = COPY %0
+    RET_ReallyLR
+...
+---
+name: spills_fills_stack_id_ppr2mul2
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: ppr2mul2 }
+stack:
+liveins:
+  - { reg: '$p0_p1', virtual-reg: '%0' }
+body:             |
+  bb.0.entry:
+    liveins: $p0_p1
+
+    ; CHECK-LABEL: name: spills_fills_stack_id_ppr2
+    ; CHECK: stack:
+    ; CHECK:      - { id: 0, name: '', type: spill-slot, offset: 0, size: 4, alignment: 2
+    ; CHECK-NEXT:     stack-id: scalable-vector, callee-saved-register: ''
+
+    ; EXPAND-LABEL: name: spills_fills_stack_id_ppr2mul2
+    ; EXPAND: STR_PXI $p0, $sp, 6
+    ; EXPAND: STR_PXI $p1, $sp, 7
+    ; EXPAND: $p0 = LDR_PXI $sp, 6
+    ; EXPAND: $p1 = LDR_PXI $sp, 7
+
+    %0:ppr2mul2 = COPY $p0_p1
+
+    $p0 = IMPLICIT_DEF
+    $p1 = IMPLICIT_DEF
+    $p2 = IMPLICIT_DEF
+    $p3 = IMPLICIT_DEF
+    $p4 = IMPLICIT_DEF
+    $p5 = IMPLICIT_DEF
+    $p6 = IMPLICIT_DEF
+    $p7 = IMPLICIT_DEF
+    $p8 = IMPLICIT_DEF
+    $p9 = IMPLICIT_DEF
+    $p10 = IMPLICIT_DEF
+    $p11 = IMPLICIT_DEF
+    $p12 = IMPLICIT_DEF
+    $p13 = IMPLICIT_DEF
+    $p14 = IMPLICIT_DEF
+    $p15 = IMPLICIT_DEF
+
+    $p0_p1 = COPY %0
+    RET_ReallyLR
+...
+---
 name: spills_fills_stack_id_pnr
 tracksRegLiveness: true
 registers:
diff --git a/llvm/test/CodeGen/AArch64/sve-pred-pair-spill-fill.ll b/llvm/test/CodeGen/AArch64/sve-pred-pair-spill-fill.ll
new file mode 100644
index 0000000000000..4dcc81feb72f1
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-pred-pair-spill-fill.ll
@@ -0,0 +1,67 @@
+; RUN: llc < %s | FileCheck %s
+
+; Derived from 
+; #include <arm_sve.h>
+
+; void g();
+
+; svboolx2_t f0(int64_t i, int64_t n) {
+;     svboolx2_t r = svwhilelt_b16_x2(i, n);
+;     g();
+;     return r;
+; }
+
+; svboolx2_t f1(svcount_t n) {
+;     svboolx2_t r = svpext_lane_c8_x2(n, 1);
+;     g();
+;     return r;
+; }
+; 
+; Check that predicate register pairs are spilled/filled without an ICE in the backend.
+
+target triple = "aarch64-unknown-linux"
+
+define <vscale x 32 x i1> @f0(i64 %i, i64 %n) #0 {
+entry:
+  %0 = tail call { <vscale x 8 x i1>, <vscale x 8 x i1> } @llvm.aarch64.sve.whilelt.x2.nxv8i1(i64 %i, i64 %n)
+  %1 = extractvalue { <vscale x 8 x i1>, <vscale x 8 x i1> } %0, 0
+  %2 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> %1)
+  %3 = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> poison, <vscale x 16 x i1> %2, i64 0)
+  %4 = extractvalue { <vscale x 8 x i1>, <vscale x 8 x i1> } %0, 1
+  %5 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> %4)
+  %6 = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> %3, <vscale x 16 x i1> %5, i64 16)
+  tail call void @g()
+  ret <vscale x 32 x i1> %6
+}
+; CHECK-LABEL: f0:
+; CHECK: whilelt { p0.h, p1.h }
+; CHECK: str p0, [sp, #6, mul vl]
+; CHECK: str p1, [sp, #7, mul vl]
+; CHECK: ldr p0, [sp, #6, mul vl]
+; CHECK: ldr p1, [sp, #7, mul vl]
+
+define <vscale x 32 x i1> @f1(target("aarch64.svcount") %n) #0 {
+entry:
+  %0 = tail call { <vscale x 16 x i1>, <vscale x 16 x i1> } @llvm.aarch64.sve.pext.x2.nxv16i1(target("aarch64.svcount") %n, i32 1)
+  %1 = extractvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } %0, 0
+  %2 = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> poison, <vscale x 16 x i1> %1, i64 0)
+  %3 = extractvalue { <vscale x 16 x i1>, <vscale x 16 x i1> } %0, 1
+  %4 = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> %2, <vscale x 16 x i1> %3, i64 16)
+  tail call void @g()
+  ret <vscale x 32 x i1> %4
+}
+
+; CHECK-LABEL: f1:
+; CHECK: pext { p0.b, p1.b }
+; CHECK: str p0, [sp, #6, mul vl]
+; CHECK: str p1, [sp, #7, mul vl]
+; CHECK: ldr p0, [sp, #6, mul vl]
+; CHECK: ldr p1, [sp, #7, mul vl]
+
+declare void @g(...)
+declare { <vscale x 8 x i1>, <vscale x 8 x i1> } @llvm.aarch64.sve.whilelt.x2.nxv8i1(i64, i64)
+declare <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1>)
+declare <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1>, <vscale x 16 x i1>, i64 immarg)
+declare { <vscale x 16 x i1>, <vscale x 16 x i1> } @llvm.aarch64.sve.pext.x2.nxv16i1(target("aarch64.svcount"), i32 immarg) #1
+
+attributes #0 = { nounwind "target-features"="+sve,+sve2,+sve2p1" }

From dd85c6cce4fc60fa4850770d66f783300a700f3a Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Fri, 22 Dec 2023 08:28:07 -0800
Subject: [PATCH 187/342] [Sema] Add -Wc++11-narrowing-const-reference (#76094)

https://github.com/llvm/llvm-project/pull/75332 diagnosed narrowing
involving const reference. Our depot has hundreds if not thousands of
breakages

(https://github.com/llvm/llvm-project/pull/75332#issuecomment-1864757240).
Add a subgroup of -Wc++11-narrowing to help users gradually fix their
issues without regressing the existing -Wc++11-narrowing diagnostics.
---
 clang/include/clang/Basic/DiagnosticGroups.td |  3 +-
 .../clang/Basic/DiagnosticSemaKinds.td        | 12 +++++++
 clang/lib/Sema/SemaInit.cpp                   | 31 +++++++++++++------
 .../dcl.decl/dcl.init/dcl.init.list/p7-0x.cpp |  3 ++
 clang/test/SemaCXX/GH63151.cpp                | 10 +++---
 5 files changed, 44 insertions(+), 15 deletions(-)

diff --git a/clang/include/clang/Basic/DiagnosticGroups.td b/clang/include/clang/Basic/DiagnosticGroups.td
index 7cf347e92d997..6765721ae7002 100644
--- a/clang/include/clang/Basic/DiagnosticGroups.td
+++ b/clang/include/clang/Basic/DiagnosticGroups.td
@@ -348,7 +348,8 @@ def CXX98CompatPedantic : DiagGroup<"c++98-compat-pedantic",
                                      CXXPre20CompatPedantic,
                                      CXXPre23CompatPedantic]>;
 
-def CXX11Narrowing : DiagGroup<"c++11-narrowing">;
+def CXX11NarrowingConstReference : DiagGroup<"c++11-narrowing-const-reference">;
+def CXX11Narrowing : DiagGroup<"c++11-narrowing", [CXX11NarrowingConstReference]>;
 
 def CXX11WarnInconsistentOverrideDestructor :
   DiagGroup<"inconsistent-missing-destructor-override">;
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index c100041ca400f..aebb7d9b945c3 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -6158,12 +6158,24 @@ def err_illegal_initializer_type : Error<"illegal initializer type %0">;
 def ext_init_list_type_narrowing : ExtWarn<
   "type %0 cannot be narrowed to %1 in initializer list">,
   InGroup<CXX11Narrowing>, DefaultError, SFINAEFailure;
+// *_narrowing_const_reference diagnostics have the same messages, but are
+// controlled by -Wc++11-narrowing-const-reference for narrowing involving a
+// const reference.
+def ext_init_list_type_narrowing_const_reference : ExtWarn<
+  "type %0 cannot be narrowed to %1 in initializer list">,
+  InGroup<CXX11NarrowingConstReference>, DefaultError, SFINAEFailure;
 def ext_init_list_variable_narrowing : ExtWarn<
   "non-constant-expression cannot be narrowed from type %0 to %1 in "
   "initializer list">, InGroup<CXX11Narrowing>, DefaultError, SFINAEFailure;
+def ext_init_list_variable_narrowing_const_reference : ExtWarn<
+  "non-constant-expression cannot be narrowed from type %0 to %1 in "
+  "initializer list">, InGroup<CXX11NarrowingConstReference>, DefaultError, SFINAEFailure;
 def ext_init_list_constant_narrowing : ExtWarn<
   "constant expression evaluates to %0 which cannot be narrowed to type %1">,
   InGroup<CXX11Narrowing>, DefaultError, SFINAEFailure;
+def ext_init_list_constant_narrowing_const_reference : ExtWarn<
+  "constant expression evaluates to %0 which cannot be narrowed to type %1">,
+  InGroup<CXX11NarrowingConstReference>, DefaultError, SFINAEFailure;
 def warn_init_list_type_narrowing : Warning<
   "type %0 cannot be narrowed to %1 in initializer list in C++11">,
   InGroup<CXX11Narrowing>, DefaultIgnore;
diff --git a/clang/lib/Sema/SemaInit.cpp b/clang/lib/Sema/SemaInit.cpp
index f768d2726b0a1..cc9db5ded1149 100644
--- a/clang/lib/Sema/SemaInit.cpp
+++ b/clang/lib/Sema/SemaInit.cpp
@@ -10411,40 +10411,53 @@ static void DiagnoseNarrowingInInitList(Sema &S,
     // No narrowing occurred.
     return;
 
-  case NK_Type_Narrowing:
+  case NK_Type_Narrowing: {
     // This was a floating-to-integer conversion, which is always considered a
     // narrowing conversion even if the value is a constant and can be
     // represented exactly as an integer.
-    S.Diag(PostInit->getBeginLoc(), NarrowingErrs(S.getLangOpts())
-                                        ? diag::ext_init_list_type_narrowing
-                                        : diag::warn_init_list_type_narrowing)
+    QualType T = EntityType.getNonReferenceType();
+    S.Diag(PostInit->getBeginLoc(),
+           NarrowingErrs(S.getLangOpts())
+               ? (T == EntityType
+                      ? diag::ext_init_list_type_narrowing
+                      : diag::ext_init_list_type_narrowing_const_reference)
+               : diag::warn_init_list_type_narrowing)
         << PostInit->getSourceRange()
         << PreNarrowingType.getLocalUnqualifiedType()
-        << EntityType.getNonReferenceType().getLocalUnqualifiedType();
+        << T.getLocalUnqualifiedType();
     break;
+  }
 
-  case NK_Constant_Narrowing:
+  case NK_Constant_Narrowing: {
     // A constant value was narrowed.
+    QualType T = EntityType.getNonReferenceType();
     S.Diag(PostInit->getBeginLoc(),
            NarrowingErrs(S.getLangOpts())
-               ? diag::ext_init_list_constant_narrowing
+               ? (T == EntityType
+                      ? diag::ext_init_list_constant_narrowing
+                      : diag::ext_init_list_constant_narrowing_const_reference)
                : diag::warn_init_list_constant_narrowing)
         << PostInit->getSourceRange()
         << ConstantValue.getAsString(S.getASTContext(), ConstantType)
         << EntityType.getNonReferenceType().getLocalUnqualifiedType();
     break;
+  }
 
-  case NK_Variable_Narrowing:
+  case NK_Variable_Narrowing: {
     // A variable's value may have been narrowed.
+    QualType T = EntityType.getNonReferenceType();
     S.Diag(PostInit->getBeginLoc(),
            NarrowingErrs(S.getLangOpts())
-               ? diag::ext_init_list_variable_narrowing
+               ? (T == EntityType
+                      ? diag::ext_init_list_variable_narrowing
+                      : diag::ext_init_list_variable_narrowing_const_reference)
                : diag::warn_init_list_variable_narrowing)
         << PostInit->getSourceRange()
         << PreNarrowingType.getLocalUnqualifiedType()
         << EntityType.getNonReferenceType().getLocalUnqualifiedType();
     break;
   }
+  }
 
   SmallString<128> StaticCast;
   llvm::raw_svector_ostream OS(StaticCast);
diff --git a/clang/test/CXX/dcl.decl/dcl.init/dcl.init.list/p7-0x.cpp b/clang/test/CXX/dcl.decl/dcl.init/dcl.init.list/p7-0x.cpp
index eac9ac0e82798..2bceb3e267790 100644
--- a/clang/test/CXX/dcl.decl/dcl.init/dcl.init.list/p7-0x.cpp
+++ b/clang/test/CXX/dcl.decl/dcl.init/dcl.init.list/p7-0x.cpp
@@ -1,4 +1,7 @@
 // RUN: %clang_cc1 -fsyntax-only -std=c++11 -triple x86_64-apple-macosx10.6.7 -verify %s
+// The following narrowing does not involve const references, so
+// -Wno-c++11-narrowing-const-reference does not suppress the errors.
+// RUN: %clang_cc1 -fsyntax-only -std=c++11 -triple x86_64-apple-macosx10.6.7 -Wno-c++11-narrowing-const-reference -verify %s
 
 // Verify that narrowing conversions in initializer lists cause errors in C++0x
 // mode.
diff --git a/clang/test/SemaCXX/GH63151.cpp b/clang/test/SemaCXX/GH63151.cpp
index 2c7533ed88f3b..a4d0da0beee21 100644
--- a/clang/test/SemaCXX/GH63151.cpp
+++ b/clang/test/SemaCXX/GH63151.cpp
@@ -1,12 +1,12 @@
-// RUN: %clang_cc1 -fsyntax-only -verify %s
-
+// RUN: %clang_cc1 -fsyntax-only -verify=expected,narrowing %s
+// RUN: %clang_cc1 -fsyntax-only -Wno-c++11-narrowing-const-reference -verify %s
 
 struct A { A(const unsigned &x) {} };
 
 void foo(int p) {
-  A a { -1 }; // expected-error {{constant expression evaluates to -1 which cannot be narrowed to type 'unsigned int'}}
+  A a { -1 }; // narrowing-error {{constant expression evaluates to -1 which cannot be narrowed to type 'unsigned int'}}
   A b { 0 };
-  A c { p }; // expected-error {{non-constant-expression cannot be narrowed from type 'int' to 'unsigned int' in initializer list}}
-  A d { 0.5 }; // expected-error {{type 'double' cannot be narrowed to 'unsigned int' in initializer list}}
+  A c { p }; // narrowing-error {{non-constant-expression cannot be narrowed from type 'int' to 'unsigned int' in initializer list}}
+  A d { 0.5 }; // narrowing-error {{type 'double' cannot be narrowed to 'unsigned int' in initializer list}}
                // expected-warning@-1 {{implicit conversion from 'double' to 'unsigned int' changes value from 0.5 to 0}}
 }

From 6c2ad8ac7b12a0963e15a853b71904b861a83af4 Mon Sep 17 00:00:00 2001
From: Alexandros Lamprineas <alexandros.lamprineas@arm.com>
Date: Fri, 22 Dec 2023 16:29:18 +0000
Subject: [PATCH 188/342] [TLI][NFC] Autogenerate vectorized call tests for
 SLEEF/ArmPL. (#76146)

This patch prepares the ground for #76060.

* Unifies ArmPL and SLEEF tests for better coverage
* Replaces deprecated float* and double* types with ptr
* Adds noalias attribute to pointer arguments
* Adds some cmd-line options to the RUN lines to simplify output
* Removes datalayout since target triple is provided
* Removes checks for return statements
* Refactors the regex filter for autogenerated checks
* Removes redundant test file suffix (already under the AArch64 dir)
---
 .../LoopVectorize/AArch64/armpl-calls.ll      | 1846 ------------
 .../LoopVectorize/AArch64/armpl-intrinsics.ll |  554 ----
 .../AArch64/sleef-calls-aarch64.ll            | 1347 ---------
 .../AArch64/sleef-intrinsic-calls-aarch64.ll  | 1290 --------
 .../AArch64/veclib-function-calls.ll          | 2641 +++++++++++++++++
 .../AArch64/veclib-intrinsic-calls.ll         | 1547 ++++++++++
 6 files changed, 4188 insertions(+), 5037 deletions(-)
 delete mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/armpl-calls.ll
 delete mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/armpl-intrinsics.ll
 delete mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/sleef-calls-aarch64.ll
 delete mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/sleef-intrinsic-calls-aarch64.ll
 create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/veclib-function-calls.ll
 create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/veclib-intrinsic-calls.ll

diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/armpl-calls.ll b/llvm/test/Transforms/LoopVectorize/AArch64/armpl-calls.ll
deleted file mode 100644
index aa5fdf59e14c0..0000000000000
--- a/llvm/test/Transforms/LoopVectorize/AArch64/armpl-calls.ll
+++ /dev/null
@@ -1,1846 +0,0 @@
-; RUN: opt -vector-library=ArmPL -passes=inject-tli-mappings,loop-vectorize -S < %s | FileCheck %s --check-prefixes=CHECK,NEON
-; RUN: opt -mattr=+sve -vector-library=ArmPL -passes=inject-tli-mappings,loop-vectorize -S < %s | FileCheck %s --check-prefixes=CHECK,SVE
-
-target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
-target triple = "aarch64-unknown-linux-gnu"
-
-
-; Tests are checking if LV can vectorize loops with function calls
-; using mappings from TLI for scalable and fixed width vectorization.
-
-declare double @acos(double)
-declare float @acosf(float)
-
-define void @acos_f64(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @acos_f64(
-; NEON:     [[TMP5:%.*]] = call <2 x double> @armpl_vacosq_f64(<2 x double> [[TMP4:%.*]])
-; SVE:      [[TMP5:%.*]] = call <vscale x 2 x double> @armpl_svacos_f64_x(<vscale x 2 x double> [[TMP4:%.*]], <vscale x 2 x i1> {{.*}})
-; CHECK:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
-  %in = load double, ptr %in.gep, align 8
-  %call = tail call double @acos(double %in)
-  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
-  store double %call, ptr %out.gep, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @acos_f32(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @acos_f32(
-; NEON: [[TMP5:%.*]] = call <4 x float> @armpl_vacosq_f32(<4 x float> [[TMP4:%.*]])
-; SVE: [[TMP5:%.*]] = call <vscale x 4 x float> @armpl_svacos_f32_x(<vscale x 4 x float> [[TMP4:%.*]], <vscale x 4 x i1> {{.*}})
-; CHECK: ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
-  %in = load float, ptr %in.gep, align 8
-  %call = tail call float @acosf(float %in)
-  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
-  store float %call, ptr %out.gep, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @acosh(double)
-declare float @acoshf(float)
-
-define void @acosh_f64(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @acosh_f64(
-; NEON:     [[TMP5:%.*]] = call <2 x double> @armpl_vacoshq_f64(<2 x double> [[TMP4:%.*]])
-; SVE:      [[TMP5:%.*]] = call <vscale x 2 x double> @armpl_svacosh_f64_x(<vscale x 2 x double> [[TMP4:%.*]], <vscale x 2 x i1> {{.*}})
-; CHECK:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
-  %in = load double, ptr %in.gep, align 8
-  %call = tail call double @acosh(double %in)
-  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
-  store double %call, ptr %out.gep, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @acosh_f32(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @acosh_f32(
-; NEON: [[TMP5:%.*]] = call <4 x float> @armpl_vacoshq_f32(<4 x float> [[TMP4:%.*]])
-; SVE: [[TMP5:%.*]] = call <vscale x 4 x float> @armpl_svacosh_f32_x(<vscale x 4 x float> [[TMP4:%.*]], <vscale x 4 x i1> {{.*}})
-; CHECK: ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
-  %in = load float, ptr %in.gep, align 8
-  %call = tail call float @acoshf(float %in)
-  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
-  store float %call, ptr %out.gep, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @asin(double)
-declare float @asinf(float)
-
-define void @asin_f64(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @asin_f64(
-; NEON:     [[TMP5:%.*]] = call <2 x double> @armpl_vasinq_f64(<2 x double> [[TMP4:%.*]])
-; SVE:      [[TMP5:%.*]] = call <vscale x 2 x double> @armpl_svasin_f64_x(<vscale x 2 x double> [[TMP4:%.*]], <vscale x 2 x i1> {{.*}})
-; CHECK:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
-  %in = load double, ptr %in.gep, align 8
-  %call = tail call double @asin(double %in)
-  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
-  store double %call, ptr %out.gep, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @asin_f32(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @asin_f32(
-; NEON: [[TMP5:%.*]] = call <4 x float> @armpl_vasinq_f32(<4 x float> [[TMP4:%.*]])
-; SVE: [[TMP5:%.*]] = call <vscale x 4 x float> @armpl_svasin_f32_x(<vscale x 4 x float> [[TMP4:%.*]], <vscale x 4 x i1> {{.*}})
-; CHECK: ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
-  %in = load float, ptr %in.gep, align 8
-  %call = tail call float @asinf(float %in)
-  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
-  store float %call, ptr %out.gep, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @asinh(double)
-declare float @asinhf(float)
-
-define void @asinh_f64(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @asinh_f64(
-; NEON:     [[TMP5:%.*]] = call <2 x double> @armpl_vasinhq_f64(<2 x double> [[TMP4:%.*]])
-; SVE:      [[TMP5:%.*]] = call <vscale x 2 x double> @armpl_svasinh_f64_x(<vscale x 2 x double> [[TMP4:%.*]], <vscale x 2 x i1> {{.*}})
-; CHECK:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
-  %in = load double, ptr %in.gep, align 8
-  %call = tail call double @asinh(double %in)
-  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
-  store double %call, ptr %out.gep, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @asinh_f32(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @asinh_f32(
-; NEON: [[TMP5:%.*]] = call <4 x float> @armpl_vasinhq_f32(<4 x float> [[TMP4:%.*]])
-; SVE: [[TMP5:%.*]] = call <vscale x 4 x float> @armpl_svasinh_f32_x(<vscale x 4 x float> [[TMP4:%.*]], <vscale x 4 x i1> {{.*}})
-; CHECK: ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
-  %in = load float, ptr %in.gep, align 8
-  %call = tail call float @asinhf(float %in)
-  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
-  store float %call, ptr %out.gep, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @atan(double)
-declare float @atanf(float)
-
-define void @atan_f64(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @atan_f64(
-; NEON:     [[TMP5:%.*]] = call <2 x double> @armpl_vatanq_f64(<2 x double> [[TMP4:%.*]])
-; SVE:      [[TMP5:%.*]] = call <vscale x 2 x double> @armpl_svatan_f64_x(<vscale x 2 x double> [[TMP4:%.*]], <vscale x 2 x i1> {{.*}})
-; CHECK:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
-  %in = load double, ptr %in.gep, align 8
-  %call = tail call double @atan(double %in)
-  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
-  store double %call, ptr %out.gep, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @atan_f32(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @atan_f32(
-; NEON: [[TMP5:%.*]] = call <4 x float> @armpl_vatanq_f32(<4 x float> [[TMP4:%.*]])
-; SVE: [[TMP5:%.*]] = call <vscale x 4 x float> @armpl_svatan_f32_x(<vscale x 4 x float> [[TMP4:%.*]], <vscale x 4 x i1> {{.*}})
-; CHECK: ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
-  %in = load float, ptr %in.gep, align 8
-  %call = tail call float @atanf(float %in)
-  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
-  store float %call, ptr %out.gep, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @atanh(double)
-declare float @atanhf(float)
-
-define void @atanh_f64(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @atanh_f64(
-; NEON:     [[TMP5:%.*]] = call <2 x double> @armpl_vatanhq_f64(<2 x double> [[TMP4:%.*]])
-; SVE:      [[TMP5:%.*]] = call <vscale x 2 x double> @armpl_svatanh_f64_x(<vscale x 2 x double> [[TMP4:%.*]], <vscale x 2 x i1> {{.*}})
-; CHECK:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
-  %in = load double, ptr %in.gep, align 8
-  %call = tail call double @atanh(double %in)
-  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
-  store double %call, ptr %out.gep, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @atanh_f32(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @atanh_f32(
-; NEON: [[TMP5:%.*]] = call <4 x float> @armpl_vatanhq_f32(<4 x float> [[TMP4:%.*]])
-; SVE: [[TMP5:%.*]] = call <vscale x 4 x float> @armpl_svatanh_f32_x(<vscale x 4 x float> [[TMP4:%.*]], <vscale x 4 x i1> {{.*}})
-; CHECK: ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
-  %in = load float, ptr %in.gep, align 8
-  %call = tail call float @atanhf(float %in)
-  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
-  store float %call, ptr %out.gep, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @cbrt(double)
-declare float @cbrtf(float)
-
-define void @cbrt_f64(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @cbrt_f64(
-; NEON:     [[TMP5:%.*]] = call <2 x double> @armpl_vcbrtq_f64(<2 x double> [[TMP4:%.*]])
-; SVE:      [[TMP5:%.*]] = call <vscale x 2 x double> @armpl_svcbrt_f64_x(<vscale x 2 x double> [[TMP4:%.*]], <vscale x 2 x i1> {{.*}})
-; CHECK:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
-  %in = load double, ptr %in.gep, align 8
-  %call = tail call double @cbrt(double %in)
-  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
-  store double %call, ptr %out.gep, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @cbrt_f32(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @cbrt_f32(
-; NEON: [[TMP5:%.*]] = call <4 x float> @armpl_vcbrtq_f32(<4 x float> [[TMP4:%.*]])
-; SVE: [[TMP5:%.*]] = call <vscale x 4 x float> @armpl_svcbrt_f32_x(<vscale x 4 x float> [[TMP4:%.*]], <vscale x 4 x i1> {{.*}})
-; CHECK: ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
-  %in = load float, ptr %in.gep, align 8
-  %call = tail call float @cbrtf(float %in)
-  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
-  store float %call, ptr %out.gep, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @cos(double)
-declare float @cosf(float)
-
-define void @cos_f64(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @cos_f64(
-; NEON:     [[TMP5:%.*]] = call <2 x double> @armpl_vcosq_f64(<2 x double> [[TMP4:%.*]])
-; SVE:      [[TMP5:%.*]] = call <vscale x 2 x double> @armpl_svcos_f64_x(<vscale x 2 x double> [[TMP4:%.*]], <vscale x 2 x i1> {{.*}})
-; CHECK:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
-  %in = load double, ptr %in.gep, align 8
-  %call = tail call double @cos(double %in)
-  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
-  store double %call, ptr %out.gep, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @cos_f32(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @cos_f32(
-; NEON: [[TMP5:%.*]] = call <4 x float> @armpl_vcosq_f32(<4 x float> [[TMP4:%.*]])
-; SVE: [[TMP5:%.*]] = call <vscale x 4 x float> @armpl_svcos_f32_x(<vscale x 4 x float> [[TMP4:%.*]], <vscale x 4 x i1> {{.*}})
-; CHECK: ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
-  %in = load float, ptr %in.gep, align 8
-  %call = tail call float @cosf(float %in)
-  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
-  store float %call, ptr %out.gep, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @cosh(double)
-declare float @coshf(float)
-
-define void @cosh_f64(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @cosh_f64(
-; NEON:     [[TMP5:%.*]] = call <2 x double> @armpl_vcoshq_f64(<2 x double> [[TMP4:%.*]])
-; SVE:      [[TMP5:%.*]] = call <vscale x 2 x double> @armpl_svcosh_f64_x(<vscale x 2 x double> [[TMP4:%.*]], <vscale x 2 x i1> {{.*}})
-; CHECK:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
-  %in = load double, ptr %in.gep, align 8
-  %call = tail call double @cosh(double %in)
-  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
-  store double %call, ptr %out.gep, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @cosh_f32(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @cosh_f32(
-; NEON: [[TMP5:%.*]] = call <4 x float> @armpl_vcoshq_f32(<4 x float> [[TMP4:%.*]])
-; SVE: [[TMP5:%.*]] = call <vscale x 4 x float> @armpl_svcosh_f32_x(<vscale x 4 x float> [[TMP4:%.*]], <vscale x 4 x i1> {{.*}})
-; CHECK: ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
-  %in = load float, ptr %in.gep, align 8
-  %call = tail call float @coshf(float %in)
-  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
-  store float %call, ptr %out.gep, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @erf(double)
-declare float @erff(float)
-
-define void @erf_f64(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @erf_f64(
-; NEON:     [[TMP5:%.*]] = call <2 x double> @armpl_verfq_f64(<2 x double> [[TMP4:%.*]])
-; SVE:      [[TMP5:%.*]] = call <vscale x 2 x double> @armpl_sverf_f64_x(<vscale x 2 x double> [[TMP4:%.*]], <vscale x 2 x i1> {{.*}})
-; CHECK:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
-  %in = load double, ptr %in.gep, align 8
-  %call = tail call double @erf(double %in)
-  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
-  store double %call, ptr %out.gep, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @erf_f32(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @erf_f32(
-; NEON: [[TMP5:%.*]] = call <4 x float> @armpl_verfq_f32(<4 x float> [[TMP4:%.*]])
-; SVE: [[TMP5:%.*]] = call <vscale x 4 x float> @armpl_sverf_f32_x(<vscale x 4 x float> [[TMP4:%.*]], <vscale x 4 x i1> {{.*}})
-; CHECK: ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
-  %in = load float, ptr %in.gep, align 8
-  %call = tail call float @erff(float %in)
-  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
-  store float %call, ptr %out.gep, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @erfc(double)
-declare float @erfcf(float)
-
-define void @erfc_f64(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @erfc_f64(
-; NEON:     [[TMP5:%.*]] = call <2 x double> @armpl_verfcq_f64(<2 x double> [[TMP4:%.*]])
-; SVE:      [[TMP5:%.*]] = call <vscale x 2 x double> @armpl_sverfc_f64_x(<vscale x 2 x double> [[TMP4:%.*]], <vscale x 2 x i1> {{.*}})
-; CHECK:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
-  %in = load double, ptr %in.gep, align 8
-  %call = tail call double @erfc(double %in)
-  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
-  store double %call, ptr %out.gep, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @erfc_f32(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @erfc_f32(
-; NEON: [[TMP5:%.*]] = call <4 x float> @armpl_verfcq_f32(<4 x float> [[TMP4:%.*]])
-; SVE: [[TMP5:%.*]] = call <vscale x 4 x float> @armpl_sverfc_f32_x(<vscale x 4 x float> [[TMP4:%.*]], <vscale x 4 x i1> {{.*}})
-; CHECK: ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
-  %in = load float, ptr %in.gep, align 8
-  %call = tail call float @erfcf(float %in)
-  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
-  store float %call, ptr %out.gep, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @exp(double)
-declare float @expf(float)
-
-define void @exp_f64(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @exp_f64(
-; NEON:     [[TMP5:%.*]] = call <2 x double> @armpl_vexpq_f64(<2 x double> [[TMP4:%.*]])
-; SVE:      [[TMP5:%.*]] = call <vscale x 2 x double> @armpl_svexp_f64_x(<vscale x 2 x double> [[TMP4:%.*]], <vscale x 2 x i1> {{.*}})
-; CHECK:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
-  %in = load double, ptr %in.gep, align 8
-  %call = tail call double @exp(double %in)
-  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
-  store double %call, ptr %out.gep, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @exp_f32(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @exp_f32(
-; NEON: [[TMP5:%.*]] = call <4 x float> @armpl_vexpq_f32(<4 x float> [[TMP4:%.*]])
-; SVE: [[TMP5:%.*]] = call <vscale x 4 x float> @armpl_svexp_f32_x(<vscale x 4 x float> [[TMP4:%.*]], <vscale x 4 x i1> {{.*}})
-; CHECK: ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
-  %in = load float, ptr %in.gep, align 8
-  %call = tail call float @expf(float %in)
-  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
-  store float %call, ptr %out.gep, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @exp2(double)
-declare float @exp2f(float)
-
-define void @exp2_f64(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @exp2_f64(
-; NEON:     [[TMP5:%.*]] = call <2 x double> @armpl_vexp2q_f64(<2 x double> [[TMP4:%.*]])
-; SVE:      [[TMP5:%.*]] = call <vscale x 2 x double> @armpl_svexp2_f64_x(<vscale x 2 x double> [[TMP4:%.*]], <vscale x 2 x i1> {{.*}})
-; CHECK:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
-  %in = load double, ptr %in.gep, align 8
-  %call = tail call double @exp2(double %in)
-  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
-  store double %call, ptr %out.gep, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @exp2_f32(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @exp2_f32(
-; NEON: [[TMP5:%.*]] = call <4 x float> @armpl_vexp2q_f32(<4 x float> [[TMP4:%.*]])
-; SVE: [[TMP5:%.*]] = call <vscale x 4 x float> @armpl_svexp2_f32_x(<vscale x 4 x float> [[TMP4:%.*]], <vscale x 4 x i1> {{.*}})
-; CHECK: ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
-  %in = load float, ptr %in.gep, align 8
-  %call = tail call float @exp2f(float %in)
-  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
-  store float %call, ptr %out.gep, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @exp10(double)
-declare float @exp10f(float)
-
-define void @exp10_f64(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @exp10_f64(
-; NEON:     [[TMP5:%.*]] = call <2 x double> @armpl_vexp10q_f64(<2 x double> [[TMP4:%.*]])
-; SVE:      [[TMP5:%.*]] = call <vscale x 2 x double> @armpl_svexp10_f64_x(<vscale x 2 x double> [[TMP4:%.*]], <vscale x 2 x i1> {{.*}})
-; CHECK:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
-  %in = load double, ptr %in.gep, align 8
-  %call = tail call double @exp10(double %in)
-  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
-  store double %call, ptr %out.gep, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @exp10_f32(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @exp10_f32(
-; NEON: [[TMP5:%.*]] = call <4 x float> @armpl_vexp10q_f32(<4 x float> [[TMP4:%.*]])
-; SVE: [[TMP5:%.*]] = call <vscale x 4 x float> @armpl_svexp10_f32_x(<vscale x 4 x float> [[TMP4:%.*]], <vscale x 4 x i1> {{.*}})
-; CHECK: ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
-  %in = load float, ptr %in.gep, align 8
-  %call = tail call float @exp10f(float %in)
-  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
-  store float %call, ptr %out.gep, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @expm1(double)
-declare float @expm1f(float)
-
-define void @expm1_f64(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @expm1_f64(
-; NEON:     [[TMP5:%.*]] = call <2 x double> @armpl_vexpm1q_f64(<2 x double> [[TMP4:%.*]])
-; SVE:      [[TMP5:%.*]] = call <vscale x 2 x double> @armpl_svexpm1_f64_x(<vscale x 2 x double> [[TMP4:%.*]], <vscale x 2 x i1> {{.*}})
-; CHECK:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
-  %in = load double, ptr %in.gep, align 8
-  %call = tail call double @expm1(double %in)
-  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
-  store double %call, ptr %out.gep, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @expm1_f32(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @expm1_f32(
-; NEON: [[TMP5:%.*]] = call <4 x float> @armpl_vexpm1q_f32(<4 x float> [[TMP4:%.*]])
-; SVE: [[TMP5:%.*]] = call <vscale x 4 x float> @armpl_svexpm1_f32_x(<vscale x 4 x float> [[TMP4:%.*]], <vscale x 4 x i1> {{.*}})
-; CHECK: ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
-  %in = load float, ptr %in.gep, align 8
-  %call = tail call float @expm1f(float %in)
-  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
-  store float %call, ptr %out.gep, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @lgamma(double)
-declare float @lgammaf(float)
-
-define void @lgamma_f64(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @lgamma_f64(
-; NEON:     [[TMP5:%.*]] = call <2 x double> @armpl_vlgammaq_f64(<2 x double> [[TMP4:%.*]])
-; SVE:      [[TMP5:%.*]] = call <vscale x 2 x double> @armpl_svlgamma_f64_x(<vscale x 2 x double> [[TMP4:%.*]], <vscale x 2 x i1> {{.*}})
-; CHECK:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
-  %in = load double, ptr %in.gep, align 8
-  %call = tail call double @lgamma(double %in)
-  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
-  store double %call, ptr %out.gep, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @lgamma_f32(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @lgamma_f32(
-; NEON: [[TMP5:%.*]] = call <4 x float> @armpl_vlgammaq_f32(<4 x float> [[TMP4:%.*]])
-; SVE: [[TMP5:%.*]] = call <vscale x 4 x float> @armpl_svlgamma_f32_x(<vscale x 4 x float> [[TMP4:%.*]], <vscale x 4 x i1> {{.*}})
-; CHECK: ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
-  %in = load float, ptr %in.gep, align 8
-  %call = tail call float @lgammaf(float %in)
-  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
-  store float %call, ptr %out.gep, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @log(double)
-declare float @logf(float)
-
-define void @log_f64(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @log_f64(
-; NEON:     [[TMP5:%.*]] = call <2 x double> @armpl_vlogq_f64(<2 x double> [[TMP4:%.*]])
-; SVE:      [[TMP5:%.*]] = call <vscale x 2 x double> @armpl_svlog_f64_x(<vscale x 2 x double> [[TMP4:%.*]], <vscale x 2 x i1> {{.*}})
-; CHECK:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
-  %in = load double, ptr %in.gep, align 8
-  %call = tail call double @log(double %in)
-  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
-  store double %call, ptr %out.gep, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @log_f32(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @log_f32(
-; NEON: [[TMP5:%.*]] = call <4 x float> @armpl_vlogq_f32(<4 x float> [[TMP4:%.*]])
-; SVE: [[TMP5:%.*]] = call <vscale x 4 x float> @armpl_svlog_f32_x(<vscale x 4 x float> [[TMP4:%.*]], <vscale x 4 x i1> {{.*}})
-; CHECK: ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
-  %in = load float, ptr %in.gep, align 8
-  %call = tail call float @logf(float %in)
-  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
-  store float %call, ptr %out.gep, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @log1p(double)
-declare float @log1pf(float)
-
-define void @log1p_f64(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @log1p_f64(
-; NEON:     [[TMP5:%.*]] = call <2 x double> @armpl_vlog1pq_f64(<2 x double> [[TMP4:%.*]])
-; SVE:      [[TMP5:%.*]] = call <vscale x 2 x double> @armpl_svlog1p_f64_x(<vscale x 2 x double> [[TMP4:%.*]], <vscale x 2 x i1> {{.*}})
-; CHECK:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
-  %in = load double, ptr %in.gep, align 8
-  %call = tail call double @log1p(double %in)
-  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
-  store double %call, ptr %out.gep, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @log1p_f32(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @log1p_f32(
-; NEON: [[TMP5:%.*]] = call <4 x float> @armpl_vlog1pq_f32(<4 x float> [[TMP4:%.*]])
-; SVE: [[TMP5:%.*]] = call <vscale x 4 x float> @armpl_svlog1p_f32_x(<vscale x 4 x float> [[TMP4:%.*]], <vscale x 4 x i1> {{.*}})
-; CHECK: ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
-  %in = load float, ptr %in.gep, align 8
-  %call = tail call float @log1pf(float %in)
-  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
-  store float %call, ptr %out.gep, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @log2(double)
-declare float @log2f(float)
-
-define void @log2_f64(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @log2_f64(
-; NEON:     [[TMP5:%.*]] = call <2 x double> @armpl_vlog2q_f64(<2 x double> [[TMP4:%.*]])
-; SVE:      [[TMP5:%.*]] = call <vscale x 2 x double> @armpl_svlog2_f64_x(<vscale x 2 x double> [[TMP4:%.*]], <vscale x 2 x i1> {{.*}})
-; CHECK:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
-  %in = load double, ptr %in.gep, align 8
-  %call = tail call double @log2(double %in)
-  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
-  store double %call, ptr %out.gep, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @log2_f32(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @log2_f32(
-; NEON: [[TMP5:%.*]] = call <4 x float> @armpl_vlog2q_f32(<4 x float> [[TMP4:%.*]])
-; SVE: [[TMP5:%.*]] = call <vscale x 4 x float> @armpl_svlog2_f32_x(<vscale x 4 x float> [[TMP4:%.*]], <vscale x 4 x i1> {{.*}})
-; CHECK: ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
-  %in = load float, ptr %in.gep, align 8
-  %call = tail call float @log2f(float %in)
-  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
-  store float %call, ptr %out.gep, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @log10(double)
-declare float @log10f(float)
-
-define void @log10_f64(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @log10_f64(
-; NEON:     [[TMP5:%.*]] = call <2 x double> @armpl_vlog10q_f64(<2 x double> [[TMP4:%.*]])
-; SVE:      [[TMP5:%.*]] = call <vscale x 2 x double> @armpl_svlog10_f64_x(<vscale x 2 x double> [[TMP4:%.*]], <vscale x 2 x i1> {{.*}})
-; CHECK:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
-  %in = load double, ptr %in.gep, align 8
-  %call = tail call double @log10(double %in)
-  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
-  store double %call, ptr %out.gep, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @log10_f32(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @log10_f32(
-; NEON: [[TMP5:%.*]] = call <4 x float> @armpl_vlog10q_f32(<4 x float> [[TMP4:%.*]])
-; SVE: [[TMP5:%.*]] = call <vscale x 4 x float> @armpl_svlog10_f32_x(<vscale x 4 x float> [[TMP4:%.*]], <vscale x 4 x i1> {{.*}})
-; CHECK: ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
-  %in = load float, ptr %in.gep, align 8
-  %call = tail call float @log10f(float %in)
-  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
-  store float %call, ptr %out.gep, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @sin(double)
-declare float @sinf(float)
-
-define void @sin_f64(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @sin_f64(
-; NEON:     [[TMP5:%.*]] = call <2 x double> @armpl_vsinq_f64(<2 x double> [[TMP4:%.*]])
-; SVE:      [[TMP5:%.*]] = call <vscale x 2 x double> @armpl_svsin_f64_x(<vscale x 2 x double> [[TMP4:%.*]], <vscale x 2 x i1> {{.*}})
-; CHECK:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
-  %in = load double, ptr %in.gep, align 8
-  %call = tail call double @sin(double %in)
-  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
-  store double %call, ptr %out.gep, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @sin_f32(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @sin_f32(
-; NEON: [[TMP5:%.*]] = call <4 x float> @armpl_vsinq_f32(<4 x float> [[TMP4:%.*]])
-; SVE: [[TMP5:%.*]] = call <vscale x 4 x float> @armpl_svsin_f32_x(<vscale x 4 x float> [[TMP4:%.*]], <vscale x 4 x i1> {{.*}})
-; CHECK: ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
-  %in = load float, ptr %in.gep, align 8
-  %call = tail call float @sinf(float %in)
-  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
-  store float %call, ptr %out.gep, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @sinh(double)
-declare float @sinhf(float)
-
-define void @sinh_f64(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @sinh_f64(
-; NEON:     [[TMP5:%.*]] = call <2 x double> @armpl_vsinhq_f64(<2 x double> [[TMP4:%.*]])
-; SVE:      [[TMP5:%.*]] = call <vscale x 2 x double> @armpl_svsinh_f64_x(<vscale x 2 x double> [[TMP4:%.*]], <vscale x 2 x i1> {{.*}})
-; CHECK:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
-  %in = load double, ptr %in.gep, align 8
-  %call = tail call double @sinh(double %in)
-  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
-  store double %call, ptr %out.gep, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @sinh_f32(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @sinh_f32(
-; NEON: [[TMP5:%.*]] = call <4 x float> @armpl_vsinhq_f32(<4 x float> [[TMP4:%.*]])
-; SVE: [[TMP5:%.*]] = call <vscale x 4 x float> @armpl_svsinh_f32_x(<vscale x 4 x float> [[TMP4:%.*]], <vscale x 4 x i1> {{.*}})
-; CHECK: ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
-  %in = load float, ptr %in.gep, align 8
-  %call = tail call float @sinhf(float %in)
-  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
-  store float %call, ptr %out.gep, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @sinpi(double)
-declare float @sinpif(float)
-
-define void @sinpi_f64(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @sinpi_f64(
-; NEON:     [[TMP5:%.*]] = call <2 x double> @armpl_vsinpiq_f64(<2 x double> [[TMP4:%.*]])
-; SVE:      [[TMP5:%.*]] = call <vscale x 2 x double> @armpl_svsinpi_f64_x(<vscale x 2 x double> [[TMP4:%.*]], <vscale x 2 x i1> {{.*}})
-; CHECK:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
-  %in = load double, ptr %in.gep, align 8
-  %call = tail call double @sinpi(double %in)
-  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
-  store double %call, ptr %out.gep, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @sinpi_f32(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @sinpi_f32(
-; NEON: [[TMP5:%.*]] = call <4 x float> @armpl_vsinpiq_f32(<4 x float> [[TMP4:%.*]])
-; SVE: [[TMP5:%.*]] = call <vscale x 4 x float> @armpl_svsinpi_f32_x(<vscale x 4 x float> [[TMP4:%.*]], <vscale x 4 x i1> {{.*}})
-; CHECK: ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
-  %in = load float, ptr %in.gep, align 8
-  %call = tail call float @sinpif(float %in)
-  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
-  store float %call, ptr %out.gep, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @sqrt(double)
-declare float @sqrtf(float)
-
-define void @sqrt_f64(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @sqrt_f64(
-; NEON:     [[TMP5:%.*]] = call <2 x double> @armpl_vsqrtq_f64(<2 x double> [[TMP4:%.*]])
-; SVE:      [[TMP5:%.*]] = call <vscale x 2 x double> @armpl_svsqrt_f64_x(<vscale x 2 x double> [[TMP4:%.*]], <vscale x 2 x i1> {{.*}})
-; CHECK:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
-  %in = load double, ptr %in.gep, align 8
-  %call = tail call double @sqrt(double %in)
-  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
-  store double %call, ptr %out.gep, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @sqrt_f32(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @sqrt_f32(
-; NEON: [[TMP5:%.*]] = call <4 x float> @armpl_vsqrtq_f32(<4 x float> [[TMP4:%.*]])
-; SVE: [[TMP5:%.*]] = call <vscale x 4 x float> @armpl_svsqrt_f32_x(<vscale x 4 x float> [[TMP4:%.*]], <vscale x 4 x i1> {{.*}})
-; CHECK: ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
-  %in = load float, ptr %in.gep, align 8
-  %call = tail call float @sqrtf(float %in)
-  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
-  store float %call, ptr %out.gep, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @tan(double)
-declare float @tanf(float)
-
-define void @tan_f64(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @tan_f64(
-; NEON:     [[TMP5:%.*]] = call <2 x double> @armpl_vtanq_f64(<2 x double> [[TMP4:%.*]])
-; SVE:      [[TMP5:%.*]] = call <vscale x 2 x double> @armpl_svtan_f64_x(<vscale x 2 x double> [[TMP4:%.*]], <vscale x 2 x i1> {{.*}})
-; CHECK:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
-  %in = load double, ptr %in.gep, align 8
-  %call = tail call double @tan(double %in)
-  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
-  store double %call, ptr %out.gep, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @tan_f32(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @tan_f32(
-; NEON: [[TMP5:%.*]] = call <4 x float> @armpl_vtanq_f32(<4 x float> [[TMP4:%.*]])
-; SVE: [[TMP5:%.*]] = call <vscale x 4 x float> @armpl_svtan_f32_x(<vscale x 4 x float> [[TMP4:%.*]], <vscale x 4 x i1> {{.*}})
-; CHECK: ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
-  %in = load float, ptr %in.gep, align 8
-  %call = tail call float @tanf(float %in)
-  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
-  store float %call, ptr %out.gep, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @tanh(double)
-declare float @tanhf(float)
-
-define void @tanh_f64(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @tanh_f64(
-; NEON:     [[TMP5:%.*]] = call <2 x double> @armpl_vtanhq_f64(<2 x double> [[TMP4:%.*]])
-; SVE:      [[TMP5:%.*]] = call <vscale x 2 x double> @armpl_svtanh_f64_x(<vscale x 2 x double> [[TMP4:%.*]], <vscale x 2 x i1> {{.*}})
-; CHECK:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
-  %in = load double, ptr %in.gep, align 8
-  %call = tail call double @tanh(double %in)
-  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
-  store double %call, ptr %out.gep, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @tanh_f32(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @tanh_f32(
-; NEON: [[TMP5:%.*]] = call <4 x float> @armpl_vtanhq_f32(<4 x float> [[TMP4:%.*]])
-; SVE: [[TMP5:%.*]] = call <vscale x 4 x float> @armpl_svtanh_f32_x(<vscale x 4 x float> [[TMP4:%.*]], <vscale x 4 x i1> {{.*}})
-; CHECK: ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
-  %in = load float, ptr %in.gep, align 8
-  %call = tail call float @tanhf(float %in)
-  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
-  store float %call, ptr %out.gep, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @tgamma(double)
-declare float @tgammaf(float)
-
-define void @tgamma_f64(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @tgamma_f64(
-; NEON:     [[TMP5:%.*]] = call <2 x double> @armpl_vtgammaq_f64(<2 x double> [[TMP4:%.*]])
-; SVE:      [[TMP5:%.*]] = call <vscale x 2 x double> @armpl_svtgamma_f64_x(<vscale x 2 x double> [[TMP4:%.*]], <vscale x 2 x i1> {{.*}})
-; CHECK:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
-  %in = load double, ptr %in.gep, align 8
-  %call = tail call double @tgamma(double %in)
-  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
-  store double %call, ptr %out.gep, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @tgamma_f32(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @tgamma_f32(
-; NEON: [[TMP5:%.*]] = call <4 x float> @armpl_vtgammaq_f32(<4 x float> [[TMP4:%.*]])
-; SVE: [[TMP5:%.*]] = call <vscale x 4 x float> @armpl_svtgamma_f32_x(<vscale x 4 x float> [[TMP4:%.*]], <vscale x 4 x i1> {{.*}})
-; CHECK: ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
-  %in = load float, ptr %in.gep, align 8
-  %call = tail call float @tgammaf(float %in)
-  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
-  store float %call, ptr %out.gep, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @atan2(double, double)
-declare float @atan2f(float, float)
-
-define void @atan2_f64(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @atan2_f64(
-; NEON:     [[TMP5:%.*]] = call <2 x double> @armpl_vatan2q_f64(<2 x double> [[TMP4:%.*]], <2 x double> [[TMP4:%.*]])
-; SVE:      [[TMP5:%.*]] = call <vscale x 2 x double> @armpl_svatan2_f64_x(<vscale x 2 x double> [[TMP4:%.*]], <vscale x 2 x double> [[TMP4:%.*]], <vscale x 2 x i1> {{.*}})
-; CHECK:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
-  %in = load double, ptr %in.gep, align 8
-  %call = tail call double @atan2(double %in, double %in)
-  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
-  store double %call, ptr %out.gep, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @atan2_f32(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @atan2_f32(
-; NEON: [[TMP5:%.*]] = call <4 x float> @armpl_vatan2q_f32(<4 x float> [[TMP4:%.*]], <4 x float> [[TMP4:%.*]])
-; SVE: [[TMP5:%.*]] = call <vscale x 4 x float> @armpl_svatan2_f32_x(<vscale x 4 x float> [[TMP4:%.*]], <vscale x 4 x float> [[TMP4:%.*]], <vscale x 4 x i1> {{.*}})
-; CHECK: ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
-  %in = load float, ptr %in.gep, align 8
-  %call = tail call float @atan2f(float %in, float %in)
-  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
-  store float %call, ptr %out.gep, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @copysign(double, double)
-declare float @copysignf(float, float)
-
-define void @copysign_f64(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @copysign_f64(
-; NEON:     [[TMP5:%.*]] = call <2 x double> @armpl_vcopysignq_f64(<2 x double> [[TMP4:%.*]], <2 x double> [[TMP4:%.*]])
-; SVE:      [[TMP5:%.*]] = call <vscale x 2 x double> @armpl_svcopysign_f64_x(<vscale x 2 x double> [[TMP4:%.*]], <vscale x 2 x double> [[TMP4:%.*]], <vscale x 2 x i1> {{.*}})
-; CHECK:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
-  %in = load double, ptr %in.gep, align 8
-  %call = tail call double @copysign(double %in, double %in)
-  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
-  store double %call, ptr %out.gep, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @copysign_f32(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @copysign_f32(
-; NEON: [[TMP5:%.*]] = call <4 x float> @armpl_vcopysignq_f32(<4 x float> [[TMP4:%.*]], <4 x float> [[TMP4:%.*]])
-; SVE: [[TMP5:%.*]] = call <vscale x 4 x float> @armpl_svcopysign_f32_x(<vscale x 4 x float> [[TMP4:%.*]], <vscale x 4 x float> [[TMP4:%.*]], <vscale x 4 x i1> {{.*}})
-; CHECK: ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
-  %in = load float, ptr %in.gep, align 8
-  %call = tail call float @copysignf(float %in, float %in)
-  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
-  store float %call, ptr %out.gep, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @fdim(double, double)
-declare float @fdimf(float, float)
-
-define void @fdim_f64(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @fdim_f64(
-; NEON:     [[TMP5:%.*]] = call <2 x double> @armpl_vfdimq_f64(<2 x double> [[TMP4:%.*]], <2 x double> [[TMP4:%.*]])
-; SVE:      [[TMP5:%.*]] = call <vscale x 2 x double> @armpl_svfdim_f64_x(<vscale x 2 x double> [[TMP4:%.*]], <vscale x 2 x double> [[TMP4:%.*]], <vscale x 2 x i1> {{.*}})
-; CHECK:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
-  %in = load double, ptr %in.gep, align 8
-  %call = tail call double @fdim(double %in, double %in)
-  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
-  store double %call, ptr %out.gep, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @fdim_f32(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @fdim_f32(
-; NEON: [[TMP5:%.*]] = call <4 x float> @armpl_vfdimq_f32(<4 x float> [[TMP4:%.*]], <4 x float> [[TMP4:%.*]])
-; SVE: [[TMP5:%.*]] = call <vscale x 4 x float> @armpl_svfdim_f32_x(<vscale x 4 x float> [[TMP4:%.*]], <vscale x 4 x float> [[TMP4:%.*]], <vscale x 4 x i1> {{.*}})
-; CHECK: ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
-  %in = load float, ptr %in.gep, align 8
-  %call = tail call float @fdimf(float %in, float %in)
-  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
-  store float %call, ptr %out.gep, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @fmin(double, double)
-declare float @fminf(float, float)
-
-define void @fmin_f64(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @fmin_f64(
-; NEON:     [[TMP5:%.*]] = call <2 x double> @armpl_vfminq_f64(<2 x double> [[TMP4:%.*]], <2 x double> [[TMP4:%.*]])
-; SVE:      [[TMP5:%.*]] = call <vscale x 2 x double> @armpl_svfmin_f64_x(<vscale x 2 x double> [[TMP4:%.*]], <vscale x 2 x double> [[TMP4:%.*]], <vscale x 2 x i1> {{.*}})
-; CHECK:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
-  %in = load double, ptr %in.gep, align 8
-  %call = tail call double @fmin(double %in, double %in)
-  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
-  store double %call, ptr %out.gep, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @fmin_f32(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @fmin_f32(
-; NEON: [[TMP5:%.*]] = call <4 x float> @armpl_vfminq_f32(<4 x float> [[TMP4:%.*]], <4 x float> [[TMP4:%.*]])
-; SVE: [[TMP5:%.*]] = call <vscale x 4 x float> @armpl_svfmin_f32_x(<vscale x 4 x float> [[TMP4:%.*]], <vscale x 4 x float> [[TMP4:%.*]], <vscale x 4 x i1> {{.*}})
-; CHECK: ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
-  %in = load float, ptr %in.gep, align 8
-  %call = tail call float @fminf(float %in, float %in)
-  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
-  store float %call, ptr %out.gep, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @fmod(double, double)
-declare float @fmodf(float, float)
-
-define void @fmod_f64(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @fmod_f64(
-; NEON:     [[TMP5:%.*]] = call <2 x double> @armpl_vfmodq_f64(<2 x double> [[TMP4:%.*]], <2 x double> [[TMP4:%.*]])
-; SVE:      [[TMP5:%.*]] = call <vscale x 2 x double> @armpl_svfmod_f64_x(<vscale x 2 x double> [[TMP4:%.*]], <vscale x 2 x double> [[TMP4:%.*]], <vscale x 2 x i1> {{.*}})
-; CHECK:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
-  %in = load double, ptr %in.gep, align 8
-  %call = tail call double @fmod(double %in, double %in)
-  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
-  store double %call, ptr %out.gep, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @fmod_f32(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @fmod_f32(
-; NEON: [[TMP5:%.*]] = call <4 x float> @armpl_vfmodq_f32(<4 x float> [[TMP4:%.*]], <4 x float> [[TMP4:%.*]])
-; SVE: [[TMP5:%.*]] = call <vscale x 4 x float> @armpl_svfmod_f32_x(<vscale x 4 x float> [[TMP4:%.*]], <vscale x 4 x float> [[TMP4:%.*]], <vscale x 4 x i1> {{.*}})
-; CHECK: ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
-  %in = load float, ptr %in.gep, align 8
-  %call = tail call float @fmodf(float %in, float %in)
-  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
-  store float %call, ptr %out.gep, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @hypot(double, double)
-declare float @hypotf(float, float)
-
-define void @hypot_f64(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @hypot_f64(
-; NEON:     [[TMP5:%.*]] = call <2 x double> @armpl_vhypotq_f64(<2 x double> [[TMP4:%.*]], <2 x double> [[TMP4:%.*]])
-; SVE:      [[TMP5:%.*]] = call <vscale x 2 x double> @armpl_svhypot_f64_x(<vscale x 2 x double> [[TMP4:%.*]], <vscale x 2 x double> [[TMP4:%.*]], <vscale x 2 x i1> {{.*}})
-; CHECK:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
-  %in = load double, ptr %in.gep, align 8
-  %call = tail call double @hypot(double %in, double %in)
-  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
-  store double %call, ptr %out.gep, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @hypot_f32(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @hypot_f32(
-; NEON: [[TMP5:%.*]] = call <4 x float> @armpl_vhypotq_f32(<4 x float> [[TMP4:%.*]], <4 x float> [[TMP4:%.*]])
-; SVE: [[TMP5:%.*]] = call <vscale x 4 x float> @armpl_svhypot_f32_x(<vscale x 4 x float> [[TMP4:%.*]], <vscale x 4 x float> [[TMP4:%.*]], <vscale x 4 x i1> {{.*}})
-; CHECK: ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
-  %in = load float, ptr %in.gep, align 8
-  %call = tail call float @hypotf(float %in, float %in)
-  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
-  store float %call, ptr %out.gep, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @nextafter(double, double)
-declare float @nextafterf(float, float)
-
-define void @nextafter_f64(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @nextafter_f64(
-; NEON:     [[TMP5:%.*]] = call <2 x double> @armpl_vnextafterq_f64(<2 x double> [[TMP4:%.*]], <2 x double> [[TMP4:%.*]])
-; SVE:      [[TMP5:%.*]] = call <vscale x 2 x double> @armpl_svnextafter_f64_x(<vscale x 2 x double> [[TMP4:%.*]], <vscale x 2 x double> [[TMP4:%.*]], <vscale x 2 x i1> {{.*}})
-; CHECK:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
-  %in = load double, ptr %in.gep, align 8
-  %call = tail call double @nextafter(double %in, double %in)
-  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
-  store double %call, ptr %out.gep, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @nextafter_f32(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @nextafter_f32(
-; NEON: [[TMP5:%.*]] = call <4 x float> @armpl_vnextafterq_f32(<4 x float> [[TMP4:%.*]], <4 x float> [[TMP4:%.*]])
-; SVE: [[TMP5:%.*]] = call <vscale x 4 x float> @armpl_svnextafter_f32_x(<vscale x 4 x float> [[TMP4:%.*]], <vscale x 4 x float> [[TMP4:%.*]], <vscale x 4 x i1> {{.*}})
-; CHECK: ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
-  %in = load float, ptr %in.gep, align 8
-  %call = tail call float @nextafterf(float %in, float %in)
-  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
-  store float %call, ptr %out.gep, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @pow(double, double)
-declare float @powf(float, float)
-
-define void @pow_f64(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @pow_f64(
-; NEON:     [[TMP5:%.*]] = call <2 x double> @armpl_vpowq_f64(<2 x double> [[TMP4:%.*]], <2 x double> [[TMP4:%.*]])
-; SVE:      [[TMP5:%.*]] = call <vscale x 2 x double> @armpl_svpow_f64_x(<vscale x 2 x double> [[TMP4:%.*]], <vscale x 2 x double> [[TMP4:%.*]], <vscale x 2 x i1> {{.*}})
-; CHECK:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
-  %in = load double, ptr %in.gep, align 8
-  %call = tail call double @pow(double %in, double %in)
-  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
-  store double %call, ptr %out.gep, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @pow_f32(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @pow_f32(
-; NEON: [[TMP5:%.*]] = call <4 x float> @armpl_vpowq_f32(<4 x float> [[TMP4:%.*]], <4 x float> [[TMP4:%.*]])
-; SVE: [[TMP5:%.*]] = call <vscale x 4 x float> @armpl_svpow_f32_x(<vscale x 4 x float> [[TMP4:%.*]], <vscale x 4 x float> [[TMP4:%.*]], <vscale x 4 x i1> {{.*}})
-; CHECK: ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
-  %in = load float, ptr %in.gep, align 8
-  %call = tail call float @powf(float %in, float %in)
-  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
-  store float %call, ptr %out.gep, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @fma(double, double, double)
-declare float @fmaf(float, float, float)
-
-define void @fma_f64(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @fma_f64(
-; NEON: [[TMP5:%.*]] = call <2 x double> @armpl_vfmaq_f64(<2 x double> [[TMP4:%.*]], <2 x double> [[TMP4:%.*]], <2 x double> [[TMP4:%.*]])
-; SVE: [[TMP5:%.*]] = call <vscale x 2 x double> @armpl_svfma_f64_x(<vscale x 2 x double> [[TMP4:%.*]], <vscale x 2 x double> [[TMP4:%.*]], <vscale x 2 x double> [[TMP4:%.*]], <vscale x 2 x i1> {{.*}})
-; CHECK: ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
-  %in = load double, ptr %in.gep, align 8
-  %call = tail call double @fma(double %in, double %in, double %in)
-  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
-  store double %call, ptr %out.gep, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @fma_f32(ptr nocapture %in.ptr, ptr %out.ptr) {
-; CHECK-LABEL: @fma_f32(
-; NEON: [[TMP5:%.*]] = call <4 x float> @armpl_vfmaq_f32(<4 x float> [[TMP4:%.*]], <4 x float> [[TMP4:%.*]])
-; SVE: [[TMP5:%.*]] = call <vscale x 4 x float> @armpl_svfma_f32_x(<vscale x 4 x float> [[TMP4:%.*]], <vscale x 4 x float> [[TMP4:%.*]], <vscale x 4 x i1> {{.*}})
-; CHECK: ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
-  %in = load float, ptr %in.gep, align 8
-  %call = tail call float @fmaf(float %in, float %in, float %in)
-  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
-  store float %call, ptr %out.gep, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/armpl-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/AArch64/armpl-intrinsics.ll
deleted file mode 100644
index 96d94f72fabf0..0000000000000
--- a/llvm/test/Transforms/LoopVectorize/AArch64/armpl-intrinsics.ll
+++ /dev/null
@@ -1,554 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter "(\.|_v|_sv)(ceil|copysign|cos|exp\.|expf?\(|exp2|exp10|fabs|floor|fma|log|m..num|pow|nearbyint|rint|round|sin|sqrt|trunc)|(ret)" --version 2
-; RUN: opt -vector-library=ArmPL -passes=inject-tli-mappings,loop-vectorize,simplifycfg -prefer-predicate-over-epilogue=predicate-dont-vectorize  -S < %s | FileCheck %s --check-prefixes=NEON
-; RUN: opt -mattr=+sve -vector-library=ArmPL -passes=inject-tli-mappings,loop-vectorize,simplifycfg -prefer-predicate-over-epilogue=predicate-dont-vectorize -S < %s | FileCheck %s --check-prefixes=SVE
-
-target triple = "aarch64-unknown-linux-gnu"
-
-; Tests are checking if LV can vectorize loops with llvm math intrinsics
-; using mappings from TLI for scalable and fixed width vectorization.
-
-declare double @llvm.cos.f64(double)
-declare float @llvm.cos.f32(float)
-
-define void @cos_f64(ptr noalias %in.ptr, ptr %out.ptr) {
-;
-; NEON-LABEL: define void @cos_f64
-; NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) {
-; NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vcosq_f64(<2 x double> [[WIDE_LOAD:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @cos_f64
-; SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1:[0-9]+]] {
-; SVE:    [[TMP13:%.*]] = call <vscale x 2 x double> @armpl_svcos_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
-  %in = load double, ptr %in.gep, align 8
-  %call = tail call double @llvm.cos.f64(double %in)
-  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
-  store double %call, ptr %out.gep, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @cos_f32(ptr noalias %in.ptr, ptr %out.ptr) {
-; NEON-LABEL: define void @cos_f32
-; NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) {
-; NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vcosq_f32(<4 x float> [[WIDE_LOAD:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @cos_f32
-; SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP13:%.*]] = call <vscale x 4 x float> @armpl_svcos_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
-  %in = load float, ptr %in.gep, align 8
-  %call = tail call float @llvm.cos.f32(float %in)
-  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
-  store float %call, ptr %out.gep, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @llvm.exp.f64(double)
-declare float @llvm.exp.f32(float)
-
-define void @exp_f64(ptr noalias %in.ptr, ptr %out.ptr) {
-; NEON-LABEL: define void @exp_f64
-; NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) {
-; NEON:    ret void
-;
-; SVE-LABEL: define void @exp_f64
-; SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
-  %in = load double, ptr %in.gep, align 8
-  %call = tail call double @llvm.exp.f64(double %in)
-  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
-  store double %call, ptr %out.gep, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @exp_f32(ptr noalias %in.ptr, ptr %out.ptr) {
-; NEON-LABEL: define void @exp_f32
-; NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) {
-; NEON:    ret void
-;
-; SVE-LABEL: define void @exp_f32
-; SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
-  %in = load float, ptr %in.gep, align 8
-  %call = tail call float @llvm.exp.f32(float %in)
-  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
-  store float %call, ptr %out.gep, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @llvm.exp2.f64(double)
-declare float @llvm.exp2.f32(float)
-
-define void @exp2_f64(ptr noalias %in.ptr, ptr %out.ptr) {
-; NEON-LABEL: define void @exp2_f64
-; NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) {
-; NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vexp2q_f64(<2 x double> [[WIDE_LOAD:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @exp2_f64
-; SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP13:%.*]] = call <vscale x 2 x double> @armpl_svexp2_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
-  %in = load double, ptr %in.gep, align 8
-  %call = tail call double @llvm.exp2.f64(double %in)
-  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
-  store double %call, ptr %out.gep, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @exp2_f32(ptr noalias %in.ptr, ptr %out.ptr) {
-; NEON-LABEL: define void @exp2_f32
-; NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) {
-; NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vexp2q_f32(<4 x float> [[WIDE_LOAD:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @exp2_f32
-; SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP13:%.*]] = call <vscale x 4 x float> @armpl_svexp2_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
-  %in = load float, ptr %in.gep, align 8
-  %call = tail call float @llvm.exp2.f32(float %in)
-  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
-  store float %call, ptr %out.gep, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @llvm.exp10.f64(double)
-declare float @llvm.exp10.f32(float)
-
-define void @exp10_f64(ptr noalias %in.ptr, ptr %out.ptr) {
-; NEON-LABEL: define void @exp10_f64
-; NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) {
-; NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vexp10q_f64(<2 x double> [[WIDE_LOAD:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @exp10_f64
-; SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP13:%.*]] = call <vscale x 2 x double> @armpl_svexp10_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
-  %in = load double, ptr %in.gep, align 8
-  %call = tail call double @llvm.exp10.f64(double %in)
-  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
-  store double %call, ptr %out.gep, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @exp10_f32(ptr noalias %in.ptr, ptr %out.ptr) {
-; NEON-LABEL: define void @exp10_f32
-; NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) {
-; NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vexp10q_f32(<4 x float> [[WIDE_LOAD:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @exp10_f32
-; SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP13:%.*]] = call <vscale x 4 x float> @armpl_svexp10_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
-  %in = load float, ptr %in.gep, align 8
-  %call = tail call float @llvm.exp10.f32(float %in)
-  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
-  store float %call, ptr %out.gep, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @llvm.log.f64(double)
-declare float @llvm.log.f32(float)
-
-define void @log_f64(ptr noalias %in.ptr, ptr %out.ptr) {
-; NEON-LABEL: define void @log_f64
-; NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) {
-; NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vlogq_f64(<2 x double> [[WIDE_LOAD:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @log_f64
-; SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP13:%.*]] = call <vscale x 2 x double> @armpl_svlog_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
-  %in = load double, ptr %in.gep, align 8
-  %call = tail call double @llvm.log.f64(double %in)
-  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
-  store double %call, ptr %out.gep, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @log_f32(ptr noalias %in.ptr, ptr %out.ptr) {
-; NEON-LABEL: define void @log_f32
-; NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) {
-; NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vlogq_f32(<4 x float> [[WIDE_LOAD:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @log_f32
-; SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP13:%.*]] = call <vscale x 4 x float> @armpl_svlog_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
-  %in = load float, ptr %in.gep, align 8
-  %call = tail call float @llvm.log.f32(float %in)
-  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
-  store float %call, ptr %out.gep, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @llvm.log2.f64(double)
-declare float @llvm.log2.f32(float)
-
-define void @log2_f64(ptr noalias %in.ptr, ptr %out.ptr) {
-; NEON-LABEL: define void @log2_f64
-; NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) {
-; NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vlog2q_f64(<2 x double> [[WIDE_LOAD:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @log2_f64
-; SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP13:%.*]] = call <vscale x 2 x double> @armpl_svlog2_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
-  %in = load double, ptr %in.gep, align 8
-  %call = tail call double @llvm.log2.f64(double %in)
-  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
-  store double %call, ptr %out.gep, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @log2_f32(ptr noalias %in.ptr, ptr %out.ptr) {
-; NEON-LABEL: define void @log2_f32
-; NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) {
-; NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vlog2q_f32(<4 x float> [[WIDE_LOAD:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @log2_f32
-; SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP13:%.*]] = call <vscale x 4 x float> @armpl_svlog2_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
-  %in = load float, ptr %in.gep, align 8
-  %call = tail call float @llvm.log2.f32(float %in)
-  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
-  store float %call, ptr %out.gep, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @llvm.log10.f64(double)
-declare float @llvm.log10.f32(float)
-
-define void @log10_f64(ptr noalias %in.ptr, ptr %out.ptr) {
-; NEON-LABEL: define void @log10_f64
-; NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) {
-; NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vlog10q_f64(<2 x double> [[WIDE_LOAD:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @log10_f64
-; SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP13:%.*]] = call <vscale x 2 x double> @armpl_svlog10_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
-  %in = load double, ptr %in.gep, align 8
-  %call = tail call double @llvm.log10.f64(double %in)
-  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
-  store double %call, ptr %out.gep, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @log10_f32(ptr noalias %in.ptr, ptr %out.ptr) {
-; NEON-LABEL: define void @log10_f32
-; NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) {
-; NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vlog10q_f32(<4 x float> [[WIDE_LOAD:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @log10_f32
-; SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP13:%.*]] = call <vscale x 4 x float> @armpl_svlog10_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
-  %in = load float, ptr %in.gep, align 8
-  %call = tail call float @llvm.log10.f32(float %in)
-  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
-  store float %call, ptr %out.gep, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @llvm.sin.f64(double)
-declare float @llvm.sin.f32(float)
-
-define void @sin_f64(ptr noalias %in.ptr, ptr %out.ptr) {
-; NEON-LABEL: define void @sin_f64
-; NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) {
-; NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vsinq_f64(<2 x double> [[WIDE_LOAD:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @sin_f64
-; SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP13:%.*]] = call <vscale x 2 x double> @armpl_svsin_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
-  %in = load double, ptr %in.gep, align 8
-  %call = tail call double @llvm.sin.f64(double %in)
-  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
-  store double %call, ptr %out.gep, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @sin_f32(ptr noalias %in.ptr, ptr %out.ptr) {
-; NEON-LABEL: define void @sin_f32
-; NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) {
-; NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vsinq_f32(<4 x float> [[WIDE_LOAD:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @sin_f32
-; SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP13:%.*]] = call <vscale x 4 x float> @armpl_svsin_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
-  %in = load float, ptr %in.gep, align 8
-  %call = tail call float @llvm.sin.f32(float %in)
-  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
-  store float %call, ptr %out.gep, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @llvm.pow.f64(double, double)
-declare float @llvm.pow.f32(float, float)
-
-define void @pow_f64(ptr noalias %in.ptr, ptr %out.ptr) {
-; NEON-LABEL: define void @pow_f64
-; NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) {
-; NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vpowq_f64(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @pow_f64
-; SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP13:%.*]] = call <vscale x 2 x double> @armpl_svpow_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x double> [[WIDE_MASKED_LOAD]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
-  %in = load double, ptr %in.gep, align 8
-  %call = tail call double @llvm.pow.f64(double %in, double %in)
-  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
-  store double %call, ptr %out.gep, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @pow_f32(ptr noalias %in.ptr, ptr %out.ptr) {
-; NEON-LABEL: define void @pow_f32
-; NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) {
-; NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vpowq_f32(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @pow_f32
-; SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP13:%.*]] = call <vscale x 4 x float> @armpl_svpow_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
-  %in = load float, ptr %in.gep, align 8
-  %call = tail call float @llvm.pow.f32(float %in, float %in)
-  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
-  store float %call, ptr %out.gep, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sleef-calls-aarch64.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sleef-calls-aarch64.ll
deleted file mode 100644
index bd39dcb3371a9..0000000000000
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sleef-calls-aarch64.ll
+++ /dev/null
@@ -1,1347 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter "(_)|(cos|expf?\(|exp2|exp10|fmod|gamma|log|pow|sin|sqrt|tan)|(ret)" --version 2
-; RUN: opt -vector-library=sleefgnuabi -passes=inject-tli-mappings,loop-vectorize,simplifycfg -prefer-predicate-over-epilogue=predicate-dont-vectorize -force-vector-interleave=1 -S < %s | FileCheck %s --check-prefix=NEON
-; RUN: opt -mattr=+sve -vector-library=sleefgnuabi -passes=inject-tli-mappings,loop-vectorize,simplifycfg -prefer-predicate-over-epilogue=predicate-dont-vectorize -force-vector-interleave=1 -S < %s | FileCheck %s --check-prefix=SVE
-
-target triple = "aarch64-unknown-linux-gnu"
-
-declare double @acos(double)
-declare float @acosf(float)
-
-define void @acos_f64(double* nocapture %varray) {
-; NEON-LABEL: define void @acos_f64
-; NEON-SAME: (ptr nocapture [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_acos(<2 x double> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @acos_f64
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR0:[0-9]+]] {
-; SVE:    [[TMP20:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_acos(<vscale x 2 x double> [[TMP19:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to double
-  %call = tail call double @acos(double %conv)
-  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
-  store double %call, double* %arrayidx, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @acos_f32(float* nocapture %varray) {
-; NEON-LABEL: define void @acos_f32
-; NEON-SAME: (ptr nocapture [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_acosf(<4 x float> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @acos_f32
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR0]] {
-; SVE:    [[TMP20:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_acosf(<vscale x 4 x float> [[TMP19:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to float
-  %call = tail call float @acosf(float %conv)
-  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
-  store float %call, float* %arrayidx, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @asin(double)
-declare float @asinf(float)
-
-define void @asin_f64(double* nocapture %varray) {
-; NEON-LABEL: define void @asin_f64
-; NEON-SAME: (ptr nocapture [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_asin(<2 x double> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @asin_f64
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR0]] {
-; SVE:    [[TMP20:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_asin(<vscale x 2 x double> [[TMP19:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to double
-  %call = tail call double @asin(double %conv)
-  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
-  store double %call, double* %arrayidx, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @asin_f32(float* nocapture %varray) {
-; NEON-LABEL: define void @asin_f32
-; NEON-SAME: (ptr nocapture [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_asinf(<4 x float> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @asin_f32
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR0]] {
-; SVE:    [[TMP20:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_asinf(<vscale x 4 x float> [[TMP19:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to float
-  %call = tail call float @asinf(float %conv)
-  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
-  store float %call, float* %arrayidx, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @atan(double)
-declare float @atanf(float)
-
-define void @atan_f64(double* nocapture %varray) {
-; NEON-LABEL: define void @atan_f64
-; NEON-SAME: (ptr nocapture [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_atan(<2 x double> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @atan_f64
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR0]] {
-; SVE:    [[TMP20:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_atan(<vscale x 2 x double> [[TMP19:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to double
-  %call = tail call double @atan(double %conv)
-  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
-  store double %call, double* %arrayidx, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @atan_f32(float* nocapture %varray) {
-; NEON-LABEL: define void @atan_f32
-; NEON-SAME: (ptr nocapture [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_atanf(<4 x float> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @atan_f32
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR0]] {
-; SVE:    [[TMP20:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_atanf(<vscale x 4 x float> [[TMP19:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to float
-  %call = tail call float @atanf(float %conv)
-  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
-  store float %call, float* %arrayidx, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @atan2(double, double)
-declare float @atan2f(float, float)
-
-define void @atan2_f64(double* nocapture %varray) {
-; NEON-LABEL: define void @atan2_f64
-; NEON-SAME: (ptr nocapture [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2vv_atan2(<2 x double> [[TMP1:%.*]], <2 x double> [[TMP1]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @atan2_f64
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR0]] {
-; SVE:    [[TMP20:%.*]] = call <vscale x 2 x double> @_ZGVsMxvv_atan2(<vscale x 2 x double> [[TMP19:%.*]], <vscale x 2 x double> [[TMP19]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to double
-  %call = tail call double @atan2(double %conv, double %conv)
-  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
-  store double %call, double* %arrayidx, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @atan2_f32(float* nocapture %varray) {
-; NEON-LABEL: define void @atan2_f32
-; NEON-SAME: (ptr nocapture [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4vv_atan2f(<4 x float> [[TMP1:%.*]], <4 x float> [[TMP1]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @atan2_f32
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR0]] {
-; SVE:    [[TMP20:%.*]] = call <vscale x 4 x float> @_ZGVsMxvv_atan2f(<vscale x 4 x float> [[TMP19:%.*]], <vscale x 4 x float> [[TMP19]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to float
-  %call = tail call float @atan2f(float %conv, float %conv)
-  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
-  store float %call, float* %arrayidx, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @atanh(double)
-declare float @atanhf(float)
-
-define void @atanh_f64(double* nocapture %varray) {
-; NEON-LABEL: define void @atanh_f64
-; NEON-SAME: (ptr nocapture [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_atanh(<2 x double> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @atanh_f64
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR0]] {
-; SVE:    [[TMP20:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_atanh(<vscale x 2 x double> [[TMP19:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to double
-  %call = tail call double @atanh(double %conv)
-  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
-  store double %call, double* %arrayidx, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @atanh_f32(float* nocapture %varray) {
-; NEON-LABEL: define void @atanh_f32
-; NEON-SAME: (ptr nocapture [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_atanhf(<4 x float> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @atanh_f32
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR0]] {
-; SVE:    [[TMP20:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_atanhf(<vscale x 4 x float> [[TMP19:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to float
-  %call = tail call float @atanhf(float %conv)
-  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
-  store float %call, float* %arrayidx, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @cos(double)
-declare float @cosf(float)
-
-define void @cos_f64(double* nocapture %varray) {
-; NEON-LABEL: define void @cos_f64
-; NEON-SAME: (ptr nocapture [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_cos(<2 x double> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @cos_f64
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR0]] {
-; SVE:    [[TMP20:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_cos(<vscale x 2 x double> [[TMP19:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to double
-  %call = tail call double @cos(double %conv)
-  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
-  store double %call, double* %arrayidx, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @cos_f32(float* nocapture %varray) {
-; NEON-LABEL: define void @cos_f32
-; NEON-SAME: (ptr nocapture [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_cosf(<4 x float> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @cos_f32
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR0]] {
-; SVE:    [[TMP20:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_cosf(<vscale x 4 x float> [[TMP19:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to float
-  %call = tail call float @cosf(float %conv)
-  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
-  store float %call, float* %arrayidx, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @cosh(double)
-declare float @coshf(float)
-
-define void @cosh_f64(double* nocapture %varray) {
-; NEON-LABEL: define void @cosh_f64
-; NEON-SAME: (ptr nocapture [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_cosh(<2 x double> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @cosh_f64
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR0]] {
-; SVE:    [[TMP20:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_cosh(<vscale x 2 x double> [[TMP19:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to double
-  %call = tail call double @cosh(double %conv)
-  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
-  store double %call, double* %arrayidx, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @cosh_f32(float* nocapture %varray) {
-; NEON-LABEL: define void @cosh_f32
-; NEON-SAME: (ptr nocapture [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_coshf(<4 x float> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @cosh_f32
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR0]] {
-; SVE:    [[TMP20:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_coshf(<vscale x 4 x float> [[TMP19:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to float
-  %call = tail call float @coshf(float %conv)
-  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
-  store float %call, float* %arrayidx, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @exp(double)
-declare float @expf(float)
-
-define void @exp_f64(double* nocapture %varray) {
-; NEON-LABEL: define void @exp_f64
-; NEON-SAME: (ptr nocapture [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_exp(<2 x double> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @exp_f64
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR0]] {
-; SVE:    [[TMP20:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_exp(<vscale x 2 x double> [[TMP19:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to double
-  %call = tail call double @exp(double %conv)
-  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
-  store double %call, double* %arrayidx, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @exp_f32(float* nocapture %varray) {
-; NEON-LABEL: define void @exp_f32
-; NEON-SAME: (ptr nocapture [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_expf(<4 x float> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @exp_f32
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR0]] {
-; SVE:    [[TMP20:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_expf(<vscale x 4 x float> [[TMP19:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to float
-  %call = tail call float @expf(float %conv)
-  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
-  store float %call, float* %arrayidx, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @exp2(double)
-declare float @exp2f(float)
-
-define void @exp2_f64(double* nocapture %varray) {
-; NEON-LABEL: define void @exp2_f64
-; NEON-SAME: (ptr nocapture [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_exp2(<2 x double> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @exp2_f64
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR0]] {
-; SVE:    [[TMP20:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_exp2(<vscale x 2 x double> [[TMP19:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to double
-  %call = tail call double @exp2(double %conv)
-  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
-  store double %call, double* %arrayidx, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @exp2_f32(float* nocapture %varray) {
-; NEON-LABEL: define void @exp2_f32
-; NEON-SAME: (ptr nocapture [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_exp2f(<4 x float> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @exp2_f32
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR0]] {
-; SVE:    [[TMP20:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_exp2f(<vscale x 4 x float> [[TMP19:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to float
-  %call = tail call float @exp2f(float %conv)
-  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
-  store float %call, float* %arrayidx, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @exp10(double)
-declare float @exp10f(float)
-
-define void @exp10_f64(double* nocapture %varray) {
-; NEON-LABEL: define void @exp10_f64
-; NEON-SAME: (ptr nocapture [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_exp10(<2 x double> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @exp10_f64
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR0]] {
-; SVE:    [[TMP20:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_exp10(<vscale x 2 x double> [[TMP19:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to double
-  %call = tail call double @exp10(double %conv)
-  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
-  store double %call, double* %arrayidx, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @exp10_f32(float* nocapture %varray) {
-; NEON-LABEL: define void @exp10_f32
-; NEON-SAME: (ptr nocapture [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_exp10f(<4 x float> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @exp10_f32
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR0]] {
-; SVE:    [[TMP20:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_exp10f(<vscale x 4 x float> [[TMP19:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to float
-  %call = tail call float @exp10f(float %conv)
-  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
-  store float %call, float* %arrayidx, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @fmod(double, double)
-declare float @fmodf(float, float)
-
-define void @fmod_f64(double* nocapture %varray) {
-; NEON-LABEL: define void @fmod_f64
-; NEON-SAME: (ptr nocapture [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2vv_fmod(<2 x double> [[TMP1:%.*]], <2 x double> [[TMP1]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @fmod_f64
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR0]] {
-; SVE:    [[TMP20:%.*]] = call <vscale x 2 x double> @_ZGVsMxvv_fmod(<vscale x 2 x double> [[TMP19:%.*]], <vscale x 2 x double> [[TMP19]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to double
-  %call = tail call double @fmod(double %conv, double %conv)
-  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
-  store double %call, double* %arrayidx, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @fmod_f32(float* nocapture %varray) {
-; NEON-LABEL: define void @fmod_f32
-; NEON-SAME: (ptr nocapture [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4vv_fmodf(<4 x float> [[TMP1:%.*]], <4 x float> [[TMP1]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @fmod_f32
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR0]] {
-; SVE:    [[TMP20:%.*]] = call <vscale x 4 x float> @_ZGVsMxvv_fmodf(<vscale x 4 x float> [[TMP19:%.*]], <vscale x 4 x float> [[TMP19]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to float
-  %call = tail call float @fmodf(float %conv, float %conv)
-  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
-  store float %call, float* %arrayidx, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @lgamma(double)
-declare float @lgammaf(float)
-
-define void @lgamma_f64(double* nocapture %varray) {
-; NEON-LABEL: define void @lgamma_f64
-; NEON-SAME: (ptr nocapture [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_lgamma(<2 x double> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @lgamma_f64
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR0]] {
-; SVE:    [[TMP20:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_lgamma(<vscale x 2 x double> [[TMP19:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to double
-  %call = tail call double @lgamma(double %conv)
-  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
-  store double %call, double* %arrayidx, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @lgamma_f32(float* nocapture %varray) {
-; NEON-LABEL: define void @lgamma_f32
-; NEON-SAME: (ptr nocapture [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_lgammaf(<4 x float> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @lgamma_f32
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR0]] {
-; SVE:    [[TMP20:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_lgammaf(<vscale x 4 x float> [[TMP19:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to float
-  %call = tail call float @lgammaf(float %conv)
-  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
-  store float %call, float* %arrayidx, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @log10(double)
-declare float @log10f(float)
-
-define void @log10_f64(double* nocapture %varray) {
-; NEON-LABEL: define void @log10_f64
-; NEON-SAME: (ptr nocapture [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_log10(<2 x double> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @log10_f64
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR0]] {
-; SVE:    [[TMP20:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_log10(<vscale x 2 x double> [[TMP19:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to double
-  %call = tail call double @log10(double %conv)
-  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
-  store double %call, double* %arrayidx, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @log10_f32(float* nocapture %varray) {
-; NEON-LABEL: define void @log10_f32
-; NEON-SAME: (ptr nocapture [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_log10f(<4 x float> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @log10_f32
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR0]] {
-; SVE:    [[TMP20:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_log10f(<vscale x 4 x float> [[TMP19:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to float
-  %call = tail call float @log10f(float %conv)
-  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
-  store float %call, float* %arrayidx, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @log2(double)
-declare float @log2f(float)
-
-define void @log2_f64(double* nocapture %varray) {
-; NEON-LABEL: define void @log2_f64
-; NEON-SAME: (ptr nocapture [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_log2(<2 x double> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @log2_f64
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR0]] {
-; SVE:    [[TMP20:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_log2(<vscale x 2 x double> [[TMP19:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to double
-  %call = tail call double @log2(double %conv)
-  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
-  store double %call, double* %arrayidx, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @log2_f32(float* nocapture %varray) {
-; NEON-LABEL: define void @log2_f32
-; NEON-SAME: (ptr nocapture [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_log2f(<4 x float> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @log2_f32
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR0]] {
-; SVE:    [[TMP20:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_log2f(<vscale x 4 x float> [[TMP19:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to float
-  %call = tail call float @log2f(float %conv)
-  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
-  store float %call, float* %arrayidx, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @log(double)
-declare float @logf(float)
-
-define void @log_f64(double* nocapture %varray) {
-; NEON-LABEL: define void @log_f64
-; NEON-SAME: (ptr nocapture [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_log(<2 x double> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @log_f64
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR0]] {
-; SVE:    [[TMP20:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_log(<vscale x 2 x double> [[TMP19:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to double
-  %call = tail call double @log(double %conv)
-  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
-  store double %call, double* %arrayidx, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @log_f32(float* nocapture %varray) {
-; NEON-LABEL: define void @log_f32
-; NEON-SAME: (ptr nocapture [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_logf(<4 x float> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @log_f32
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR0]] {
-; SVE:    [[TMP20:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_logf(<vscale x 4 x float> [[TMP19:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to float
-  %call = tail call float @logf(float %conv)
-  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
-  store float %call, float* %arrayidx, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @pow(double, double)
-declare float @powf(float, float)
-
-define void @pow_f64(double* nocapture %varray) {
-; NEON-LABEL: define void @pow_f64
-; NEON-SAME: (ptr nocapture [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2vv_pow(<2 x double> [[TMP1:%.*]], <2 x double> [[TMP1]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @pow_f64
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR0]] {
-; SVE:    [[TMP20:%.*]] = call <vscale x 2 x double> @_ZGVsMxvv_pow(<vscale x 2 x double> [[TMP19:%.*]], <vscale x 2 x double> [[TMP19]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to double
-  %call = tail call double @pow(double %conv, double %conv)
-  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
-  store double %call, double* %arrayidx, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @pow_f32(float* nocapture %varray) {
-; NEON-LABEL: define void @pow_f32
-; NEON-SAME: (ptr nocapture [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4vv_powf(<4 x float> [[TMP1:%.*]], <4 x float> [[TMP1]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @pow_f32
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR0]] {
-; SVE:    [[TMP20:%.*]] = call <vscale x 4 x float> @_ZGVsMxvv_powf(<vscale x 4 x float> [[TMP19:%.*]], <vscale x 4 x float> [[TMP19]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to float
-  %call = tail call float @powf(float %conv, float %conv)
-  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
-  store float %call, float* %arrayidx, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @sin(double)
-declare float @sinf(float)
-
-define void @sin_f64(double* nocapture %varray) {
-; NEON-LABEL: define void @sin_f64
-; NEON-SAME: (ptr nocapture [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_sin(<2 x double> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @sin_f64
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR0]] {
-; SVE:    [[TMP20:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_sin(<vscale x 2 x double> [[TMP19:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to double
-  %call = tail call double @sin(double %conv)
-  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
-  store double %call, double* %arrayidx, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @sin_f32(float* nocapture %varray) {
-; NEON-LABEL: define void @sin_f32
-; NEON-SAME: (ptr nocapture [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_sinf(<4 x float> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @sin_f32
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR0]] {
-; SVE:    [[TMP20:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_sinf(<vscale x 4 x float> [[TMP19:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to float
-  %call = tail call float @sinf(float %conv)
-  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
-  store float %call, float* %arrayidx, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @sinh(double)
-declare float @sinhf(float)
-
-define void @sinh_f64(double* nocapture %varray) {
-; NEON-LABEL: define void @sinh_f64
-; NEON-SAME: (ptr nocapture [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_sinh(<2 x double> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @sinh_f64
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR0]] {
-; SVE:    [[TMP20:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_sinh(<vscale x 2 x double> [[TMP19:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to double
-  %call = tail call double @sinh(double %conv)
-  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
-  store double %call, double* %arrayidx, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @sinh_f32(float* nocapture %varray) {
-; NEON-LABEL: define void @sinh_f32
-; NEON-SAME: (ptr nocapture [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_sinhf(<4 x float> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @sinh_f32
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR0]] {
-; SVE:    [[TMP20:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_sinhf(<vscale x 4 x float> [[TMP19:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to float
-  %call = tail call float @sinhf(float %conv)
-  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
-  store float %call, float* %arrayidx, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @sqrt(double)
-declare float @sqrtf(float)
-
-define void @sqrt_f64(double* nocapture %varray) {
-; NEON-LABEL: define void @sqrt_f64
-; NEON-SAME: (ptr nocapture [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_sqrt(<2 x double> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @sqrt_f64
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR0]] {
-; SVE:    [[TMP20:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_sqrt(<vscale x 2 x double> [[TMP19:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to double
-  %call = tail call double @sqrt(double %conv)
-  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
-  store double %call, double* %arrayidx, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @sqrt_f32(float* nocapture %varray) {
-; NEON-LABEL: define void @sqrt_f32
-; NEON-SAME: (ptr nocapture [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_sqrtf(<4 x float> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @sqrt_f32
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR0]] {
-; SVE:    [[TMP20:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_sqrtf(<vscale x 4 x float> [[TMP19:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to float
-  %call = tail call float @sqrtf(float %conv)
-  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
-  store float %call, float* %arrayidx, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @tan(double)
-declare float @tanf(float)
-
-define void @tan_f64(double* nocapture %varray) {
-; NEON-LABEL: define void @tan_f64
-; NEON-SAME: (ptr nocapture [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_tan(<2 x double> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @tan_f64
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR0]] {
-; SVE:    [[TMP20:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_tan(<vscale x 2 x double> [[TMP19:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to double
-  %call = tail call double @tan(double %conv)
-  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
-  store double %call, double* %arrayidx, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @tan_f32(float* nocapture %varray) {
-; NEON-LABEL: define void @tan_f32
-; NEON-SAME: (ptr nocapture [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_tanf(<4 x float> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @tan_f32
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR0]] {
-; SVE:    [[TMP20:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_tanf(<vscale x 4 x float> [[TMP19:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to float
-  %call = tail call float @tanf(float %conv)
-  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
-  store float %call, float* %arrayidx, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @tanh(double)
-declare float @tanhf(float)
-
-define void @tanh_f64(double* nocapture %varray) {
-; NEON-LABEL: define void @tanh_f64
-; NEON-SAME: (ptr nocapture [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_tanh(<2 x double> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @tanh_f64
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR0]] {
-; SVE:    [[TMP20:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_tanh(<vscale x 2 x double> [[TMP19:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to double
-  %call = tail call double @tanh(double %conv)
-  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
-  store double %call, double* %arrayidx, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @tanh_f32(float* nocapture %varray) {
-; NEON-LABEL: define void @tanh_f32
-; NEON-SAME: (ptr nocapture [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_tanhf(<4 x float> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @tanh_f32
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR0]] {
-; SVE:    [[TMP20:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_tanhf(<vscale x 4 x float> [[TMP19:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to float
-  %call = tail call float @tanhf(float %conv)
-  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
-  store float %call, float* %arrayidx, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @tgamma(double)
-declare float @tgammaf(float)
-
-define void @tgamma_f64(double* nocapture %varray) {
-; NEON-LABEL: define void @tgamma_f64
-; NEON-SAME: (ptr nocapture [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_tgamma(<2 x double> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @tgamma_f64
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR0]] {
-; SVE:    [[TMP20:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_tgamma(<vscale x 2 x double> [[TMP19:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to double
-  %call = tail call double @tgamma(double %conv)
-  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
-  store double %call, double* %arrayidx, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @tgamma_f32(float* nocapture %varray) {
-; NEON-LABEL: define void @tgamma_f32
-; NEON-SAME: (ptr nocapture [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_tgammaf(<4 x float> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @tgamma_f32
-; SVE-SAME: (ptr nocapture [[VARRAY:%.*]]) #[[ATTR0]] {
-; SVE:    [[TMP20:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_tgammaf(<vscale x 4 x float> [[TMP19:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to float
-  %call = tail call float @tgammaf(float %conv)
-  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
-  store float %call, float* %arrayidx, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sleef-intrinsic-calls-aarch64.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sleef-intrinsic-calls-aarch64.ll
deleted file mode 100644
index 2300ce74996e3..0000000000000
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sleef-intrinsic-calls-aarch64.ll
+++ /dev/null
@@ -1,1290 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter "(\.|_)(ceil|copysign|cos|exp\.|expf?\(|exp2|exp10|fabs|floor|fma|log|m..num|pow|nearbyint|rint|round|sin|sqrt|trunc)|(ret)" --version 2
-; RUN: opt -vector-library=sleefgnuabi -passes=inject-tli-mappings,loop-vectorize,simplifycfg -force-vector-interleave=1 -prefer-predicate-over-epilogue=predicate-dont-vectorize -S < %s | FileCheck %s --check-prefix=NEON
-; RUN: opt -mattr=+sve -vector-library=sleefgnuabi -passes=inject-tli-mappings,loop-vectorize,simplifycfg -force-vector-interleave=1 -prefer-predicate-over-epilogue=predicate-dont-vectorize -S < %s | FileCheck %s --check-prefix=SVE
-
-target triple = "aarch64-unknown-linux-gnu"
-
-; Tests are checking if LV can vectorize loops with llvm math intrinsics using mappings
-; from TLI (if such mappings exist) for scalable and fixed width vectors.
-
-declare double @llvm.ceil.f64(double)
-declare float @llvm.ceil.f32(float)
-
-define void @llvm_ceil_f64(double* %varray) {
-; NEON-LABEL: define void @llvm_ceil_f64
-; NEON-SAME: (ptr [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @llvm_ceil_f64
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1:[0-9]+]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 2 x double> @llvm.ceil.nxv2f64(<vscale x 2 x double> [[TMP17:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to double
-  %call = tail call double @llvm.ceil.f64(double %conv)
-  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
-  store double %call, double* %arrayidx, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @llvm_ceil_f32(float* %varray) {
-; NEON-LABEL: define void @llvm_ceil_f32
-; NEON-SAME: (ptr [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <4 x float> @llvm.ceil.v4f32(<4 x float> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @llvm_ceil_f32
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 4 x float> @llvm.ceil.nxv4f32(<vscale x 4 x float> [[TMP17:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to float
-  %call = tail call float @llvm.ceil.f32(float %conv)
-  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
-  store float %call, float* %arrayidx, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @llvm.copysign.f64(double, double)
-declare float @llvm.copysign.f32(float, float)
-
-define void @llvm_copysign_f64(double* %varray) {
-; NEON-LABEL: define void @llvm_copysign_f64
-; NEON-SAME: (ptr [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <2 x double> @llvm.copysign.v2f64(<2 x double> [[TMP1:%.*]], <2 x double> [[TMP1]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @llvm_copysign_f64
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 2 x double> @llvm.copysign.nxv2f64(<vscale x 2 x double> [[TMP17:%.*]], <vscale x 2 x double> [[TMP17]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to double
-  %call = tail call double @llvm.copysign.f64(double %conv, double %conv)
-  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
-  store double %call, double* %arrayidx, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @llvm_copysign_f32(float* %varray) {
-; NEON-LABEL: define void @llvm_copysign_f32
-; NEON-SAME: (ptr [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <4 x float> @llvm.copysign.v4f32(<4 x float> [[TMP1:%.*]], <4 x float> [[TMP1]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @llvm_copysign_f32
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 4 x float> @llvm.copysign.nxv4f32(<vscale x 4 x float> [[TMP17:%.*]], <vscale x 4 x float> [[TMP17]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to float
-  %call = tail call float @llvm.copysign.f32(float %conv, float %conv)
-  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
-  store float %call, float* %arrayidx, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @llvm.cos.f64(double)
-declare float @llvm.cos.f32(float)
-
-define void @llvm_cos_f64(double* %varray) {
-; NEON-LABEL: define void @llvm_cos_f64
-; NEON-SAME: (ptr [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_cos(<2 x double> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @llvm_cos_f64
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_cos(<vscale x 2 x double> [[TMP17:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to double
-  %call = tail call double @llvm.cos.f64(double %conv)
-  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
-  store double %call, double* %arrayidx, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @llvm_cos_f32(float* %varray) {
-; NEON-LABEL: define void @llvm_cos_f32
-; NEON-SAME: (ptr [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_cosf(<4 x float> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @llvm_cos_f32
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_cosf(<vscale x 4 x float> [[TMP17:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to float
-  %call = tail call float @llvm.cos.f32(float %conv)
-  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
-  store float %call, float* %arrayidx, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @llvm.exp.f64(double)
-declare float @llvm.exp.f32(float)
-
-define void @llvm_exp_f64(double* %varray) {
-; NEON-LABEL: define void @llvm_exp_f64
-; NEON-SAME: (ptr [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_exp(<2 x double> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @llvm_exp_f64
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_exp(<vscale x 2 x double> [[TMP17:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to double
-  %call = tail call double @llvm.exp.f64(double %conv)
-  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
-  store double %call, double* %arrayidx, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @llvm_exp_f32(float* %varray) {
-; NEON-LABEL: define void @llvm_exp_f32
-; NEON-SAME: (ptr [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_expf(<4 x float> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @llvm_exp_f32
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_expf(<vscale x 4 x float> [[TMP17:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to float
-  %call = tail call float @llvm.exp.f32(float %conv)
-  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
-  store float %call, float* %arrayidx, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @llvm.exp2.f64(double)
-declare float @llvm.exp2.f32(float)
-
-define void @llvm_exp2_f64(double* %varray) {
-; NEON-LABEL: define void @llvm_exp2_f64
-; NEON-SAME: (ptr [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_exp2(<2 x double> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @llvm_exp2_f64
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_exp2(<vscale x 2 x double> [[TMP17:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to double
-  %call = tail call double @llvm.exp2.f64(double %conv)
-  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
-  store double %call, double* %arrayidx, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @llvm_exp2_f32(float* %varray) {
-; NEON-LABEL: define void @llvm_exp2_f32
-; NEON-SAME: (ptr [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_exp2f(<4 x float> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @llvm_exp2_f32
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_exp2f(<vscale x 4 x float> [[TMP17:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to float
-  %call = tail call float @llvm.exp2.f32(float %conv)
-  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
-  store float %call, float* %arrayidx, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @llvm.exp10.f64(double)
-declare float @llvm.exp10.f32(float)
-
-define void @llvm_exp10_f64(double* %varray) {
-; NEON-LABEL: define void @llvm_exp10_f64
-; NEON-SAME: (ptr [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_exp10(<2 x double> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @llvm_exp10_f64
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_exp10(<vscale x 2 x double> [[TMP17:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to double
-  %call = tail call double @llvm.exp10.f64(double %conv)
-  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
-  store double %call, double* %arrayidx, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @llvm_exp10_f32(float* %varray) {
-; NEON-LABEL: define void @llvm_exp10_f32
-; NEON-SAME: (ptr [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_exp10f(<4 x float> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @llvm_exp10_f32
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_exp10f(<vscale x 4 x float> [[TMP17:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to float
-  %call = tail call float @llvm.exp10.f32(float %conv)
-  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
-  store float %call, float* %arrayidx, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @llvm.fabs.f64(double)
-declare float @llvm.fabs.f32(float)
-
-define void @llvm_fabs_f64(double* %varray) {
-; NEON-LABEL: define void @llvm_fabs_f64
-; NEON-SAME: (ptr [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <2 x double> @llvm.fabs.v2f64(<2 x double> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @llvm_fabs_f64
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 2 x double> @llvm.fabs.nxv2f64(<vscale x 2 x double> [[TMP17:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to double
-  %call = tail call double @llvm.fabs.f64(double %conv)
-  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
-  store double %call, double* %arrayidx, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-
-define void @llvm_fabs_f32(float* %varray) {
-; NEON-LABEL: define void @llvm_fabs_f32
-; NEON-SAME: (ptr [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @llvm_fabs_f32
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 4 x float> @llvm.fabs.nxv4f32(<vscale x 4 x float> [[TMP17:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to float
-  %call = tail call float @llvm.fabs.f32(float %conv)
-  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
-  store float %call, float* %arrayidx, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @llvm.floor.f64(double)
-declare float @llvm.floor.f32(float)
-
-define void @llvm_floor_f64(double* %varray) {
-; NEON-LABEL: define void @llvm_floor_f64
-; NEON-SAME: (ptr [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <2 x double> @llvm.floor.v2f64(<2 x double> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @llvm_floor_f64
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 2 x double> @llvm.floor.nxv2f64(<vscale x 2 x double> [[TMP17:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to double
-  %call = tail call double @llvm.floor.f64(double %conv)
-  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
-  store double %call, double* %arrayidx, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @llvm_floor_f32(float* %varray) {
-; NEON-LABEL: define void @llvm_floor_f32
-; NEON-SAME: (ptr [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <4 x float> @llvm.floor.v4f32(<4 x float> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @llvm_floor_f32
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 4 x float> @llvm.floor.nxv4f32(<vscale x 4 x float> [[TMP17:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to float
-  %call = tail call float @llvm.floor.f32(float %conv)
-  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
-  store float %call, float* %arrayidx, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @llvm.fma.f64(double, double, double)
-declare float @llvm.fma.f32(float, float, float)
-
-define void @llvm_fma_f64(double* %varray) {
-; NEON-LABEL: define void @llvm_fma_f64
-; NEON-SAME: (ptr [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[TMP1:%.*]], <2 x double> [[TMP1]], <2 x double> [[TMP1]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @llvm_fma_f64
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 2 x double> @llvm.fma.nxv2f64(<vscale x 2 x double> [[TMP17:%.*]], <vscale x 2 x double> [[TMP17]], <vscale x 2 x double> [[TMP17]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to double
-  %call = tail call double @llvm.fma.f64(double %conv, double %conv, double %conv)
-  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
-  store double %call, double* %arrayidx, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @llvm_fma_f32(float* %varray) {
-; NEON-LABEL: define void @llvm_fma_f32
-; NEON-SAME: (ptr [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[TMP1:%.*]], <4 x float> [[TMP1]], <4 x float> [[TMP1]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @llvm_fma_f32
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 4 x float> @llvm.fma.nxv4f32(<vscale x 4 x float> [[TMP17:%.*]], <vscale x 4 x float> [[TMP17]], <vscale x 4 x float> [[TMP17]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to float
-  %call = tail call float @llvm.fma.f32(float %conv, float %conv, float %conv)
-  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
-  store float %call, float* %arrayidx, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @llvm.log.f64(double)
-declare float @llvm.log.f32(float)
-
-define void @llvm_log_f64(double* %varray) {
-; NEON-LABEL: define void @llvm_log_f64
-; NEON-SAME: (ptr [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_log(<2 x double> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @llvm_log_f64
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_log(<vscale x 2 x double> [[TMP17:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to double
-  %call = tail call double @llvm.log.f64(double %conv)
-  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
-  store double %call, double* %arrayidx, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @llvm_log_f32(float* %varray) {
-; NEON-LABEL: define void @llvm_log_f32
-; NEON-SAME: (ptr [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_logf(<4 x float> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @llvm_log_f32
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_logf(<vscale x 4 x float> [[TMP17:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to float
-  %call = tail call float @llvm.log.f32(float %conv)
-  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
-  store float %call, float* %arrayidx, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @llvm.log10.f64(double)
-declare float @llvm.log10.f32(float)
-
-define void @llvm_log10_f64(double* %varray) {
-; NEON-LABEL: define void @llvm_log10_f64
-; NEON-SAME: (ptr [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_log10(<2 x double> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @llvm_log10_f64
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_log10(<vscale x 2 x double> [[TMP17:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to double
-  %call = tail call double @llvm.log10.f64(double %conv)
-  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
-  store double %call, double* %arrayidx, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @llvm_log10_f32(float* %varray) {
-; NEON-LABEL: define void @llvm_log10_f32
-; NEON-SAME: (ptr [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_log10f(<4 x float> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @llvm_log10_f32
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_log10f(<vscale x 4 x float> [[TMP17:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to float
-  %call = tail call float @llvm.log10.f32(float %conv)
-  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
-  store float %call, float* %arrayidx, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @llvm.log2.f64(double)
-declare float @llvm.log2.f32(float)
-
-define void @llvm_log2_f64(double* %varray) {
-; NEON-LABEL: define void @llvm_log2_f64
-; NEON-SAME: (ptr [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_log2(<2 x double> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @llvm_log2_f64
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_log2(<vscale x 2 x double> [[TMP17:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to double
-  %call = tail call double @llvm.log2.f64(double %conv)
-  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
-  store double %call, double* %arrayidx, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @llvm_log2_f32(float* %varray) {
-; NEON-LABEL: define void @llvm_log2_f32
-; NEON-SAME: (ptr [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_log2f(<4 x float> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @llvm_log2_f32
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_log2f(<vscale x 4 x float> [[TMP17:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to float
-  %call = tail call float @llvm.log2.f32(float %conv)
-  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
-  store float %call, float* %arrayidx, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @llvm.maxnum.f64(double, double)
-declare float @llvm.maxnum.f32(float, float)
-
-define void @llvm_maxnum_f64(double* %varray) {
-; NEON-LABEL: define void @llvm_maxnum_f64
-; NEON-SAME: (ptr [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <2 x double> @llvm.maxnum.v2f64(<2 x double> [[TMP1:%.*]], <2 x double> [[TMP1]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @llvm_maxnum_f64
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 2 x double> @llvm.maxnum.nxv2f64(<vscale x 2 x double> [[TMP17:%.*]], <vscale x 2 x double> [[TMP17]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to double
-  %call = tail call double @llvm.maxnum.f64(double %conv, double %conv)
-  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
-  store double %call, double* %arrayidx, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @llvm_maxnum_f32(float* %varray) {
-; NEON-LABEL: define void @llvm_maxnum_f32
-; NEON-SAME: (ptr [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[TMP1:%.*]], <4 x float> [[TMP1]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @llvm_maxnum_f32
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 4 x float> @llvm.maxnum.nxv4f32(<vscale x 4 x float> [[TMP17:%.*]], <vscale x 4 x float> [[TMP17]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to float
-  %call = tail call float @llvm.maxnum.f32(float %conv, float %conv)
-  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
-  store float %call, float* %arrayidx, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @llvm.minnum.f64(double, double)
-declare float @llvm.minnum.f32(float, float)
-
-define void @llvm_minnum_f64(double* %varray) {
-; NEON-LABEL: define void @llvm_minnum_f64
-; NEON-SAME: (ptr [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <2 x double> @llvm.minnum.v2f64(<2 x double> [[TMP1:%.*]], <2 x double> [[TMP1]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @llvm_minnum_f64
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 2 x double> @llvm.minnum.nxv2f64(<vscale x 2 x double> [[TMP17:%.*]], <vscale x 2 x double> [[TMP17]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to double
-  %call = tail call double @llvm.minnum.f64(double %conv, double %conv)
-  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
-  store double %call, double* %arrayidx, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @llvm_minnum_f32(float* %varray) {
-; NEON-LABEL: define void @llvm_minnum_f32
-; NEON-SAME: (ptr [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[TMP1:%.*]], <4 x float> [[TMP1]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @llvm_minnum_f32
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 4 x float> @llvm.minnum.nxv4f32(<vscale x 4 x float> [[TMP17:%.*]], <vscale x 4 x float> [[TMP17]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to float
-  %call = tail call float @llvm.minnum.f32(float %conv, float %conv)
-  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
-  store float %call, float* %arrayidx, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @llvm.nearbyint.f64(double)
-declare float @llvm.nearbyint.f32(float)
-
-define void @llvm_nearbyint_f64(double* %varray) {
-; NEON-LABEL: define void @llvm_nearbyint_f64
-; NEON-SAME: (ptr [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @llvm_nearbyint_f64
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 2 x double> @llvm.nearbyint.nxv2f64(<vscale x 2 x double> [[TMP17:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to double
-  %call = tail call double @llvm.nearbyint.f64(double %conv)
-  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
-  store double %call, double* %arrayidx, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @llvm_nearbyint_f32(float* %varray) {
-; NEON-LABEL: define void @llvm_nearbyint_f32
-; NEON-SAME: (ptr [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @llvm_nearbyint_f32
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 4 x float> @llvm.nearbyint.nxv4f32(<vscale x 4 x float> [[TMP17:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to float
-  %call = tail call float @llvm.nearbyint.f32(float %conv)
-  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
-  store float %call, float* %arrayidx, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @llvm.pow.f64(double, double)
-declare float @llvm.pow.f32(float, float)
-
-define void @llvm_pow_f64(double* %varray) {
-; NEON-LABEL: define void @llvm_pow_f64
-; NEON-SAME: (ptr [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2vv_pow(<2 x double> [[TMP1:%.*]], <2 x double> [[TMP1]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @llvm_pow_f64
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 2 x double> @_ZGVsMxvv_pow(<vscale x 2 x double> [[TMP17:%.*]], <vscale x 2 x double> [[TMP17]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to double
-  %call = tail call double @llvm.pow.f64(double %conv, double %conv)
-  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
-  store double %call, double* %arrayidx, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @llvm_pow_f32(float* %varray) {
-; NEON-LABEL: define void @llvm_pow_f32
-; NEON-SAME: (ptr [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4vv_powf(<4 x float> [[TMP1:%.*]], <4 x float> [[TMP1]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @llvm_pow_f32
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 4 x float> @_ZGVsMxvv_powf(<vscale x 4 x float> [[TMP17:%.*]], <vscale x 4 x float> [[TMP17]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to float
-  %call = tail call float @llvm.pow.f32(float %conv, float %conv)
-  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
-  store float %call, float* %arrayidx, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @llvm.rint.f64(double)
-declare float @llvm.rint.f32(float)
-
-define void @llvm_rint_f64(double* %varray) {
-; NEON-LABEL: define void @llvm_rint_f64
-; NEON-SAME: (ptr [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <2 x double> @llvm.rint.v2f64(<2 x double> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @llvm_rint_f64
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 2 x double> @llvm.rint.nxv2f64(<vscale x 2 x double> [[TMP17:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to double
-  %call = tail call double @llvm.rint.f64(double %conv)
-  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
-  store double %call, double* %arrayidx, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @llvm_rint_f32(float* %varray) {
-; NEON-LABEL: define void @llvm_rint_f32
-; NEON-SAME: (ptr [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <4 x float> @llvm.rint.v4f32(<4 x float> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @llvm_rint_f32
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 4 x float> @llvm.rint.nxv4f32(<vscale x 4 x float> [[TMP17:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to float
-  %call = tail call float @llvm.rint.f32(float %conv)
-  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
-  store float %call, float* %arrayidx, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @llvm.round.f64(double)
-declare float @llvm.round.f32(float)
-
-define void @llvm_round_f64(double* %varray) {
-; NEON-LABEL: define void @llvm_round_f64
-; NEON-SAME: (ptr [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <2 x double> @llvm.round.v2f64(<2 x double> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @llvm_round_f64
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 2 x double> @llvm.round.nxv2f64(<vscale x 2 x double> [[TMP17:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to double
-  %call = tail call double @llvm.round.f64(double %conv)
-  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
-  store double %call, double* %arrayidx, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @llvm_round_f32(float* %varray) {
-; NEON-LABEL: define void @llvm_round_f32
-; NEON-SAME: (ptr [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <4 x float> @llvm.round.v4f32(<4 x float> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @llvm_round_f32
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 4 x float> @llvm.round.nxv4f32(<vscale x 4 x float> [[TMP17:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to float
-  %call = tail call float @llvm.round.f32(float %conv)
-  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
-  store float %call, float* %arrayidx, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @llvm.sin.f64(double)
-declare float @llvm.sin.f32(float)
-
-define void @llvm_sin_f64(double* %varray) {
-; NEON-LABEL: define void @llvm_sin_f64
-; NEON-SAME: (ptr [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <2 x double> @_ZGVnN2v_sin(<2 x double> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @llvm_sin_f64
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_sin(<vscale x 2 x double> [[TMP17:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to double
-  %call = tail call double @llvm.sin.f64(double %conv)
-  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
-  store double %call, double* %arrayidx, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @llvm_sin_f32(float* %varray) {
-; NEON-LABEL: define void @llvm_sin_f32
-; NEON-SAME: (ptr [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <4 x float> @_ZGVnN4v_sinf(<4 x float> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @llvm_sin_f32
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_sinf(<vscale x 4 x float> [[TMP17:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to float
-  %call = tail call float @llvm.sin.f32(float %conv)
-  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
-  store float %call, float* %arrayidx, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @llvm.sqrt.f64(double)
-declare float @llvm.sqrt.f32(float)
-
-define void @llvm_sqrt_f64(double* %varray) {
-; NEON-LABEL: define void @llvm_sqrt_f64
-; NEON-SAME: (ptr [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @llvm_sqrt_f64
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 2 x double> @llvm.sqrt.nxv2f64(<vscale x 2 x double> [[TMP17:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to double
-  %call = tail call double @llvm.sqrt.f64(double %conv)
-  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
-  store double %call, double* %arrayidx, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @llvm_sqrt_f32(float* %varray) {
-; NEON-LABEL: define void @llvm_sqrt_f32
-; NEON-SAME: (ptr [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <4 x float> @llvm.sqrt.v4f32(<4 x float> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @llvm_sqrt_f32
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 4 x float> @llvm.sqrt.nxv4f32(<vscale x 4 x float> [[TMP17:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to float
-  %call = tail call float @llvm.sqrt.f32(float %conv)
-  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
-  store float %call, float* %arrayidx, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-declare double @llvm.trunc.f64(double)
-declare float @llvm.trunc.f32(float)
-
-define void @llvm_trunc_f64(double* %varray) {
-; NEON-LABEL: define void @llvm_trunc_f64
-; NEON-SAME: (ptr [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <2 x double> @llvm.trunc.v2f64(<2 x double> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @llvm_trunc_f64
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 2 x double> @llvm.trunc.nxv2f64(<vscale x 2 x double> [[TMP17:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to double
-  %call = tail call double @llvm.trunc.f64(double %conv)
-  %arrayidx = getelementptr inbounds double, double* %varray, i64 %iv
-  store double %call, double* %arrayidx, align 8
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
-
-define void @llvm_trunc_f32(float* %varray) {
-; NEON-LABEL: define void @llvm_trunc_f32
-; NEON-SAME: (ptr [[VARRAY:%.*]]) {
-; NEON:    [[TMP2:%.*]] = call <4 x float> @llvm.trunc.v4f32(<4 x float> [[TMP1:%.*]])
-; NEON:    ret void
-;
-; SVE-LABEL: define void @llvm_trunc_f32
-; SVE-SAME: (ptr [[VARRAY:%.*]]) #[[ATTR1]] {
-; SVE:    [[TMP18:%.*]] = call <vscale x 4 x float> @llvm.trunc.nxv4f32(<vscale x 4 x float> [[TMP17:%.*]])
-; SVE:    ret void
-;
-  entry:
-  br label %for.body
-
-  for.body:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %tmp = trunc i64 %iv to i32
-  %conv = sitofp i32 %tmp to float
-  %call = tail call float @llvm.trunc.f32(float %conv)
-  %arrayidx = getelementptr inbounds float, float* %varray, i64 %iv
-  store float %call, float* %arrayidx, align 4
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, 1000
-  br i1 %exitcond, label %for.end, label %for.body
-
-  for.end:
-  ret void
-}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/veclib-function-calls.ll b/llvm/test/Transforms/LoopVectorize/AArch64/veclib-function-calls.ll
new file mode 100644
index 0000000000000..ebee4fa42e9bf
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/veclib-function-calls.ll
@@ -0,0 +1,2641 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter "call.*(cos|sin|tan|cbrt|erf|exp|gamma|log|sqrt|copysign|dim|min|mod|hypot|nextafter|pow|fma)" --version 2
+; RUN: opt -mattr=+neon -vector-library=sleefgnuabi -passes=inject-tli-mappings,loop-vectorize,simplifycfg -force-vector-interleave=1 -S < %s | FileCheck %s --check-prefix=SLEEF-NEON
+; RUN: opt -mattr=+sve -vector-library=sleefgnuabi -passes=inject-tli-mappings,loop-vectorize,simplifycfg -force-vector-interleave=1 -prefer-predicate-over-epilogue=predicate-dont-vectorize -S < %s | FileCheck %s --check-prefix=SLEEF-SVE
+; RUN: opt -mattr=+neon -vector-library=ArmPL -passes=inject-tli-mappings,loop-vectorize,simplifycfg -force-vector-interleave=1 -S < %s | FileCheck %s --check-prefix=ARMPL-NEON
+; RUN: opt -mattr=+sve -vector-library=ArmPL -passes=inject-tli-mappings,loop-vectorize,simplifycfg -force-vector-interleave=1 -prefer-predicate-over-epilogue=predicate-dont-vectorize -S < %s | FileCheck %s --check-prefix=ARMPL-SVE
+
+target triple = "aarch64-unknown-linux-gnu"
+
+; We are checking whether loops containing function calls can be vectorized,
+; when the compiler provides TLI mappings to their vector variants. The tests
+; are checking fixed width vectorization with NEON and scalable vectorization
+; with SVE.
+
+declare double @acos(double)
+declare float @acosf(float)
+
+define void @acos_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @acos_f64
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0:[0-9]+]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <2 x double> @_ZGVnN2v_acos(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @acos_f64
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0:[0-9]+]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_acos(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @acos_f64
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0:[0-9]+]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vacosq_f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @acos_f64
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0:[0-9]+]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svacos_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
+  %in = load double, ptr %in.gep, align 8
+  %call = tail call double @acos(double %in)
+  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
+  store double %call, ptr %out.gep, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @acos_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @acos_f32
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <4 x float> @_ZGVnN4v_acosf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @acos_f32
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_acosf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @acos_f32
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vacosq_f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @acos_f32
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svacos_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
+  %in = load float, ptr %in.gep, align 8
+  %call = tail call float @acosf(float %in)
+  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
+  store float %call, ptr %out.gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @acosh(double)
+declare float @acoshf(float)
+
+define void @acosh_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @acosh_f64
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[CALL:%.*]] = tail call double @acosh(double [[IN:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @acosh_f64
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[CALL:%.*]] = tail call double @acosh(double [[IN:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @acosh_f64
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vacoshq_f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @acosh_f64
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svacosh_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
+  %in = load double, ptr %in.gep, align 8
+  %call = tail call double @acosh(double %in)
+  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
+  store double %call, ptr %out.gep, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @acosh_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @acosh_f32
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[CALL:%.*]] = tail call float @acoshf(float [[IN:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @acosh_f32
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[CALL:%.*]] = tail call float @acoshf(float [[IN:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @acosh_f32
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vacoshq_f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @acosh_f32
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svacosh_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
+  %in = load float, ptr %in.gep, align 8
+  %call = tail call float @acoshf(float %in)
+  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
+  store float %call, ptr %out.gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @asin(double)
+declare float @asinf(float)
+
+define void @asin_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @asin_f64
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <2 x double> @_ZGVnN2v_asin(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @asin_f64
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_asin(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @asin_f64
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vasinq_f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @asin_f64
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svasin_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
+  %in = load double, ptr %in.gep, align 8
+  %call = tail call double @asin(double %in)
+  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
+  store double %call, ptr %out.gep, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @asin_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @asin_f32
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <4 x float> @_ZGVnN4v_asinf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @asin_f32
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_asinf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @asin_f32
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vasinq_f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @asin_f32
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svasin_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
+  %in = load float, ptr %in.gep, align 8
+  %call = tail call float @asinf(float %in)
+  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
+  store float %call, ptr %out.gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @asinh(double)
+declare float @asinhf(float)
+
+define void @asinh_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @asinh_f64
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[CALL:%.*]] = tail call double @asinh(double [[IN:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @asinh_f64
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[CALL:%.*]] = tail call double @asinh(double [[IN:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @asinh_f64
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vasinhq_f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @asinh_f64
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svasinh_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
+  %in = load double, ptr %in.gep, align 8
+  %call = tail call double @asinh(double %in)
+  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
+  store double %call, ptr %out.gep, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @asinh_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @asinh_f32
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[CALL:%.*]] = tail call float @asinhf(float [[IN:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @asinh_f32
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[CALL:%.*]] = tail call float @asinhf(float [[IN:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @asinh_f32
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vasinhq_f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @asinh_f32
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svasinh_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
+  %in = load float, ptr %in.gep, align 8
+  %call = tail call float @asinhf(float %in)
+  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
+  store float %call, ptr %out.gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @atan(double)
+declare float @atanf(float)
+
+define void @atan_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @atan_f64
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <2 x double> @_ZGVnN2v_atan(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @atan_f64
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_atan(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @atan_f64
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vatanq_f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @atan_f64
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svatan_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
+  %in = load double, ptr %in.gep, align 8
+  %call = tail call double @atan(double %in)
+  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
+  store double %call, ptr %out.gep, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @atan_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @atan_f32
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <4 x float> @_ZGVnN4v_atanf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @atan_f32
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_atanf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @atan_f32
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vatanq_f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @atan_f32
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svatan_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
+  %in = load float, ptr %in.gep, align 8
+  %call = tail call float @atanf(float %in)
+  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
+  store float %call, ptr %out.gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @atanh(double)
+declare float @atanhf(float)
+
+define void @atanh_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @atanh_f64
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <2 x double> @_ZGVnN2v_atanh(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @atanh_f64
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_atanh(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @atanh_f64
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vatanhq_f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @atanh_f64
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svatanh_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
+  %in = load double, ptr %in.gep, align 8
+  %call = tail call double @atanh(double %in)
+  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
+  store double %call, ptr %out.gep, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @atanh_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @atanh_f32
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <4 x float> @_ZGVnN4v_atanhf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @atanh_f32
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_atanhf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @atanh_f32
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vatanhq_f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @atanh_f32
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svatanh_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
+  %in = load float, ptr %in.gep, align 8
+  %call = tail call float @atanhf(float %in)
+  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
+  store float %call, ptr %out.gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @cbrt(double)
+declare float @cbrtf(float)
+
+define void @cbrt_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @cbrt_f64
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[CALL:%.*]] = tail call double @cbrt(double [[IN:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @cbrt_f64
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[CALL:%.*]] = tail call double @cbrt(double [[IN:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @cbrt_f64
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vcbrtq_f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @cbrt_f64
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svcbrt_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
+  %in = load double, ptr %in.gep, align 8
+  %call = tail call double @cbrt(double %in)
+  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
+  store double %call, ptr %out.gep, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @cbrt_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @cbrt_f32
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[CALL:%.*]] = tail call float @cbrtf(float [[IN:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @cbrt_f32
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[CALL:%.*]] = tail call float @cbrtf(float [[IN:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @cbrt_f32
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vcbrtq_f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @cbrt_f32
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svcbrt_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
+  %in = load float, ptr %in.gep, align 8
+  %call = tail call float @cbrtf(float %in)
+  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
+  store float %call, ptr %out.gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @cos(double)
+declare float @cosf(float)
+
+define void @cos_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @cos_f64
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <2 x double> @_ZGVnN2v_cos(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @cos_f64
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_cos(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @cos_f64
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vcosq_f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @cos_f64
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svcos_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
+  %in = load double, ptr %in.gep, align 8
+  %call = tail call double @cos(double %in)
+  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
+  store double %call, ptr %out.gep, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @cos_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @cos_f32
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <4 x float> @_ZGVnN4v_cosf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @cos_f32
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_cosf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @cos_f32
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vcosq_f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @cos_f32
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svcos_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
+  %in = load float, ptr %in.gep, align 8
+  %call = tail call float @cosf(float %in)
+  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
+  store float %call, ptr %out.gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @cosh(double)
+declare float @coshf(float)
+
+define void @cosh_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @cosh_f64
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <2 x double> @_ZGVnN2v_cosh(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @cosh_f64
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_cosh(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @cosh_f64
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vcoshq_f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @cosh_f64
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svcosh_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
+  %in = load double, ptr %in.gep, align 8
+  %call = tail call double @cosh(double %in)
+  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
+  store double %call, ptr %out.gep, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @cosh_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @cosh_f32
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <4 x float> @_ZGVnN4v_coshf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @cosh_f32
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_coshf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @cosh_f32
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vcoshq_f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @cosh_f32
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svcosh_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
+  %in = load float, ptr %in.gep, align 8
+  %call = tail call float @coshf(float %in)
+  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
+  store float %call, ptr %out.gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @erf(double)
+declare float @erff(float)
+
+define void @erf_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @erf_f64
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[CALL:%.*]] = tail call double @erf(double [[IN:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @erf_f64
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[CALL:%.*]] = tail call double @erf(double [[IN:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @erf_f64
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_verfq_f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @erf_f64
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_sverf_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
+  %in = load double, ptr %in.gep, align 8
+  %call = tail call double @erf(double %in)
+  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
+  store double %call, ptr %out.gep, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @erf_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @erf_f32
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[CALL:%.*]] = tail call float @erff(float [[IN:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @erf_f32
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[CALL:%.*]] = tail call float @erff(float [[IN:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @erf_f32
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_verfq_f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @erf_f32
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_sverf_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
+  %in = load float, ptr %in.gep, align 8
+  %call = tail call float @erff(float %in)
+  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
+  store float %call, ptr %out.gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @erfc(double)
+declare float @erfcf(float)
+
+define void @erfc_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @erfc_f64
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[CALL:%.*]] = tail call double @erfc(double [[IN:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @erfc_f64
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[CALL:%.*]] = tail call double @erfc(double [[IN:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @erfc_f64
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_verfcq_f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @erfc_f64
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_sverfc_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
+  %in = load double, ptr %in.gep, align 8
+  %call = tail call double @erfc(double %in)
+  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
+  store double %call, ptr %out.gep, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @erfc_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @erfc_f32
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[CALL:%.*]] = tail call float @erfcf(float [[IN:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @erfc_f32
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[CALL:%.*]] = tail call float @erfcf(float [[IN:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @erfc_f32
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_verfcq_f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @erfc_f32
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_sverfc_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
+  %in = load float, ptr %in.gep, align 8
+  %call = tail call float @erfcf(float %in)
+  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
+  store float %call, ptr %out.gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @exp(double)
+declare float @expf(float)
+
+define void @exp_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @exp_f64
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <2 x double> @_ZGVnN2v_exp(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @exp_f64
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_exp(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @exp_f64
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vexpq_f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @exp_f64
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svexp_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
+  %in = load double, ptr %in.gep, align 8
+  %call = tail call double @exp(double %in)
+  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
+  store double %call, ptr %out.gep, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @exp_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @exp_f32
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <4 x float> @_ZGVnN4v_expf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @exp_f32
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_expf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @exp_f32
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vexpq_f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @exp_f32
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svexp_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
+  %in = load float, ptr %in.gep, align 8
+  %call = tail call float @expf(float %in)
+  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
+  store float %call, ptr %out.gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @exp2(double)
+declare float @exp2f(float)
+
+define void @exp2_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @exp2_f64
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <2 x double> @_ZGVnN2v_exp2(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @exp2_f64
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_exp2(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @exp2_f64
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vexp2q_f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @exp2_f64
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svexp2_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
+  %in = load double, ptr %in.gep, align 8
+  %call = tail call double @exp2(double %in)
+  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
+  store double %call, ptr %out.gep, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @exp2_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @exp2_f32
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <4 x float> @_ZGVnN4v_exp2f(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @exp2_f32
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_exp2f(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @exp2_f32
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vexp2q_f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @exp2_f32
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svexp2_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
+  %in = load float, ptr %in.gep, align 8
+  %call = tail call float @exp2f(float %in)
+  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
+  store float %call, ptr %out.gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @exp10(double)
+declare float @exp10f(float)
+
+define void @exp10_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @exp10_f64
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <2 x double> @_ZGVnN2v_exp10(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @exp10_f64
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_exp10(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @exp10_f64
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vexp10q_f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @exp10_f64
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svexp10_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
+  %in = load double, ptr %in.gep, align 8
+  %call = tail call double @exp10(double %in)
+  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
+  store double %call, ptr %out.gep, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @exp10_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @exp10_f32
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <4 x float> @_ZGVnN4v_exp10f(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @exp10_f32
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_exp10f(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @exp10_f32
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vexp10q_f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @exp10_f32
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svexp10_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
+  %in = load float, ptr %in.gep, align 8
+  %call = tail call float @exp10f(float %in)
+  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
+  store float %call, ptr %out.gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @expm1(double)
+declare float @expm1f(float)
+
+define void @expm1_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @expm1_f64
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[CALL:%.*]] = tail call double @expm1(double [[IN:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @expm1_f64
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[CALL:%.*]] = tail call double @expm1(double [[IN:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @expm1_f64
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vexpm1q_f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @expm1_f64
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svexpm1_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
+  %in = load double, ptr %in.gep, align 8
+  %call = tail call double @expm1(double %in)
+  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
+  store double %call, ptr %out.gep, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @expm1_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @expm1_f32
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[CALL:%.*]] = tail call float @expm1f(float [[IN:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @expm1_f32
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[CALL:%.*]] = tail call float @expm1f(float [[IN:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @expm1_f32
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vexpm1q_f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @expm1_f32
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svexpm1_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
+  %in = load float, ptr %in.gep, align 8
+  %call = tail call float @expm1f(float %in)
+  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
+  store float %call, ptr %out.gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @lgamma(double)
+declare float @lgammaf(float)
+
+define void @lgamma_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @lgamma_f64
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <2 x double> @_ZGVnN2v_lgamma(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @lgamma_f64
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_lgamma(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @lgamma_f64
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vlgammaq_f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @lgamma_f64
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svlgamma_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
+  %in = load double, ptr %in.gep, align 8
+  %call = tail call double @lgamma(double %in)
+  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
+  store double %call, ptr %out.gep, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @lgamma_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @lgamma_f32
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <4 x float> @_ZGVnN4v_lgammaf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @lgamma_f32
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_lgammaf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @lgamma_f32
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vlgammaq_f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @lgamma_f32
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svlgamma_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
+  %in = load float, ptr %in.gep, align 8
+  %call = tail call float @lgammaf(float %in)
+  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
+  store float %call, ptr %out.gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @log(double)
+declare float @logf(float)
+
+define void @log_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @log_f64
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <2 x double> @_ZGVnN2v_log(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @log_f64
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_log(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @log_f64
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vlogq_f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @log_f64
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svlog_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
+  %in = load double, ptr %in.gep, align 8
+  %call = tail call double @log(double %in)
+  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
+  store double %call, ptr %out.gep, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @log_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @log_f32
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <4 x float> @_ZGVnN4v_logf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @log_f32
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_logf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @log_f32
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vlogq_f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @log_f32
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svlog_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
+  %in = load float, ptr %in.gep, align 8
+  %call = tail call float @logf(float %in)
+  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
+  store float %call, ptr %out.gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @log1p(double)
+declare float @log1pf(float)
+
+define void @log1p_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @log1p_f64
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[CALL:%.*]] = tail call double @log1p(double [[IN:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @log1p_f64
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[CALL:%.*]] = tail call double @log1p(double [[IN:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @log1p_f64
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vlog1pq_f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @log1p_f64
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svlog1p_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
+  %in = load double, ptr %in.gep, align 8
+  %call = tail call double @log1p(double %in)
+  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
+  store double %call, ptr %out.gep, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @log1p_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @log1p_f32
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[CALL:%.*]] = tail call float @log1pf(float [[IN:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @log1p_f32
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[CALL:%.*]] = tail call float @log1pf(float [[IN:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @log1p_f32
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vlog1pq_f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @log1p_f32
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svlog1p_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
+  %in = load float, ptr %in.gep, align 8
+  %call = tail call float @log1pf(float %in)
+  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
+  store float %call, ptr %out.gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @log2(double)
+declare float @log2f(float)
+
+define void @log2_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @log2_f64
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <2 x double> @_ZGVnN2v_log2(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @log2_f64
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_log2(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @log2_f64
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vlog2q_f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @log2_f64
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svlog2_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
+  %in = load double, ptr %in.gep, align 8
+  %call = tail call double @log2(double %in)
+  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
+  store double %call, ptr %out.gep, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @log2_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @log2_f32
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <4 x float> @_ZGVnN4v_log2f(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @log2_f32
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_log2f(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @log2_f32
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vlog2q_f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @log2_f32
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svlog2_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
+  %in = load float, ptr %in.gep, align 8
+  %call = tail call float @log2f(float %in)
+  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
+  store float %call, ptr %out.gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @log10(double)
+declare float @log10f(float)
+
+define void @log10_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @log10_f64
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <2 x double> @_ZGVnN2v_log10(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @log10_f64
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_log10(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @log10_f64
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vlog10q_f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @log10_f64
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svlog10_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
+  %in = load double, ptr %in.gep, align 8
+  %call = tail call double @log10(double %in)
+  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
+  store double %call, ptr %out.gep, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @log10_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @log10_f32
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <4 x float> @_ZGVnN4v_log10f(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @log10_f32
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_log10f(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @log10_f32
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vlog10q_f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @log10_f32
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svlog10_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
+  %in = load float, ptr %in.gep, align 8
+  %call = tail call float @log10f(float %in)
+  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
+  store float %call, ptr %out.gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @sin(double)
+declare float @sinf(float)
+
+define void @sin_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @sin_f64
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <2 x double> @_ZGVnN2v_sin(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @sin_f64
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_sin(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @sin_f64
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vsinq_f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @sin_f64
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svsin_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
+  %in = load double, ptr %in.gep, align 8
+  %call = tail call double @sin(double %in)
+  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
+  store double %call, ptr %out.gep, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @sin_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @sin_f32
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <4 x float> @_ZGVnN4v_sinf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @sin_f32
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_sinf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @sin_f32
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vsinq_f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @sin_f32
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svsin_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
+  %in = load float, ptr %in.gep, align 8
+  %call = tail call float @sinf(float %in)
+  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
+  store float %call, ptr %out.gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @sinh(double)
+declare float @sinhf(float)
+
+define void @sinh_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @sinh_f64
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <2 x double> @_ZGVnN2v_sinh(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @sinh_f64
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_sinh(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @sinh_f64
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vsinhq_f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @sinh_f64
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svsinh_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
+  %in = load double, ptr %in.gep, align 8
+  %call = tail call double @sinh(double %in)
+  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
+  store double %call, ptr %out.gep, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @sinh_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @sinh_f32
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <4 x float> @_ZGVnN4v_sinhf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @sinh_f32
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_sinhf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @sinh_f32
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vsinhq_f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @sinh_f32
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svsinh_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
+  %in = load float, ptr %in.gep, align 8
+  %call = tail call float @sinhf(float %in)
+  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
+  store float %call, ptr %out.gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @sinpi(double)
+declare float @sinpif(float)
+
+define void @sinpi_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @sinpi_f64
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[CALL:%.*]] = tail call double @sinpi(double [[IN:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @sinpi_f64
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[CALL:%.*]] = tail call double @sinpi(double [[IN:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @sinpi_f64
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vsinpiq_f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @sinpi_f64
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svsinpi_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
+  %in = load double, ptr %in.gep, align 8
+  %call = tail call double @sinpi(double %in)
+  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
+  store double %call, ptr %out.gep, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @sinpi_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @sinpi_f32
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[CALL:%.*]] = tail call float @sinpif(float [[IN:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @sinpi_f32
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[CALL:%.*]] = tail call float @sinpif(float [[IN:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @sinpi_f32
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vsinpiq_f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @sinpi_f32
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svsinpi_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
+  %in = load float, ptr %in.gep, align 8
+  %call = tail call float @sinpif(float %in)
+  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
+  store float %call, ptr %out.gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @sqrt(double)
+declare float @sqrtf(float)
+
+define void @sqrt_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @sqrt_f64
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <2 x double> @_ZGVnN2v_sqrt(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @sqrt_f64
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_sqrt(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @sqrt_f64
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vsqrtq_f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @sqrt_f64
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svsqrt_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
+  %in = load double, ptr %in.gep, align 8
+  %call = tail call double @sqrt(double %in)
+  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
+  store double %call, ptr %out.gep, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @sqrt_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @sqrt_f32
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <4 x float> @_ZGVnN4v_sqrtf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @sqrt_f32
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_sqrtf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @sqrt_f32
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vsqrtq_f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @sqrt_f32
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svsqrt_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
+  %in = load float, ptr %in.gep, align 8
+  %call = tail call float @sqrtf(float %in)
+  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
+  store float %call, ptr %out.gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @tan(double)
+declare float @tanf(float)
+
+define void @tan_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @tan_f64
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <2 x double> @_ZGVnN2v_tan(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @tan_f64
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_tan(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @tan_f64
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vtanq_f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @tan_f64
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svtan_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
+  %in = load double, ptr %in.gep, align 8
+  %call = tail call double @tan(double %in)
+  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
+  store double %call, ptr %out.gep, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @tan_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @tan_f32
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <4 x float> @_ZGVnN4v_tanf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @tan_f32
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_tanf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @tan_f32
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vtanq_f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @tan_f32
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svtan_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
+  %in = load float, ptr %in.gep, align 8
+  %call = tail call float @tanf(float %in)
+  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
+  store float %call, ptr %out.gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @tanh(double)
+declare float @tanhf(float)
+
+define void @tanh_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @tanh_f64
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <2 x double> @_ZGVnN2v_tanh(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @tanh_f64
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_tanh(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @tanh_f64
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vtanhq_f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @tanh_f64
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svtanh_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
+  %in = load double, ptr %in.gep, align 8
+  %call = tail call double @tanh(double %in)
+  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
+  store double %call, ptr %out.gep, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @tanh_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @tanh_f32
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <4 x float> @_ZGVnN4v_tanhf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @tanh_f32
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_tanhf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @tanh_f32
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vtanhq_f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @tanh_f32
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svtanh_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
+  %in = load float, ptr %in.gep, align 8
+  %call = tail call float @tanhf(float %in)
+  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
+  store float %call, ptr %out.gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @tgamma(double)
+declare float @tgammaf(float)
+
+define void @tgamma_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @tgamma_f64
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <2 x double> @_ZGVnN2v_tgamma(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @tgamma_f64
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_tgamma(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @tgamma_f64
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vtgammaq_f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @tgamma_f64
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svtgamma_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
+  %in = load double, ptr %in.gep, align 8
+  %call = tail call double @tgamma(double %in)
+  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
+  store double %call, ptr %out.gep, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @tgamma_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @tgamma_f32
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <4 x float> @_ZGVnN4v_tgammaf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @tgamma_f32
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_tgammaf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @tgamma_f32
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vtgammaq_f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @tgamma_f32
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svtgamma_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
+  %in = load float, ptr %in.gep, align 8
+  %call = tail call float @tgammaf(float %in)
+  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
+  store float %call, ptr %out.gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @atan2(double, double)
+declare float @atan2f(float, float)
+
+define void @atan2_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @atan2_f64
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <2 x double> @_ZGVnN2vv_atan2(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
+;
+; SLEEF-SVE-LABEL: define void @atan2_f64
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @_ZGVsMxvv_atan2(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x double> [[WIDE_MASKED_LOAD]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @atan2_f64
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vatan2q_f64(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
+;
+; ARMPL-SVE-LABEL: define void @atan2_f64
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svatan2_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x double> [[WIDE_MASKED_LOAD]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
+  %in = load double, ptr %in.gep, align 8
+  %call = tail call double @atan2(double %in, double %in)
+  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
+  store double %call, ptr %out.gep, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @atan2_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @atan2_f32
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <4 x float> @_ZGVnN4vv_atan2f(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]])
+;
+; SLEEF-SVE-LABEL: define void @atan2_f32
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @_ZGVsMxvv_atan2f(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @atan2_f32
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vatan2q_f32(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]])
+;
+; ARMPL-SVE-LABEL: define void @atan2_f32
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svatan2_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
+  %in = load float, ptr %in.gep, align 8
+  %call = tail call float @atan2f(float %in, float %in)
+  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
+  store float %call, ptr %out.gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @copysign(double, double)
+declare float @copysignf(float, float)
+
+define void @copysign_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @copysign_f64
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[CALL:%.*]] = tail call double @copysign(double [[IN:%.*]], double [[IN]])
+;
+; SLEEF-SVE-LABEL: define void @copysign_f64
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[CALL:%.*]] = tail call double @copysign(double [[IN:%.*]], double [[IN]])
+;
+; ARMPL-NEON-LABEL: define void @copysign_f64
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vcopysignq_f64(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
+;
+; ARMPL-SVE-LABEL: define void @copysign_f64
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svcopysign_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x double> [[WIDE_MASKED_LOAD]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
+  %in = load double, ptr %in.gep, align 8
+  %call = tail call double @copysign(double %in, double %in)
+  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
+  store double %call, ptr %out.gep, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @copysign_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @copysign_f32
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[CALL:%.*]] = tail call float @copysignf(float [[IN:%.*]], float [[IN]])
+;
+; SLEEF-SVE-LABEL: define void @copysign_f32
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[CALL:%.*]] = tail call float @copysignf(float [[IN:%.*]], float [[IN]])
+;
+; ARMPL-NEON-LABEL: define void @copysign_f32
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vcopysignq_f32(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]])
+;
+; ARMPL-SVE-LABEL: define void @copysign_f32
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svcopysign_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
+  %in = load float, ptr %in.gep, align 8
+  %call = tail call float @copysignf(float %in, float %in)
+  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
+  store float %call, ptr %out.gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @fdim(double, double)
+declare float @fdimf(float, float)
+
+define void @fdim_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @fdim_f64
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[CALL:%.*]] = tail call double @fdim(double [[IN:%.*]], double [[IN]])
+;
+; SLEEF-SVE-LABEL: define void @fdim_f64
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[CALL:%.*]] = tail call double @fdim(double [[IN:%.*]], double [[IN]])
+;
+; ARMPL-NEON-LABEL: define void @fdim_f64
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vfdimq_f64(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
+;
+; ARMPL-SVE-LABEL: define void @fdim_f64
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svfdim_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x double> [[WIDE_MASKED_LOAD]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
+  %in = load double, ptr %in.gep, align 8
+  %call = tail call double @fdim(double %in, double %in)
+  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
+  store double %call, ptr %out.gep, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @fdim_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @fdim_f32
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[CALL:%.*]] = tail call float @fdimf(float [[IN:%.*]], float [[IN]])
+;
+; SLEEF-SVE-LABEL: define void @fdim_f32
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[CALL:%.*]] = tail call float @fdimf(float [[IN:%.*]], float [[IN]])
+;
+; ARMPL-NEON-LABEL: define void @fdim_f32
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vfdimq_f32(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]])
+;
+; ARMPL-SVE-LABEL: define void @fdim_f32
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svfdim_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
+  %in = load float, ptr %in.gep, align 8
+  %call = tail call float @fdimf(float %in, float %in)
+  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
+  store float %call, ptr %out.gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @fmin(double, double)
+declare float @fminf(float, float)
+
+define void @fmin_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @fmin_f64
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[CALL:%.*]] = tail call double @fmin(double [[IN:%.*]], double [[IN]])
+;
+; SLEEF-SVE-LABEL: define void @fmin_f64
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[CALL:%.*]] = tail call double @fmin(double [[IN:%.*]], double [[IN]])
+;
+; ARMPL-NEON-LABEL: define void @fmin_f64
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vfminq_f64(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
+;
+; ARMPL-SVE-LABEL: define void @fmin_f64
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svfmin_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x double> [[WIDE_MASKED_LOAD]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
+  %in = load double, ptr %in.gep, align 8
+  %call = tail call double @fmin(double %in, double %in)
+  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
+  store double %call, ptr %out.gep, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @fmin_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @fmin_f32
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[CALL:%.*]] = tail call float @fminf(float [[IN:%.*]], float [[IN]])
+;
+; SLEEF-SVE-LABEL: define void @fmin_f32
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[CALL:%.*]] = tail call float @fminf(float [[IN:%.*]], float [[IN]])
+;
+; ARMPL-NEON-LABEL: define void @fmin_f32
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vfminq_f32(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]])
+;
+; ARMPL-SVE-LABEL: define void @fmin_f32
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svfmin_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
+  %in = load float, ptr %in.gep, align 8
+  %call = tail call float @fminf(float %in, float %in)
+  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
+  store float %call, ptr %out.gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @fmod(double, double)
+declare float @fmodf(float, float)
+
+define void @fmod_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @fmod_f64
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <2 x double> @_ZGVnN2vv_fmod(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
+;
+; SLEEF-SVE-LABEL: define void @fmod_f64
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @_ZGVsMxvv_fmod(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x double> [[WIDE_MASKED_LOAD]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @fmod_f64
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vfmodq_f64(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
+;
+; ARMPL-SVE-LABEL: define void @fmod_f64
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svfmod_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x double> [[WIDE_MASKED_LOAD]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
+  %in = load double, ptr %in.gep, align 8
+  %call = tail call double @fmod(double %in, double %in)
+  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
+  store double %call, ptr %out.gep, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @fmod_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @fmod_f32
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <4 x float> @_ZGVnN4vv_fmodf(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]])
+;
+; SLEEF-SVE-LABEL: define void @fmod_f32
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @_ZGVsMxvv_fmodf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @fmod_f32
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vfmodq_f32(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]])
+;
+; ARMPL-SVE-LABEL: define void @fmod_f32
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svfmod_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
+  %in = load float, ptr %in.gep, align 8
+  %call = tail call float @fmodf(float %in, float %in)
+  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
+  store float %call, ptr %out.gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @hypot(double, double)
+declare float @hypotf(float, float)
+
+define void @hypot_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @hypot_f64
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[CALL:%.*]] = tail call double @hypot(double [[IN:%.*]], double [[IN]])
+;
+; SLEEF-SVE-LABEL: define void @hypot_f64
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[CALL:%.*]] = tail call double @hypot(double [[IN:%.*]], double [[IN]])
+;
+; ARMPL-NEON-LABEL: define void @hypot_f64
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vhypotq_f64(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
+;
+; ARMPL-SVE-LABEL: define void @hypot_f64
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svhypot_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x double> [[WIDE_MASKED_LOAD]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
+  %in = load double, ptr %in.gep, align 8
+  %call = tail call double @hypot(double %in, double %in)
+  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
+  store double %call, ptr %out.gep, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @hypot_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @hypot_f32
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[CALL:%.*]] = tail call float @hypotf(float [[IN:%.*]], float [[IN]])
+;
+; SLEEF-SVE-LABEL: define void @hypot_f32
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[CALL:%.*]] = tail call float @hypotf(float [[IN:%.*]], float [[IN]])
+;
+; ARMPL-NEON-LABEL: define void @hypot_f32
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vhypotq_f32(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]])
+;
+; ARMPL-SVE-LABEL: define void @hypot_f32
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svhypot_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
+  %in = load float, ptr %in.gep, align 8
+  %call = tail call float @hypotf(float %in, float %in)
+  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
+  store float %call, ptr %out.gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @nextafter(double, double)
+declare float @nextafterf(float, float)
+
+define void @nextafter_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @nextafter_f64
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[CALL:%.*]] = tail call double @nextafter(double [[IN:%.*]], double [[IN]])
+;
+; SLEEF-SVE-LABEL: define void @nextafter_f64
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[CALL:%.*]] = tail call double @nextafter(double [[IN:%.*]], double [[IN]])
+;
+; ARMPL-NEON-LABEL: define void @nextafter_f64
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vnextafterq_f64(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
+;
+; ARMPL-SVE-LABEL: define void @nextafter_f64
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svnextafter_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x double> [[WIDE_MASKED_LOAD]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
+  %in = load double, ptr %in.gep, align 8
+  %call = tail call double @nextafter(double %in, double %in)
+  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
+  store double %call, ptr %out.gep, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @nextafter_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @nextafter_f32
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[CALL:%.*]] = tail call float @nextafterf(float [[IN:%.*]], float [[IN]])
+;
+; SLEEF-SVE-LABEL: define void @nextafter_f32
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[CALL:%.*]] = tail call float @nextafterf(float [[IN:%.*]], float [[IN]])
+;
+; ARMPL-NEON-LABEL: define void @nextafter_f32
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vnextafterq_f32(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]])
+;
+; ARMPL-SVE-LABEL: define void @nextafter_f32
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svnextafter_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
+  %in = load float, ptr %in.gep, align 8
+  %call = tail call float @nextafterf(float %in, float %in)
+  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
+  store float %call, ptr %out.gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @pow(double, double)
+declare float @powf(float, float)
+
+define void @pow_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @pow_f64
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <2 x double> @_ZGVnN2vv_pow(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
+;
+; SLEEF-SVE-LABEL: define void @pow_f64
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @_ZGVsMxvv_pow(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x double> [[WIDE_MASKED_LOAD]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @pow_f64
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vpowq_f64(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
+;
+; ARMPL-SVE-LABEL: define void @pow_f64
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svpow_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x double> [[WIDE_MASKED_LOAD]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
+  %in = load double, ptr %in.gep, align 8
+  %call = tail call double @pow(double %in, double %in)
+  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
+  store double %call, ptr %out.gep, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @pow_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @pow_f32
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <4 x float> @_ZGVnN4vv_powf(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]])
+;
+; SLEEF-SVE-LABEL: define void @pow_f32
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @_ZGVsMxvv_powf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @pow_f32
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vpowq_f32(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]])
+;
+; ARMPL-SVE-LABEL: define void @pow_f32
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svpow_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
+  %in = load float, ptr %in.gep, align 8
+  %call = tail call float @powf(float %in, float %in)
+  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
+  store float %call, ptr %out.gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @fma(double, double, double)
+declare float @fmaf(float, float, float)
+
+define void @fma_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @fma_f64
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[CALL:%.*]] = tail call double @fma(double [[IN:%.*]], double [[IN]], double [[IN]])
+;
+; SLEEF-SVE-LABEL: define void @fma_f64
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[CALL:%.*]] = tail call double @fma(double [[IN:%.*]], double [[IN]], double [[IN]])
+;
+; ARMPL-NEON-LABEL: define void @fma_f64
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vfmaq_f64(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]], <2 x double> [[WIDE_LOAD]])
+;
+; ARMPL-SVE-LABEL: define void @fma_f64
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svfma_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x double> [[WIDE_MASKED_LOAD]], <vscale x 2 x double> [[WIDE_MASKED_LOAD]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
+  %in = load double, ptr %in.gep, align 8
+  %call = tail call double @fma(double %in, double %in, double %in)
+  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
+  store double %call, ptr %out.gep, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @fma_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
+; SLEEF-NEON-LABEL: define void @fma_f32
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-NEON:    [[CALL:%.*]] = tail call float @fmaf(float [[IN:%.*]], float [[IN]], float [[IN]])
+;
+; SLEEF-SVE-LABEL: define void @fma_f32
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; SLEEF-SVE:    [[CALL:%.*]] = tail call float @fmaf(float [[IN:%.*]], float [[IN]], float [[IN]])
+;
+; ARMPL-NEON-LABEL: define void @fma_f32
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vfmaq_f32(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD]])
+;
+; ARMPL-SVE-LABEL: define void @fma_f32
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svfma_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
+  %in = load float, ptr %in.gep, align 8
+  %call = tail call float @fmaf(float %in, float %in, float %in)
+  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
+  store float %call, ptr %out.gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/veclib-intrinsic-calls.ll b/llvm/test/Transforms/LoopVectorize/AArch64/veclib-intrinsic-calls.ll
new file mode 100644
index 0000000000000..d59c28849bfb5
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/veclib-intrinsic-calls.ll
@@ -0,0 +1,1547 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter "call.*(cos|exp|log|sin|pow|ceil|copysign|fabs|floor|fma|m..num|nearbyint|rint|round|sqrt|trunc)" --version 2
+
+; RUN: opt -mattr=+neon -vector-library=sleefgnuabi -passes=inject-tli-mappings,loop-vectorize,simplifycfg -force-vector-interleave=1 -S < %s | FileCheck %s --check-prefix=SLEEF-NEON
+; RUN: opt -mattr=+sve -vector-library=sleefgnuabi -passes=inject-tli-mappings,loop-vectorize,simplifycfg -force-vector-interleave=1 -prefer-predicate-over-epilogue=predicate-dont-vectorize -S < %s | FileCheck %s --check-prefix=SLEEF-SVE
+; RUN: opt -mattr=+neon -vector-library=ArmPL -passes=inject-tli-mappings,loop-vectorize,simplifycfg -force-vector-interleave=1 -S < %s | FileCheck %s --check-prefix=ARMPL-NEON
+; RUN: opt -mattr=+sve -vector-library=ArmPL -passes=inject-tli-mappings,loop-vectorize,simplifycfg -force-vector-interleave=1 -prefer-predicate-over-epilogue=predicate-dont-vectorize -S < %s | FileCheck %s --check-prefix=ARMPL-SVE
+
+target triple = "aarch64-unknown-linux-gnu"
+
+; We are checking whether loops containing intrinsic calls can be vectorized,
+; when the compiler provides TLI mappings to their vector variants. The tests
+; are checking fixed width vectorization with NEON and scalable vectorization
+; with SVE.
+
+declare double @llvm.cos.f64(double)
+declare float @llvm.cos.f32(float)
+
+define void @cos_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; SLEEF-NEON-LABEL: define void @cos_f64
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1:[0-9]+]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <2 x double> @_ZGVnN2v_cos(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @cos_f64
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1:[0-9]+]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_cos(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @cos_f64
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1:[0-9]+]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vcosq_f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @cos_f64
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1:[0-9]+]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svcos_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
+  %in = load double, ptr %in.gep, align 8
+  %call = tail call double @llvm.cos.f64(double %in)
+  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
+  store double %call, ptr %out.gep, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @cos_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; SLEEF-NEON-LABEL: define void @cos_f32
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <4 x float> @_ZGVnN4v_cosf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @cos_f32
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_cosf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @cos_f32
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vcosq_f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @cos_f32
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svcos_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
+  %in = load float, ptr %in.gep, align 8
+  %call = tail call float @llvm.cos.f32(float %in)
+  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
+  store float %call, ptr %out.gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @llvm.exp.f64(double)
+declare float @llvm.exp.f32(float)
+
+define void @exp_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; SLEEF-NEON-LABEL: define void @exp_f64
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <2 x double> @_ZGVnN2v_exp(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @exp_f64
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_exp(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @exp_f64
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vexpq_f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @exp_f64
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svexp_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
+  %in = load double, ptr %in.gep, align 8
+  %call = tail call double @llvm.exp.f64(double %in)
+  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
+  store double %call, ptr %out.gep, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @exp_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; SLEEF-NEON-LABEL: define void @exp_f32
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <4 x float> @_ZGVnN4v_expf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @exp_f32
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_expf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @exp_f32
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vexpq_f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @exp_f32
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svexp_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
+  %in = load float, ptr %in.gep, align 8
+  %call = tail call float @llvm.exp.f32(float %in)
+  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
+  store float %call, ptr %out.gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @llvm.exp2.f64(double)
+declare float @llvm.exp2.f32(float)
+
+define void @exp2_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; SLEEF-NEON-LABEL: define void @exp2_f64
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <2 x double> @_ZGVnN2v_exp2(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @exp2_f64
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_exp2(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @exp2_f64
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vexp2q_f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @exp2_f64
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svexp2_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
+  %in = load double, ptr %in.gep, align 8
+  %call = tail call double @llvm.exp2.f64(double %in)
+  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
+  store double %call, ptr %out.gep, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @exp2_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; SLEEF-NEON-LABEL: define void @exp2_f32
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <4 x float> @_ZGVnN4v_exp2f(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @exp2_f32
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_exp2f(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @exp2_f32
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vexp2q_f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @exp2_f32
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svexp2_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
+  %in = load float, ptr %in.gep, align 8
+  %call = tail call float @llvm.exp2.f32(float %in)
+  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
+  store float %call, ptr %out.gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @llvm.exp10.f64(double)
+declare float @llvm.exp10.f32(float)
+
+define void @exp10_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; SLEEF-NEON-LABEL: define void @exp10_f64
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <2 x double> @_ZGVnN2v_exp10(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @exp10_f64
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_exp10(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @exp10_f64
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vexp10q_f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @exp10_f64
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svexp10_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
+  %in = load double, ptr %in.gep, align 8
+  %call = tail call double @llvm.exp10.f64(double %in)
+  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
+  store double %call, ptr %out.gep, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @exp10_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; SLEEF-NEON-LABEL: define void @exp10_f32
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <4 x float> @_ZGVnN4v_exp10f(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @exp10_f32
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_exp10f(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @exp10_f32
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vexp10q_f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @exp10_f32
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svexp10_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
+  %in = load float, ptr %in.gep, align 8
+  %call = tail call float @llvm.exp10.f32(float %in)
+  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
+  store float %call, ptr %out.gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @llvm.log.f64(double)
+declare float @llvm.log.f32(float)
+
+define void @log_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; SLEEF-NEON-LABEL: define void @log_f64
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <2 x double> @_ZGVnN2v_log(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @log_f64
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_log(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @log_f64
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vlogq_f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @log_f64
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svlog_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
+  %in = load double, ptr %in.gep, align 8
+  %call = tail call double @llvm.log.f64(double %in)
+  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
+  store double %call, ptr %out.gep, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @log_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; SLEEF-NEON-LABEL: define void @log_f32
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <4 x float> @_ZGVnN4v_logf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @log_f32
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_logf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @log_f32
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vlogq_f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @log_f32
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svlog_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
+  %in = load float, ptr %in.gep, align 8
+  %call = tail call float @llvm.log.f32(float %in)
+  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
+  store float %call, ptr %out.gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @llvm.log2.f64(double)
+declare float @llvm.log2.f32(float)
+
+define void @log2_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; SLEEF-NEON-LABEL: define void @log2_f64
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <2 x double> @_ZGVnN2v_log2(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @log2_f64
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_log2(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @log2_f64
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vlog2q_f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @log2_f64
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svlog2_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
+  %in = load double, ptr %in.gep, align 8
+  %call = tail call double @llvm.log2.f64(double %in)
+  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
+  store double %call, ptr %out.gep, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @log2_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; SLEEF-NEON-LABEL: define void @log2_f32
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <4 x float> @_ZGVnN4v_log2f(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @log2_f32
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_log2f(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @log2_f32
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vlog2q_f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @log2_f32
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svlog2_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
+  %in = load float, ptr %in.gep, align 8
+  %call = tail call float @llvm.log2.f32(float %in)
+  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
+  store float %call, ptr %out.gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @llvm.log10.f64(double)
+declare float @llvm.log10.f32(float)
+
+define void @log10_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; SLEEF-NEON-LABEL: define void @log10_f64
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <2 x double> @_ZGVnN2v_log10(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @log10_f64
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_log10(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @log10_f64
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vlog10q_f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @log10_f64
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svlog10_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
+  %in = load double, ptr %in.gep, align 8
+  %call = tail call double @llvm.log10.f64(double %in)
+  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
+  store double %call, ptr %out.gep, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @log10_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; SLEEF-NEON-LABEL: define void @log10_f32
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <4 x float> @_ZGVnN4v_log10f(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @log10_f32
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_log10f(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @log10_f32
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vlog10q_f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @log10_f32
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svlog10_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
+  %in = load float, ptr %in.gep, align 8
+  %call = tail call float @llvm.log10.f32(float %in)
+  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
+  store float %call, ptr %out.gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @llvm.sin.f64(double)
+declare float @llvm.sin.f32(float)
+
+define void @sin_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; SLEEF-NEON-LABEL: define void @sin_f64
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <2 x double> @_ZGVnN2v_sin(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @sin_f64
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @_ZGVsMxv_sin(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @sin_f64
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vsinq_f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @sin_f64
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svsin_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
+  %in = load double, ptr %in.gep, align 8
+  %call = tail call double @llvm.sin.f64(double %in)
+  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
+  store double %call, ptr %out.gep, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @sin_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; SLEEF-NEON-LABEL: define void @sin_f32
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <4 x float> @_ZGVnN4v_sinf(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @sin_f32
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @_ZGVsMxv_sinf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @sin_f32
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vsinq_f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @sin_f32
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svsin_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
+  %in = load float, ptr %in.gep, align 8
+  %call = tail call float @llvm.sin.f32(float %in)
+  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
+  store float %call, ptr %out.gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @llvm.pow.f64(double, double)
+declare float @llvm.pow.f32(float, float)
+
+define void @pow_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; SLEEF-NEON-LABEL: define void @pow_f64
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <2 x double> @_ZGVnN2vv_pow(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
+;
+; SLEEF-SVE-LABEL: define void @pow_f64
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @_ZGVsMxvv_pow(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x double> [[WIDE_MASKED_LOAD]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @pow_f64
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vpowq_f64(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
+;
+; ARMPL-SVE-LABEL: define void @pow_f64
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svpow_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x double> [[WIDE_MASKED_LOAD]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
+  %in = load double, ptr %in.gep, align 8
+  %call = tail call double @llvm.pow.f64(double %in, double %in)
+  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
+  store double %call, ptr %out.gep, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @pow_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; SLEEF-NEON-LABEL: define void @pow_f32
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <4 x float> @_ZGVnN4vv_powf(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]])
+;
+; SLEEF-SVE-LABEL: define void @pow_f32
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @_ZGVsMxvv_powf(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @pow_f32
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vpowq_f32(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]])
+;
+; ARMPL-SVE-LABEL: define void @pow_f32
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svpow_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
+  %in = load float, ptr %in.gep, align 8
+  %call = tail call float @llvm.pow.f32(float %in, float %in)
+  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
+  store float %call, ptr %out.gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @llvm.ceil.f64(double)
+declare float @llvm.ceil.f32(float)
+
+define void @ceil_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; SLEEF-NEON-LABEL: define void @ceil_f64
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @ceil_f64
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @llvm.ceil.nxv2f64(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @ceil_f64
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @ceil_f64
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @llvm.ceil.nxv2f64(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
+  %in = load double, ptr %in.gep, align 8
+  %call = tail call double @llvm.ceil.f64(double %in)
+  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
+  store double %call, ptr %out.gep, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @ceil_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; SLEEF-NEON-LABEL: define void @ceil_f32
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <4 x float> @llvm.ceil.v4f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @ceil_f32
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @llvm.ceil.nxv4f32(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @ceil_f32
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @llvm.ceil.v4f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @ceil_f32
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @llvm.ceil.nxv4f32(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
+  %in = load float, ptr %in.gep, align 8
+  %call = tail call float @llvm.ceil.f32(float %in)
+  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
+  store float %call, ptr %out.gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @llvm.copysign.f64(double, double)
+declare float @llvm.copysign.f32(float, float)
+
+define void @copysign_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; SLEEF-NEON-LABEL: define void @copysign_f64
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <2 x double> @llvm.copysign.v2f64(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
+;
+; SLEEF-SVE-LABEL: define void @copysign_f64
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @llvm.copysign.nxv2f64(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x double> [[WIDE_MASKED_LOAD]])
+;
+; ARMPL-NEON-LABEL: define void @copysign_f64
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @llvm.copysign.v2f64(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
+;
+; ARMPL-SVE-LABEL: define void @copysign_f64
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @llvm.copysign.nxv2f64(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x double> [[WIDE_MASKED_LOAD]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
+  %in = load double, ptr %in.gep, align 8
+  %call = tail call double @llvm.copysign.f64(double %in, double %in)
+  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
+  store double %call, ptr %out.gep, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @copysign_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; SLEEF-NEON-LABEL: define void @copysign_f32
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <4 x float> @llvm.copysign.v4f32(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]])
+;
+; SLEEF-SVE-LABEL: define void @copysign_f32
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @llvm.copysign.nxv4f32(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]])
+;
+; ARMPL-NEON-LABEL: define void @copysign_f32
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @llvm.copysign.v4f32(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]])
+;
+; ARMPL-SVE-LABEL: define void @copysign_f32
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @llvm.copysign.nxv4f32(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
+  %in = load float, ptr %in.gep, align 8
+  %call = tail call float @llvm.copysign.f32(float %in, float %in)
+  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
+  store float %call, ptr %out.gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @llvm.fabs.f64(double)
+declare float @llvm.fabs.f32(float)
+
+define void @fabs_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; SLEEF-NEON-LABEL: define void @fabs_f64
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <2 x double> @llvm.fabs.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @fabs_f64
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @llvm.fabs.nxv2f64(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @fabs_f64
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @llvm.fabs.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @fabs_f64
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @llvm.fabs.nxv2f64(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
+  %in = load double, ptr %in.gep, align 8
+  %call = tail call double @llvm.fabs.f64(double %in)
+  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
+  store double %call, ptr %out.gep, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @fabs_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; SLEEF-NEON-LABEL: define void @fabs_f32
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @fabs_f32
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @llvm.fabs.nxv4f32(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @fabs_f32
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @fabs_f32
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @llvm.fabs.nxv4f32(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
+  %in = load float, ptr %in.gep, align 8
+  %call = tail call float @llvm.fabs.f32(float %in)
+  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
+  store float %call, ptr %out.gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @llvm.floor.f64(double)
+declare float @llvm.floor.f32(float)
+
+define void @floor_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; SLEEF-NEON-LABEL: define void @floor_f64
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <2 x double> @llvm.floor.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @floor_f64
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @llvm.floor.nxv2f64(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @floor_f64
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @llvm.floor.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @floor_f64
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @llvm.floor.nxv2f64(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
+  %in = load double, ptr %in.gep, align 8
+  %call = tail call double @llvm.floor.f64(double %in)
+  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
+  store double %call, ptr %out.gep, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @floor_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; SLEEF-NEON-LABEL: define void @floor_f32
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <4 x float> @llvm.floor.v4f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @floor_f32
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @llvm.floor.nxv4f32(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @floor_f32
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @llvm.floor.v4f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @floor_f32
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @llvm.floor.nxv4f32(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
+  %in = load float, ptr %in.gep, align 8
+  %call = tail call float @llvm.floor.f32(float %in)
+  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
+  store float %call, ptr %out.gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @llvm.fma.f64(double, double, double)
+declare float @llvm.fma.f32(float, float, float)
+
+define void @fma_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; SLEEF-NEON-LABEL: define void @fma_f64
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]], <2 x double> [[WIDE_LOAD]])
+;
+; SLEEF-SVE-LABEL: define void @fma_f64
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @llvm.fma.nxv2f64(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x double> [[WIDE_MASKED_LOAD]], <vscale x 2 x double> [[WIDE_MASKED_LOAD]])
+;
+; ARMPL-NEON-LABEL: define void @fma_f64
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]], <2 x double> [[WIDE_LOAD]])
+;
+; ARMPL-SVE-LABEL: define void @fma_f64
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @llvm.fma.nxv2f64(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x double> [[WIDE_MASKED_LOAD]], <vscale x 2 x double> [[WIDE_MASKED_LOAD]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
+  %in = load double, ptr %in.gep, align 8
+  %call = tail call double @llvm.fma.f64(double %in, double %in, double %in)
+  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
+  store double %call, ptr %out.gep, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @fma_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; SLEEF-NEON-LABEL: define void @fma_f32
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD]])
+;
+; SLEEF-SVE-LABEL: define void @fma_f32
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @llvm.fma.nxv4f32(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]])
+;
+; ARMPL-NEON-LABEL: define void @fma_f32
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD]])
+;
+; ARMPL-SVE-LABEL: define void @fma_f32
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @llvm.fma.nxv4f32(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
+  %in = load float, ptr %in.gep, align 8
+  %call = tail call float @llvm.fma.f32(float %in, float %in, float %in)
+  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
+  store float %call, ptr %out.gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @llvm.maxnum.f64(double, double)
+declare float @llvm.maxnum.f32(float, float)
+
+define void @maxnum_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; SLEEF-NEON-LABEL: define void @maxnum_f64
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <2 x double> @llvm.maxnum.v2f64(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
+;
+; SLEEF-SVE-LABEL: define void @maxnum_f64
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @llvm.maxnum.nxv2f64(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x double> [[WIDE_MASKED_LOAD]])
+;
+; ARMPL-NEON-LABEL: define void @maxnum_f64
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @llvm.maxnum.v2f64(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
+;
+; ARMPL-SVE-LABEL: define void @maxnum_f64
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @llvm.maxnum.nxv2f64(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x double> [[WIDE_MASKED_LOAD]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
+  %in = load double, ptr %in.gep, align 8
+  %call = tail call double @llvm.maxnum.f64(double %in, double %in)
+  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
+  store double %call, ptr %out.gep, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @maxnum_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; SLEEF-NEON-LABEL: define void @maxnum_f32
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]])
+;
+; SLEEF-SVE-LABEL: define void @maxnum_f32
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @llvm.maxnum.nxv4f32(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]])
+;
+; ARMPL-NEON-LABEL: define void @maxnum_f32
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]])
+;
+; ARMPL-SVE-LABEL: define void @maxnum_f32
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @llvm.maxnum.nxv4f32(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
+  %in = load float, ptr %in.gep, align 8
+  %call = tail call float @llvm.maxnum.f32(float %in, float %in)
+  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
+  store float %call, ptr %out.gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @llvm.minnum.f64(double, double)
+declare float @llvm.minnum.f32(float, float)
+
+define void @minnum_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; SLEEF-NEON-LABEL: define void @minnum_f64
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <2 x double> @llvm.minnum.v2f64(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
+;
+; SLEEF-SVE-LABEL: define void @minnum_f64
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @llvm.minnum.nxv2f64(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x double> [[WIDE_MASKED_LOAD]])
+;
+; ARMPL-NEON-LABEL: define void @minnum_f64
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @llvm.minnum.v2f64(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
+;
+; ARMPL-SVE-LABEL: define void @minnum_f64
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @llvm.minnum.nxv2f64(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x double> [[WIDE_MASKED_LOAD]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
+  %in = load double, ptr %in.gep, align 8
+  %call = tail call double @llvm.minnum.f64(double %in, double %in)
+  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
+  store double %call, ptr %out.gep, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @minnum_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; SLEEF-NEON-LABEL: define void @minnum_f32
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]])
+;
+; SLEEF-SVE-LABEL: define void @minnum_f32
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @llvm.minnum.nxv4f32(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]])
+;
+; ARMPL-NEON-LABEL: define void @minnum_f32
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]])
+;
+; ARMPL-SVE-LABEL: define void @minnum_f32
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @llvm.minnum.nxv4f32(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
+  %in = load float, ptr %in.gep, align 8
+  %call = tail call float @llvm.minnum.f32(float %in, float %in)
+  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
+  store float %call, ptr %out.gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @llvm.nearbyint.f64(double)
+declare float @llvm.nearbyint.f32(float)
+
+define void @nearbyint_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; SLEEF-NEON-LABEL: define void @nearbyint_f64
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @nearbyint_f64
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @llvm.nearbyint.nxv2f64(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @nearbyint_f64
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @nearbyint_f64
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @llvm.nearbyint.nxv2f64(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
+  %in = load double, ptr %in.gep, align 8
+  %call = tail call double @llvm.nearbyint.f64(double %in)
+  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
+  store double %call, ptr %out.gep, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @nearbyint_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; SLEEF-NEON-LABEL: define void @nearbyint_f32
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @nearbyint_f32
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @llvm.nearbyint.nxv4f32(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @nearbyint_f32
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @nearbyint_f32
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @llvm.nearbyint.nxv4f32(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
+  %in = load float, ptr %in.gep, align 8
+  %call = tail call float @llvm.nearbyint.f32(float %in)
+  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
+  store float %call, ptr %out.gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @llvm.rint.f64(double)
+declare float @llvm.rint.f32(float)
+
+define void @rint_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; SLEEF-NEON-LABEL: define void @rint_f64
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <2 x double> @llvm.rint.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @rint_f64
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @llvm.rint.nxv2f64(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @rint_f64
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @llvm.rint.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @rint_f64
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @llvm.rint.nxv2f64(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
+  %in = load double, ptr %in.gep, align 8
+  %call = tail call double @llvm.rint.f64(double %in)
+  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
+  store double %call, ptr %out.gep, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @rint_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; SLEEF-NEON-LABEL: define void @rint_f32
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <4 x float> @llvm.rint.v4f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @rint_f32
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @llvm.rint.nxv4f32(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @rint_f32
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @llvm.rint.v4f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @rint_f32
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @llvm.rint.nxv4f32(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
+  %in = load float, ptr %in.gep, align 8
+  %call = tail call float @llvm.rint.f32(float %in)
+  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
+  store float %call, ptr %out.gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @llvm.round.f64(double)
+declare float @llvm.round.f32(float)
+
+define void @round_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; SLEEF-NEON-LABEL: define void @round_f64
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <2 x double> @llvm.round.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @round_f64
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @llvm.round.nxv2f64(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @round_f64
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @llvm.round.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @round_f64
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @llvm.round.nxv2f64(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
+  %in = load double, ptr %in.gep, align 8
+  %call = tail call double @llvm.round.f64(double %in)
+  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
+  store double %call, ptr %out.gep, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @round_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; SLEEF-NEON-LABEL: define void @round_f32
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <4 x float> @llvm.round.v4f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @round_f32
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @llvm.round.nxv4f32(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @round_f32
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @llvm.round.v4f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @round_f32
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @llvm.round.nxv4f32(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
+  %in = load float, ptr %in.gep, align 8
+  %call = tail call float @llvm.round.f32(float %in)
+  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
+  store float %call, ptr %out.gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @llvm.sqrt.f64(double)
+declare float @llvm.sqrt.f32(float)
+
+define void @sqrt_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; SLEEF-NEON-LABEL: define void @sqrt_f64
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <2 x double> @llvm.sqrt.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @sqrt_f64
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @llvm.sqrt.nxv2f64(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @sqrt_f64
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @llvm.sqrt.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @sqrt_f64
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @llvm.sqrt.nxv2f64(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
+  %in = load double, ptr %in.gep, align 8
+  %call = tail call double @llvm.sqrt.f64(double %in)
+  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
+  store double %call, ptr %out.gep, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @sqrt_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; SLEEF-NEON-LABEL: define void @sqrt_f32
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <4 x float> @llvm.sqrt.v4f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @sqrt_f32
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @llvm.sqrt.nxv4f32(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @sqrt_f32
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @llvm.sqrt.v4f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @sqrt_f32
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @llvm.sqrt.nxv4f32(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
+  %in = load float, ptr %in.gep, align 8
+  %call = tail call float @llvm.sqrt.f32(float %in)
+  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
+  store float %call, ptr %out.gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+declare double @llvm.trunc.f64(double)
+declare float @llvm.trunc.f32(float)
+
+define void @trunc_f64(ptr noalias %in.ptr, ptr %out.ptr) {
+; SLEEF-NEON-LABEL: define void @trunc_f64
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <2 x double> @llvm.trunc.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @trunc_f64
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @llvm.trunc.nxv2f64(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @trunc_f64
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @llvm.trunc.v2f64(<2 x double> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @trunc_f64
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @llvm.trunc.nxv2f64(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds double, ptr %in.ptr, i64 %iv
+  %in = load double, ptr %in.gep, align 8
+  %call = tail call double @llvm.trunc.f64(double %in)
+  %out.gep = getelementptr inbounds double, ptr %out.ptr, i64 %iv
+  store double %call, ptr %out.gep, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+
+define void @trunc_f32(ptr noalias %in.ptr, ptr %out.ptr) {
+; SLEEF-NEON-LABEL: define void @trunc_f32
+; SLEEF-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-NEON:    [[TMP3:%.*]] = call <4 x float> @llvm.trunc.v4f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; SLEEF-SVE-LABEL: define void @trunc_f32
+; SLEEF-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; SLEEF-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @llvm.trunc.nxv4f32(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]])
+;
+; ARMPL-NEON-LABEL: define void @trunc_f32
+; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @llvm.trunc.v4f32(<4 x float> [[WIDE_LOAD:%.*]])
+;
+; ARMPL-SVE-LABEL: define void @trunc_f32
+; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr [[OUT_PTR:%.*]]) #[[ATTR1]] {
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @llvm.trunc.nxv4f32(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]])
+;
+  entry:
+  br label %for.body
+
+  for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %in.gep = getelementptr inbounds float, ptr %in.ptr, i64 %iv
+  %in = load float, ptr %in.gep, align 8
+  %call = tail call float @llvm.trunc.f32(float %in)
+  %out.gep = getelementptr inbounds float, ptr %out.ptr, i64 %iv
+  store float %call, ptr %out.gep, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1000
+  br i1 %exitcond, label %for.end, label %for.body
+
+  for.end:
+  ret void
+}
+

From 650119c737c12860af7b7e661f5c3c0dc1400010 Mon Sep 17 00:00:00 2001
From: Nico Weber <thakis@chromium.org>
Date: Fri, 22 Dec 2023 11:31:14 -0500
Subject: [PATCH 189/342] [gn] port d430c145ba92 (dladdr check for clang)

---
 .../gn/secondary/clang/include/clang/Config/BUILD.gn | 12 ++++++++++--
 .../gn/secondary/llvm/include/llvm/Config/BUILD.gn   |  8 ++++----
 2 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/llvm/utils/gn/secondary/clang/include/clang/Config/BUILD.gn b/llvm/utils/gn/secondary/clang/include/clang/Config/BUILD.gn
index 808010deef046..7273803dd5165 100644
--- a/llvm/utils/gn/secondary/clang/include/clang/Config/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/include/clang/Config/BUILD.gn
@@ -44,9 +44,17 @@ write_cmake_config("Config") {
   }
 
   if (host_os != "win") {
-    values += [ "CLANG_HAVE_RLIMITS=1" ]
+    values += [
+      "CLANG_HAVE_DLADDR=1",
+      "CLANG_HAVE_DLFCN_H=1",
+      "CLANG_HAVE_RLIMITS=1",
+    ]
   } else {
-    values += [ "CLANG_HAVE_RLIMITS=" ]
+    values += [
+      "CLANG_HAVE_DLADDR=",
+      "CLANG_HAVE_DLFCN_H=",
+      "CLANG_HAVE_RLIMITS=",
+    ]
   }
 
   if (llvm_enable_libxml2) {
diff --git a/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn b/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn
index 2478b2f8a861c..e5fb529b455fc 100644
--- a/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn
@@ -205,6 +205,8 @@ write_cmake_config("config") {
   if (current_os == "win") {
     values += [
       "HAVE_DECL_STRERROR_S=1",
+      "HAVE_DLADDR=",
+      "HAVE_DLFCN_H=",
       "HAVE_DLOPEN=",
       "HAVE_FUTIMES=",
       "HAVE_GETPAGESIZE=",
@@ -239,6 +241,8 @@ write_cmake_config("config") {
     # POSIX-y system defaults.
     values += [
       "HAVE_DECL_STRERROR_S=",
+      "HAVE_DLADDR=1",
+      "HAVE_DLFCN_H=1",
       "HAVE_DLOPEN=1",
       "HAVE_FUTIMES=1",
       "HAVE_GETPAGESIZE=1",
@@ -358,16 +362,12 @@ write_cmake_config("llvm-config") {
 
   if (current_os == "win") {
     values += [
-      "HAVE_DLADDR=",
-      "HAVE_DLFCN_H=",
       "HAVE_SYSEXITS_H=",
       "LLVM_ENABLE_PLUGINS=",
       "LLVM_ON_UNIX=",
     ]
   } else {
     values += [
-      "HAVE_DLADDR=1",
-      "HAVE_DLFCN_H=1",
       "HAVE_SYSEXITS_H=1",
       "LLVM_ENABLE_PLUGINS=1",
       "LLVM_ON_UNIX=1",

From 366923810ef11b2df1c4cffa29b51c304c72d332 Mon Sep 17 00:00:00 2001
From: Maksim Levental <maksim.levental@gmail.com>
Date: Fri, 22 Dec 2023 11:05:46 -0600
Subject: [PATCH 190/342] [lit] add `LIT_MAX_WORKERS` env variable (#76205)

---
 llvm/utils/lit/lit/cl_arguments.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/utils/lit/lit/cl_arguments.py b/llvm/utils/lit/lit/cl_arguments.py
index ba3706659550b..b9122d07afd8a 100644
--- a/llvm/utils/lit/lit/cl_arguments.py
+++ b/llvm/utils/lit/lit/cl_arguments.py
@@ -36,7 +36,7 @@ def parse_args():
         metavar="N",
         help="Number of workers used for testing",
         type=_positive_int,
-        default=lit.util.usable_core_count(),
+        default=os.getenv("LIT_MAX_WORKERS", lit.util.usable_core_count()),
     )
     parser.add_argument(
         "--config-prefix",

From d782f198a61221c8b80734c371b304981eae5576 Mon Sep 17 00:00:00 2001
From: Nico Weber <thakis@chromium.org>
Date: Fri, 22 Dec 2023 12:06:32 -0500
Subject: [PATCH 191/342] lld/MachO: Fix two typos to cycle bots

---
 lld/MachO/InputFiles.h | 2 +-
 lld/MachO/Options.td   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/lld/MachO/InputFiles.h b/lld/MachO/InputFiles.h
index 2e37e7ba5a006..5e550c167c232 100644
--- a/lld/MachO/InputFiles.h
+++ b/lld/MachO/InputFiles.h
@@ -140,7 +140,7 @@ class InputFile {
 
   InputFile(Kind, const llvm::MachO::InterfaceFile &);
 
-  // If true, this input's arch is compatiable with target.
+  // If true, this input's arch is compatible with target.
   bool compatArch = true;
 
 private:
diff --git a/lld/MachO/Options.td b/lld/MachO/Options.td
index f92e6cda31e52..01e73b789f9aa 100644
--- a/lld/MachO/Options.td
+++ b/lld/MachO/Options.td
@@ -132,7 +132,7 @@ def check_category_conflicts : Flag<["--"], "check-category-conflicts">,
 def lto_debug_pass_manager: Flag<["--"], "lto-debug-pass-manager">,
     HelpText<"Debug new pass manager">, Group<grp_lld>;
 def cs_profile_generate: Flag<["--"], "cs-profile-generate">,
-    HelpText<"Perform context senstive PGO instrumentation">, Group<grp_lld>;
+    HelpText<"Perform context sensitive PGO instrumentation">, Group<grp_lld>;
 def cs_profile_path: Joined<["--"], "cs-profile-path=">,
     HelpText<"Context sensitive profile file path">, Group<grp_lld>;
 defm pgo_warn_mismatch: BB<"pgo-warn-mismatch",

From 04c473bea3e0f135432698fcaafab52e1fe1b5ec Mon Sep 17 00:00:00 2001
From: Nico Weber <thakis@chromium.org>
Date: Fri, 22 Dec 2023 12:11:05 -0500
Subject: [PATCH 192/342] [lldb] Fix two deprecation warnings

No behavior change.
---
 lldb/source/Host/linux/HostInfoLinux.cpp | 2 +-
 lldb/source/Target/JITLoaderList.cpp     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/lldb/source/Host/linux/HostInfoLinux.cpp b/lldb/source/Host/linux/HostInfoLinux.cpp
index c66f787db0cf9..723f0c2fb3fdc 100644
--- a/lldb/source/Host/linux/HostInfoLinux.cpp
+++ b/lldb/source/Host/linux/HostInfoLinux.cpp
@@ -123,7 +123,7 @@ llvm::StringRef HostInfoLinux::GetDistributionId() {
         if (strstr(distribution_id, distributor_id_key)) {
           // strip newlines
           std::string id_string(distribution_id + strlen(distributor_id_key));
-          llvm::erase_value(id_string, '\n');
+          llvm::erase(id_string, '\n');
 
           // lower case it and convert whitespace to underscores
           std::transform(
diff --git a/lldb/source/Target/JITLoaderList.cpp b/lldb/source/Target/JITLoaderList.cpp
index 9158d0a5e546c..9fa070edd4b8d 100644
--- a/lldb/source/Target/JITLoaderList.cpp
+++ b/lldb/source/Target/JITLoaderList.cpp
@@ -24,7 +24,7 @@ void JITLoaderList::Append(const JITLoaderSP &jit_loader_sp) {
 
 void JITLoaderList::Remove(const JITLoaderSP &jit_loader_sp) {
   std::lock_guard<std::recursive_mutex> guard(m_jit_loaders_mutex);
-  llvm::erase_value(m_jit_loaders_vec, jit_loader_sp);
+  llvm::erase(m_jit_loaders_vec, jit_loader_sp);
 }
 
 size_t JITLoaderList::GetSize() const { return m_jit_loaders_vec.size(); }

From 52b7045fbb70571e09c0ad3be7bd3f0c1acccffa Mon Sep 17 00:00:00 2001
From: Kareem Ergawy <kareem.ergawy@amd.com>
Date: Fri, 22 Dec 2023 20:02:31 +0100
Subject: [PATCH 193/342] [flang][MLIR][OpenMP] Emit `UpdateDataOp` from `!$omp
 target update` (#75345)

Emits MLIR op corresponding to `!$omp target update` directive. So far,
only motion types: `to` and `from` are supported. Motion modifiers:
`present`, `mapper`, and `iterator` are not supported yet.

This is a follow up to #75047 & #75159, only the last commit is relevant
to this PR.
---
 flang/lib/Lower/OpenMP.cpp             | 85 ++++++++++++++++++++---
 flang/test/Lower/OpenMP/FIR/target.f90 | 94 ++++++++++++++++++++++++++
 flang/test/Lower/OpenMP/target.f90     | 88 ++++++++++++++++++++++++
 3 files changed, 256 insertions(+), 11 deletions(-)

diff --git a/flang/lib/Lower/OpenMP.cpp b/flang/lib/Lower/OpenMP.cpp
index 8ed31766725e1..c3a570bf15ea0 100644
--- a/flang/lib/Lower/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP.cpp
@@ -607,6 +607,12 @@ class ClauseProcessor {
                       llvm::SmallVectorImpl<const Fortran::semantics::Symbol *>
                           &useDeviceSymbols) const;
 
+  template <typename T>
+  bool
+  processMotionClauses(Fortran::semantics::SemanticsContext &semanticsContext,
+                       Fortran::lower::StatementContext &stmtCtx,
+                       llvm::SmallVectorImpl<mlir::Value> &mapOperands);
+
   // Call this method for these clauses that should be supported but are not
   // implemented yet. It triggers a compilation error if any of the given
   // clauses is found.
@@ -1893,6 +1899,47 @@ bool ClauseProcessor::processUseDevicePtr(
       });
 }
 
+template <typename T>
+bool ClauseProcessor::processMotionClauses(
+    Fortran::semantics::SemanticsContext &semanticsContext,
+    Fortran::lower::StatementContext &stmtCtx,
+    llvm::SmallVectorImpl<mlir::Value> &mapOperands) {
+  return findRepeatableClause<T>(
+      [&](const T *motionClause, const Fortran::parser::CharBlock &source) {
+        mlir::Location clauseLocation = converter.genLocation(source);
+        fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
+
+        static_assert(std::is_same_v<T, ClauseProcessor::ClauseTy::To> ||
+                      std::is_same_v<T, ClauseProcessor::ClauseTy::From>);
+
+        // TODO Support motion modifiers: present, mapper, iterator.
+        constexpr llvm::omp::OpenMPOffloadMappingFlags mapTypeBits =
+            std::is_same_v<T, ClauseProcessor::ClauseTy::To>
+                ? llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_TO
+                : llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_FROM;
+
+        for (const Fortran::parser::OmpObject &ompObject : motionClause->v.v) {
+          llvm::SmallVector<mlir::Value> bounds;
+          std::stringstream asFortran;
+          Fortran::lower::AddrAndBoundsInfo info =
+              Fortran::lower::gatherDataOperandAddrAndBounds<
+                  Fortran::parser::OmpObject, mlir::omp::DataBoundsOp,
+                  mlir::omp::DataBoundsType>(
+                  converter, firOpBuilder, semanticsContext, stmtCtx, ompObject,
+                  clauseLocation, asFortran, bounds, treatIndexAsSection);
+
+          mlir::Value mapOp = createMapInfoOp(
+              firOpBuilder, clauseLocation, info.addr, asFortran, bounds,
+              static_cast<
+                  std::underlying_type_t<llvm::omp::OpenMPOffloadMappingFlags>>(
+                  mapTypeBits),
+              mlir::omp::VariableCaptureKind::ByRef, info.addr.getType());
+
+          mapOperands.push_back(mapOp);
+        }
+      });
+}
+
 template <typename... Ts>
 void ClauseProcessor::processTODO(mlir::Location currentLocation,
                                   llvm::omp::Directive directive) const {
@@ -2416,10 +2463,10 @@ genDataOp(Fortran::lower::AbstractConverter &converter,
 
 template <typename OpTy>
 static OpTy
-genEnterExitDataOp(Fortran::lower::AbstractConverter &converter,
-                   Fortran::semantics::SemanticsContext &semanticsContext,
-                   mlir::Location currentLocation,
-                   const Fortran::parser::OmpClauseList &clauseList) {
+genEnterExitUpdateDataOp(Fortran::lower::AbstractConverter &converter,
+                         Fortran::semantics::SemanticsContext &semanticsContext,
+                         mlir::Location currentLocation,
+                         const Fortran::parser::OmpClauseList &clauseList) {
   fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
   Fortran::lower::StatementContext stmtCtx;
   mlir::Value ifClauseOperand, deviceOperand;
@@ -2436,6 +2483,10 @@ genEnterExitDataOp(Fortran::lower::AbstractConverter &converter,
     directiveName =
         Fortran::parser::OmpIfClause::DirectiveNameModifier::TargetExitData;
     directive = llvm::omp::Directive::OMPD_target_exit_data;
+  } else if constexpr (std::is_same_v<OpTy, mlir::omp::UpdateDataOp>) {
+    directiveName =
+        Fortran::parser::OmpIfClause::DirectiveNameModifier::TargetUpdate;
+    directive = llvm::omp::Directive::OMPD_target_update;
   } else {
     return nullptr;
   }
@@ -2444,8 +2495,18 @@ genEnterExitDataOp(Fortran::lower::AbstractConverter &converter,
   cp.processIf(directiveName, ifClauseOperand);
   cp.processDevice(stmtCtx, deviceOperand);
   cp.processNowait(nowaitAttr);
-  cp.processMap(currentLocation, directive, semanticsContext, stmtCtx,
-                mapOperands);
+
+  if constexpr (std::is_same_v<OpTy, mlir::omp::UpdateDataOp>) {
+    cp.processMotionClauses<Fortran::parser::OmpClause::To>(
+        semanticsContext, stmtCtx, mapOperands);
+    cp.processMotionClauses<Fortran::parser::OmpClause::From>(
+        semanticsContext, stmtCtx, mapOperands);
+
+  } else {
+    cp.processMap(currentLocation, directive, semanticsContext, stmtCtx,
+                  mapOperands);
+  }
+
   cp.processTODO<Fortran::parser::OmpClause::Depend>(currentLocation,
                                                      directive);
 
@@ -2847,15 +2908,17 @@ genOmpSimpleStandalone(Fortran::lower::AbstractConverter &converter,
     genDataOp(converter, eval, semanticsContext, currentLocation, opClauseList);
     break;
   case llvm::omp::Directive::OMPD_target_enter_data:
-    genEnterExitDataOp<mlir::omp::EnterDataOp>(converter, semanticsContext,
-                                               currentLocation, opClauseList);
+    genEnterExitUpdateDataOp<mlir::omp::EnterDataOp>(
+        converter, semanticsContext, currentLocation, opClauseList);
     break;
   case llvm::omp::Directive::OMPD_target_exit_data:
-    genEnterExitDataOp<mlir::omp::ExitDataOp>(converter, semanticsContext,
-                                              currentLocation, opClauseList);
+    genEnterExitUpdateDataOp<mlir::omp::ExitDataOp>(
+        converter, semanticsContext, currentLocation, opClauseList);
     break;
   case llvm::omp::Directive::OMPD_target_update:
-    TODO(currentLocation, "OMPD_target_update");
+    genEnterExitUpdateDataOp<mlir::omp::UpdateDataOp>(
+        converter, semanticsContext, currentLocation, opClauseList);
+    break;
   case llvm::omp::Directive::OMPD_ordered:
     TODO(currentLocation, "OMPD_ordered");
   }
diff --git a/flang/test/Lower/OpenMP/FIR/target.f90 b/flang/test/Lower/OpenMP/FIR/target.f90
index 2034ac84334e5..5d36699bf0e90 100644
--- a/flang/test/Lower/OpenMP/FIR/target.f90
+++ b/flang/test/Lower/OpenMP/FIR/target.f90
@@ -133,6 +133,100 @@ subroutine omp_target_exit_device
    !$omp target exit data map(from: a) device(d)
 end subroutine omp_target_exit_device
 
+!===============================================================================
+! Target_Update `to` clause
+!===============================================================================
+
+subroutine omp_target_update_to
+   integer :: a(1024)
+
+   !CHECK-DAG: %[[A_ALLOC:.*]] = fir.alloca !fir.array<1024xi32> {bindc_name = "a", uniq_name = "_QFomp_target_update_toEa"}
+   !CHECK-DAG: %[[BOUNDS:.*]] = omp.bounds
+
+   !CHECK: %[[TO_MAP:.*]] = omp.map_info var_ptr(%[[A_ALLOC]] : !fir.ref<!fir.array<1024xi32>>, !fir.array<1024xi32>)
+   !CHECK-SAME: map_clauses(to) capture(ByRef)
+   !CHECK-SAME: bounds(%[[BOUNDS]]) -> !fir.ref<!fir.array<1024xi32>> {name = "a"}
+
+   !CHECK: omp.target_update_data
+   !CHECK-SAME: motion_entries(%[[TO_MAP]] : !fir.ref<!fir.array<1024xi32>>)
+   !$omp target update to(a)
+end subroutine omp_target_update_to
+
+!===============================================================================
+! Target_Update `from` clause
+!===============================================================================
+
+subroutine omp_target_update_from
+   integer :: a(1024)
+
+   !CHECK-DAG: %[[A_ALLOC:.*]] = fir.alloca !fir.array<1024xi32> {bindc_name = "a", uniq_name = "_QFomp_target_update_fromEa"}
+   !CHECK-DAG: %[[BOUNDS:.*]] = omp.bounds
+
+   !CHECK: %[[FROM_MAP:.*]] = omp.map_info var_ptr(%[[A_ALLOC]] : !fir.ref<!fir.array<1024xi32>>, !fir.array<1024xi32>)
+   !CHECK-SAME: map_clauses(from) capture(ByRef)
+   !CHECK-SAME: bounds(%[[BOUNDS]]) -> !fir.ref<!fir.array<1024xi32>> {name = "a"}
+
+   !CHECK: omp.target_update_data
+   !CHECK-SAME: motion_entries(%[[FROM_MAP]] : !fir.ref<!fir.array<1024xi32>>)
+   !$omp target update from(a)
+end subroutine omp_target_update_from
+
+!===============================================================================
+! Target_Update `if` clause
+!===============================================================================
+
+subroutine omp_target_update_if
+   integer :: a(1024)
+   logical :: i
+
+   !CHECK-DAG: %[[A_ALLOC:.*]] = fir.alloca
+   !CHECK-DAG: %[[BOUNDS:.*]] = omp.bounds
+   !CHECK-DAG: %[[COND:.*]] = fir.convert %{{.*}} : (!fir.logical<4>) -> i1
+
+   !CHECK: %[[TO_MAP:.*]] = omp.map_info
+
+   !CHECK: omp.target_update_data if(%[[COND]] : i1)
+   !CHECK-SAME: motion_entries(%[[TO_MAP]] : !fir.ref<!fir.array<1024xi32>>)
+   !$omp target update to(a) if(i)
+end subroutine omp_target_update_if
+
+!===============================================================================
+! Target_Update `device` clause
+!===============================================================================
+
+subroutine omp_target_update_device
+   integer :: a(1024)
+
+   !CHECK-DAG: %[[A_ALLOC:.*]] = fir.alloca
+   !CHECK-DAG: %[[BOUNDS:.*]] = omp.bounds
+   !CHECK-DAG: %[[DEVICE:.*]] = arith.constant 1 : i32
+
+   !CHECK: %[[TO_MAP:.*]] = omp.map_info
+
+   !CHECK: omp.target_update_data
+   !CHECK-SAME: device(%[[DEVICE]] : i32)
+   !CHECK-SAME: motion_entries(%[[TO_MAP]] : !fir.ref<!fir.array<1024xi32>>)
+   !$omp target update to(a) device(1)
+end subroutine omp_target_update_device
+
+!===============================================================================
+! Target_Update `nowait` clause
+!===============================================================================
+
+subroutine omp_target_update_nowait
+   integer :: a(1024)
+
+   !CHECK-DAG: %[[A_ALLOC:.*]] = fir.alloca
+   !CHECK-DAG: %[[BOUNDS:.*]] = omp.bounds
+
+   !CHECK: %[[TO_MAP:.*]] = omp.map_info
+
+   !CHECK: omp.target_update_data
+   !CHECK-SAME: nowait
+   !CHECK-SAME: motion_entries(%[[TO_MAP]] : !fir.ref<!fir.array<1024xi32>>)
+   !$omp target update to(a) nowait
+end subroutine omp_target_update_nowait
+
 !===============================================================================
 ! Target_Data with region
 !===============================================================================
diff --git a/flang/test/Lower/OpenMP/target.f90 b/flang/test/Lower/OpenMP/target.f90
index 26b4d595d5229..5ca3a08d8a8b2 100644
--- a/flang/test/Lower/OpenMP/target.f90
+++ b/flang/test/Lower/OpenMP/target.f90
@@ -134,6 +134,94 @@ subroutine omp_target_exit_device
    !$omp target exit data map(from: a) device(d)
 end subroutine omp_target_exit_device
 
+!===============================================================================
+! Target_Update `to` clause
+!===============================================================================
+
+!CHECK-LABEL: func.func @_QPomp_target_update_to() {
+subroutine omp_target_update_to
+   integer :: a(1024)
+
+   !CHECK-DAG: %[[A_DECL:.*]]:2 = hlfir.declare %{{.*}}(%{{.*}})
+   !CHECK-DAG: %[[BOUNDS:.*]] = omp.bounds
+
+   !CHECK: %[[TO_MAP:.*]] = omp.map_info var_ptr(%[[A_DECL]]#1 : !fir.ref<!fir.array<1024xi32>>, !fir.array<1024xi32>)
+   !CHECK-SAME: map_clauses(to) capture(ByRef)
+   !CHECK-SAME: bounds(%[[BOUNDS]]) -> !fir.ref<!fir.array<1024xi32>> {name = "a"}
+
+   !CHECK: omp.target_update_data motion_entries(%[[TO_MAP]] : !fir.ref<!fir.array<1024xi32>>)
+   !$omp target update to(a)
+end subroutine omp_target_update_to
+
+!===============================================================================
+! Target_Update `from` clause
+!===============================================================================
+
+!CHECK-LABEL: func.func @_QPomp_target_update_from() {
+subroutine omp_target_update_from
+   integer :: a(1024)
+
+   !CHECK-DAG: %[[A_DECL:.*]]:2 = hlfir.declare %{{.*}}(%{{.*}})
+   !CHECK-DAG: %[[BOUNDS:.*]] = omp.bounds
+
+   !CHECK: %[[FROM_MAP:.*]] = omp.map_info var_ptr(%[[A_DECL]]#1 : !fir.ref<!fir.array<1024xi32>>, !fir.array<1024xi32>)
+   !CHECK-SAME: map_clauses(from) capture(ByRef)
+   !CHECK-SAME: bounds(%[[BOUNDS]]) -> !fir.ref<!fir.array<1024xi32>> {name = "a"}
+
+   !CHECK: omp.target_update_data motion_entries(%[[FROM_MAP]] : !fir.ref<!fir.array<1024xi32>>)
+   !$omp target update from(a)
+end subroutine omp_target_update_from
+
+!===============================================================================
+! Target_Update `if` clause
+!===============================================================================
+
+!CHECK-LABEL: func.func @_QPomp_target_update_if() {
+subroutine omp_target_update_if
+   integer :: a(1024)
+   logical :: i
+
+   !CHECK-DAG: %[[A_DECL:.*]]:2 = hlfir.declare %{{.*}}(%{{.*}})
+   !CHECK-DAG: %[[BOUNDS:.*]] = omp.bounds
+   !CHECK-DAG: %[[COND:.*]] = fir.convert %{{.*}} : (!fir.logical<4>) -> i1
+
+   !CHECK: omp.target_update_data if(%[[COND]] : i1) motion_entries
+   !$omp target update from(a) if(i)
+end subroutine omp_target_update_if
+
+!===============================================================================
+! Target_Update `device` clause
+!===============================================================================
+
+!CHECK-LABEL: func.func @_QPomp_target_update_device() {
+subroutine omp_target_update_device
+   integer :: a(1024)
+   logical :: i
+
+   !CHECK-DAG: %[[A_DECL:.*]]:2 = hlfir.declare %{{.*}}(%{{.*}})
+   !CHECK-DAG: %[[BOUNDS:.*]] = omp.bounds
+   !CHECK-DAG: %[[DEVICE:.*]] = arith.constant 1 : i32
+
+   !CHECK: omp.target_update_data device(%[[DEVICE]] : i32) motion_entries
+   !$omp target update from(a) device(1)
+end subroutine omp_target_update_device
+
+!===============================================================================
+! Target_Update `nowait` clause
+!===============================================================================
+
+!CHECK-LABEL: func.func @_QPomp_target_update_nowait() {
+subroutine omp_target_update_nowait
+   integer :: a(1024)
+   logical :: i
+
+   !CHECK-DAG: %[[A_DECL:.*]]:2 = hlfir.declare %{{.*}}(%{{.*}})
+   !CHECK-DAG: %[[BOUNDS:.*]] = omp.bounds
+
+   !CHECK: omp.target_update_data nowait motion_entries
+   !$omp target update from(a) nowait
+end subroutine omp_target_update_nowait
+
 !===============================================================================
 ! Target_Data with region
 !===============================================================================

From e9a56ab31613b610d53b9ca86918168c49842c59 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Fri, 22 Dec 2023 19:44:19 +0000
Subject: [PATCH 194/342] [PhaseOrdering] Add test with removable chained
 conditions.

Based on https://godbolt.org/z/hTnra7zdY, which is a slightly more
complicated version of the example from
https://discourse.llvm.org/t/why-does-llvm-not-perform-range-analysis-on-integer-values/74341
---
 .../PhaseOrdering/runtime-check-removal.ll    | 78 +++++++++++++++++++
 1 file changed, 78 insertions(+)

diff --git a/llvm/test/Transforms/PhaseOrdering/runtime-check-removal.ll b/llvm/test/Transforms/PhaseOrdering/runtime-check-removal.ll
index c159d1b686787..5128614de1d1e 100644
--- a/llvm/test/Transforms/PhaseOrdering/runtime-check-removal.ll
+++ b/llvm/test/Transforms/PhaseOrdering/runtime-check-removal.ll
@@ -56,3 +56,81 @@ loop.latch:
 exit:
   ret void
 }
+
+
+define void @chained_conditions(i64 noundef %a, i64 noundef %b, i64 noundef %c, i64 noundef %d) #0 {
+; CHECK-LABEL: @chained_conditions(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i64 [[A:%.*]], 2048
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ugt i64 [[B:%.*]], 1024
+; CHECK-NEXT:    [[OR_COND:%.*]] = or i1 [[CMP]], [[CMP1]]
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp ugt i64 [[C:%.*]], 1024
+; CHECK-NEXT:    [[OR_COND1:%.*]] = or i1 [[OR_COND]], [[CMP3]]
+; CHECK-NEXT:    br i1 [[OR_COND1]], label [[IF_END10:%.*]], label [[IF_END:%.*]]
+; CHECK:       if.end:
+; CHECK-NEXT:    [[ADD:%.*]] = add nuw nsw i64 [[B]], [[A]]
+; CHECK-NEXT:    [[ADD4:%.*]] = add nuw nsw i64 [[ADD]], [[C]]
+; CHECK-NEXT:    [[CMP5_NOT:%.*]] = icmp uge i64 [[ADD4]], [[D:%.*]]
+; CHECK-NEXT:    [[CMP8_NOT:%.*]] = icmp ult i64 [[A]], [[D]]
+; CHECK-NEXT:    [[OR_COND7:%.*]] = or i1 [[CMP5_NOT]], [[CMP8_NOT]]
+; CHECK-NEXT:    br i1 [[OR_COND7]], label [[IF_END10]], label [[IF_THEN9:%.*]]
+; CHECK:       if.then9:
+; CHECK-NEXT:    tail call void @bar()
+; CHECK-NEXT:    br label [[IF_END10]]
+; CHECK:       if.end10:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %a.addr = alloca i64, align 8
+  %b.addr = alloca i64, align 8
+  %c.addr = alloca i64, align 8
+  %d.addr = alloca i64, align 8
+  store i64 %a, ptr %a.addr, align 8
+  store i64 %b, ptr %b.addr, align 8
+  store i64 %c, ptr %c.addr, align 8
+  store i64 %d, ptr %d.addr, align 8
+  %0 = load i64, ptr %a.addr, align 8
+  %cmp = icmp ugt i64 %0, 2048
+  br i1 %cmp, label %if.then, label %lor.lhs.false
+
+lor.lhs.false:                                    ; preds = %entry
+  %1 = load i64, ptr %b.addr, align 8
+  %cmp1 = icmp ugt i64 %1, 1024
+  br i1 %cmp1, label %if.then, label %lor.lhs.false2
+
+lor.lhs.false2:                                   ; preds = %lor.lhs.false
+  %2 = load i64, ptr %c.addr, align 8
+  %cmp3 = icmp ugt i64 %2, 1024
+  br i1 %cmp3, label %if.then, label %if.end
+
+if.then:                                          ; preds = %lor.lhs.false2, %lor.lhs.false, %entry
+  br label %if.end10
+
+if.end:                                           ; preds = %lor.lhs.false2
+  %3 = load i64, ptr %a.addr, align 8
+  %4 = load i64, ptr %b.addr, align 8
+  %add = add i64 %3, %4
+  %5 = load i64, ptr %c.addr, align 8
+  %add4 = add i64 %add, %5
+  %6 = load i64, ptr %d.addr, align 8
+  %cmp5 = icmp uge i64 %add4, %6
+  br i1 %cmp5, label %if.then6, label %if.end7
+
+if.then6:                                         ; preds = %if.end
+  br label %if.end10
+
+if.end7:                                          ; preds = %if.end
+  %7 = load i64, ptr %a.addr, align 8
+  %8 = load i64, ptr %d.addr, align 8
+  %cmp8 = icmp uge i64 %7, %8
+  br i1 %cmp8, label %if.then9, label %if.end10
+
+if.then9:                                         ; preds = %if.end7
+  call void @bar()
+  br label %if.end10
+
+if.end10:                                         ; preds = %if.then, %if.then6, %if.then9, %if.end7
+  ret void
+}
+
+declare void @bar()

From 1ba4a452e51ccfaec0a453b0800dec6f5d557b80 Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Fri, 22 Dec 2023 11:52:35 -0800
Subject: [PATCH 195/342] [test][hwasan] Add -g required for the test

---
 compiler-rt/test/hwasan/TestCases/strip_path_prefix.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/compiler-rt/test/hwasan/TestCases/strip_path_prefix.c b/compiler-rt/test/hwasan/TestCases/strip_path_prefix.c
index 5e41d03b683e9..d8c96e392b9aa 100644
--- a/compiler-rt/test/hwasan/TestCases/strip_path_prefix.c
+++ b/compiler-rt/test/hwasan/TestCases/strip_path_prefix.c
@@ -1,4 +1,4 @@
-// RUN: %clang_hwasan -O0 %s -o %t && %env_hwasan_opts=strip_path_prefix=/TestCases/ not %run %t 2>&1 | FileCheck %s
+// RUN: %clang_hwasan -O0 -g %s -o %t && %env_hwasan_opts=strip_path_prefix=/TestCases/ not %run %t 2>&1 | FileCheck %s
 
 // Stack histories currently are not recorded on x86.
 // XFAIL: target=x86_64{{.*}}

From 9b6ea5e8f8df3c043fba0a2896ab16d682af01cc Mon Sep 17 00:00:00 2001
From: Felipe Cabarcas <110852406+fel-cab@users.noreply.github.com>
Date: Fri, 22 Dec 2023 14:58:11 -0500
Subject: [PATCH 196/342] [OpenMP] Improve omp offload profiler (#68016)

Summary:
Adding information to the LIBOMPTARGET profiler runtime kernel and API
calls.

Key changes:
* Adding information to runtime calls for better understanding of how
the application
is executing. For example teams requested by the user, size of memory
transfers.
* Profile timer was changed from 'us' to 'ns', since 'us' was too
coarse-grain
  to register some important details like key kernel duration
* Removed non API or Runtime calls, to reduce complexity of profile for
application
  developers.

---------

Co-authored-by: Felipe Cabarcas <cabarcas@leia.crpl.cis.udel.edu>
Co-authored-by: fel-cab <fel-cab@github.com>
---
 openmp/libomptarget/include/Shared/Profile.h | 12 ++++++++
 openmp/libomptarget/src/OpenMP/API.cpp       | 17 +++++++++---
 openmp/libomptarget/src/interface.cpp        | 29 ++++++++++----------
 openmp/libomptarget/src/omptarget.cpp        | 21 +++++++-------
 4 files changed, 50 insertions(+), 29 deletions(-)

diff --git a/openmp/libomptarget/include/Shared/Profile.h b/openmp/libomptarget/include/Shared/Profile.h
index 19ca0cf227518..7e580988a39ba 100644
--- a/openmp/libomptarget/include/Shared/Profile.h
+++ b/openmp/libomptarget/include/Shared/Profile.h
@@ -97,4 +97,16 @@ class Profiler {
   std::string RTM = RegionTypeMsg;                                             \
   llvm::TimeTraceScope TimeScope(__FUNCTION__, ProfileLocation + RTM)
 
+/// Time spend in the current scope, assigned to the regionType
+/// with details from runtime
+#define TIMESCOPE_WITH_DETAILS_AND_IDENT(RegionTypeMsg, Details, IDENT)        \
+  SourceInfo SI(IDENT);                                                        \
+  std::string ProfileLocation = SI.getProfileLocation();                       \
+  llvm::TimeTraceScope TimeScope(RegionTypeMsg, ProfileLocation + Details)
+
+/// Time spend in the current scope, assigned to the function name and source
+/// with details
+#define TIMESCOPE_WITH_DETAILS(Details)                                        \
+  llvm::TimeTraceScope TimeScope(__FUNCTION__, Details)
+
 #endif // OMPTARGET_SHARED_PROFILE_H
diff --git a/openmp/libomptarget/src/OpenMP/API.cpp b/openmp/libomptarget/src/OpenMP/API.cpp
index 1769404faf888..a7b6eac8bcd65 100644
--- a/openmp/libomptarget/src/OpenMP/API.cpp
+++ b/openmp/libomptarget/src/OpenMP/API.cpp
@@ -83,6 +83,8 @@ EXTERN int omp_get_initial_device(void) {
 }
 
 EXTERN void *omp_target_alloc(size_t Size, int DeviceNum) {
+  TIMESCOPE_WITH_DETAILS("dst_dev=" + std::to_string(DeviceNum) +
+                         ";size=" + std::to_string(Size));
   return targetAllocExplicit(Size, DeviceNum, TARGET_ALLOC_DEFAULT, __func__);
 }
 
@@ -99,6 +101,7 @@ EXTERN void *llvm_omp_target_alloc_shared(size_t Size, int DeviceNum) {
 }
 
 EXTERN void omp_target_free(void *Ptr, int DeviceNum) {
+  TIMESCOPE();
   return targetFreeExplicit(Ptr, DeviceNum, TARGET_ALLOC_DEFAULT, __func__);
 }
 
@@ -161,7 +164,9 @@ EXTERN int omp_target_is_present(const void *Ptr, int DeviceNum) {
 EXTERN int omp_target_memcpy(void *Dst, const void *Src, size_t Length,
                              size_t DstOffset, size_t SrcOffset, int DstDevice,
                              int SrcDevice) {
-  TIMESCOPE();
+  TIMESCOPE_WITH_DETAILS("dst_dev=" + std::to_string(DstDevice) +
+                         ";src_dev=" + std::to_string(SrcDevice) +
+                         ";size=" + std::to_string(Length));
   DP("Call to omp_target_memcpy, dst device %d, src device %d, "
      "dst addr " DPxMOD ", src addr " DPxMOD ", dst offset %zu, "
      "src offset %zu, length %zu\n",
@@ -400,7 +405,9 @@ EXTERN int omp_target_memcpy_async(void *Dst, const void *Src, size_t Length,
                                    size_t DstOffset, size_t SrcOffset,
                                    int DstDevice, int SrcDevice,
                                    int DepObjCount, omp_depend_t *DepObjList) {
-  TIMESCOPE();
+  TIMESCOPE_WITH_DETAILS("dst_dev=" + std::to_string(DstDevice) +
+                         ";src_dev=" + std::to_string(SrcDevice) +
+                         ";size=" + std::to_string(Length));
   DP("Call to omp_target_memcpy_async, dst device %d, src device %d, "
      "dst addr " DPxMOD ", src addr " DPxMOD ", dst offset %zu, "
      "src offset %zu, length %zu\n",
@@ -429,7 +436,6 @@ omp_target_memcpy_rect(void *Dst, const void *Src, size_t ElementSize,
                        const size_t *DstOffsets, const size_t *SrcOffsets,
                        const size_t *DstDimensions, const size_t *SrcDimensions,
                        int DstDevice, int SrcDevice) {
-  TIMESCOPE();
   DP("Call to omp_target_memcpy_rect, dst device %d, src device %d, "
      "dst addr " DPxMOD ", src addr " DPxMOD ", dst offsets " DPxMOD ", "
      "src offsets " DPxMOD ", dst dims " DPxMOD ", src dims " DPxMOD ", "
@@ -488,7 +494,10 @@ EXTERN int omp_target_memcpy_rect_async(
     const size_t *Volume, const size_t *DstOffsets, const size_t *SrcOffsets,
     const size_t *DstDimensions, const size_t *SrcDimensions, int DstDevice,
     int SrcDevice, int DepObjCount, omp_depend_t *DepObjList) {
-  TIMESCOPE();
+  TIMESCOPE_WITH_DETAILS("dst_dev=" + std::to_string(DstDevice) +
+                         ";src_dev=" + std::to_string(SrcDevice) +
+                         ";size=" + std::to_string(ElementSize) +
+                         ";num_dims=" + std::to_string(NumDims));
   DP("Call to omp_target_memcpy_rect_async, dst device %d, src device %d, "
      "dst addr " DPxMOD ", src addr " DPxMOD ", dst offsets " DPxMOD ", "
      "src offsets " DPxMOD ", dst dims " DPxMOD ", src dims " DPxMOD ", "
diff --git a/openmp/libomptarget/src/interface.cpp b/openmp/libomptarget/src/interface.cpp
index d9e87640161f2..61d9db17f5100 100644
--- a/openmp/libomptarget/src/interface.cpp
+++ b/openmp/libomptarget/src/interface.cpp
@@ -33,14 +33,12 @@ using namespace llvm::omp::target::ompt;
 ////////////////////////////////////////////////////////////////////////////////
 /// adds requires flags
 EXTERN void __tgt_register_requires(int64_t Flags) {
-  TIMESCOPE();
   PM->addRequirements(Flags);
 }
 
 ////////////////////////////////////////////////////////////////////////////////
 /// adds a target shared library to the target execution image
 EXTERN void __tgt_register_lib(__tgt_bin_desc *Desc) {
-  TIMESCOPE();
   if (PM->delayRegisterLib(Desc))
     return;
 
@@ -54,7 +52,6 @@ EXTERN void __tgt_init_all_rtls() { PM->initAllPlugins(); }
 ////////////////////////////////////////////////////////////////////////////////
 /// unloads a target shared library
 EXTERN void __tgt_unregister_lib(__tgt_bin_desc *Desc) {
-  TIMESCOPE();
   PM->unregisterLib(Desc);
 }
 
@@ -68,7 +65,8 @@ targetData(ident_t *Loc, int64_t DeviceId, int32_t ArgNum, void **ArgsBase,
   static_assert(std::is_convertible_v<TargetAsyncInfoTy, AsyncInfoTy>,
                 "TargetAsyncInfoTy must be convertible to AsyncInfoTy.");
 
-  TIMESCOPE_WITH_RTM_AND_IDENT(RegionTypeMsg, Loc);
+  TIMESCOPE_WITH_DETAILS_AND_IDENT("Runtime: Data Copy",
+                                   "NumArgs=" + std::to_string(ArgNum), Loc);
 
   DP("Entering data %s region for device %" PRId64 " with %d mappings\n",
      RegionName, DeviceId, ArgNum);
@@ -240,9 +238,6 @@ static inline int targetKernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams,
                                KernelArgsTy *KernelArgs) {
   static_assert(std::is_convertible_v<TargetAsyncInfoTy, AsyncInfoTy>,
                 "Target AsyncInfoTy must be convertible to AsyncInfoTy.");
-
-  TIMESCOPE_WITH_IDENT(Loc);
-
   DP("Entering target region for device %" PRId64 " with entry point " DPxMOD
      "\n",
      DeviceId, DPxPTR(HostPtr));
@@ -267,6 +262,11 @@ static inline int targetKernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams,
   assert(KernelArgs->ThreadLimit[0] == static_cast<uint32_t>(ThreadLimit) &&
          !KernelArgs->ThreadLimit[1] && !KernelArgs->ThreadLimit[2] &&
          "OpenMP interface should not use multiple dimensions");
+  TIMESCOPE_WITH_DETAILS_AND_IDENT(
+      "Runtime: target exe",
+      "NumTeams=" + std::to_string(NumTeams) +
+          ";NumArgs=" + std::to_string(KernelArgs->NumArgs),
+      Loc);
 
   if (getInfoLevel() & OMP_INFOTYPE_KERNEL_ARGS)
     printKernelArguments(Loc, DeviceId, KernelArgs->NumArgs,
@@ -297,13 +297,14 @@ static inline int targetKernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams,
 
   int Rc = OFFLOAD_SUCCESS;
   Rc = target(Loc, *DeviceOrErr, HostPtr, *KernelArgs, AsyncInfo);
+  { // required to show syncronization
+    TIMESCOPE_WITH_DETAILS_AND_IDENT("Runtime: syncronize", "", Loc);
+    if (Rc == OFFLOAD_SUCCESS)
+      Rc = AsyncInfo.synchronize();
 
-  if (Rc == OFFLOAD_SUCCESS)
-    Rc = AsyncInfo.synchronize();
-
-  handleTargetOutcome(Rc == OFFLOAD_SUCCESS, Loc);
-  assert(Rc == OFFLOAD_SUCCESS && "__tgt_target_kernel unexpected failure!");
-
+    handleTargetOutcome(Rc == OFFLOAD_SUCCESS, Loc);
+    assert(Rc == OFFLOAD_SUCCESS && "__tgt_target_kernel unexpected failure!");
+  }
   return OMP_TGT_SUCCESS;
 }
 
@@ -402,7 +403,6 @@ EXTERN int __tgt_target_kernel_replay(ident_t *Loc, int64_t DeviceId,
 
 // Get the current number of components for a user-defined mapper.
 EXTERN int64_t __tgt_mapper_num_components(void *RtMapperHandle) {
-  TIMESCOPE();
   auto *MapperComponentsPtr = (struct MapperComponentsTy *)RtMapperHandle;
   int64_t Size = MapperComponentsPtr->Components.size();
   DP("__tgt_mapper_num_components(Handle=" DPxMOD ") returns %" PRId64 "\n",
@@ -414,7 +414,6 @@ EXTERN int64_t __tgt_mapper_num_components(void *RtMapperHandle) {
 EXTERN void __tgt_push_mapper_component(void *RtMapperHandle, void *Base,
                                         void *Begin, int64_t Size, int64_t Type,
                                         void *Name) {
-  TIMESCOPE();
   DP("__tgt_push_mapper_component(Handle=" DPxMOD
      ") adds an entry (Base=" DPxMOD ", Begin=" DPxMOD ", Size=%" PRId64
      ", Type=0x%" PRIx64 ", Name=%s).\n",
diff --git a/openmp/libomptarget/src/omptarget.cpp b/openmp/libomptarget/src/omptarget.cpp
index e724b2f6db8b5..a7d55d7ebd539 100644
--- a/openmp/libomptarget/src/omptarget.cpp
+++ b/openmp/libomptarget/src/omptarget.cpp
@@ -392,7 +392,6 @@ static int32_t getParentIndex(int64_t Type) {
 
 void *targetAllocExplicit(size_t Size, int DeviceNum, int Kind,
                           const char *Name) {
-  TIMESCOPE();
   DP("Call to %s for device %d requesting %zu bytes\n", Name, DeviceNum, Size);
 
   if (Size <= 0) {
@@ -419,7 +418,6 @@ void *targetAllocExplicit(size_t Size, int DeviceNum, int Kind,
 
 void targetFreeExplicit(void *DevicePtr, int DeviceNum, int Kind,
                         const char *Name) {
-  TIMESCOPE();
   DP("Call to %s for device %d and address " DPxMOD "\n", Name, DeviceNum,
      DPxPTR(DevicePtr));
 
@@ -444,7 +442,6 @@ void targetFreeExplicit(void *DevicePtr, int DeviceNum, int Kind,
 
 void *targetLockExplicit(void *HostPtr, size_t Size, int DeviceNum,
                          const char *Name) {
-  TIMESCOPE();
   DP("Call to %s for device %d locking %zu bytes\n", Name, DeviceNum, Size);
 
   if (Size <= 0) {
@@ -471,7 +468,6 @@ void *targetLockExplicit(void *HostPtr, size_t Size, int DeviceNum,
 }
 
 void targetUnlockExplicit(void *HostPtr, int DeviceNum, const char *Name) {
-  TIMESCOPE();
   DP("Call to %s for device %d unlocking\n", Name, DeviceNum);
 
   auto DeviceOrErr = PM->getDevice(DeviceNum);
@@ -531,14 +527,14 @@ int targetDataBegin(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
                     int64_t *ArgTypes, map_var_info_t *ArgNames,
                     void **ArgMappers, AsyncInfoTy &AsyncInfo,
                     bool FromMapper) {
-  TIMESCOPE_WITH_IDENT(Loc);
   // process each input.
   for (int32_t I = 0; I < ArgNum; ++I) {
     // Ignore private variables and arrays - there is no mapping for them.
     if ((ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) ||
         (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE))
       continue;
-
+    TIMESCOPE_WITH_DETAILS_AND_IDENT(
+        "HostToDev", "Size=" + std::to_string(ArgSizes[I]) + "B", Loc);
     if (ArgMappers && ArgMappers[I]) {
       // Instead of executing the regular path of targetDataBegin, call the
       // targetDataMapper variant which will call targetDataBegin again
@@ -913,7 +909,8 @@ int targetDataEnd(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
         !TPR.Flags.IsHostPointer && DataSize != 0) {
       DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n",
          DataSize, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin));
-
+      TIMESCOPE_WITH_DETAILS_AND_IDENT(
+          "DevToHost", "Size=" + std::to_string(DataSize) + "B", Loc);
       // Wait for any previous transfer if an event is present.
       if (void *Event = TPR.getEntry()->getEvent()) {
         if (Device.waitEvent(Event, AsyncInfo) != OFFLOAD_SUCCESS) {
@@ -1403,7 +1400,6 @@ static int processDataBefore(ident_t *Loc, int64_t DeviceId, void *HostPtr,
                              SmallVector<ptrdiff_t> &TgtOffsets,
                              PrivateArgumentManagerTy &PrivateArgumentManager,
                              AsyncInfoTy &AsyncInfo) {
-  TIMESCOPE_WITH_NAME_AND_IDENT("mappingBeforeTargetRegion", Loc);
 
   auto DeviceOrErr = PM->getDevice(DeviceId);
   if (!DeviceOrErr)
@@ -1537,7 +1533,7 @@ static int processDataAfter(ident_t *Loc, int64_t DeviceId, void *HostPtr,
                             map_var_info_t *ArgNames, void **ArgMappers,
                             PrivateArgumentManagerTy &PrivateArgumentManager,
                             AsyncInfoTy &AsyncInfo) {
-  TIMESCOPE_WITH_NAME_AND_IDENT("mappingAfterTargetRegion", Loc);
+
   auto DeviceOrErr = PM->getDevice(DeviceId);
   if (!DeviceOrErr)
     FATAL_MESSAGE(DeviceId, "%s", toString(DeviceOrErr.takeError()).c_str());
@@ -1639,7 +1635,12 @@ int target(ident_t *Loc, DeviceTy &Device, void *HostPtr,
 
   {
     assert(KernelArgs.NumArgs == TgtArgs.size() && "Argument count mismatch!");
-    TIMESCOPE_WITH_NAME_AND_IDENT("Initiate Kernel Launch", Loc);
+    TIMESCOPE_WITH_DETAILS_AND_IDENT(
+        "Kernel Target",
+        "NumArguments=" + std::to_string(KernelArgs.NumArgs) +
+            ";NumTeams=" + std::to_string(KernelArgs.NumTeams[0]) +
+            ";TripCount=" + std::to_string(KernelArgs.Tripcount),
+        Loc);
 
 #ifdef OMPT_SUPPORT
     assert(KernelArgs.NumTeams[1] == 0 && KernelArgs.NumTeams[2] == 0 &&

From 345d7b1618d48e658d3a72c5c258c1168dcee4bd Mon Sep 17 00:00:00 2001
From: Yingwei Zheng <dtcxzyw2333@gmail.com>
Date: Sat, 23 Dec 2023 04:41:32 +0800
Subject: [PATCH 197/342] [InstCombine] Fold minmax intrinsic using KnownBits
 information (#76242)

This patch tries to fold minmax intrinsic by using
`computeConstantRangeIncludingKnownBits`.
Fixes regression in
[_karatsuba_rec:cpython/Modules/_decimal/libmpdec/mpdecimal.c](https://github.com/python/cpython/blob/c31943af16f885c8cf5d5a690c25c366afdb2862/Modules/_decimal/libmpdec/mpdecimal.c#L5460-L5462),
which was introduced by #71396.
See also
https://github.com/dtcxzyw/llvm-opt-benchmark/issues/16#issuecomment-1865875756.

Alive2 for splat vectors with undef: https://alive2.llvm.org/ce/z/J8hKWd
---
 llvm/include/llvm/Analysis/ValueTracking.h    |  5 +
 llvm/lib/Analysis/ValueTracking.cpp           |  8 +-
 .../InstCombine/InstCombineCalls.cpp          | 17 ++++
 .../InstCombine/minmax-intrinsics.ll          | 92 +++++++++++++++++++
 4 files changed, 118 insertions(+), 4 deletions(-)

diff --git a/llvm/include/llvm/Analysis/ValueTracking.h b/llvm/include/llvm/Analysis/ValueTracking.h
index a3186e61b94ad..baa16306ebf5d 100644
--- a/llvm/include/llvm/Analysis/ValueTracking.h
+++ b/llvm/include/llvm/Analysis/ValueTracking.h
@@ -863,6 +863,11 @@ ConstantRange computeConstantRange(const Value *V, bool ForSigned,
                                    const DominatorTree *DT = nullptr,
                                    unsigned Depth = 0);
 
+/// Combine constant ranges from computeConstantRange() and computeKnownBits().
+ConstantRange
+computeConstantRangeIncludingKnownBits(const WithCache<const Value *> &V,
+                                       bool ForSigned, const SimplifyQuery &SQ);
+
 /// Return true if this function can prove that the instruction I will
 /// always transfer execution to one of its successors (including the next
 /// instruction that follows within a basic block). E.g. this is not
diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index 769d921eb1e8d..cac2602d455f9 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -6289,10 +6289,10 @@ static OverflowResult mapOverflowResult(ConstantRange::OverflowResult OR) {
 }
 
 /// Combine constant ranges from computeConstantRange() and computeKnownBits().
-static ConstantRange
-computeConstantRangeIncludingKnownBits(const WithCache<const Value *> &V,
-                                       bool ForSigned,
-                                       const SimplifyQuery &SQ) {
+ConstantRange
+llvm::computeConstantRangeIncludingKnownBits(const WithCache<const Value *> &V,
+                                             bool ForSigned,
+                                             const SimplifyQuery &SQ) {
   ConstantRange CR1 =
       ConstantRange::fromKnownBits(V.getKnownBits(SQ), ForSigned);
   ConstantRange CR2 = computeConstantRange(V, ForSigned, SQ.IIQ.UseInstrInfo);
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index a272357fa04a4..3b7fe7fa22660 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -1796,6 +1796,23 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
     if (Instruction *NewMinMax = factorizeMinMaxTree(II))
        return NewMinMax;
 
+    // Try to fold minmax with constant RHS based on range information
+    const APInt *RHSC;
+    if (match(I1, m_APIntAllowUndef(RHSC))) {
+      ICmpInst::Predicate Pred =
+          ICmpInst::getNonStrictPredicate(MinMaxIntrinsic::getPredicate(IID));
+      bool IsSigned = MinMaxIntrinsic::isSigned(IID);
+      ConstantRange LHS_CR = computeConstantRangeIncludingKnownBits(
+          I0, IsSigned, SQ.getWithInstruction(II));
+      if (!LHS_CR.isFullSet()) {
+        if (LHS_CR.icmp(Pred, *RHSC))
+          return replaceInstUsesWith(*II, I0);
+        if (LHS_CR.icmp(ICmpInst::getSwappedPredicate(Pred), *RHSC))
+          return replaceInstUsesWith(*II,
+                                     ConstantInt::get(II->getType(), *RHSC));
+      }
+    }
+
     break;
   }
   case Intrinsic::bitreverse: {
diff --git a/llvm/test/Transforms/InstCombine/minmax-intrinsics.ll b/llvm/test/Transforms/InstCombine/minmax-intrinsics.ll
index f3833a420ee83..ae2e115b1dd9a 100644
--- a/llvm/test/Transforms/InstCombine/minmax-intrinsics.ll
+++ b/llvm/test/Transforms/InstCombine/minmax-intrinsics.ll
@@ -2489,3 +2489,95 @@ define i1 @PR57986() {
   %umin = call i1 @llvm.umin.i1(i1 ptrtoint (ptr @g to i1), i1 true)
   ret i1 %umin
 }
+
+define i8 @fold_umax_with_knownbits_info(i8 %a, i8 %b) {
+; CHECK-LABEL: @fold_umax_with_knownbits_info(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A1:%.*]] = or i8 [[A:%.*]], 1
+; CHECK-NEXT:    [[A2:%.*]] = shl i8 [[B:%.*]], 1
+; CHECK-NEXT:    [[SUB:%.*]] = sub i8 [[A1]], [[A2]]
+; CHECK-NEXT:    ret i8 [[SUB]]
+;
+entry:
+  %a1 = or i8 %a, 1
+  %a2 = shl i8 %b, 1
+  %sub = sub i8 %a1, %a2
+  %val = call i8 @llvm.umax.i8(i8 %sub, i8 1)
+  ret i8 %val
+}
+
+define <3 x i8> @fold_umax_with_knownbits_info_undef_in_splat(<3 x i8> %a, <3 x i8> %b) {
+; CHECK-LABEL: @fold_umax_with_knownbits_info_undef_in_splat(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A1:%.*]] = or <3 x i8> [[A:%.*]], <i8 1, i8 1, i8 1>
+; CHECK-NEXT:    [[A2:%.*]] = shl <3 x i8> [[B:%.*]], <i8 1, i8 1, i8 1>
+; CHECK-NEXT:    [[SUB:%.*]] = sub <3 x i8> [[A1]], [[A2]]
+; CHECK-NEXT:    ret <3 x i8> [[SUB]]
+;
+entry:
+  %a1 = or <3 x i8> %a, <i8 1, i8 1, i8 1>
+  %a2 = shl <3 x i8> %b, <i8 1, i8 1, i8 1>
+  %sub = sub <3 x i8> %a1, %a2
+  %val = call <3 x i8> @llvm.umax.v3i8(<3 x i8> %sub, <3 x i8> <i8 1, i8 undef, i8 1>)
+  ret <3 x i8> %val
+}
+
+define i8 @fold_umin_with_knownbits_info(i8 %a, i8 %b) {
+; CHECK-LABEL: @fold_umin_with_knownbits_info(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    ret i8 3
+;
+entry:
+  %a1 = or i8 %a, 3
+  %a2 = shl i8 %b, 2
+  %sub = sub i8 %a1, %a2
+  %val = call i8 @llvm.umin.i8(i8 %sub, i8 3)
+  ret i8 %val
+}
+
+define <3 x i8> @fold_umin_with_knownbits_info_undef_in_splat(<3 x i8> %a, <3 x i8> %b) {
+; CHECK-LABEL: @fold_umin_with_knownbits_info_undef_in_splat(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    ret <3 x i8> <i8 3, i8 3, i8 3>
+;
+entry:
+  %a1 = or <3 x i8> %a, <i8 3, i8 3, i8 3>
+  %a2 = shl <3 x i8> %b, <i8 2, i8 2, i8 2>
+  %sub = sub <3 x i8> %a1, %a2
+  %val = call <3 x i8> @llvm.umin.v3i8(<3 x i8> %sub, <3 x i8> <i8 3, i8 undef, i8 3>)
+  ret <3 x i8> %val
+}
+
+define i8 @fold_umax_with_knownbits_info_fail(i8 %a, i8 %b) {
+; CHECK-LABEL: @fold_umax_with_knownbits_info_fail(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A1:%.*]] = or i8 [[A:%.*]], 2
+; CHECK-NEXT:    [[A2:%.*]] = shl i8 [[B:%.*]], 1
+; CHECK-NEXT:    [[SUB:%.*]] = sub i8 [[A1]], [[A2]]
+; CHECK-NEXT:    [[VAL:%.*]] = call i8 @llvm.umax.i8(i8 [[SUB]], i8 1)
+; CHECK-NEXT:    ret i8 [[VAL]]
+;
+entry:
+  %a1 = or i8 %a, 2
+  %a2 = shl i8 %b, 1
+  %sub = sub i8 %a1, %a2
+  %val = call i8 @llvm.umax.i8(i8 %sub, i8 1)
+  ret i8 %val
+}
+
+define i8 @fold_umin_with_knownbits_info_fail(i8 %a, i8 %b) {
+; CHECK-LABEL: @fold_umin_with_knownbits_info_fail(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A1:%.*]] = or i8 [[A:%.*]], 1
+; CHECK-NEXT:    [[A2:%.*]] = shl i8 [[B:%.*]], 2
+; CHECK-NEXT:    [[SUB:%.*]] = sub i8 [[A1]], [[A2]]
+; CHECK-NEXT:    [[VAL:%.*]] = call i8 @llvm.umin.i8(i8 [[SUB]], i8 3)
+; CHECK-NEXT:    ret i8 [[VAL]]
+;
+entry:
+  %a1 = or i8 %a, 1
+  %a2 = shl i8 %b, 2
+  %sub = sub i8 %a1, %a2
+  %val = call i8 @llvm.umin.i8(i8 %sub, i8 3)
+  ret i8 %val
+}

From 813a671232e2ecd49a4c861ca34727a01abf09c9 Mon Sep 17 00:00:00 2001
From: Ethan Luis McDonough <ethanluismcdonough@gmail.com>
Date: Fri, 22 Dec 2023 14:44:23 -0600
Subject: [PATCH 198/342] [OpenMP] Remove unnecessary dependencies from plugin
 unit tests (#76266)

This was an oversight that seems to be causing problems on certain
builds. This patch should fix #76225.
---
 openmp/libomptarget/unittests/Plugins/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/openmp/libomptarget/unittests/Plugins/CMakeLists.txt b/openmp/libomptarget/unittests/Plugins/CMakeLists.txt
index e137d2a9d1774..67d4d9b463b0c 100644
--- a/openmp/libomptarget/unittests/Plugins/CMakeLists.txt
+++ b/openmp/libomptarget/unittests/Plugins/CMakeLists.txt
@@ -1,4 +1,4 @@
-set(PLUGINS_TEST_COMMON omptarget OMPT omptarget.devicertl)
+set(PLUGINS_TEST_COMMON omptarget)
 set(PLUGINS_TEST_SOURCES NextgenPluginsTest.cpp)
 set(PLUGINS_TEST_INCLUDE ${LIBOMPTARGET_INCLUDE_DIR})
 

From ceccacb0b835167fe23ec7cc872edd1d046a30cb Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Fri, 22 Dec 2023 12:54:12 -0800
Subject: [PATCH 199/342] [test][hwasan] Re-enabled android test

---
 compiler-rt/test/hwasan/TestCases/strip_path_prefix.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/compiler-rt/test/hwasan/TestCases/strip_path_prefix.c b/compiler-rt/test/hwasan/TestCases/strip_path_prefix.c
index d8c96e392b9aa..80ef32699f8f4 100644
--- a/compiler-rt/test/hwasan/TestCases/strip_path_prefix.c
+++ b/compiler-rt/test/hwasan/TestCases/strip_path_prefix.c
@@ -3,9 +3,6 @@
 // Stack histories currently are not recorded on x86.
 // XFAIL: target=x86_64{{.*}}
 
-// FIXME: Android does not see a variable.
-// XFAIL: android
-
 #include <assert.h>
 #include <sanitizer/hwasan_interface.h>
 #include <stdio.h>

From 0e039fc39e29320c9a0eeadb34a5e83ac51d48ba Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Fri, 22 Dec 2023 14:23:25 -0800
Subject: [PATCH 200/342] [APINotes] Use DenseMap::contains (NFC)

---
 clang/lib/APINotes/APINotesManager.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/lib/APINotes/APINotesManager.cpp b/clang/lib/APINotes/APINotesManager.cpp
index a921c8b9fce3e..d3aef09dac910 100644
--- a/clang/lib/APINotes/APINotesManager.cpp
+++ b/clang/lib/APINotes/APINotesManager.cpp
@@ -125,7 +125,7 @@ APINotesManager::loadAPINotes(StringRef Buffer) {
 
 bool APINotesManager::loadAPINotes(const DirectoryEntry *HeaderDir,
                                    FileEntryRef APINotesFile) {
-  assert(Readers.find(HeaderDir) == Readers.end());
+  assert(!Readers.contains(HeaderDir));
   if (auto Reader = loadAPINotes(APINotesFile)) {
     Readers[HeaderDir] = Reader.release();
     return false;

From 4532617ae420056bf32f6403dde07fb99d276a49 Mon Sep 17 00:00:00 2001
From: James Y Knight <jyknight@google.com>
Date: Fri, 22 Dec 2023 17:35:26 -0500
Subject: [PATCH 201/342] Change release branch creation process to bump
 version to N.1.0. (#75743)

This will help distinguish release branch builds from development branch
builds, and is similar to GCC's version numbering policy.

Thus, the branch `releases/18.x` will start out numbered 18.1.0, instead
of 18.0.0.

Unchanged are other versioning policies:
- mainline will be numbered 18.0.0, 19.0.0, ...
- typical release branch releases will increment micro version, e.g.
18.1.1, 18.1.2, ....
- If an ABI break is required on the release branch, the minor version
will be incremented, e.g. to 18.2.0.

See the Discourse RFC:

https://discourse.llvm.org/t/rfc-name-the-first-release-from-a-branch-n-1-0-instead-of-n-0-0/75384
---
 llvm/docs/HowToReleaseLLVM.rst | 31 +++++++++++++++++--------------
 1 file changed, 17 insertions(+), 14 deletions(-)

diff --git a/llvm/docs/HowToReleaseLLVM.rst b/llvm/docs/HowToReleaseLLVM.rst
index 15397c593d497..51ab6dfd8d8d5 100644
--- a/llvm/docs/HowToReleaseLLVM.rst
+++ b/llvm/docs/HowToReleaseLLVM.rst
@@ -40,16 +40,16 @@ Release                         Approx. Date
 =============================== =========================
 *release branch: even releases* *4th Tue in January*
 *release branch: odd releases*  *4th Tue in July*
-X.0.0-rc1                       3 days after branch.
-X.0.0-rc2                       2 weeks after branch.
-X.0.0-rc3                       4 weeks after branch
-**X.0.0-final**                 **6 weeks after branch**
-**X.0.1**                       **8 weeks after branch**
-**X.0.2**                       **10 weeks after branch**
-**X.0.3**                       **12 weeks after branch**
-**X.0.4**                       **14 weeks after branch**
-**X.0.5**                       **16 weeks after branch**
-**X.0.6 (if necessary)**        **18 weeks after branch**
+X.1.0-rc1                       3 days after branch.
+X.1.0-rc2                       2 weeks after branch.
+X.1.0-rc3                       4 weeks after branch
+**X.1.0-final**                 **6 weeks after branch**
+**X.1.1**                       **8 weeks after branch**
+**X.1.2**                       **10 weeks after branch**
+**X.1.3**                       **12 weeks after branch**
+**X.1.4**                       **14 weeks after branch**
+**X.1.5**                       **16 weeks after branch**
+**X.1.6 (if necessary)**        **18 weeks after branch**
 =============================== =========================
 
 Release Process Summary
@@ -77,7 +77,7 @@ Release Process Summary
 
 * Announce bug fix release schedule to the LLVM community and update the website.
 
-* Do bug-fix releases every two weeks until X.0.5 or X.0.6 (if necessary).
+* Do bug-fix releases every two weeks until X.1.5 or X.1.6 (if necessary).
 
 Release Process
 ===============
@@ -123,6 +123,9 @@ Branch the Git trunk using the following procedure:
    version bump.  The branch's name is release/X.x where ``X`` is the major version
    number and ``x`` is just the letter ``x``.
 
+#. On the newly-created release branch, immediately bump the version
+   to X.1.0git (where ``X`` is the major version of the branch.)
+
 #. All tags and branches need to be created in both the llvm/llvm-project and
    llvm/llvm-test-suite repos.
 
@@ -406,13 +409,13 @@ Announce the Release
 ^^^^^^^^^^^^^^^^^^^^
 
 Create a new post in the `Announce Category <https://discourse.llvm.org/c/announce>`_
-once all the release tasks are complete.  For X.0.0 releases, make sure to include a
-link to the release notes in the post.  For X.0.1+ releases, generate a changelog
+once all the release tasks are complete.  For X.1.0 releases, make sure to include a
+link to the release notes in the post.  For X.1.1+ releases, generate a changelog
 using this command and add it to the post.
 
 ::
 
-  $ git log --format="- %aN: [%s (%h)](https://github.com/llvm/llvm-project/commit/%H)" llvmorg-X.0.N-1..llvmorg-X.0.N
+  $ git log --format="- %aN: [%s (%h)](https://github.com/llvm/llvm-project/commit/%H)" llvmorg-X.1.N-1..llvmorg-X.1.N
 
 Once the release has been announced add a link to the announcement on the llvm
 homepage (from the llvm-www repo) in the "Release Emails" section.

From 03dc806b128a94771ff9e9b56b28babdfb3c0931 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Fri, 22 Dec 2023 14:51:22 -0800
Subject: [PATCH 202/342] [Transforms] Use {DenseMap,SmallPtrSet}::contains
 (NFC)

---
 llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp        | 2 +-
 llvm/lib/Transforms/Scalar/GVN.cpp                     | 2 +-
 llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp | 2 +-
 llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp     | 2 +-
 llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp        | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
index edfeb36f3422e..c5bf913cda301 100644
--- a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
+++ b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
@@ -521,7 +521,7 @@ struct AllSwitchPaths {
 
       const BasicBlock *PrevBB = Path.back();
       for (const BasicBlock *BB : Path) {
-        if (StateDef.count(BB) != 0) {
+        if (StateDef.contains(BB)) {
           const PHINode *Phi = dyn_cast<PHINode>(StateDef[BB]);
           assert(Phi && "Expected a state-defining instr to be a phi node.");
 
diff --git a/llvm/lib/Transforms/Scalar/GVN.cpp b/llvm/lib/Transforms/Scalar/GVN.cpp
index 5e58af0edc155..e36578f3de7ac 100644
--- a/llvm/lib/Transforms/Scalar/GVN.cpp
+++ b/llvm/lib/Transforms/Scalar/GVN.cpp
@@ -592,7 +592,7 @@ uint32_t GVNPass::ValueTable::lookupOrAddCall(CallInst *C) {
 
 /// Returns true if a value number exists for the specified value.
 bool GVNPass::ValueTable::exists(Value *V) const {
-  return valueNumbering.count(V) != 0;
+  return valueNumbering.contains(V);
 }
 
 /// lookup_or_add - Returns the value number for the specified value, assigning
diff --git a/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp b/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
index 40b4ea92e1ff9..3f02441b74ba8 100644
--- a/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
+++ b/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
@@ -2057,7 +2057,7 @@ static void relocationViaAlloca(
   for (const auto &Info : Records)
     for (auto RematerializedValuePair : Info.RematerializedValues) {
       Value *OriginalValue = RematerializedValuePair.second;
-      if (AllocaMap.count(OriginalValue) != 0)
+      if (AllocaMap.contains(OriginalValue))
         continue;
 
       emitAllocaFor(OriginalValue);
diff --git a/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp b/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp
index 1e42d7491676d..f94047633022c 100644
--- a/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp
+++ b/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp
@@ -64,7 +64,7 @@ bool forAllReachableExits(const DominatorTree &DT, const PostDominatorTree &PDT,
     // sure that the return is covered. Otherwise, we can check whether there
     // is a way to reach the RI from the start of the lifetime without passing
     // through an end.
-    if (EndBlocks.count(RI->getParent()) > 0 ||
+    if (EndBlocks.contains(RI->getParent()) ||
         !isPotentiallyReachable(Start, RI, &EndBlocks, &DT, &LI)) {
       ++NumCoveredExits;
     }
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 5c325ad8a291a..32913b3f55697 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -6443,7 +6443,7 @@ bool BoUpSLP::areAllUsersVectorized(
     Instruction *I, const SmallDenseSet<Value *> *VectorizedVals) const {
   return (I->hasOneUse() && (!VectorizedVals || VectorizedVals->contains(I))) ||
          all_of(I->users(), [this](User *U) {
-           return ScalarToTreeEntry.count(U) > 0 ||
+           return ScalarToTreeEntry.contains(U) ||
                   isVectorLikeInstWithConstOps(U) ||
                   (isa<ExtractElementInst>(U) && MustGather.contains(U));
          });

From 2205d2334f3c859ad9f6c65ed950bfb3bb6f7cbe Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Fri, 22 Dec 2023 15:20:00 -0800
Subject: [PATCH 203/342] Revert "[Sema] Fix crash on invalid code with
 parenthesized aggregate initialization" (#76272)

Reverts llvm/llvm-project#76232 and
7ab16fb5207fe187ab999f882069bd632d2e68e5 to recover build bots.

Breaks libc++ tests, details in #76232

#76228
---
 clang/lib/Sema/SemaInit.cpp                |  8 -------
 clang/test/SemaCXX/crash-GH76228.cpp       | 28 ----------------------
 clang/test/SemaCXX/paren-list-agg-init.cpp |  2 +-
 3 files changed, 1 insertion(+), 37 deletions(-)
 delete mode 100644 clang/test/SemaCXX/crash-GH76228.cpp

diff --git a/clang/lib/Sema/SemaInit.cpp b/clang/lib/Sema/SemaInit.cpp
index cc9db5ded1149..61d244f3bb979 100644
--- a/clang/lib/Sema/SemaInit.cpp
+++ b/clang/lib/Sema/SemaInit.cpp
@@ -5512,14 +5512,6 @@ static void TryOrBuildParenListInitialization(
   } else if (auto *RT = Entity.getType()->getAs<RecordType>()) {
     bool IsUnion = RT->isUnionType();
     const CXXRecordDecl *RD = cast<CXXRecordDecl>(RT->getDecl());
-    if (RD->isInvalidDecl()) {
-      // Exit early to avoid confusion when processing members.
-      // We do the same for braced list initialization in
-      // `CheckStructUnionTypes`.
-      Sequence.SetFailed(
-          clang::InitializationSequence::FK_ParenthesizedListInitFailed);
-      return;
-    }
 
     if (!IsUnion) {
       for (const CXXBaseSpecifier &Base : RD->bases()) {
diff --git a/clang/test/SemaCXX/crash-GH76228.cpp b/clang/test/SemaCXX/crash-GH76228.cpp
deleted file mode 100644
index 33a9395823127..0000000000000
--- a/clang/test/SemaCXX/crash-GH76228.cpp
+++ /dev/null
@@ -1,28 +0,0 @@
-// RUN: %clang_cc1 -std=c++20 -verify %s
-// Check we don't crash on incomplete members and bases when handling parenthesized initialization.
-class incomplete; // expected-note@-0 3  {{forward declaration of 'incomplete'}}
-struct foo {
-  int a;
-  incomplete b;
-  // expected-error@-1 {{incomplete type}}
-};
-foo a1(0);
-
-struct one_int {
-    int a;
-};
-struct bar : one_int, incomplete {};
-// expected-error@-1 {{incomplete type}}
-bar a2(0);
-
-incomplete a3[3](1,2,3);
-// expected-error@-1 {{incomplete type}}
-
-struct qux : foo {
-};
-qux a4(0);
-
-struct fred {
-    foo a[3];
-};
-fred a5(0);
diff --git a/clang/test/SemaCXX/paren-list-agg-init.cpp b/clang/test/SemaCXX/paren-list-agg-init.cpp
index c1964a5a9eb00..f60b20e0d4656 100644
--- a/clang/test/SemaCXX/paren-list-agg-init.cpp
+++ b/clang/test/SemaCXX/paren-list-agg-init.cpp
@@ -289,7 +289,7 @@ int test() {
   // used to crash
   S a(0, 1);
   S b(0);
-  S c(0, 0, 1);
+  S c(0, 0, 1); // beforecxx20-warning {{aggregate initialization of type 'S' from a parenthesized list of values is a C++20 extension}}
 
   S d {0, 1};
   S e {0};

From 63c314835458211b165e48b2278800ea6d52620a Mon Sep 17 00:00:00 2001
From: Aiden Grossman <agrossman154@yahoo.com>
Date: Fri, 22 Dec 2023 17:55:26 -0800
Subject: [PATCH 204/342] [llvm-exegesis] Fix stack pointer register
 restoration

9eb80ab3787e1851be8c686651688e870b93506b changed the method for stack
pointer restoration to fix segmentation faults. However, I made a
mistake in the patch and swapped a != for a ==, which caused an
arbitrary register (the first one specified) to get restored rather than
the stack pointer specifically. This patch fixes that issue and adds
test coverage to prevent regression.
---
 .../latency/subprocess-preserved-registers.s  | 39 ++++++++++++++++---
 llvm/tools/llvm-exegesis/lib/Assembler.cpp    |  2 +-
 2 files changed, 34 insertions(+), 7 deletions(-)

diff --git a/llvm/test/tools/llvm-exegesis/X86/latency/subprocess-preserved-registers.s b/llvm/test/tools/llvm-exegesis/X86/latency/subprocess-preserved-registers.s
index 1c680b7c66d27..e46da53a212c5 100644
--- a/llvm/test/tools/llvm-exegesis/X86/latency/subprocess-preserved-registers.s
+++ b/llvm/test/tools/llvm-exegesis/X86/latency/subprocess-preserved-registers.s
@@ -7,9 +7,18 @@
 
 # LLVM-EXEGESIS-DEFREG RAX 3
 # LLVM-EXEGESIS-DEFREG RCX 5
-# LLVM-EXEGESIS-DEFREG RDI 7
-# LLVM-EXEGESIS-DEFREG RSI B
-# LLVM-EXEGESIS-DEFREG R11 D
+# LLVM-EXEGESIS-DEFREG RDX 7
+# LLVM-EXEGESIS-DEFREG RBX B
+# LLVM-EXEGESIS-DEFREG RSI D
+# LLVM-EXEGESIS-DEFREG RDI 11
+# LLVM-EXEGESIS-DEFREG RSP 13
+# LLVM-EXEGESIS-DEFREG RBP 17
+# LLVM-EXEGESIS-DEFREG R8 1D
+# LLVM-EXEGESIS-DEFREG R9 1F
+# LLVM-EXEGESIS-DEFREG R10 29
+# LLVM-EXEGESIS-DEFREG R11 2B
+# LLVM-EXEGESIS-DEFREG R12 2F
+# LLVM-EXEGESIS-DEFREG R13 35
 # LLVM-EXEGESIS-DEFREG R14 127
 # LLVM-EXEGESIS-DEFREG R15 0
 
@@ -17,11 +26,29 @@ cmpq $0x3, %rax
 cmovneq %r14, %r15
 cmpq $0x5, %rcx
 cmovneq %r14, %r15
-cmpq $0x7, %rdi
+cmpq $0x7, %rdx
 cmovneq %r14, %r15
-cmpq $0xB, %rsi
+cmpq $0xB, %rbx
 cmovneq %r14, %r15
-cmpq $0xD, %r11
+cmpq $0xD, %rsi
+cmovneq %r14, %r15
+cmpq $0x11, %rdi
+cmovneq %r14, %r15
+cmpq $0x13, %rsp
+cmovneq %r14, %r15
+cmpq $0x17, %rbp
+cmovneq %r14, %r15
+cmpq $0x1d, %r8
+cmovneq %r14, %r15
+cmpq $0x1f, %r9
+cmovneq %r14, %r15
+cmpq $0x29, %r10
+cmovneq %r14, %r15
+cmpq $0x2b, %r11
+cmovneq %r14, %r15
+cmpq $0x2f, %r12
+cmovneq %r14, %r15
+cmpq $0x35, %r13
 cmovneq %r14, %r15
 
 movq $60, %rax
diff --git a/llvm/tools/llvm-exegesis/lib/Assembler.cpp b/llvm/tools/llvm-exegesis/lib/Assembler.cpp
index c2fad7c731a7d..96b5a068ff21f 100644
--- a/llvm/tools/llvm-exegesis/lib/Assembler.cpp
+++ b/llvm/tools/llvm-exegesis/lib/Assembler.cpp
@@ -97,7 +97,7 @@ static bool generateSnippetSetupCode(
       // Load in the stack register now as we're done using it elsewhere
       // and need to set the value in preparation for executing the
       // snippet.
-      if (RV.Register == StackPointerRegister)
+      if (RV.Register != StackPointerRegister)
         continue;
       const auto SetRegisterCode = ET.setRegTo(*MSI, RV.Register, RV.Value);
       if (SetRegisterCode.empty())

From 61b5cc6654addf2f1c84c8c3e49f98ad6f022cb9 Mon Sep 17 00:00:00 2001
From: Sameer Sahasrabuddhe <sameer.sahasrabuddhe@amd.com>
Date: Sat, 23 Dec 2023 07:58:43 +0530
Subject: [PATCH 205/342] [LLVM] ConvergenceControlInst as a derived class of
 IntrinsicInst (#76230)

---
 .../llvm/IR/GenericConvergenceVerifierImpl.h  | 13 +---------
 llvm/include/llvm/IR/IntrinsicInst.h          | 24 +++++++++++++++++++
 2 files changed, 25 insertions(+), 12 deletions(-)

diff --git a/llvm/include/llvm/IR/GenericConvergenceVerifierImpl.h b/llvm/include/llvm/IR/GenericConvergenceVerifierImpl.h
index e2ece30b18641..f6eb5066d5535 100644
--- a/llvm/include/llvm/IR/GenericConvergenceVerifierImpl.h
+++ b/llvm/include/llvm/IR/GenericConvergenceVerifierImpl.h
@@ -29,7 +29,7 @@
 #include "llvm/ADT/GenericConvergenceVerifier.h"
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/Twine.h"
-#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IntrinsicInst.h"
 
 #define Check(C, ...)                                                          \
   do {                                                                         \
@@ -48,17 +48,6 @@
   } while (false)
 
 namespace llvm {
-static bool isConvergenceControlIntrinsic(unsigned IntrinsicID) {
-  switch (IntrinsicID) {
-  default:
-    return false;
-  case Intrinsic::experimental_convergence_anchor:
-  case Intrinsic::experimental_convergence_entry:
-  case Intrinsic::experimental_convergence_loop:
-    return true;
-  }
-}
-
 template <class ContextT> void GenericConvergenceVerifier<ContextT>::clear() {
   Tokens.clear();
   CI.clear();
diff --git a/llvm/include/llvm/IR/IntrinsicInst.h b/llvm/include/llvm/IR/IntrinsicInst.h
index 8940bebd2c9a2..b8d578d0fee08 100644
--- a/llvm/include/llvm/IR/IntrinsicInst.h
+++ b/llvm/include/llvm/IR/IntrinsicInst.h
@@ -1724,6 +1724,30 @@ class AssumeInst : public IntrinsicInst {
   }
 };
 
+/// Check if \p ID corresponds to a convergence control intrinsic.
+static inline bool isConvergenceControlIntrinsic(unsigned IntrinsicID) {
+  switch (IntrinsicID) {
+  default:
+    return false;
+  case Intrinsic::experimental_convergence_anchor:
+  case Intrinsic::experimental_convergence_entry:
+  case Intrinsic::experimental_convergence_loop:
+    return true;
+  }
+}
+
+/// Represents calls to the llvm.experimintal.convergence.* intrinsics.
+class ConvergenceControlInst : public IntrinsicInst {
+public:
+  static bool classof(const IntrinsicInst *I) {
+    return isConvergenceControlIntrinsic(I->getIntrinsicID());
+  }
+
+  static bool classof(const Value *V) {
+    return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
+  }
+};
+
 } // end namespace llvm
 
 #endif // LLVM_IR_INTRINSICINST_H

From bbe1b06fbb7127d613cb4958e06c737967878388 Mon Sep 17 00:00:00 2001
From: smanna12 <soumi.manna@intel.com>
Date: Fri, 22 Dec 2023 20:39:22 -0600
Subject: [PATCH 206/342] [NFC][CLANG] Fix static analyzer bugs about
 unnecessary object copies with auto keyword (#75082)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reported by Static Analyzer Tool:

In EmitAssemblyHelper::RunOptimizationPipeline(): Using the auto
keyword without an & causes the copy of an object of type function.

 /// List of pass builder callbacks ("CodeGenOptions.h").
std::vector<std::function<void(llvm::PassBuilder &)>>
PassBuilderCallbacks;
---
 clang/lib/CodeGen/BackendUtil.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp
index 480410db1021b..a6142d99f3b68 100644
--- a/clang/lib/CodeGen/BackendUtil.cpp
+++ b/clang/lib/CodeGen/BackendUtil.cpp
@@ -881,7 +881,7 @@ void EmitAssemblyHelper::RunOptimizationPipeline(
           << PluginFN << toString(PassPlugin.takeError());
     }
   }
-  for (auto PassCallback : CodeGenOpts.PassBuilderCallbacks)
+  for (const auto &PassCallback : CodeGenOpts.PassBuilderCallbacks)
     PassCallback(PB);
 #define HANDLE_EXTENSION(Ext)                                                  \
   get##Ext##PluginInfo().RegisterPassBuilderCallbacks(PB);

From 31aa7d2de018693a6b45c9056a67229c54461b8f Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Fri, 22 Dec 2023 18:42:55 -0800
Subject: [PATCH 207/342] [RISCV] Use riscv_bitmanip.h in zbb.c. NFC

I missed this when converting other scalar bitmanip/crypto tests
to use intrinsics intead of builtins.
---
 clang/test/CodeGen/RISCV/rvb-intrinsics/zbb.c | 26 ++++++++++---------
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/clang/test/CodeGen/RISCV/rvb-intrinsics/zbb.c b/clang/test/CodeGen/RISCV/rvb-intrinsics/zbb.c
index 3a421f8c6cd42..5edbc578e82e9 100644
--- a/clang/test/CodeGen/RISCV/rvb-intrinsics/zbb.c
+++ b/clang/test/CodeGen/RISCV/rvb-intrinsics/zbb.c
@@ -6,6 +6,8 @@
 // RUN:     -disable-O0-optnone | opt -S -passes=mem2reg \
 // RUN:     | FileCheck %s  -check-prefix=RV64ZBB
 
+#include <riscv_bitmanip.h>
+
 // RV32ZBB-LABEL: @orc_b_32(
 // RV32ZBB-NEXT:  entry:
 // RV32ZBB-NEXT:    [[TMP0:%.*]] = call i32 @llvm.riscv.orc.b.i32(i32 [[A:%.*]])
@@ -16,8 +18,8 @@
 // RV64ZBB-NEXT:    [[TMP0:%.*]] = call i32 @llvm.riscv.orc.b.i32(i32 [[A:%.*]])
 // RV64ZBB-NEXT:    ret i32 [[TMP0]]
 //
-unsigned int orc_b_32(unsigned int a) {
-  return __builtin_riscv_orc_b_32(a);
+uint32_t orc_b_32(uint32_t a) {
+  return __riscv_orc_b_32(a);
 }
 
 #if __riscv_xlen == 64
@@ -26,8 +28,8 @@ unsigned int orc_b_32(unsigned int a) {
 // RV64ZBB-NEXT:    [[TMP0:%.*]] = call i64 @llvm.riscv.orc.b.i64(i64 [[A:%.*]])
 // RV64ZBB-NEXT:    ret i64 [[TMP0]]
 //
-unsigned long orc_b_64(unsigned long a) {
-  return __builtin_riscv_orc_b_64(a);
+uint64_t orc_b_64(uint64_t a) {
+  return __riscv_orc_b_64(a);
 }
 #endif
 
@@ -41,8 +43,8 @@ unsigned long orc_b_64(unsigned long a) {
 // RV64ZBB-NEXT:    [[TMP0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[A:%.*]], i1 false)
 // RV64ZBB-NEXT:    ret i32 [[TMP0]]
 //
-unsigned int clz_32(unsigned int a) {
-  return __builtin_riscv_clz_32(a);
+unsigned int clz_32(uint32_t a) {
+  return __riscv_clz_32(a);
 }
 
 #if __riscv_xlen == 64
@@ -52,8 +54,8 @@ unsigned int clz_32(unsigned int a) {
 // RV64ZBB-NEXT:    [[CAST:%.*]] = trunc i64 [[TMP0]] to i32
 // RV64ZBB-NEXT:    ret i32 [[CAST]]
 //
-unsigned int clz_64(unsigned long a) {
-  return __builtin_riscv_clz_64(a);
+unsigned int clz_64(uint64_t a) {
+  return __riscv_clz_64(a);
 }
 #endif
 
@@ -67,8 +69,8 @@ unsigned int clz_64(unsigned long a) {
 // RV64ZBB-NEXT:    [[TMP0:%.*]] = call i32 @llvm.cttz.i32(i32 [[A:%.*]], i1 false)
 // RV64ZBB-NEXT:    ret i32 [[TMP0]]
 //
-unsigned int ctz_32(unsigned int a) {
-  return __builtin_riscv_ctz_32(a);
+unsigned int ctz_32(uint32_t a) {
+  return __riscv_ctz_32(a);
 }
 
 #if __riscv_xlen == 64
@@ -78,7 +80,7 @@ unsigned int ctz_32(unsigned int a) {
 // RV64ZBB-NEXT:    [[CAST:%.*]] = trunc i64 [[TMP0]] to i32
 // RV64ZBB-NEXT:    ret i32 [[CAST]]
 //
-unsigned int ctz_64(unsigned long a) {
-  return __builtin_riscv_ctz_64(a);
+unsigned int ctz_64(uint64_t a) {
+  return __riscv_ctz_64(a);
 }
 #endif

From f8f8926054dcf47cb0f3166be8d6961afc979290 Mon Sep 17 00:00:00 2001
From: Owen Pan <owenpiano@gmail.com>
Date: Fri, 22 Dec 2023 22:51:00 -0800
Subject: [PATCH 208/342] [clang-format] Fix a bug in annotating function
 declaration names (#76206)

Annotates function declaration names having unnamed parameters.
---
 clang/lib/Format/TokenAnnotator.cpp           | 3 ++-
 clang/unittests/Format/TokenAnnotatorTest.cpp | 7 +++++++
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp
index f3551af342439..3ac3aa3c5e3a2 100644
--- a/clang/lib/Format/TokenAnnotator.cpp
+++ b/clang/lib/Format/TokenAnnotator.cpp
@@ -3403,7 +3403,8 @@ static bool isFunctionDeclarationName(bool IsCpp, const FormatToken &Current,
       continue;
     }
     if (Tok->is(tok::kw_const) || Tok->isSimpleTypeSpecifier() ||
-        Tok->isOneOf(TT_PointerOrReference, TT_StartOfName, tok::ellipsis)) {
+        Tok->isOneOf(TT_PointerOrReference, TT_StartOfName, tok::ellipsis,
+                     TT_TypeName)) {
       return true;
     }
     if (Tok->isOneOf(tok::l_brace, TT_ObjCMethodExpr) || Tok->Tok.isLiteral())
diff --git a/clang/unittests/Format/TokenAnnotatorTest.cpp b/clang/unittests/Format/TokenAnnotatorTest.cpp
index 8e6935319b2f3..2cafc0438ffb4 100644
--- a/clang/unittests/Format/TokenAnnotatorTest.cpp
+++ b/clang/unittests/Format/TokenAnnotatorTest.cpp
@@ -1718,6 +1718,13 @@ TEST_F(TokenAnnotatorTest, UnderstandsFunctionDeclarationNames) {
   ASSERT_EQ(Tokens.size(), 14u) << Tokens;
   EXPECT_TOKEN(Tokens[3], tok::identifier, TT_Unknown);
   EXPECT_TOKEN(Tokens[4], tok::l_paren, TT_FunctionTypeLParen);
+
+  auto Style = getLLVMStyle();
+  Style.TypeNames.push_back("time_t");
+  Tokens = annotate("int iso_time(time_t);", Style);
+  ASSERT_EQ(Tokens.size(), 7u) << Tokens;
+  EXPECT_TOKEN(Tokens[1], tok::identifier, TT_FunctionDeclarationName);
+  EXPECT_TOKEN(Tokens[3], tok::identifier, TT_TypeName);
 }
 
 TEST_F(TokenAnnotatorTest, UnderstandsCtorAndDtorDeclNames) {

From 8097a5d37b70f483d9e441d78aa7f689618fa795 Mon Sep 17 00:00:00 2001
From: XDeme <66138117+XDeme@users.noreply.github.com>
Date: Sat, 23 Dec 2023 04:02:47 -0300
Subject: [PATCH 209/342] [clang-format] Fix operator overload inconsistency in
 `BreakAfterAttributes: Always` (#74943)

Fixes llvm/llvm-project#74901
---
 clang/lib/Format/ContinuationIndenter.cpp | 20 +++++++++-----------
 clang/unittests/Format/FormatTest.cpp     | 13 +++++++++++++
 2 files changed, 22 insertions(+), 11 deletions(-)

diff --git a/clang/lib/Format/ContinuationIndenter.cpp b/clang/lib/Format/ContinuationIndenter.cpp
index bd319f21b05f8..8489a30dd34ab 100644
--- a/clang/lib/Format/ContinuationIndenter.cpp
+++ b/clang/lib/Format/ContinuationIndenter.cpp
@@ -583,17 +583,15 @@ bool ContinuationIndenter::mustBreak(const LineState &State) {
       return true;
   }
 
-  // If the return type spans multiple lines, wrap before the function name.
-  if (((Current.is(TT_FunctionDeclarationName) &&
-        !State.Line->ReturnTypeWrapped &&
-        // Don't break before a C# function when no break after return type.
-        (!Style.isCSharp() ||
-         Style.AlwaysBreakAfterReturnType != FormatStyle::RTBS_None) &&
-        // Don't always break between a JavaScript `function` and the function
-        // name.
-        !Style.isJavaScript()) ||
-       (Current.is(tok::kw_operator) && Previous.isNot(tok::coloncolon))) &&
-      Previous.isNot(tok::kw_template) && CurrentState.BreakBeforeParameter) {
+  if (Current.is(TT_FunctionDeclarationName) &&
+      !State.Line->ReturnTypeWrapped &&
+      // Don't break before a C# function when no break after return type.
+      (!Style.isCSharp() ||
+       Style.AlwaysBreakAfterReturnType != FormatStyle::RTBS_None) &&
+      // Don't always break between a JavaScript `function` and the function
+      // name.
+      !Style.isJavaScript() && Previous.isNot(tok::kw_template) &&
+      CurrentState.BreakBeforeParameter) {
     return true;
   }
 
diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp
index 9772c3be71877..762fc8254bdfc 100644
--- a/clang/unittests/Format/FormatTest.cpp
+++ b/clang/unittests/Format/FormatTest.cpp
@@ -26479,6 +26479,19 @@ TEST_F(FormatTest, BreakAfterAttributes) {
                "{\n"
                "}",
                CtorDtorCode, Style);
+
+  verifyFormat("struct Foo {\n"
+               "  [[maybe_unused]]\n"
+               "  void operator+();\n"
+               "};\n"
+               "[[nodiscard]]\n"
+               "Foo &operator-(Foo &);",
+               Style);
+
+  Style.ReferenceAlignment = FormatStyle::ReferenceAlignmentStyle::RAS_Left;
+  verifyFormat("[[nodiscard]]\n"
+               "Foo& operator-(Foo&);",
+               Style);
 }
 
 TEST_F(FormatTest, InsertNewlineAtEOF) {

From f78a742ab8fc0290742db28a61feef21aa0ecf97 Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Fri, 22 Dec 2023 23:38:01 -0800
Subject: [PATCH 210/342] [NFC][sanitizer] Rename Lock{Before,After}Fork
 suffixes locking StackDepotBase (#76279)

This is preparation for performance optimization.

We need to highlight that this is very specific lock, and should not be
used for other purposes.
Add `fork_child` parameter to distinguish processes after fork.
---
 compiler-rt/lib/asan/asan_posix.cpp           | 48 ++++++++++---------
 .../lib/dfsan/dfsan_chained_origin_depot.cpp  |  6 +++
 .../lib/dfsan/dfsan_chained_origin_depot.h    |  3 ++
 compiler-rt/lib/dfsan/dfsan_custom.cpp        | 12 ++---
 compiler-rt/lib/hwasan/hwasan_linux.cpp       | 46 ++++++++++--------
 compiler-rt/lib/lsan/lsan_posix.cpp           | 30 +++++++-----
 .../lib/msan/msan_chained_origin_depot.cpp    |  6 +--
 .../lib/msan/msan_chained_origin_depot.h      |  4 +-
 compiler-rt/lib/msan/msan_linux.cpp           | 32 +++++++------
 .../sanitizer_common/sanitizer_stackdepot.cpp |  4 +-
 .../sanitizer_common/sanitizer_stackdepot.h   |  4 +-
 11 files changed, 109 insertions(+), 86 deletions(-)

diff --git a/compiler-rt/lib/asan/asan_posix.cpp b/compiler-rt/lib/asan/asan_posix.cpp
index a5b87b7fbf1b5..76564538bd5d7 100644
--- a/compiler-rt/lib/asan/asan_posix.cpp
+++ b/compiler-rt/lib/asan/asan_posix.cpp
@@ -146,33 +146,37 @@ void PlatformTSDDtor(void *tsd) {
 #    endif
   AsanThread::TSDDtor(tsd);
 }
-#endif
+#  endif
+
+static void BeforeFork() {
+  if (CAN_SANITIZE_LEAKS) {
+    __lsan::LockGlobal();
+  }
+  // `_lsan` functions defined regardless of `CAN_SANITIZE_LEAKS` and lock the
+  // stuff we need.
+  __lsan::LockThreads();
+  __lsan::LockAllocator();
+  StackDepotLockBeforeFork();
+}
+
+static void AfterFork(bool fork_child) {
+  StackDepotUnlockAfterFork(fork_child);
+  // `_lsan` functions defined regardless of `CAN_SANITIZE_LEAKS` and unlock
+  // the stuff we need.
+  __lsan::UnlockAllocator();
+  __lsan::UnlockThreads();
+  if (CAN_SANITIZE_LEAKS) {
+    __lsan::UnlockGlobal();
+  }
+}
 
 void InstallAtForkHandler() {
 #  if SANITIZER_SOLARIS || SANITIZER_NETBSD || SANITIZER_APPLE
   return;  // FIXME: Implement FutexWait.
 #  endif
-  auto before = []() {
-    if (CAN_SANITIZE_LEAKS) {
-      __lsan::LockGlobal();
-    }
-    // `_lsan` functions defined regardless of `CAN_SANITIZE_LEAKS` and lock the
-    // stuff we need.
-    __lsan::LockThreads();
-    __lsan::LockAllocator();
-    StackDepotLockAll();
-  };
-  auto after = []() {
-    StackDepotUnlockAll();
-    // `_lsan` functions defined regardless of `CAN_SANITIZE_LEAKS` and unlock
-    // the stuff we need.
-    __lsan::UnlockAllocator();
-    __lsan::UnlockThreads();
-    if (CAN_SANITIZE_LEAKS) {
-      __lsan::UnlockGlobal();
-    }
-  };
-  pthread_atfork(before, after, after);
+  pthread_atfork(
+      &BeforeFork, []() { AfterFork(/* fork_child= */ false); },
+      []() { AfterFork(/* fork_child= */ true); });
 }
 
 void InstallAtExitCheckLeaks() {
diff --git a/compiler-rt/lib/dfsan/dfsan_chained_origin_depot.cpp b/compiler-rt/lib/dfsan/dfsan_chained_origin_depot.cpp
index 9ec598bf2ce9e..6644bd6a7c6c0 100644
--- a/compiler-rt/lib/dfsan/dfsan_chained_origin_depot.cpp
+++ b/compiler-rt/lib/dfsan/dfsan_chained_origin_depot.cpp
@@ -19,4 +19,10 @@ static ChainedOriginDepot chainedOriginDepot;
 
 ChainedOriginDepot* GetChainedOriginDepot() { return &chainedOriginDepot; }
 
+void ChainedOriginDepotLockBeforeFork() { chainedOriginDepot.LockAll(); }
+
+void ChainedOriginDepotUnlockAfterFork(bool fork_child) {
+  chainedOriginDepot.UnlockAll();
+}
+
 }  // namespace __dfsan
diff --git a/compiler-rt/lib/dfsan/dfsan_chained_origin_depot.h b/compiler-rt/lib/dfsan/dfsan_chained_origin_depot.h
index d715ef707f41b..83b9e29e1b710 100644
--- a/compiler-rt/lib/dfsan/dfsan_chained_origin_depot.h
+++ b/compiler-rt/lib/dfsan/dfsan_chained_origin_depot.h
@@ -21,6 +21,9 @@ namespace __dfsan {
 
 ChainedOriginDepot* GetChainedOriginDepot();
 
+void ChainedOriginDepotLockBeforeFork();
+void ChainedOriginDepotUnlockAfterFork(bool fork_child);
+
 }  // namespace __dfsan
 
 #endif  // DFSAN_CHAINED_ORIGIN_DEPOT_H
diff --git a/compiler-rt/lib/dfsan/dfsan_custom.cpp b/compiler-rt/lib/dfsan/dfsan_custom.cpp
index 38371d3533681..05b48fd0525e3 100644
--- a/compiler-rt/lib/dfsan/dfsan_custom.cpp
+++ b/compiler-rt/lib/dfsan/dfsan_custom.cpp
@@ -2893,13 +2893,13 @@ int __dfso___isoc99_sscanf(char *str, const char *format, dfsan_label str_label,
 }
 
 static void BeforeFork() {
-  StackDepotLockAll();
-  GetChainedOriginDepot()->LockAll();
+  StackDepotLockBeforeFork();
+  ChainedOriginDepotLockBeforeFork();
 }
 
-static void AfterFork() {
-  GetChainedOriginDepot()->UnlockAll();
-  StackDepotUnlockAll();
+static void AfterFork(bool fork_child) {
+  ChainedOriginDepotUnlockAfterFork(fork_child);
+  StackDepotUnlockAfterFork(fork_child);
 }
 
 SANITIZER_INTERFACE_ATTRIBUTE
@@ -2913,7 +2913,7 @@ SANITIZER_INTERFACE_ATTRIBUTE
 pid_t __dfso_fork(dfsan_label *ret_label, dfsan_origin *ret_origin) {
   BeforeFork();
   pid_t pid = __dfsw_fork(ret_label);
-  AfterFork();
+  AfterFork(/* fork_child= */ pid == 0);
   return pid;
 }
 
diff --git a/compiler-rt/lib/hwasan/hwasan_linux.cpp b/compiler-rt/lib/hwasan/hwasan_linux.cpp
index 3271a955e7ed1..e6aa60b324fa7 100644
--- a/compiler-rt/lib/hwasan/hwasan_linux.cpp
+++ b/compiler-rt/lib/hwasan/hwasan_linux.cpp
@@ -521,28 +521,32 @@ uptr TagMemoryAligned(uptr p, uptr size, tag_t tag) {
   return AddTagToPointer(p, tag);
 }
 
+static void BeforeFork() {
+  if (CAN_SANITIZE_LEAKS) {
+    __lsan::LockGlobal();
+  }
+  // `_lsan` functions defined regardless of `CAN_SANITIZE_LEAKS` and lock the
+  // stuff we need.
+  __lsan::LockThreads();
+  __lsan::LockAllocator();
+  StackDepotLockBeforeFork();
+}
+
+static void AfterFork(bool fork_child) {
+  StackDepotUnlockAfterFork(fork_child);
+  // `_lsan` functions defined regardless of `CAN_SANITIZE_LEAKS` and unlock
+  // the stuff we need.
+  __lsan::UnlockAllocator();
+  __lsan::UnlockThreads();
+  if (CAN_SANITIZE_LEAKS) {
+    __lsan::UnlockGlobal();
+  }
+}
+
 void HwasanInstallAtForkHandler() {
-  auto before = []() {
-    if (CAN_SANITIZE_LEAKS) {
-      __lsan::LockGlobal();
-    }
-    // `_lsan` functions defined regardless of `CAN_SANITIZE_LEAKS` and lock the
-    // stuff we need.
-    __lsan::LockThreads();
-    __lsan::LockAllocator();
-    StackDepotLockAll();
-  };
-  auto after = []() {
-    StackDepotUnlockAll();
-    // `_lsan` functions defined regardless of `CAN_SANITIZE_LEAKS` and unlock
-    // the stuff we need.
-    __lsan::UnlockAllocator();
-    __lsan::UnlockThreads();
-    if (CAN_SANITIZE_LEAKS) {
-      __lsan::UnlockGlobal();
-    }
-  };
-  pthread_atfork(before, after, after);
+  pthread_atfork(
+      &BeforeFork, []() { AfterFork(/* fork_child= */ false); },
+      []() { AfterFork(/* fork_child= */ true); });
 }
 
 void InstallAtExitCheckLeaks() {
diff --git a/compiler-rt/lib/lsan/lsan_posix.cpp b/compiler-rt/lib/lsan/lsan_posix.cpp
index 4bfadf1ef809c..422c29acca69f 100644
--- a/compiler-rt/lib/lsan/lsan_posix.cpp
+++ b/compiler-rt/lib/lsan/lsan_posix.cpp
@@ -100,23 +100,27 @@ void InstallAtExitCheckLeaks() {
     Atexit(DoLeakCheck);
 }
 
+static void BeforeFork() {
+  LockGlobal();
+  LockThreads();
+  LockAllocator();
+  StackDepotLockBeforeFork();
+}
+
+static void AfterFork(bool fork_child) {
+  StackDepotUnlockAfterFork(fork_child);
+  UnlockAllocator();
+  UnlockThreads();
+  UnlockGlobal();
+}
+
 void InstallAtForkHandler() {
 #  if SANITIZER_SOLARIS || SANITIZER_NETBSD || SANITIZER_APPLE
   return;  // FIXME: Implement FutexWait.
 #  endif
-  auto before = []() {
-    LockGlobal();
-    LockThreads();
-    LockAllocator();
-    StackDepotLockAll();
-  };
-  auto after = []() {
-    StackDepotUnlockAll();
-    UnlockAllocator();
-    UnlockThreads();
-    UnlockGlobal();
-  };
-  pthread_atfork(before, after, after);
+  pthread_atfork(
+      &BeforeFork, []() { AfterFork(/* fork_child= */ false); },
+      []() { AfterFork(/* fork_child= */ true); });
 }
 
 }  // namespace __lsan
diff --git a/compiler-rt/lib/msan/msan_chained_origin_depot.cpp b/compiler-rt/lib/msan/msan_chained_origin_depot.cpp
index 49b14131a89be..c3bd54141e6c3 100644
--- a/compiler-rt/lib/msan/msan_chained_origin_depot.cpp
+++ b/compiler-rt/lib/msan/msan_chained_origin_depot.cpp
@@ -31,11 +31,9 @@ u32 ChainedOriginDepotGet(u32 id, u32 *other) {
   return chainedOriginDepot.Get(id, other);
 }
 
-void ChainedOriginDepotLockAll() {
-  chainedOriginDepot.LockAll();
-}
+void ChainedOriginDepotBeforeFork() { chainedOriginDepot.LockAll(); }
 
-void ChainedOriginDepotUnlockAll() {
+void ChainedOriginDepotAfterFork(bool fork_child) {
   chainedOriginDepot.UnlockAll();
 }
 
diff --git a/compiler-rt/lib/msan/msan_chained_origin_depot.h b/compiler-rt/lib/msan/msan_chained_origin_depot.h
index ea51c77a905b5..7518745dc8520 100644
--- a/compiler-rt/lib/msan/msan_chained_origin_depot.h
+++ b/compiler-rt/lib/msan/msan_chained_origin_depot.h
@@ -30,8 +30,8 @@ bool ChainedOriginDepotPut(u32 here_id, u32 prev_id, u32 *new_id);
 // Retrieves the stored StackDepot ID for the given origin ID.
 u32 ChainedOriginDepotGet(u32 id, u32 *other);
 
-void ChainedOriginDepotLockAll();
-void ChainedOriginDepotUnlockAll();
+void ChainedOriginDepotBeforeFork();
+void ChainedOriginDepotAfterFork(bool fork_child);
 
 }  // namespace __msan
 
diff --git a/compiler-rt/lib/msan/msan_linux.cpp b/compiler-rt/lib/msan/msan_linux.cpp
index 04af6f4b27ac8..c7ecb7cad5666 100644
--- a/compiler-rt/lib/msan/msan_linux.cpp
+++ b/compiler-rt/lib/msan/msan_linux.cpp
@@ -256,22 +256,26 @@ void MsanTSDDtor(void *tsd) {
   atomic_signal_fence(memory_order_seq_cst);
   MsanThread::TSDDtor(tsd);
 }
-#endif
+#  endif
+
+static void BeforeFork() {
+  // Usually we lock ThreadRegistry, but msan does not have one.
+  LockAllocator();
+  StackDepotLockBeforeFork();
+  ChainedOriginDepotBeforeFork();
+}
+
+static void AfterFork(bool fork_child) {
+  ChainedOriginDepotAfterFork(fork_child);
+  StackDepotUnlockAfterFork(fork_child);
+  UnlockAllocator();
+  // Usually we unlock ThreadRegistry, but msan does not have one.
+}
 
 void InstallAtForkHandler() {
-  auto before = []() {
-    // Usually we lock ThreadRegistry, but msan does not have one.
-    LockAllocator();
-    StackDepotLockAll();
-    ChainedOriginDepotLockAll();
-  };
-  auto after = []() {
-    ChainedOriginDepotUnlockAll();
-    StackDepotUnlockAll();
-    UnlockAllocator();
-    // Usually we unlock ThreadRegistry, but msan does not have one.
-  };
-  pthread_atfork(before, after, after);
+  pthread_atfork(
+      &BeforeFork, []() { AfterFork(/* fork_child= */ false); },
+      []() { AfterFork(/* fork_child= */ true); });
 }
 
 } // namespace __msan
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_stackdepot.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_stackdepot.cpp
index a746d4621936c..ce21f3c178bce 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_stackdepot.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_stackdepot.cpp
@@ -215,13 +215,13 @@ StackTrace StackDepotGet(u32 id) {
   return theDepot.Get(id);
 }
 
-void StackDepotLockAll() {
+void StackDepotLockBeforeFork() {
   theDepot.LockAll();
   compress_thread.LockAndStop();
   stackStore.LockAll();
 }
 
-void StackDepotUnlockAll() {
+void StackDepotUnlockAfterFork(bool fork_child) {
   stackStore.UnlockAll();
   compress_thread.Unlock();
   theDepot.UnlockAll();
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_stackdepot.h b/compiler-rt/lib/sanitizer_common/sanitizer_stackdepot.h
index cca6fd5346883..82cf7578d0fb9 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_stackdepot.h
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_stackdepot.h
@@ -39,8 +39,8 @@ StackDepotHandle StackDepotPut_WithHandle(StackTrace stack);
 // Retrieves a stored stack trace by the id.
 StackTrace StackDepotGet(u32 id);
 
-void StackDepotLockAll();
-void StackDepotUnlockAll();
+void StackDepotLockBeforeFork();
+void StackDepotUnlockAfterFork(bool fork_child);
 void StackDepotPrintAll();
 void StackDepotStopBackgroundThread();
 

From b203d5320df7754bf0ce019f01347a0ef743a207 Mon Sep 17 00:00:00 2001
From: Nikolas Klauser <nikolasklauser@berlin.de>
Date: Sat, 23 Dec 2023 11:21:27 +0100
Subject: [PATCH 211/342] [libc++] Optimize std::find if types are integral and
 have the same signedness (#70345)

Fixes #70238
---
 libcxx/include/__algorithm/find.h             | 19 +++++++++
 .../alg.nonmodifying/alg.find/find.pass.cpp   | 42 +++++++++++++++++++
 2 files changed, 61 insertions(+)

diff --git a/libcxx/include/__algorithm/find.h b/libcxx/include/__algorithm/find.h
index 754e597130c5b..7d7631b6e98a9 100644
--- a/libcxx/include/__algorithm/find.h
+++ b/libcxx/include/__algorithm/find.h
@@ -21,8 +21,11 @@
 #include <__fwd/bit_reference.h>
 #include <__iterator/segmented_iterator.h>
 #include <__string/constexpr_c_functions.h>
+#include <__type_traits/is_integral.h>
 #include <__type_traits/is_same.h>
+#include <__type_traits/is_signed.h>
 #include <__utility/move.h>
+#include <limits>
 
 #ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS
 #  include <cwchar>
@@ -76,6 +79,22 @@ __find_impl(_Tp* __first, _Tp* __last, const _Up& __value, _Proj&) {
 }
 #endif // _LIBCPP_HAS_NO_WIDE_CHARACTERS
 
+// TODO: This should also be possible to get right with different signedness
+// cast integral types to allow vectorization
+template <class _Tp,
+          class _Up,
+          class _Proj,
+          __enable_if_t<__is_identity<_Proj>::value && !__libcpp_is_trivially_equality_comparable<_Tp, _Up>::value &&
+                            is_integral<_Tp>::value && is_integral<_Up>::value &&
+                            is_signed<_Tp>::value == is_signed<_Up>::value,
+                        int> = 0>
+_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Tp*
+__find_impl(_Tp* __first, _Tp* __last, const _Up& __value, _Proj& __proj) {
+  if (__value < numeric_limits<_Tp>::min() || __value > numeric_limits<_Tp>::max())
+    return __last;
+  return std::__find_impl(__first, __last, _Tp(__value), __proj);
+}
+
 // __bit_iterator implementation
 template <bool _ToFind, class _Cp, bool _IsConst>
 _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI __bit_iterator<_Cp, _IsConst>
diff --git a/libcxx/test/std/algorithms/alg.nonmodifying/alg.find/find.pass.cpp b/libcxx/test/std/algorithms/alg.nonmodifying/alg.find/find.pass.cpp
index 0afc573aa1771..0676da13e90f7 100644
--- a/libcxx/test/std/algorithms/alg.nonmodifying/alg.find/find.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.nonmodifying/alg.find/find.pass.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// ADDITIONAL_COMPILE_FLAGS(gcc): -Wno-bool-compare
 // ADDITIONAL_COMPILE_FLAGS(gcc-style-warnings): -Wno-sign-compare
 // MSVC warning C4389: '==': signed/unsigned mismatch
 // ADDITIONAL_COMPILE_FLAGS(cl-style-warnings): /wd4389
@@ -162,6 +163,45 @@ void test_deque() {
   }
 }
 
+template <class T>
+struct TestIntegerPromotions1 {
+  template <class U>
+  TEST_CONSTEXPR_CXX20 void test(T val, U to_find) {
+    bool expect_match = val == to_find;
+    assert(std::find(&val, &val + 1, to_find) == (expect_match ? &val : &val + 1));
+  }
+
+  template <class U>
+  TEST_CONSTEXPR_CXX20 void operator()() {
+    test<U>(0, 0);
+    test<U>(0, 1);
+    test<U>(1, 1);
+    test<U>(0, -1);
+    test<U>(-1, -1);
+    test<U>(0, U(-127));
+    test<U>(T(-127), U(-127));
+    test<U>(T(-128), U(-128));
+    test<U>(T(-129), U(-129));
+    test<U>(T(255), U(255));
+    test<U>(T(256), U(256));
+    test<U>(T(257), U(257));
+    test<U>(0, std::numeric_limits<U>::min());
+    test<U>(T(std::numeric_limits<U>::min()), std::numeric_limits<U>::min());
+    test<U>(0, std::numeric_limits<U>::min() + 1);
+    test<U>(T(std::numeric_limits<U>::min() + 1), std::numeric_limits<U>::min() + 1);
+    test<U>(0, std::numeric_limits<U>::max());
+    test<U>(T(std::numeric_limits<U>::max()), std::numeric_limits<U>::max());
+    test<U>(T(std::numeric_limits<U>::max() - 1), std::numeric_limits<U>::max() - 1);
+  }
+};
+
+struct TestIntegerPromotions {
+  template <class T>
+  TEST_CONSTEXPR_CXX20 void operator()() {
+    types::for_each(types::integral_types(), TestIntegerPromotions1<T>());
+  }
+};
+
 TEST_CONSTEXPR_CXX20 bool test() {
   types::for_each(types::integer_types(), TestTypes<char>());
   types::for_each(types::integer_types(), TestTypes<int>());
@@ -181,6 +221,8 @@ TEST_CONSTEXPR_CXX20 bool test() {
   }
 #endif
 
+  types::for_each(types::integral_types(), TestIntegerPromotions());
+
   return true;
 }
 

From 5841140e38c94f3d2b29354f67e70803d88ce174 Mon Sep 17 00:00:00 2001
From: Mark de Wever <koraq@xs4all.nl>
Date: Sat, 23 Dec 2023 11:43:17 +0100
Subject: [PATCH 212/342] [libc++][modules] Fixes C++20 build errors.

Recent CI changes have disabled testing modules in different
configurations. This broke building the std and std.compat module in
C++20. This was found by the CI in #76246.
---
 libcxx/modules/std/algorithm.inc | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/libcxx/modules/std/algorithm.inc b/libcxx/modules/std/algorithm.inc
index f6b35efa144f8..75e8a3af78dea 100644
--- a/libcxx/modules/std/algorithm.inc
+++ b/libcxx/modules/std/algorithm.inc
@@ -16,7 +16,9 @@ export namespace std {
     using std::ranges::in_in_result;
     using std::ranges::in_out_out_result;
     using std::ranges::in_out_result;
+#if _LIBCPP_STD_VER >= 23
     using std::ranges::in_value_result;
+#endif
     using std::ranges::min_max_result;
     // using std::ranges::out_value_result;
   } // namespace ranges
@@ -40,6 +42,7 @@ export namespace std {
     using std::ranges::none_of;
   }
 
+#if _LIBCPP_STD_VER >= 23
   // [alg.contains], contains
   namespace ranges {
     using std::ranges::contains;
@@ -47,6 +50,7 @@ export namespace std {
     using std::ranges::contains_subrange;
 #endif
   } // namespace ranges
+#endif // _LIBCPP_STD_VER >= 23
 
   // [alg.foreach], for each
   using std::for_each;

From ed6dc6286264f0b6e94bb786a462d9975144ee7d Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Sat, 23 Dec 2023 20:43:14 +0700
Subject: [PATCH 213/342] DAG: Handle equal size element build_vector promotion
 (#76213)

---
 llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index a483b8028fda9..4e317062cec49 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -4908,7 +4908,9 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) {
 static MVT getPromotedVectorElementType(const TargetLowering &TLI,
                                         MVT EltVT, MVT NewEltVT) {
   unsigned OldEltsPerNewElt = EltVT.getSizeInBits() / NewEltVT.getSizeInBits();
-  MVT MidVT = MVT::getVectorVT(NewEltVT, OldEltsPerNewElt);
+  MVT MidVT = OldEltsPerNewElt == 1
+                  ? NewEltVT
+                  : MVT::getVectorVT(NewEltVT, OldEltsPerNewElt);
   assert(TLI.isTypeLegal(MidVT) && "unexpected");
   return MidVT;
 }
@@ -5395,7 +5397,7 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) {
 
     assert(NVT.isVector() && OVT.getSizeInBits() == NVT.getSizeInBits() &&
            "Invalid promote type for build_vector");
-    assert(NewEltVT.bitsLT(EltVT) && "not handled");
+    assert(NewEltVT.bitsLE(EltVT) && "not handled");
 
     MVT MidVT = getPromotedVectorElementType(TLI, EltVT, NewEltVT);
 
@@ -5406,7 +5408,9 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) {
     }
 
     SDLoc SL(Node);
-    SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, SL, NVT, NewOps);
+    SDValue Concat =
+        DAG.getNode(MidVT == NewEltVT ? ISD::BUILD_VECTOR : ISD::CONCAT_VECTORS,
+                    SL, NVT, NewOps);
     SDValue CvtVec = DAG.getNode(ISD::BITCAST, SL, OVT, Concat);
     Results.push_back(CvtVec);
     break;

From fbcf8a8cbb2461730bfd0603b396842925a88ef2 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Sat, 23 Dec 2023 15:53:48 +0100
Subject: [PATCH 214/342] [ConstraintElim] Add (UGE, var, 0) to unsigned system
 for new vars. (#76262)

The constraint system used for ConstraintElimination assumes all
varibles to be signed. This can cause missed optimization in the
unsigned system, due to missing the information that all variables are
unsigned (non-negative).

Variables can be marked as non-negative by adding Var >= 0 for all
variables. This is done for arguments on ConstraintInfo construction and
after adding new variables. This handles cases like the ones outlined in
https://discourse.llvm.org/t/why-does-llvm-not-perform-range-analysis-on-integer-values/74341

The original example shared above is now handled without this change,
but adding another variable means that instcombine won't be able to
simplify examples like https://godbolt.org/z/hTnra7zdY

Adding the extra variables comes with a slight compile-time increase
https://llvm-compile-time-tracker.com/compare.php?from=7568b36a2bc1a1e496ec29246966ffdfc3a8b87f&to=641a47f0acce7755e340447386013a2e086f03d9&stat=instructions:u

stage1-O3    stage1-ReleaseThinLTO    stage1-ReleaseLTO-g  stage1-O0-g
 +0.04%           +0.07%                   +0.05%           +0.02%
stage2-O3    stage2-O0-g    stage2-clang
  +0.05%         +0.05%        +0.05%

https://github.com/llvm/llvm-project/pull/76262
---
 .../Scalar/ConstraintElimination.cpp          | 25 +++++++++++--
 .../ConstraintElimination/add-nuw.ll          | 26 +++++---------
 .../and-implied-by-operands.ll                |  3 +-
 .../gep-arithmetic-add.ll                     |  6 ++--
 .../ConstraintElimination/gep-arithmetic.ll   |  3 +-
 .../geps-pointers-to-structs.ll               |  3 +-
 .../large-constant-ints.ll                    |  3 +-
 .../loops-bottom-tested-pointer-cmps.ll       |  6 ++--
 .../loops-header-tested-base.ll               |  3 +-
 .../loops-header-tested-pointer-cmps.ll       | 10 +++---
 .../ConstraintElimination/max-row-limit.ll    |  4 +--
 .../Transforms/ConstraintElimination/mul.ll   | 18 ++++------
 .../or-implied-by-operands.ll                 |  3 +-
 .../Transforms/ConstraintElimination/or.ll    | 19 ++++------
 .../reason-about-add-operands.ll              | 36 +++++++------------
 .../reproducer-remarks-debug.ll               |  1 +
 .../reproducer-remarks.ll                     |  4 +--
 .../Transforms/ConstraintElimination/shl.ll   | 33 ++++++-----------
 .../ConstraintElimination/sub-nuw.ll          |  5 ++-
 .../PhaseOrdering/runtime-check-removal.ll    | 17 ---------
 20 files changed, 90 insertions(+), 138 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
index 98cfadddee8ef..899d7e0a11e6f 100644
--- a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
@@ -273,7 +273,16 @@ class ConstraintInfo {
 
 public:
   ConstraintInfo(const DataLayout &DL, ArrayRef<Value *> FunctionArgs)
-      : UnsignedCS(FunctionArgs), SignedCS(FunctionArgs), DL(DL) {}
+      : UnsignedCS(FunctionArgs), SignedCS(FunctionArgs), DL(DL) {
+    auto &Value2Index = getValue2Index(false);
+    // Add Arg > -1 constraints to unsigned system for all function arguments.
+    for (Value *Arg : FunctionArgs) {
+      ConstraintTy VarPos(SmallVector<int64_t, 8>(Value2Index.size() + 1, 0),
+                          false, false, false);
+      VarPos.Coefficients[Value2Index[Arg]] = -1;
+      UnsignedCS.addVariableRow(VarPos.Coefficients);
+    }
+  }
 
   DenseMap<Value *, unsigned> &getValue2Index(bool Signed) {
     return Signed ? SignedCS.getValue2Index() : UnsignedCS.getValue2Index();
@@ -1466,6 +1475,17 @@ void ConstraintInfo::addFact(CmpInst::Predicate Pred, Value *A, Value *B,
     DFSInStack.emplace_back(NumIn, NumOut, R.IsSigned,
                             std::move(ValuesToRelease));
 
+    if (!R.IsSigned) {
+      for (Value *V : NewVariables) {
+        ConstraintTy VarPos(SmallVector<int64_t, 8>(Value2Index.size() + 1, 0),
+                            false, false, false);
+        VarPos.Coefficients[Value2Index[V]] = -1;
+        CSToUse.addVariableRow(VarPos.Coefficients);
+        DFSInStack.emplace_back(NumIn, NumOut, R.IsSigned,
+                                SmallVector<Value *, 2>());
+      }
+    }
+
     if (R.isEq()) {
       // Also add the inverted constraint for equality constraints.
       for (auto &Coeff : R.Coefficients)
@@ -1711,7 +1731,8 @@ static bool eliminateConstraints(Function &F, DominatorTree &DT, LoopInfo &LI,
 #ifndef NDEBUG
   unsigned SignedEntries =
       count_if(DFSInStack, [](const StackEntry &E) { return E.IsSigned; });
-  assert(Info.getCS(false).size() == DFSInStack.size() - SignedEntries &&
+  assert(Info.getCS(false).size() - FunctionArgs.size() ==
+             DFSInStack.size() - SignedEntries &&
          "updates to CS and DFSInStack are out of sync");
   assert(Info.getCS(true).size() == SignedEntries &&
          "updates to CS and DFSInStack are out of sync");
diff --git a/llvm/test/Transforms/ConstraintElimination/add-nuw.ll b/llvm/test/Transforms/ConstraintElimination/add-nuw.ll
index be3b66af10e08..a8a474e6d4502 100644
--- a/llvm/test/Transforms/ConstraintElimination/add-nuw.ll
+++ b/llvm/test/Transforms/ConstraintElimination/add-nuw.ll
@@ -235,8 +235,7 @@ define void @test.decompose.nonconst(i8 %a, i8 %b, i8 %c, i8 %d) {
 ; CHECK-NEXT:    [[ADD_1:%.*]] = add nuw i8 [[A]], [[A]]
 ; CHECK-NEXT:    call void @use(i1 true)
 ; CHECK-NEXT:    [[ADD_2:%.*]] = add nuw i8 [[A]], [[D:%.*]]
-; CHECK-NEXT:    [[C_4:%.*]] = icmp uge i8 [[ADD_2]], [[C]]
-; CHECK-NEXT:    call void @use(i1 [[C_4]])
+; CHECK-NEXT:    call void @use(i1 true)
 ; CHECK-NEXT:    ret void
 ; CHECK:       if.end:
 ; CHECK-NEXT:    ret void
@@ -278,14 +277,11 @@ define void @test.decompose.nonconst.no.null.check(i8 %a, i8 %b, i8 %c, i8 %d) {
 ; CHECK-NEXT:    br i1 [[AND_0]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
 ; CHECK:       if.then:
 ; CHECK-NEXT:    [[ADD_0:%.*]] = add nuw i8 [[A]], [[B]]
-; CHECK-NEXT:    [[T_0:%.*]] = icmp uge i8 [[ADD_0]], [[C]]
-; CHECK-NEXT:    call void @use(i1 [[T_0]])
+; CHECK-NEXT:    call void @use(i1 true)
 ; CHECK-NEXT:    [[ADD_1:%.*]] = add nuw i8 [[A]], [[A]]
-; CHECK-NEXT:    [[T_1:%.*]] = icmp uge i8 [[ADD_0]], [[C]]
-; CHECK-NEXT:    call void @use(i1 [[T_1]])
+; CHECK-NEXT:    call void @use(i1 true)
 ; CHECK-NEXT:    [[ADD_2:%.*]] = add nuw i8 [[A]], [[D:%.*]]
-; CHECK-NEXT:    [[C_4:%.*]] = icmp uge i8 [[ADD_2]], [[C]]
-; CHECK-NEXT:    call void @use(i1 [[C_4]])
+; CHECK-NEXT:    call void @use(i1 true)
 ; CHECK-NEXT:    ret void
 ; CHECK:       if.end:
 ; CHECK-NEXT:    ret void
@@ -318,8 +314,7 @@ define i1 @test_n_must_ule_1_due_to_nuw(i8 %n, i8 %i) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[SUB_N_1:%.*]] = add nuw i8 [[N:%.*]], -1
 ; CHECK-NEXT:    [[ADD:%.*]] = add nuw i8 [[I:%.*]], [[SUB_N_1]]
-; CHECK-NEXT:    [[C_1:%.*]] = icmp uge i8 [[I]], [[ADD]]
-; CHECK-NEXT:    br i1 [[C_1]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
+; CHECK-NEXT:    br i1 false, label [[IF_THEN:%.*]], label [[IF_END:%.*]]
 ; CHECK:       if.then:
 ; CHECK-NEXT:    ret i1 true
 ; CHECK:       if.end:
@@ -376,8 +371,7 @@ define i1 @test_n_must_ule_2_due_to_nuw(i8 %n, i8 %i) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[SUB_N_1:%.*]] = add nuw i8 [[N:%.*]], -2
 ; CHECK-NEXT:    [[ADD:%.*]] = add nuw i8 [[I:%.*]], [[SUB_N_1]]
-; CHECK-NEXT:    [[C_1:%.*]] = icmp uge i8 [[I]], [[ADD]]
-; CHECK-NEXT:    br i1 [[C_1]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
+; CHECK-NEXT:    br i1 false, label [[IF_THEN:%.*]], label [[IF_END:%.*]]
 ; CHECK:       if.then:
 ; CHECK-NEXT:    ret i1 true
 ; CHECK:       if.end:
@@ -435,10 +429,9 @@ define i1 @add_nuw_neg_pr54224_i16(i16 %a) {
 ; CHECK-LABEL: @add_nuw_neg_pr54224_i16(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[NEG2:%.*]] = add nuw i16 [[A:%.*]], -305
-; CHECK-NEXT:    [[C_1:%.*]] = icmp ugt i16 0, [[NEG2]]
-; CHECK-NEXT:    br i1 [[C_1]], label [[EXIT_1:%.*]], label [[EXIT_2:%.*]]
+; CHECK-NEXT:    br i1 false, label [[EXIT_1:%.*]], label [[EXIT_2:%.*]]
 ; CHECK:       exit.1:
-; CHECK-NEXT:    ret i1 false
+; CHECK-NEXT:    ret i1 true
 ; CHECK:       exit.2:
 ; CHECK-NEXT:    [[C_3:%.*]] = icmp ugt i16 [[A]], 0
 ; CHECK-NEXT:    ret i1 [[C_3]]
@@ -464,8 +457,7 @@ define i1 @add_nuw_neg_pr54224_i64(i64 %a) {
 ; CHECK-NEXT:    [[C_1:%.*]] = icmp ugt i64 0, [[NEG2]]
 ; CHECK-NEXT:    br i1 [[C_1]], label [[EXIT_1:%.*]], label [[EXIT_2:%.*]]
 ; CHECK:       exit.1:
-; CHECK-NEXT:    [[C_2:%.*]] = icmp ugt i64 [[A]], 0
-; CHECK-NEXT:    ret i1 [[C_2]]
+; CHECK-NEXT:    ret i1 true
 ; CHECK:       exit.2:
 ; CHECK-NEXT:    [[C_3:%.*]] = icmp ugt i64 [[A]], 0
 ; CHECK-NEXT:    ret i1 [[C_3]]
diff --git a/llvm/test/Transforms/ConstraintElimination/and-implied-by-operands.ll b/llvm/test/Transforms/ConstraintElimination/and-implied-by-operands.ll
index 5c49ca0e96f30..6bbc73c9c996c 100644
--- a/llvm/test/Transforms/ConstraintElimination/and-implied-by-operands.ll
+++ b/llvm/test/Transforms/ConstraintElimination/and-implied-by-operands.ll
@@ -181,9 +181,8 @@ define i1 @test_remove_variables(i1 %c, ptr %A, i64 %B, ptr %C) {
 ; CHECK-NEXT:    [[C_1:%.*]] = icmp ult ptr [[TMP0]], [[A:%.*]]
 ; CHECK-NEXT:    br i1 [[C_1]], label [[THEN_2:%.*]], label [[ELSE_2:%.*]]
 ; CHECK:       then.2:
-; CHECK-NEXT:    [[C_2:%.*]] = icmp ne ptr [[A]], null
 ; CHECK-NEXT:    [[C_3:%.*]] = icmp sgt i64 [[B:%.*]], 0
-; CHECK-NEXT:    [[AND:%.*]] = and i1 [[C_2]], [[C_3]]
+; CHECK-NEXT:    [[AND:%.*]] = and i1 true, [[C_3]]
 ; CHECK-NEXT:    ret i1 [[AND]]
 ; CHECK:       else.2:
 ; CHECK-NEXT:    ret i1 false
diff --git a/llvm/test/Transforms/ConstraintElimination/gep-arithmetic-add.ll b/llvm/test/Transforms/ConstraintElimination/gep-arithmetic-add.ll
index ea65e890e4f3b..52adc78b4e159 100644
--- a/llvm/test/Transforms/ConstraintElimination/gep-arithmetic-add.ll
+++ b/llvm/test/Transforms/ConstraintElimination/gep-arithmetic-add.ll
@@ -289,8 +289,7 @@ define i4 @ptr_N_step_zext_n_zext(ptr %src, ptr %lower, ptr %upper, i16 %N, i16
 ; CHECK-NEXT:    br i1 [[STEP_ULT_N]], label [[PTR_CHECK:%.*]], label [[EXIT:%.*]]
 ; CHECK:       ptr.check:
 ; CHECK-NEXT:    [[SRC_STEP:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i32 [[STEP_ADD_1_EXT]]
-; CHECK-NEXT:    [[CMP_STEP_START:%.*]] = icmp ult ptr [[SRC_STEP]], [[LOWER]]
-; CHECK-NEXT:    [[OR_CHECK:%.*]] = or i1 [[CMP_STEP_START]], false
+; CHECK-NEXT:    [[OR_CHECK:%.*]] = or i1 false, false
 ; CHECK-NEXT:    br i1 [[OR_CHECK]], label [[TRAP_BB]], label [[EXIT]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret i4 3
@@ -344,9 +343,8 @@ define i4 @ptr_N_step_zext_n_zext_out_of_bounds(ptr %src, ptr %lower, ptr %upper
 ; CHECK-NEXT:    br i1 [[STEP_ULT_N]], label [[PTR_CHECK:%.*]], label [[EXIT:%.*]]
 ; CHECK:       ptr.check:
 ; CHECK-NEXT:    [[SRC_STEP:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i32 [[STEP_ADD_2_EXT]]
-; CHECK-NEXT:    [[CMP_STEP_START:%.*]] = icmp ult ptr [[SRC_STEP]], [[LOWER]]
 ; CHECK-NEXT:    [[CMP_STEP_END:%.*]] = icmp uge ptr [[SRC_STEP]], [[UPPER]]
-; CHECK-NEXT:    [[OR_CHECK:%.*]] = or i1 [[CMP_STEP_START]], [[CMP_STEP_END]]
+; CHECK-NEXT:    [[OR_CHECK:%.*]] = or i1 false, [[CMP_STEP_END]]
 ; CHECK-NEXT:    br i1 [[OR_CHECK]], label [[TRAP_BB]], label [[EXIT]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret i4 3
diff --git a/llvm/test/Transforms/ConstraintElimination/gep-arithmetic.ll b/llvm/test/Transforms/ConstraintElimination/gep-arithmetic.ll
index 0a835fb38a243..a4d825b327969 100644
--- a/llvm/test/Transforms/ConstraintElimination/gep-arithmetic.ll
+++ b/llvm/test/Transforms/ConstraintElimination/gep-arithmetic.ll
@@ -530,9 +530,8 @@ define i4 @ptr_N_unsigned_positive(ptr %src, ptr %lower, ptr %upper, i16 %N, i16
 ; CHECK-NEXT:    [[SRC_END:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i16 [[N:%.*]]
 ; CHECK-NEXT:    [[CMP_SRC_START:%.*]] = icmp ult ptr [[SRC]], [[LOWER:%.*]]
 ; CHECK-NEXT:    [[CMP_SRC_END:%.*]] = icmp uge ptr [[SRC_END]], [[UPPER:%.*]]
-; CHECK-NEXT:    [[N_NEG:%.*]] = icmp ult i16 [[N]], 0
 ; CHECK-NEXT:    [[OR_PRECOND_0:%.*]] = or i1 [[CMP_SRC_START]], [[CMP_SRC_END]]
-; CHECK-NEXT:    [[OR_PRECOND_1:%.*]] = or i1 [[OR_PRECOND_0]], [[N_NEG]]
+; CHECK-NEXT:    [[OR_PRECOND_1:%.*]] = or i1 [[OR_PRECOND_0]], false
 ; CHECK-NEXT:    br i1 [[OR_PRECOND_1]], label [[TRAP_BB:%.*]], label [[STEP_CHECK:%.*]]
 ; CHECK:       trap.bb:
 ; CHECK-NEXT:    ret i4 2
diff --git a/llvm/test/Transforms/ConstraintElimination/geps-pointers-to-structs.ll b/llvm/test/Transforms/ConstraintElimination/geps-pointers-to-structs.ll
index e72f1ee9b3706..a86cd5c04d5b9 100644
--- a/llvm/test/Transforms/ConstraintElimination/geps-pointers-to-structs.ll
+++ b/llvm/test/Transforms/ConstraintElimination/geps-pointers-to-structs.ll
@@ -369,8 +369,7 @@ define i1 @ptr.int.struct.test.ult.due.to.second.dimension.var.index(ptr %start,
 ; CHECK-NEXT:    [[IDX_EXT:%.*]] = zext i32 [[IDX]] to i64
 ; CHECK-NEXT:    [[START_0:%.*]] = getelementptr inbounds [[STRUCT_2]], ptr [[START]], i64 6, i32 0
 ; CHECK-NEXT:    [[START_0_CAST:%.*]] = bitcast ptr [[START_0]] to ptr
-; CHECK-NEXT:    [[C_0:%.*]] = icmp ult ptr [[START_0_CAST]], [[HIGH]]
-; CHECK-NEXT:    ret i1 [[C_0]]
+; CHECK-NEXT:    ret i1 true
 ; CHECK:       if.end:
 ; CHECK-NEXT:    ret i1 true
 ;
diff --git a/llvm/test/Transforms/ConstraintElimination/large-constant-ints.ll b/llvm/test/Transforms/ConstraintElimination/large-constant-ints.ll
index 9568b155af13a..a80e492e08246 100644
--- a/llvm/test/Transforms/ConstraintElimination/large-constant-ints.ll
+++ b/llvm/test/Transforms/ConstraintElimination/large-constant-ints.ll
@@ -128,8 +128,7 @@ define i1 @gep_decomp_i80(ptr %a) {
 ; CHECK-LABEL: @gep_decomp_i80(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i80 1973801615886922022913
-; CHECK-NEXT:    [[C:%.*]] = icmp eq ptr [[GEP]], null
-; CHECK-NEXT:    br i1 [[C]], label [[THEN:%.*]], label [[ELSE:%.*]]
+; CHECK-NEXT:    br i1 false, label [[THEN:%.*]], label [[ELSE:%.*]]
 ; CHECK:       then:
 ; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr inbounds i8, ptr [[A]], i80 1973801615886922022913
 ; CHECK-NEXT:    ret i1 true
diff --git a/llvm/test/Transforms/ConstraintElimination/loops-bottom-tested-pointer-cmps.ll b/llvm/test/Transforms/ConstraintElimination/loops-bottom-tested-pointer-cmps.ll
index 17a54b6ecbe2e..e3f2a54f321ed 100644
--- a/llvm/test/Transforms/ConstraintElimination/loops-bottom-tested-pointer-cmps.ll
+++ b/llvm/test/Transforms/ConstraintElimination/loops-bottom-tested-pointer-cmps.ll
@@ -23,7 +23,7 @@ define void @checks_in_loops_removable(ptr %ptr, ptr %lower, ptr %upper, i8 %n)
 ; CHECK-NEXT:    [[PTR_IV:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i16 [[IV]]
 ; CHECK-NEXT:    [[CMP_PTR_IV_LOWER:%.*]] = icmp ugt ptr [[LOWER]], [[PTR_IV]]
 ; CHECK-NEXT:    [[CMP_PTR_IV_UPPER:%.*]] = icmp ule ptr [[UPPER]], [[PTR_IV]]
-; CHECK-NEXT:    [[OR:%.*]] = or i1 [[CMP_PTR_IV_LOWER]], [[CMP_PTR_IV_UPPER]]
+; CHECK-NEXT:    [[OR:%.*]] = or i1 false, [[CMP_PTR_IV_UPPER]]
 ; CHECK-NEXT:    br i1 [[OR]], label [[TRAP]], label [[LOOP_LATCH]]
 ; CHECK:       loop.latch:
 ; CHECK-NEXT:    store i8 0, ptr [[PTR_IV]], align 4
@@ -88,7 +88,7 @@ define void @some_checks_in_loops_removable(ptr %ptr, ptr %lower, ptr %upper, i8
 ; CHECK-NEXT:    [[PTR_IV:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i16 [[IV]]
 ; CHECK-NEXT:    [[CMP_PTR_IV_LOWER:%.*]] = icmp ugt ptr [[LOWER]], [[PTR_IV]]
 ; CHECK-NEXT:    [[CMP_PTR_IV_UPPER:%.*]] = icmp ule ptr [[UPPER]], [[PTR_IV]]
-; CHECK-NEXT:    [[OR:%.*]] = or i1 [[CMP_PTR_IV_LOWER]], [[CMP_PTR_IV_UPPER]]
+; CHECK-NEXT:    [[OR:%.*]] = or i1 false, [[CMP_PTR_IV_UPPER]]
 ; CHECK-NEXT:    br i1 [[OR]], label [[TRAP]], label [[LOOP_BODY:%.*]]
 ; CHECK:       loop.body:
 ; CHECK-NEXT:    [[IV_1:%.*]] = add nuw nsw i16 [[IV]], 1
@@ -166,7 +166,7 @@ define void @no_checks_in_loops_removable(ptr %ptr, ptr %lower, ptr %upper, i8 %
 ; CHECK-NEXT:    [[PTR_IV:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i16 [[IV]]
 ; CHECK-NEXT:    [[CMP_PTR_IV_LOWER:%.*]] = icmp ugt ptr [[LOWER]], [[PTR_IV]]
 ; CHECK-NEXT:    [[CMP_PTR_IV_UPPER:%.*]] = icmp ule ptr [[UPPER]], [[PTR_IV]]
-; CHECK-NEXT:    [[OR:%.*]] = or i1 [[CMP_PTR_IV_LOWER]], [[CMP_PTR_IV_UPPER]]
+; CHECK-NEXT:    [[OR:%.*]] = or i1 false, [[CMP_PTR_IV_UPPER]]
 ; CHECK-NEXT:    br i1 [[OR]], label [[TRAP]], label [[LOOP_BODY:%.*]]
 ; CHECK:       loop.body:
 ; CHECK-NEXT:    [[IV_1:%.*]] = add nuw nsw i16 [[IV]], 1
diff --git a/llvm/test/Transforms/ConstraintElimination/loops-header-tested-base.ll b/llvm/test/Transforms/ConstraintElimination/loops-header-tested-base.ll
index 8245e84108e42..7b928a030614b 100644
--- a/llvm/test/Transforms/ConstraintElimination/loops-header-tested-base.ll
+++ b/llvm/test/Transforms/ConstraintElimination/loops-header-tested-base.ll
@@ -20,8 +20,7 @@ define void @loop_phi_pos_start_value(i32 %y, i1 %c, i32 %n) {
 ; CHECK-NEXT:    call void @use(i1 [[C_2]])
 ; CHECK-NEXT:    [[C_3:%.*]] = icmp sgt i32 [[X]], 9
 ; CHECK-NEXT:    call void @use(i1 [[C_3]])
-; CHECK-NEXT:    [[C_4:%.*]] = icmp sge i32 [[X]], 0
-; CHECK-NEXT:    call void @use(i1 [[C_4]])
+; CHECK-NEXT:    call void @use(i1 true)
 ; CHECK-NEXT:    [[C_5:%.*]] = icmp sge i32 [[X]], 9
 ; CHECK-NEXT:    call void @use(i1 [[C_5]])
 ; CHECK-NEXT:    [[X_NEXT]] = add nsw i32 [[X]], 1
diff --git a/llvm/test/Transforms/ConstraintElimination/loops-header-tested-pointer-cmps.ll b/llvm/test/Transforms/ConstraintElimination/loops-header-tested-pointer-cmps.ll
index b8ae10f42f036..66ce1ffc6ebc9 100644
--- a/llvm/test/Transforms/ConstraintElimination/loops-header-tested-pointer-cmps.ll
+++ b/llvm/test/Transforms/ConstraintElimination/loops-header-tested-pointer-cmps.ll
@@ -19,9 +19,8 @@ define void @test1(ptr %src, ptr noundef %lower, ptr noundef %upper, i8 %N) {
 ; CHECK-NEXT:    br i1 [[EC]], label [[EXIT:%.*]], label [[LOOP_BODY:%.*]]
 ; CHECK:       loop.body:
 ; CHECK-NEXT:    [[SRC_IV:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 [[IV]]
-; CHECK-NEXT:    [[CMP_IV_START:%.*]] = icmp ult ptr [[SRC_IV]], [[LOWER]]
 ; CHECK-NEXT:    [[CMP_IV_END:%.*]] = icmp uge ptr [[SRC_IV]], [[UPPER]]
-; CHECK-NEXT:    [[OR_1:%.*]] = or i1 [[CMP_IV_START]], [[CMP_IV_END]]
+; CHECK-NEXT:    [[OR_1:%.*]] = or i1 false, [[CMP_IV_END]]
 ; CHECK-NEXT:    br i1 [[OR_1]], label [[TRAP_BB]], label [[LOOP_BODY_1:%.*]]
 ; CHECK:       loop.body.1:
 ; CHECK-NEXT:    [[PTR_SRC_IV:%.*]] = bitcast ptr [[SRC_IV]] to ptr
@@ -120,9 +119,8 @@ define void @test2(ptr %src, ptr %lower, ptr %upper, i8 %N) {
 ; CHECK-NEXT:    br i1 [[EC]], label [[EXIT:%.*]], label [[LOOP_BODY:%.*]]
 ; CHECK:       loop.body:
 ; CHECK-NEXT:    [[SRC_IV:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 [[IV]]
-; CHECK-NEXT:    [[CMP_IV_START:%.*]] = icmp ult ptr [[SRC_IV]], [[LOWER]]
 ; CHECK-NEXT:    [[CMP_IV_END:%.*]] = icmp uge ptr [[SRC_IV]], [[UPPER]]
-; CHECK-NEXT:    [[OR_1:%.*]] = or i1 [[CMP_IV_START]], [[CMP_IV_END]]
+; CHECK-NEXT:    [[OR_1:%.*]] = or i1 false, [[CMP_IV_END]]
 ; CHECK-NEXT:    br i1 [[OR_1]], label [[TRAP_BB]], label [[LOOP_BODY_1:%.*]]
 ; CHECK:       loop.body.1:
 ; CHECK-NEXT:    [[ADD_1:%.*]] = add nuw nsw i8 [[IV]], 1
@@ -218,7 +216,7 @@ define void @test2_with_ne(ptr %src, ptr %lower, ptr %upper, i8 %N) {
 ; CHECK-NEXT:    [[SRC_IV:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 [[IV]]
 ; CHECK-NEXT:    [[CMP_IV_START:%.*]] = icmp ult ptr [[SRC_IV]], [[LOWER]]
 ; CHECK-NEXT:    [[CMP_IV_END:%.*]] = icmp uge ptr [[SRC_IV]], [[UPPER]]
-; CHECK-NEXT:    [[OR_1:%.*]] = or i1 [[CMP_IV_START]], [[CMP_IV_END]]
+; CHECK-NEXT:    [[OR_1:%.*]] = or i1 false, [[CMP_IV_END]]
 ; CHECK-NEXT:    br i1 [[OR_1]], label [[TRAP_BB]], label [[LOOP_BODY_1:%.*]]
 ; CHECK:       loop.body.1:
 ; CHECK-NEXT:    [[ADD_1:%.*]] = add nuw nsw i8 [[IV]], 1
@@ -314,7 +312,7 @@ define void @test3(ptr %src, ptr %lower, ptr %upper, i8 %N) {
 ; CHECK-NEXT:    [[SRC_IV:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 [[IV]]
 ; CHECK-NEXT:    [[CMP_IV_START:%.*]] = icmp ult ptr [[SRC_IV]], [[LOWER]]
 ; CHECK-NEXT:    [[CMP_IV_END:%.*]] = icmp uge ptr [[SRC_IV]], [[UPPER]]
-; CHECK-NEXT:    [[OR_1:%.*]] = or i1 [[CMP_IV_START]], [[CMP_IV_END]]
+; CHECK-NEXT:    [[OR_1:%.*]] = or i1 false, [[CMP_IV_END]]
 ; CHECK-NEXT:    br i1 [[OR_1]], label [[TRAP_BB]], label [[LOOP_BODY_1:%.*]]
 ; CHECK:       loop.body.1:
 ; CHECK-NEXT:    [[SRC_IV_1:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 [[NEXT]]
diff --git a/llvm/test/Transforms/ConstraintElimination/max-row-limit.ll b/llvm/test/Transforms/ConstraintElimination/max-row-limit.ll
index 752e012791ee4..0e078109ed663 100644
--- a/llvm/test/Transforms/ConstraintElimination/max-row-limit.ll
+++ b/llvm/test/Transforms/ConstraintElimination/max-row-limit.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -passes=constraint-elimination -S %s | FileCheck --check-prefixes=COMMON,SIMP %s
-; RUN: opt -passes=constraint-elimination -constraint-elimination-max-rows=4 -S %s | FileCheck --check-prefixes=COMMON,SIMP %s
-; RUN: opt -passes=constraint-elimination -constraint-elimination-max-rows=3 -S %s | FileCheck --check-prefixes=COMMON,NOSIMP %s
+; RUN: opt -passes=constraint-elimination -constraint-elimination-max-rows=9 -S %s | FileCheck --check-prefixes=COMMON,SIMP %s
+; RUN: opt -passes=constraint-elimination -constraint-elimination-max-rows=8 -S %s | FileCheck --check-prefixes=COMMON,NOSIMP %s
 
 
 define i1 @test_max_row_limit(i32 %l0, i32 %l1, i32 %l2, i32 %l3, i32 %l4) {
diff --git a/llvm/test/Transforms/ConstraintElimination/mul.ll b/llvm/test/Transforms/ConstraintElimination/mul.ll
index c8a1d31a48d56..362cd33fe6505 100644
--- a/llvm/test/Transforms/ConstraintElimination/mul.ll
+++ b/llvm/test/Transforms/ConstraintElimination/mul.ll
@@ -9,8 +9,7 @@ define i1 @test_mul_const_nuw_unsigned_1(i8 %start, i8 %high) {
 ; CHECK-NEXT:    [[START_MUL_4:%.*]] = mul nuw i8 [[START:%.*]], 4
 ; CHECK-NEXT:    [[C_1:%.*]] = icmp ult i8 [[START_MUL_4]], [[HIGH:%.*]]
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[C_1]])
-; CHECK-NEXT:    [[T_1:%.*]] = icmp ult i8 [[START]], [[HIGH]]
-; CHECK-NEXT:    ret i1 [[T_1]]
+; CHECK-NEXT:    ret i1 true
 ;
 entry:
   %start.mul.4 = mul nuw i8 %start, 4
@@ -28,8 +27,7 @@ define i1 @test_mul_const_nuw_unsigned_2(i8 %start, i8 %high) {
 ; CHECK-NEXT:    [[C_1:%.*]] = icmp ult i8 [[START_MUL_4]], [[HIGH:%.*]]
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[C_1]])
 ; CHECK-NEXT:    [[START_MUL_2:%.*]] = mul nuw i8 [[START]], 2
-; CHECK-NEXT:    [[T:%.*]] = icmp ult i8 [[START_MUL_2]], [[HIGH]]
-; CHECK-NEXT:    ret i1 [[T]]
+; CHECK-NEXT:    ret i1 true
 ;
 entry:
   %start.mul.4 = mul nuw i8 %start, 4
@@ -163,8 +161,7 @@ define i1 @test_mul_const_nuw_unsigned_8(i8 %start, i8 %high) {
 ; CHECK-NEXT:    [[START_ADD_2:%.*]] = add nuw i8 [[START_ADD_1]], [[START_ADD_1]]
 ; CHECK-NEXT:    [[START_ADD_2_1:%.*]] = add nuw i8 [[START_ADD_2]], 1
 ; CHECK-NEXT:    [[START_MUL_3:%.*]] = mul nuw i8 [[START]], 3
-; CHECK-NEXT:    [[T_5:%.*]] = icmp ule i8 [[START_ADD_1]], [[START_MUL_3]]
-; CHECK-NEXT:    ret i1 [[T_5]]
+; CHECK-NEXT:    ret i1 true
 ;
 entry:
   %start.mul.4 = mul nuw i8 %start, 4
@@ -191,8 +188,7 @@ define i1 @test_mul_const_nuw_unsigned_9(i8 %start, i8 %high) {
 ; CHECK-NEXT:    [[START_ADD_2:%.*]] = add nuw i8 [[START_ADD_1]], [[START_ADD_1]]
 ; CHECK-NEXT:    [[START_ADD_2_1:%.*]] = add nuw i8 [[START_ADD_2]], 1
 ; CHECK-NEXT:    [[START_MUL_3:%.*]] = mul nuw i8 [[START]], 3
-; CHECK-NEXT:    [[F_5:%.*]] = icmp ult i8 [[START_ADD_2]], [[START_MUL_3]]
-; CHECK-NEXT:    ret i1 [[F_5]]
+; CHECK-NEXT:    ret i1 false
 ;
 entry:
   %start.mul.4 = mul nuw i8 %start, 4
@@ -371,8 +367,7 @@ define i1 @test_mul_add_const_nuw_unsigned_1(i8 %start, i8 %high) {
 ; CHECK-NEXT:    [[START_MUL_4:%.*]] = mul nuw i8 [[ADD]], 4
 ; CHECK-NEXT:    [[C_1:%.*]] = icmp ult i8 [[START_MUL_4]], [[HIGH:%.*]]
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[C_1]])
-; CHECK-NEXT:    [[T_1:%.*]] = icmp ult i8 [[START]], [[HIGH]]
-; CHECK-NEXT:    ret i1 [[T_1]]
+; CHECK-NEXT:    ret i1 true
 ;
 entry:
   %add = add nuw i8 %start, 3
@@ -392,8 +387,7 @@ define i1 @test_mul_add_const_nuw_unsigned_2(i8 %start, i8 %high) {
 ; CHECK-NEXT:    [[C_1:%.*]] = icmp ult i8 [[START_MUL_4]], [[HIGH:%.*]]
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[C_1]])
 ; CHECK-NEXT:    [[START_MUL_2:%.*]] = mul nuw i8 [[START]], 2
-; CHECK-NEXT:    [[T_2:%.*]] = icmp ult i8 [[START_MUL_2]], [[HIGH]]
-; CHECK-NEXT:    ret i1 [[T_2]]
+; CHECK-NEXT:    ret i1 true
 ;
 entry:
   %add = add nuw i8 %start, 3
diff --git a/llvm/test/Transforms/ConstraintElimination/or-implied-by-operands.ll b/llvm/test/Transforms/ConstraintElimination/or-implied-by-operands.ll
index fea6f2d8a5dc4..f5c108822b8cd 100644
--- a/llvm/test/Transforms/ConstraintElimination/or-implied-by-operands.ll
+++ b/llvm/test/Transforms/ConstraintElimination/or-implied-by-operands.ll
@@ -181,9 +181,8 @@ define i1 @test_remove_variables(i1 %c, ptr %A, i64 %B, ptr %C) {
 ; CHECK-NEXT:    [[C_1:%.*]] = icmp ult ptr [[TMP0]], [[A:%.*]]
 ; CHECK-NEXT:    br i1 [[C_1]], label [[THEN_2:%.*]], label [[ELSE_2:%.*]]
 ; CHECK:       then.2:
-; CHECK-NEXT:    [[C_2:%.*]] = icmp ne ptr [[A]], null
 ; CHECK-NEXT:    [[C_3:%.*]] = icmp sgt i64 [[B:%.*]], 0
-; CHECK-NEXT:    [[OR:%.*]] = or i1 [[C_2]], [[C_3]]
+; CHECK-NEXT:    [[OR:%.*]] = or i1 true, [[C_3]]
 ; CHECK-NEXT:    ret i1 [[OR]]
 ; CHECK:       else.2:
 ; CHECK-NEXT:    ret i1 false
diff --git a/llvm/test/Transforms/ConstraintElimination/or.ll b/llvm/test/Transforms/ConstraintElimination/or.ll
index 2f24519ddd1d5..01b8ca973efa5 100644
--- a/llvm/test/Transforms/ConstraintElimination/or.ll
+++ b/llvm/test/Transforms/ConstraintElimination/or.ll
@@ -135,15 +135,13 @@ define i1 @test_or_chain_ule_1(i4 %x, i4 %y, i4 %z, i4 %a, i4 %b) {
 ; CHECK-NEXT:    [[RES_2:%.*]] = xor i1 [[RES_1]], [[C_7]]
 ; CHECK-NEXT:    ret i1 [[RES_2]]
 ; CHECK:       exit:
-; CHECK-NEXT:    [[RES_3:%.*]] = xor i1 false, false
+; CHECK-NEXT:    [[RES_3:%.*]] = xor i1 true, true
 ; CHECK-NEXT:    [[RES_4:%.*]] = xor i1 [[RES_3]], true
 ; CHECK-NEXT:    [[RES_5:%.*]] = xor i1 [[RES_4]], true
 ; CHECK-NEXT:    [[RES_6:%.*]] = xor i1 [[RES_5]], true
 ; CHECK-NEXT:    [[RES_7:%.*]] = xor i1 [[RES_6]], true
-; CHECK-NEXT:    [[C_8:%.*]] = icmp ule i4 [[X]], [[A]]
-; CHECK-NEXT:    [[RES_8:%.*]] = xor i1 [[RES_7]], [[C_8]]
-; CHECK-NEXT:    [[C_9:%.*]] = icmp ule i4 [[X]], [[B:%.*]]
-; CHECK-NEXT:    [[RES_9:%.*]] = xor i1 [[RES_8]], [[C_9]]
+; CHECK-NEXT:    [[RES_8:%.*]] = xor i1 [[RES_7]], true
+; CHECK-NEXT:    [[RES_9:%.*]] = xor i1 [[RES_8]], true
 ; CHECK-NEXT:    ret i1 [[RES_9]]
 ;
 entry:
@@ -210,15 +208,13 @@ define i1 @test_or_chain_ule_2(i4 %x, i4 %y, i4 %z, i4 %a, i4 %b) {
 ; CHECK-NEXT:    [[RES_2:%.*]] = xor i1 [[RES_1]], [[C_7]]
 ; CHECK-NEXT:    ret i1 [[RES_2]]
 ; CHECK:       exit:
-; CHECK-NEXT:    [[RES_3:%.*]] = xor i1 false, false
+; CHECK-NEXT:    [[RES_3:%.*]] = xor i1 true, true
 ; CHECK-NEXT:    [[RES_4:%.*]] = xor i1 [[RES_3]], true
 ; CHECK-NEXT:    [[RES_5:%.*]] = xor i1 [[RES_4]], true
 ; CHECK-NEXT:    [[RES_6:%.*]] = xor i1 [[RES_5]], true
 ; CHECK-NEXT:    [[RES_7:%.*]] = xor i1 [[RES_6]], true
-; CHECK-NEXT:    [[C_8:%.*]] = icmp ule i4 [[X]], [[A]]
-; CHECK-NEXT:    [[RES_8:%.*]] = xor i1 [[RES_7]], [[C_8]]
-; CHECK-NEXT:    [[C_9:%.*]] = icmp ule i4 [[X]], [[B:%.*]]
-; CHECK-NEXT:    [[RES_9:%.*]] = xor i1 [[RES_8]], [[C_9]]
+; CHECK-NEXT:    [[RES_8:%.*]] = xor i1 [[RES_7]], true
+; CHECK-NEXT:    [[RES_9:%.*]] = xor i1 [[RES_8]], true
 ; CHECK-NEXT:    ret i1 [[RES_9]]
 ;
 entry:
@@ -354,8 +350,7 @@ define i1 @test_or_chain_with_and_ule(i4 %x, i4 %y, i4 %z, i4 %a, i4 %b) {
 ; CHECK-NEXT:    [[RES_6:%.*]] = xor i1 [[RES_5]], [[C_8]]
 ; CHECK-NEXT:    [[C_9:%.*]] = icmp ule i4 [[X]], [[B:%.*]]
 ; CHECK-NEXT:    [[RES_7:%.*]] = xor i1 [[RES_6]], [[C_9]]
-; CHECK-NEXT:    [[C_10:%.*]] = icmp ule i4 2, [[X]]
-; CHECK-NEXT:    [[RES_8:%.*]] = xor i1 [[RES_7]], [[C_10]]
+; CHECK-NEXT:    [[RES_8:%.*]] = xor i1 [[RES_7]], true
 ; CHECK-NEXT:    [[C_11:%.*]] = icmp ugt i4 2, [[A]]
 ; CHECK-NEXT:    [[RES_9:%.*]] = xor i1 [[RES_8]], [[C_11]]
 ; CHECK-NEXT:    ret i1 [[RES_9]]
diff --git a/llvm/test/Transforms/ConstraintElimination/reason-about-add-operands.ll b/llvm/test/Transforms/ConstraintElimination/reason-about-add-operands.ll
index 3c95b192705a5..fb13417c7a6c7 100644
--- a/llvm/test/Transforms/ConstraintElimination/reason-about-add-operands.ll
+++ b/llvm/test/Transforms/ConstraintElimination/reason-about-add-operands.ll
@@ -14,8 +14,7 @@ define i1 @addition_with_extra_facts_and_args_ult_i64(i64 noundef %a, i64 nounde
 ; CHECK-NEXT:    [[ADD:%.*]] = add nuw nsw i64 [[B]], [[A]]
 ; CHECK-NEXT:    [[CMP_ADD:%.*]] = icmp ult i64 [[ADD]], [[C]]
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP_ADD]])
-; CHECK-NEXT:    [[T:%.*]] = icmp ult i64 [[A]], [[C]]
-; CHECK-NEXT:    ret i1 [[T]]
+; CHECK-NEXT:    ret i1 true
 ;
 entry:
   %cmp.a = icmp ule i64 %a, 2048
@@ -40,8 +39,7 @@ define i1 @addition_with_extra_facts_and_args_ult_1(i16 noundef %a, i16 noundef
 ; CHECK-NEXT:    [[ADD:%.*]] = add nuw nsw i16 [[B]], [[A]]
 ; CHECK-NEXT:    [[CMP_ADD:%.*]] = icmp ult i16 [[ADD]], [[C]]
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP_ADD]])
-; CHECK-NEXT:    [[T:%.*]] = icmp ult i16 [[A]], [[C]]
-; CHECK-NEXT:    ret i1 [[T]]
+; CHECK-NEXT:    ret i1 true
 ;
 entry:
   %cmp.a = icmp ule i16 %a, 2048
@@ -66,8 +64,7 @@ define i1 @addition_with_extra_facts_and_args_ult_2(i16 noundef %a, i16 noundef
 ; CHECK-NEXT:    [[ADD:%.*]] = add nuw nsw i16 [[B]], [[A]]
 ; CHECK-NEXT:    [[CMP_ADD:%.*]] = icmp ult i16 [[ADD]], [[C]]
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP_ADD]])
-; CHECK-NEXT:    [[T:%.*]] = icmp ult i16 [[B]], [[C]]
-; CHECK-NEXT:    ret i1 [[T]]
+; CHECK-NEXT:    ret i1 true
 ;
 entry:
   %cmp.a = icmp ule i16 %a, 2048
@@ -92,8 +89,7 @@ define i1 @addition_with_extra_facts_and_args_ult_3(i16 noundef %a, i16 noundef
 ; CHECK-NEXT:    [[ADD:%.*]] = add nuw nsw i16 [[B]], [[A]]
 ; CHECK-NEXT:    [[CMP_ADD:%.*]] = icmp ult i16 [[ADD]], [[C]]
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP_ADD]])
-; CHECK-NEXT:    [[F:%.*]] = icmp uge i16 [[A]], [[C]]
-; CHECK-NEXT:    ret i1 [[F]]
+; CHECK-NEXT:    ret i1 false
 ;
 entry:
   %cmp.a = icmp ule i16 %a, 2048
@@ -118,8 +114,7 @@ define i1 @addition_with_extra_facts_and_args_ult_4(i16 noundef %a, i16 noundef
 ; CHECK-NEXT:    [[ADD:%.*]] = add nuw nsw i16 [[B]], [[A]]
 ; CHECK-NEXT:    [[CMP_ADD:%.*]] = icmp ult i16 [[ADD]], [[C]]
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP_ADD]])
-; CHECK-NEXT:    [[F:%.*]] = icmp uge i16 [[B]], [[C]]
-; CHECK-NEXT:    ret i1 [[F]]
+; CHECK-NEXT:    ret i1 false
 ;
 entry:
   %cmp.a = icmp ule i16 %a, 2048
@@ -201,8 +196,7 @@ define i1 @addition_with_extra_facts_and_return_value_ult_1() {
 ; CHECK-NEXT:    [[ADD:%.*]] = add nuw nsw i16 [[B]], [[A]]
 ; CHECK-NEXT:    [[CMP_ADD:%.*]] = icmp ult i16 [[ADD]], [[C]]
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP_ADD]])
-; CHECK-NEXT:    [[T:%.*]] = icmp ult i16 [[A]], [[C]]
-; CHECK-NEXT:    ret i1 [[T]]
+; CHECK-NEXT:    ret i1 true
 ;
 entry:
   %a = call i16 @get()
@@ -232,8 +226,7 @@ define i1 @addition_with_extra_facts_and_return_value_ult_2() {
 ; CHECK-NEXT:    [[ADD:%.*]] = add nuw nsw i16 [[B]], [[A]]
 ; CHECK-NEXT:    [[CMP_ADD:%.*]] = icmp ult i16 [[ADD]], [[C]]
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP_ADD]])
-; CHECK-NEXT:    [[F:%.*]] = icmp uge i16 [[A]], [[C]]
-; CHECK-NEXT:    ret i1 [[F]]
+; CHECK-NEXT:    ret i1 false
 ;
 entry:
   %a = call i16 @get()
@@ -259,8 +252,7 @@ define i1 @addition_no_extra_facts_with_return_value_ult_1() {
 ; CHECK-NEXT:    [[ADD:%.*]] = add nuw nsw i16 [[B]], [[A]]
 ; CHECK-NEXT:    [[CMP_ADD:%.*]] = icmp ult i16 [[ADD]], [[C]]
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP_ADD]])
-; CHECK-NEXT:    [[T:%.*]] = icmp ult i16 [[A]], [[C]]
-; CHECK-NEXT:    ret i1 [[T]]
+; CHECK-NEXT:    ret i1 true
 ;
 entry:
   %a = call i16 @get()
@@ -282,8 +274,7 @@ define i1 @addition_no_extra_facts_with_return_value_ult_2() {
 ; CHECK-NEXT:    [[ADD:%.*]] = add nuw nsw i16 [[B]], [[A]]
 ; CHECK-NEXT:    [[CMP_ADD:%.*]] = icmp ult i16 [[ADD]], [[C]]
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP_ADD]])
-; CHECK-NEXT:    [[F:%.*]] = icmp uge i16 [[A]], [[C]]
-; CHECK-NEXT:    ret i1 [[F]]
+; CHECK-NEXT:    ret i1 false
 ;
 entry:
   %a = call i16 @get()
@@ -326,8 +317,7 @@ define i1 @assume_x_ugt_y_plus_y_via_shl_eq(i8 %x, i8 %y) {
 ; CHECK-NEXT:    [[S:%.*]] = shl nuw i8 [[Y]], 1
 ; CHECK-NEXT:    [[C_1:%.*]] = icmp ugt i8 [[X]], [[S]]
 ; CHECK-NEXT:    tail call void @llvm.assume(i1 [[C_1]])
-; CHECK-NEXT:    [[C_2:%.*]] = icmp eq i8 [[X]], [[Y]]
-; CHECK-NEXT:    ret i1 [[C_2]]
+; CHECK-NEXT:    ret i1 false
 ;
   %s = shl nuw i8 %y, 1
   %c.1 = icmp ugt i8 %x, %s
@@ -358,8 +348,7 @@ define i1 @assume_x_ugt_y_plus_y_via_add_eq(i8 %x, i8 %y) {
 ; CHECK-NEXT:    [[S:%.*]] = add nuw i8 [[Y]], [[Y]]
 ; CHECK-NEXT:    [[C_1:%.*]] = icmp ugt i8 [[X]], [[S]]
 ; CHECK-NEXT:    tail call void @llvm.assume(i1 [[C_1]])
-; CHECK-NEXT:    [[C_2:%.*]] = icmp eq i8 [[X]], [[Y]]
-; CHECK-NEXT:    ret i1 [[C_2]]
+; CHECK-NEXT:    ret i1 false
 ;
   %s = add nuw i8 %y, %y
   %c.1 = icmp ugt i8 %x, %s
@@ -390,8 +379,7 @@ define i1 @assume_x_ugt_y_plus_y_via_shl_ne(i8 %x, i8 %y) {
 ; CHECK-NEXT:    [[S:%.*]] = shl nuw i8 [[Y]], 1
 ; CHECK-NEXT:    [[C_1:%.*]] = icmp ugt i8 [[X]], [[S]]
 ; CHECK-NEXT:    tail call void @llvm.assume(i1 [[C_1]])
-; CHECK-NEXT:    [[C_2:%.*]] = icmp ne i8 [[X]], [[Y]]
-; CHECK-NEXT:    ret i1 [[C_2]]
+; CHECK-NEXT:    ret i1 true
 ;
   %s = shl nuw i8 %y, 1
   %c.1 = icmp ugt i8 %x, %s
diff --git a/llvm/test/Transforms/ConstraintElimination/reproducer-remarks-debug.ll b/llvm/test/Transforms/ConstraintElimination/reproducer-remarks-debug.ll
index 4fdc8e583112e..f0b20fb360597 100644
--- a/llvm/test/Transforms/ConstraintElimination/reproducer-remarks-debug.ll
+++ b/llvm/test/Transforms/ConstraintElimination/reproducer-remarks-debug.ll
@@ -5,6 +5,7 @@
 target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
 
 ; CHECK:      Condition icmp eq ptr %a, null implied by dominating constraints
+; CHECK-NEXT: -1 * %a <= 0
 ; CHECK-NEXT: %a <= 0
 ; CHECK-NEXT: Creating reproducer for   %c.2 = icmp eq ptr %a, null
 ; CHECK-NEXT:   found external input ptr %a
diff --git a/llvm/test/Transforms/ConstraintElimination/reproducer-remarks.ll b/llvm/test/Transforms/ConstraintElimination/reproducer-remarks.ll
index 9f8ae3825a8d8..f912abfc24a8f 100644
--- a/llvm/test/Transforms/ConstraintElimination/reproducer-remarks.ll
+++ b/llvm/test/Transforms/ConstraintElimination/reproducer-remarks.ll
@@ -183,14 +183,14 @@ else:
 define i32 @test_branch(i32 %a) {
 ; CHECK-LABEL: define i1 @"{{.+}}test_branchrepro"(i32 %a) {
 ; CHECK-NEXT: entry:
-; CHECK-NEXT:   %0 = icmp ult i32 %a, 0
+; CHECK-NEXT:   %0 = icmp ult i32 %a, 4
 ; CHECK-NEXT:   call void @llvm.assume(i1 %0)
 ; CHECK-NEXT:   %c.2 = icmp ugt i32 0, 0
 ; CHECK-NEXT:   ret i1 %c.2
 ; CHECK-NEXT: }
 ;
 entry:
-  %c.1 = icmp ult i32 %a, 0
+  %c.1 = icmp ult i32 %a, 4
   br i1 %c.1, label %then, label %exit
 
 then:
diff --git a/llvm/test/Transforms/ConstraintElimination/shl.ll b/llvm/test/Transforms/ConstraintElimination/shl.ll
index 9f98a9d3a57ca..9fe8c147017b0 100644
--- a/llvm/test/Transforms/ConstraintElimination/shl.ll
+++ b/llvm/test/Transforms/ConstraintElimination/shl.ll
@@ -9,8 +9,7 @@ define i1 @test_shl_const_nuw_unsigned_1(i8 %start, i8 %high) {
 ; CHECK-NEXT:    [[START_SHL_4:%.*]] = shl nuw i8 [[START:%.*]], 4
 ; CHECK-NEXT:    [[C_1:%.*]] = icmp ult i8 [[START_SHL_4]], [[HIGH:%.*]]
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[C_1]])
-; CHECK-NEXT:    [[T_1:%.*]] = icmp ult i8 [[START]], [[HIGH]]
-; CHECK-NEXT:    ret i1 [[T_1]]
+; CHECK-NEXT:    ret i1 true
 ;
 entry:
   %start.shl.4 = shl nuw i8 %start, 4
@@ -28,8 +27,7 @@ define i1 @test_shl_const_nuw_unsigned_2(i8 %start, i8 %high) {
 ; CHECK-NEXT:    [[C_1:%.*]] = icmp ult i8 [[START_SHL_4]], [[HIGH:%.*]]
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[C_1]])
 ; CHECK-NEXT:    [[START_SHL_2:%.*]] = shl nuw i8 [[START]], 2
-; CHECK-NEXT:    [[T:%.*]] = icmp ult i8 [[START_SHL_2]], [[HIGH]]
-; CHECK-NEXT:    ret i1 [[T]]
+; CHECK-NEXT:    ret i1 true
 ;
 entry:
   %start.shl.4 = shl nuw i8 %start, 4
@@ -49,8 +47,7 @@ define i1 @test_shl_const_nuw_unsigned_3(i8 %start, i8 %high) {
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[C_1]])
 ; CHECK-NEXT:    [[START_SHL_2:%.*]] = shl nuw i8 [[START]], 2
 ; CHECK-NEXT:    [[START_ADD_1:%.*]] = add nuw i8 [[START]], [[START]]
-; CHECK-NEXT:    [[T:%.*]] = icmp ule i8 [[START_ADD_1]], [[START_SHL_2]]
-; CHECK-NEXT:    ret i1 [[T]]
+; CHECK-NEXT:    ret i1 true
 ;
 entry:
   %start.shl.4 = shl nuw i8 %start, 4
@@ -94,8 +91,7 @@ define i1 @test_shl_const_nuw_unsigned_5(i8 %start, i8 %high) {
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[C_1]])
 ; CHECK-NEXT:    [[START_ADD_1:%.*]] = add nuw i8 [[START]], [[START]]
 ; CHECK-NEXT:    [[START_ADD_2:%.*]] = add nuw i8 [[START_ADD_1]], [[START_ADD_1]]
-; CHECK-NEXT:    [[T_4:%.*]] = icmp ule i8 [[START_ADD_2]], [[START_SHL_4]]
-; CHECK-NEXT:    ret i1 [[T_4]]
+; CHECK-NEXT:    ret i1 true
 ;
 entry:
   %start.shl.4 = shl nuw i8 %start, 4
@@ -167,8 +163,7 @@ define i1 @test_shl_const_nuw_unsigned_8(i8 %start, i8 %high) {
 ; CHECK-NEXT:    [[START_ADD_2:%.*]] = add nuw i8 [[START_ADD_1]], [[START_ADD_1]]
 ; CHECK-NEXT:    [[START_ADD_2_1:%.*]] = add nuw i8 [[START_ADD_2]], 1
 ; CHECK-NEXT:    [[START_SHL_3:%.*]] = shl nuw i8 [[START]], 3
-; CHECK-NEXT:    [[T_5:%.*]] = icmp ule i8 [[START_ADD_1]], [[START_SHL_3]]
-; CHECK-NEXT:    ret i1 [[T_5]]
+; CHECK-NEXT:    ret i1 true
 ;
 entry:
   %start.shl.4 = shl nuw i8 %start, 4
@@ -296,8 +291,7 @@ define i1 @test_shl_add_const_nuw_unsigned_1(i8 %start, i8 %high) {
 ; CHECK-NEXT:    [[START_SHL_4:%.*]] = shl nuw i8 [[ADD]], 4
 ; CHECK-NEXT:    [[C_1:%.*]] = icmp ult i8 [[START_SHL_4]], [[HIGH:%.*]]
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[C_1]])
-; CHECK-NEXT:    [[T_1:%.*]] = icmp ult i8 [[START]], [[HIGH]]
-; CHECK-NEXT:    ret i1 [[T_1]]
+; CHECK-NEXT:    ret i1 true
 ;
 entry:
   %add = add nuw i8 %start, 3
@@ -317,8 +311,7 @@ define i1 @test_shl_add_const_nuw_unsigned_2(i8 %start, i8 %high) {
 ; CHECK-NEXT:    [[C_1:%.*]] = icmp ult i8 [[START_SHL_4]], [[HIGH:%.*]]
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[C_1]])
 ; CHECK-NEXT:    [[START_SHL_2:%.*]] = shl nuw i8 [[START]], 2
-; CHECK-NEXT:    [[T_2:%.*]] = icmp ult i8 [[START_SHL_2]], [[HIGH]]
-; CHECK-NEXT:    ret i1 [[T_2]]
+; CHECK-NEXT:    ret i1 true
 ;
 entry:
   %add = add nuw i8 %start, 3
@@ -340,8 +333,7 @@ define i1 @test_shl_add_const_nuw_unsigned_3(i8 %start, i8 %high) {
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[C_1]])
 ; CHECK-NEXT:    [[START_ADD_1:%.*]] = add nuw i8 [[START]], [[START]]
 ; CHECK-NEXT:    [[START_ADD_2:%.*]] = add nuw i8 [[START_ADD_1]], [[START_ADD_1]]
-; CHECK-NEXT:    [[T_3:%.*]] = icmp ule i8 [[START_ADD_2]], [[START_SHL_4]]
-; CHECK-NEXT:    ret i1 [[T_3]]
+; CHECK-NEXT:    ret i1 true
 ;
 entry:
   %add = add nuw i8 %start, 3
@@ -365,8 +357,7 @@ define i1 @test_shl_add_const_nuw_unsigned_4(i8 %start, i8 %high) {
 ; CHECK-NEXT:    [[START_SHL_2:%.*]] = shl nuw i8 [[START]], 2
 ; CHECK-NEXT:    [[START_ADD_1:%.*]] = add nuw i8 [[START]], [[START]]
 ; CHECK-NEXT:    [[START_ADD_2:%.*]] = add nuw i8 [[START_ADD_1]], [[START_ADD_1]]
-; CHECK-NEXT:    [[T_4:%.*]] = icmp ult i8 [[START_ADD_2]], [[START_SHL_4]]
-; CHECK-NEXT:    ret i1 [[T_4]]
+; CHECK-NEXT:    ret i1 true
 ;
 entry:
   %add = add nuw i8 %start, 3
@@ -391,8 +382,7 @@ define i1 @test_shl_add_const_nuw_unsigned_5(i8 %start, i8 %high) {
 ; CHECK-NEXT:    [[START_ADD_1:%.*]] = add nuw i8 [[START]], [[START]]
 ; CHECK-NEXT:    [[START_ADD_2:%.*]] = add nuw i8 [[START_ADD_1]], [[START_ADD_1]]
 ; CHECK-NEXT:    [[START_ADD_2_12:%.*]] = add nuw i8 [[START_ADD_2]], 12
-; CHECK-NEXT:    [[T_5:%.*]] = icmp ule i8 [[START_ADD_2_12]], [[START_SHL_4]]
-; CHECK-NEXT:    ret i1 [[T_5]]
+; CHECK-NEXT:    ret i1 true
 ;
 entry:
   %add = add nuw i8 %start, 3
@@ -417,8 +407,7 @@ define i1 @test_shl_add_const_nuw_unsigned_6(i8 %start, i8 %high) {
 ; CHECK-NEXT:    [[START_ADD_1:%.*]] = add nuw i8 [[START]], [[START]]
 ; CHECK-NEXT:    [[START_ADD_2:%.*]] = add nuw i8 [[START_ADD_1]], [[START_ADD_1]]
 ; CHECK-NEXT:    [[START_ADD_2_13:%.*]] = add nuw i8 [[START_ADD_2]], 13
-; CHECK-NEXT:    [[F_1:%.*]] = icmp ule i8 [[START_ADD_2_13]], [[START_SHL_4]]
-; CHECK-NEXT:    ret i1 [[F_1]]
+; CHECK-NEXT:    ret i1 true
 ;
 entry:
   %add = add nuw i8 %start, 3
diff --git a/llvm/test/Transforms/ConstraintElimination/sub-nuw.ll b/llvm/test/Transforms/ConstraintElimination/sub-nuw.ll
index 30051080178e1..0d90bc2fb3435 100644
--- a/llvm/test/Transforms/ConstraintElimination/sub-nuw.ll
+++ b/llvm/test/Transforms/ConstraintElimination/sub-nuw.ll
@@ -316,10 +316,9 @@ define i1 @sub_nuw_neg_i16(i16 %a) {
 ; CHECK-LABEL: @sub_nuw_neg_i16(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[NEG2:%.*]] = sub nuw i16 [[A:%.*]], -305
-; CHECK-NEXT:    [[C_1:%.*]] = icmp ugt i16 0, [[NEG2]]
-; CHECK-NEXT:    br i1 [[C_1]], label [[EXIT_1:%.*]], label [[EXIT_2:%.*]]
+; CHECK-NEXT:    br i1 false, label [[EXIT_1:%.*]], label [[EXIT_2:%.*]]
 ; CHECK:       exit.1:
-; CHECK-NEXT:    ret i1 false
+; CHECK-NEXT:    ret i1 true
 ; CHECK:       exit.2:
 ; CHECK-NEXT:    [[C_3:%.*]] = icmp ugt i16 [[A]], 0
 ; CHECK-NEXT:    ret i1 [[C_3]]
diff --git a/llvm/test/Transforms/PhaseOrdering/runtime-check-removal.ll b/llvm/test/Transforms/PhaseOrdering/runtime-check-removal.ll
index 5128614de1d1e..89095048f2249 100644
--- a/llvm/test/Transforms/PhaseOrdering/runtime-check-removal.ll
+++ b/llvm/test/Transforms/PhaseOrdering/runtime-check-removal.ll
@@ -61,23 +61,6 @@ exit:
 define void @chained_conditions(i64 noundef %a, i64 noundef %b, i64 noundef %c, i64 noundef %d) #0 {
 ; CHECK-LABEL: @chained_conditions(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i64 [[A:%.*]], 2048
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp ugt i64 [[B:%.*]], 1024
-; CHECK-NEXT:    [[OR_COND:%.*]] = or i1 [[CMP]], [[CMP1]]
-; CHECK-NEXT:    [[CMP3:%.*]] = icmp ugt i64 [[C:%.*]], 1024
-; CHECK-NEXT:    [[OR_COND1:%.*]] = or i1 [[OR_COND]], [[CMP3]]
-; CHECK-NEXT:    br i1 [[OR_COND1]], label [[IF_END10:%.*]], label [[IF_END:%.*]]
-; CHECK:       if.end:
-; CHECK-NEXT:    [[ADD:%.*]] = add nuw nsw i64 [[B]], [[A]]
-; CHECK-NEXT:    [[ADD4:%.*]] = add nuw nsw i64 [[ADD]], [[C]]
-; CHECK-NEXT:    [[CMP5_NOT:%.*]] = icmp uge i64 [[ADD4]], [[D:%.*]]
-; CHECK-NEXT:    [[CMP8_NOT:%.*]] = icmp ult i64 [[A]], [[D]]
-; CHECK-NEXT:    [[OR_COND7:%.*]] = or i1 [[CMP5_NOT]], [[CMP8_NOT]]
-; CHECK-NEXT:    br i1 [[OR_COND7]], label [[IF_END10]], label [[IF_THEN9:%.*]]
-; CHECK:       if.then9:
-; CHECK-NEXT:    tail call void @bar()
-; CHECK-NEXT:    br label [[IF_END10]]
-; CHECK:       if.end10:
 ; CHECK-NEXT:    ret void
 ;
 entry:

From acacec3bbf4586ef9bc6c4f31707d3515d5215a1 Mon Sep 17 00:00:00 2001
From: Felipe de Azevedo Piovezan <fpiovezan@apple.com>
Date: Sat, 23 Dec 2023 13:44:45 -0300
Subject: [PATCH 215/342] [LiveDebugValues][nfc] Reduce memory usage of
 InstrRef (#76051)

Commit 1b531d54f623 (#74203) removed the usage of unique_ptrs of arrays
in favour of using vectors, but inadvertently increased peak memory
usage by removing the ability to deallocate vector memory that was no
longer needed mid-LDV.

In that same review, it was pointed out that `FuncValueTable` typedef
could be removed, since it was "just a vector".

This commit addresses both issues by making `FuncValueTable` a real data
structure, capable of mapping BBs to ValueTables and able to free
ValueTables as needed.

This reduces peak memory usage in the compiler by 10% in the benchmarks
flagged by the original review.

As a consequence, we had to remove a handful of instances of the
"declare-then-initialize" antipattern in unittests, as the
FuncValueTable class is no longer default-constructible.
---
 .../LiveDebugValues/InstrRefBasedImpl.cpp     | 51 +++++++++----------
 .../LiveDebugValues/InstrRefBasedImpl.h       | 45 ++++++++++++++--
 llvm/unittests/CodeGen/InstrRefLDVTest.cpp    | 45 ++++++----------
 3 files changed, 80 insertions(+), 61 deletions(-)

diff --git a/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp b/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp
index aeb8a20e1f122..9037f752dc4f3 100644
--- a/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp
+++ b/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp
@@ -2413,7 +2413,7 @@ bool InstrRefBasedLDV::mlocJoin(
 
     // Pick out the first predecessors live-out value for this location. It's
     // guaranteed to not be a backedge, as we order by RPO.
-    ValueIDNum FirstVal = OutLocs[BlockOrders[0]->getNumber()][Idx.asU64()];
+    ValueIDNum FirstVal = OutLocs[*BlockOrders[0]][Idx.asU64()];
 
     // If we've already eliminated a PHI here, do no further checking, just
     // propagate the first live-in value into this block.
@@ -2430,8 +2430,7 @@ bool InstrRefBasedLDV::mlocJoin(
     bool Disagree = false;
     for (unsigned int I = 1; I < BlockOrders.size(); ++I) {
       const MachineBasicBlock *PredMBB = BlockOrders[I];
-      const ValueIDNum &PredLiveOut =
-          OutLocs[PredMBB->getNumber()][Idx.asU64()];
+      const ValueIDNum &PredLiveOut = OutLocs[*PredMBB][Idx.asU64()];
 
       // Incoming values agree, continue trying to eliminate this PHI.
       if (FirstVal == PredLiveOut)
@@ -2556,7 +2555,7 @@ void InstrRefBasedLDV::placeMLocPHIs(
 
   auto InstallPHIsAtLoc = [&PHIBlocks, &MInLocs](LocIdx L) {
     for (const MachineBasicBlock *MBB : PHIBlocks)
-      MInLocs[MBB->getNumber()][L.asU64()] = ValueIDNum(MBB->getNumber(), 0, L);
+      MInLocs[*MBB][L.asU64()] = ValueIDNum(MBB->getNumber(), 0, L);
   };
 
   // For locations with no reg units, just place PHIs.
@@ -2635,7 +2634,8 @@ void InstrRefBasedLDV::buildMLocValueMap(
 
   // Initialize entry block to PHIs. These represent arguments.
   for (auto Location : MTracker->locations())
-    MInLocs[0][Location.Idx.asU64()] = ValueIDNum(0, 0, Location.Idx);
+    MInLocs.tableForEntryMBB()[Location.Idx.asU64()] =
+        ValueIDNum(0, 0, Location.Idx);
 
   MTracker->reset();
 
@@ -2664,7 +2664,7 @@ void InstrRefBasedLDV::buildMLocValueMap(
 
       // Join the values in all predecessor blocks.
       bool InLocsChanged;
-      InLocsChanged = mlocJoin(*MBB, Visited, MOutLocs, MInLocs[CurBB]);
+      InLocsChanged = mlocJoin(*MBB, Visited, MOutLocs, MInLocs[*MBB]);
       InLocsChanged |= Visited.insert(MBB).second;
 
       // Don't examine transfer function if we've visited this loc at least
@@ -2673,7 +2673,7 @@ void InstrRefBasedLDV::buildMLocValueMap(
         continue;
 
       // Load the current set of live-ins into MLocTracker.
-      MTracker->loadFromArray(MInLocs[CurBB], CurBB);
+      MTracker->loadFromArray(MInLocs[*MBB], CurBB);
 
       // Each element of the transfer function can be a new def, or a read of
       // a live-in value. Evaluate each element, and store to "ToRemap".
@@ -2700,8 +2700,8 @@ void InstrRefBasedLDV::buildMLocValueMap(
       // the transfer function, and mlocJoin.
       bool OLChanged = false;
       for (auto Location : MTracker->locations()) {
-        OLChanged |= MOutLocs[CurBB][Location.Idx.asU64()] != Location.Value;
-        MOutLocs[CurBB][Location.Idx.asU64()] = Location.Value;
+        OLChanged |= MOutLocs[*MBB][Location.Idx.asU64()] != Location.Value;
+        MOutLocs[*MBB][Location.Idx.asU64()] = Location.Value;
       }
 
       MTracker->reset();
@@ -2844,7 +2844,6 @@ std::optional<ValueIDNum> InstrRefBasedLDV::pickOperandPHILoc(
   unsigned NumLocs = MTracker->getNumLocs();
 
   for (const auto p : BlockOrders) {
-    unsigned ThisBBNum = p->getNumber();
     auto OutValIt = LiveOuts.find(p);
     assert(OutValIt != LiveOuts.end());
     const DbgValue &OutVal = *OutValIt->second;
@@ -2863,7 +2862,7 @@ std::optional<ValueIDNum> InstrRefBasedLDV::pickOperandPHILoc(
       ValueIDNum ValToLookFor = OutValOp.ID;
       // Search the live-outs of the predecessor for the specified value.
       for (unsigned int I = 0; I < NumLocs; ++I) {
-        if (MOutLocs[ThisBBNum][I] == ValToLookFor)
+        if (MOutLocs[*p][I] == ValToLookFor)
           Locs.back().push_back(LocIdx(I));
       }
     } else {
@@ -2876,7 +2875,7 @@ std::optional<ValueIDNum> InstrRefBasedLDV::pickOperandPHILoc(
       // machine-value PHI locations.
       for (unsigned int I = 0; I < NumLocs; ++I) {
         ValueIDNum MPHI(MBB.getNumber(), 0, LocIdx(I));
-        if (MOutLocs[ThisBBNum][I] == MPHI)
+        if (MOutLocs[*p][I] == MPHI)
           Locs.back().push_back(LocIdx(I));
       }
     }
@@ -3498,19 +3497,15 @@ bool InstrRefBasedLDV::depthFirstVLocAndEmit(
   // Helper lambda for ejecting a block -- if nothing is going to use the block,
   // we can translate the variable location information into DBG_VALUEs and then
   // free all of InstrRefBasedLDV's data structures.
-  SmallPtrSet<const MachineBasicBlock *, 8> EjectedBBs;
   auto EjectBlock = [&](MachineBasicBlock &MBB) -> void {
-    if (EjectedBBs.insert(&MBB).second == false)
-      return;
     unsigned BBNum = MBB.getNumber();
     AllTheVLocs[BBNum].clear();
 
     // Prime the transfer-tracker, and then step through all the block
     // instructions, installing transfers.
     MTracker->reset();
-    MTracker->loadFromArray(MInLocs[BBNum], BBNum);
-    TTracker->loadInlocs(MBB, MInLocs[BBNum], DbgOpStore, Output[BBNum],
-                         NumLocs);
+    MTracker->loadFromArray(MInLocs[MBB], BBNum);
+    TTracker->loadInlocs(MBB, MInLocs[MBB], DbgOpStore, Output[BBNum], NumLocs);
 
     CurBB = BBNum;
     CurInst = 1;
@@ -3521,8 +3516,8 @@ bool InstrRefBasedLDV::depthFirstVLocAndEmit(
     }
 
     // Free machine-location tables for this block.
-    MInLocs[BBNum] = ValueTable();
-    MOutLocs[BBNum] = ValueTable();
+    MInLocs.ejectTableForBlock(MBB);
+    MOutLocs.ejectTableForBlock(MBB);
     // We don't need live-in variable values for this block either.
     Output[BBNum].clear();
     AllTheVLocs[BBNum].clear();
@@ -3587,7 +3582,8 @@ bool InstrRefBasedLDV::depthFirstVLocAndEmit(
   // anything for such out-of-scope blocks, but for the sake of being similar
   // to VarLocBasedLDV, eject these too.
   for (auto *MBB : ArtificialBlocks)
-    EjectBlock(*MBB);
+    if (MInLocs.hasTableFor(*MBB))
+      EjectBlock(*MBB);
 
   return emitTransfers(AllVarsNumbering);
 }
@@ -3686,8 +3682,8 @@ bool InstrRefBasedLDV::ExtendRanges(MachineFunction &MF,
   // machine values. The outer dimension is the block number; while the inner
   // dimension is a LocIdx from MLocTracker.
   unsigned NumLocs = MTracker->getNumLocs();
-  FuncValueTable MOutLocs(MaxNumBlocks, ValueTable(NumLocs));
-  FuncValueTable MInLocs(MaxNumBlocks, ValueTable(NumLocs));
+  FuncValueTable MOutLocs(MaxNumBlocks, NumLocs);
+  FuncValueTable MInLocs(MaxNumBlocks, NumLocs);
 
   // Solve the machine value dataflow problem using the MLocTransfer function,
   // storing the computed live-ins / live-outs into the array-of-arrays. We use
@@ -3725,7 +3721,7 @@ bool InstrRefBasedLDV::ExtendRanges(MachineFunction &MF,
     CurBB = MBB.getNumber();
     VTracker = &vlocs[CurBB];
     VTracker->MBB = &MBB;
-    MTracker->loadFromArray(MInLocs[CurBB], CurBB);
+    MTracker->loadFromArray(MInLocs[MBB], CurBB);
     CurInst = 1;
     for (auto &MI : MBB) {
       process(MI, &MOutLocs, &MInLocs);
@@ -3939,7 +3935,7 @@ class LDVSSAUpdater {
   /// Find the live-in value number for the given block. Looks up the value at
   /// the PHI location on entry.
   BlockValueNum getValue(LDVSSABlock *LDVBB) {
-    return MLiveIns[LDVBB->BB.getNumber()][Loc.asU64()].asU64();
+    return MLiveIns[LDVBB->BB][Loc.asU64()].asU64();
   }
 };
 
@@ -4179,8 +4175,7 @@ std::optional<ValueIDNum> InstrRefBasedLDV::resolveDbgPHIsImpl(
   });
 
   for (auto &PHI : SortedPHIs) {
-    ValueIDNum ThisBlockValueNum =
-        MLiveIns[PHI->ParentBlock->BB.getNumber()][Loc.asU64()];
+    ValueIDNum ThisBlockValueNum = MLiveIns[PHI->ParentBlock->BB][Loc.asU64()];
 
     // Are all these things actually defined?
     for (auto &PHIIt : PHI->IncomingValues) {
@@ -4189,7 +4184,7 @@ std::optional<ValueIDNum> InstrRefBasedLDV::resolveDbgPHIsImpl(
         return std::nullopt;
 
       ValueIDNum ValueToCheck;
-      const ValueTable &BlockLiveOuts = MLiveOuts[PHIIt.first->BB.getNumber()];
+      const ValueTable &BlockLiveOuts = MLiveOuts[PHIIt.first->BB];
 
       auto VVal = ValidatedValues.find(PHIIt.first);
       if (VVal == ValidatedValues.end()) {
diff --git a/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.h b/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.h
index d6dbb1feda3e8..ccc284b623310 100644
--- a/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.h
+++ b/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.h
@@ -207,9 +207,48 @@ using namespace llvm;
 /// Type for a table of values in a block.
 using ValueTable = SmallVector<ValueIDNum, 0>;
 
-/// Type for a table-of-table-of-values, i.e., the collection of either
-/// live-in or live-out values for each block in the function.
-using FuncValueTable = SmallVector<ValueTable, 0>;
+/// A collection of ValueTables, one per BB in a function, with convenient
+/// accessor methods.
+struct FuncValueTable {
+  FuncValueTable(int NumBBs, int NumLocs) {
+    Storage.reserve(NumBBs);
+    for (int i = 0; i != NumBBs; ++i)
+      Storage.push_back(
+          std::make_unique<ValueTable>(NumLocs, ValueIDNum::EmptyValue));
+  }
+
+  /// Returns the ValueTable associated with MBB.
+  ValueTable &operator[](const MachineBasicBlock &MBB) const {
+    return (*this)[MBB.getNumber()];
+  }
+
+  /// Returns the ValueTable associated with the MachineBasicBlock whose number
+  /// is MBBNum.
+  ValueTable &operator[](int MBBNum) const {
+    auto &TablePtr = Storage[MBBNum];
+    assert(TablePtr && "Trying to access a deleted table");
+    return *TablePtr;
+  }
+
+  /// Returns the ValueTable associated with the entry MachineBasicBlock.
+  ValueTable &tableForEntryMBB() const { return (*this)[0]; }
+
+  /// Returns true if the ValueTable associated with MBB has not been freed.
+  bool hasTableFor(MachineBasicBlock &MBB) const {
+    return Storage[MBB.getNumber()] != nullptr;
+  }
+
+  /// Frees the memory of the ValueTable associated with MBB.
+  void ejectTableForBlock(const MachineBasicBlock &MBB) {
+    Storage[MBB.getNumber()].reset();
+  }
+
+private:
+  /// ValueTables are stored as unique_ptrs to allow for deallocation during
+  /// LDV; this was measured to have a significant impact on compiler memory
+  /// usage.
+  SmallVector<std::unique_ptr<ValueTable>, 0> Storage;
+};
 
 /// Thin wrapper around an integer -- designed to give more type safety to
 /// spill location numbers.
diff --git a/llvm/unittests/CodeGen/InstrRefLDVTest.cpp b/llvm/unittests/CodeGen/InstrRefLDVTest.cpp
index acbcd247fa9a0..7d4eaf388f727 100644
--- a/llvm/unittests/CodeGen/InstrRefLDVTest.cpp
+++ b/llvm/unittests/CodeGen/InstrRefLDVTest.cpp
@@ -497,8 +497,7 @@ body:  |
 
   std::pair<FuncValueTable, FuncValueTable>
   allocValueTables(unsigned Blocks, unsigned Locs) {
-    return {FuncValueTable(Blocks, ValueTable(Locs)),
-            FuncValueTable(Blocks, ValueTable(Locs))};
+    return {FuncValueTable(Blocks, Locs), FuncValueTable(Blocks, Locs)};
   }
 };
 
@@ -917,8 +916,7 @@ TEST_F(InstrRefLDVTest, MLocSingleBlock) {
 
   // Set up live-in and live-out tables for this function: two locations (we
   // add one later) in a single block.
-  FuncValueTable MOutLocs, MInLocs;
-  std::tie(MOutLocs, MInLocs) = allocValueTables(1, 2);
+  auto [MOutLocs, MInLocs] = allocValueTables(1, 2);
 
   // Transfer function: nothing.
   SmallVector<MLocTransferMap, 1> TransferFunc;
@@ -983,8 +981,7 @@ TEST_F(InstrRefLDVTest, MLocDiamondBlocks) {
   Register RAX = getRegByName("RAX");
   LocIdx RaxLoc = MTracker->lookupOrTrackRegister(RAX);
 
-  FuncValueTable MInLocs, MOutLocs;
-  std::tie(MInLocs, MOutLocs) = allocValueTables(4, 2);
+  auto [MInLocs, MOutLocs] = allocValueTables(4, 2);
 
   // Transfer function: start with nothing.
   SmallVector<MLocTransferMap, 1> TransferFunc;
@@ -1137,8 +1134,7 @@ TEST_F(InstrRefLDVTest, MLocDiamondSpills) {
   // There are other locations, for things like xmm0, which we're going to
   // ignore here.
 
-  FuncValueTable MInLocs, MOutLocs;
-  std::tie(MInLocs, MOutLocs) = allocValueTables(4, 11);
+  auto [MInLocs, MOutLocs] = allocValueTables(4, 11);
 
   // Transfer function: start with nothing.
   SmallVector<MLocTransferMap, 1> TransferFunc;
@@ -1199,8 +1195,7 @@ TEST_F(InstrRefLDVTest, MLocSimpleLoop) {
   Register RAX = getRegByName("RAX");
   LocIdx RaxLoc = MTracker->lookupOrTrackRegister(RAX);
 
-  FuncValueTable MInLocs, MOutLocs;
-  std::tie(MInLocs, MOutLocs) = allocValueTables(3, 2);
+  auto [MInLocs, MOutLocs] = allocValueTables(3, 2);
 
   SmallVector<MLocTransferMap, 1> TransferFunc;
   TransferFunc.resize(3);
@@ -1298,8 +1293,7 @@ TEST_F(InstrRefLDVTest, MLocNestedLoop) {
   Register RAX = getRegByName("RAX");
   LocIdx RaxLoc = MTracker->lookupOrTrackRegister(RAX);
 
-  FuncValueTable MInLocs, MOutLocs;
-  std::tie(MInLocs, MOutLocs) = allocValueTables(5, 2);
+  auto [MInLocs, MOutLocs] = allocValueTables(5, 2);
 
   SmallVector<MLocTransferMap, 1> TransferFunc;
   TransferFunc.resize(5);
@@ -1500,8 +1494,7 @@ TEST_F(InstrRefLDVTest, MLocNoDominatingLoop) {
   Register RAX = getRegByName("RAX");
   LocIdx RaxLoc = MTracker->lookupOrTrackRegister(RAX);
 
-  FuncValueTable MInLocs, MOutLocs;
-  std::tie(MInLocs, MOutLocs) = allocValueTables(5, 2);
+  auto [MInLocs, MOutLocs] = allocValueTables(5, 2);
 
   SmallVector<MLocTransferMap, 1> TransferFunc;
   TransferFunc.resize(5);
@@ -1656,8 +1649,7 @@ TEST_F(InstrRefLDVTest, MLocBadlyNestedLoops) {
   Register RAX = getRegByName("RAX");
   LocIdx RaxLoc = MTracker->lookupOrTrackRegister(RAX);
 
-  FuncValueTable MInLocs, MOutLocs;
-  std::tie(MInLocs, MOutLocs) = allocValueTables(5, 2);
+  auto [MInLocs, MOutLocs] = allocValueTables(5, 2);
 
   SmallVector<MLocTransferMap, 1> TransferFunc;
   TransferFunc.resize(5);
@@ -1789,8 +1781,7 @@ TEST_F(InstrRefLDVTest, pickVPHILocDiamond) {
   Register RAX = getRegByName("RAX");
   LocIdx RaxLoc = MTracker->lookupOrTrackRegister(RAX);
 
-  FuncValueTable MInLocs, MOutLocs;
-  std::tie(MInLocs, MOutLocs) = allocValueTables(4, 2);
+  auto [MInLocs, MOutLocs] = allocValueTables(4, 2);
 
   initValueArray(MOutLocs, 4, 2);
 
@@ -1986,8 +1977,7 @@ TEST_F(InstrRefLDVTest, pickVPHILocLoops) {
   Register RAX = getRegByName("RAX");
   LocIdx RaxLoc = MTracker->lookupOrTrackRegister(RAX);
 
-  FuncValueTable MInLocs, MOutLocs;
-  std::tie(MInLocs, MOutLocs) = allocValueTables(3, 2);
+  auto [MInLocs, MOutLocs] = allocValueTables(3, 2);
 
   initValueArray(MOutLocs, 3, 2);
 
@@ -2117,8 +2107,7 @@ TEST_F(InstrRefLDVTest, pickVPHILocBadlyNestedLoops) {
   Register RBX = getRegByName("RBX");
   LocIdx RbxLoc = MTracker->lookupOrTrackRegister(RBX);
 
-  FuncValueTable MInLocs, MOutLocs;
-  std::tie(MInLocs, MOutLocs) = allocValueTables(5, 3);
+  auto [MInLocs, MOutLocs] = allocValueTables(5, 3);
 
   initValueArray(MOutLocs, 5, 3);
 
@@ -2635,8 +2624,7 @@ TEST_F(InstrRefLDVTest, VLocSingleBlock) {
   ASSERT_TRUE(MTracker->getNumLocs() == 1);
   LocIdx RspLoc(0);
 
-  FuncValueTable MInLocs, MOutLocs;
-  std::tie(MInLocs, MOutLocs) = allocValueTables(1, 2);
+  auto [MInLocs, MOutLocs] = allocValueTables(1, 2);
 
   ValueIDNum LiveInRsp = ValueIDNum(0, 0, RspLoc);
   DbgOpID LiveInRspID = addValueDbgOp(LiveInRsp);
@@ -2699,8 +2687,7 @@ TEST_F(InstrRefLDVTest, VLocDiamondBlocks) {
   DbgOpID LiveInRaxID = addValueDbgOp(LiveInRax);
   DbgOpID RspPHIInBlk3ID = addValueDbgOp(RspPHIInBlk3);
 
-  FuncValueTable MInLocs, MOutLocs;
-  std::tie(MInLocs, MOutLocs) = allocValueTables(4, 2);
+  auto [MInLocs, MOutLocs] = allocValueTables(4, 2);
 
   initValueArray(MInLocs, 4, 2);
   initValueArray(MOutLocs, 4, 2);
@@ -2921,8 +2908,7 @@ TEST_F(InstrRefLDVTest, VLocSimpleLoop) {
   DbgOpID RspDefInBlk1ID = addValueDbgOp(RspDefInBlk1);
   DbgOpID RaxPHIInBlk1ID = addValueDbgOp(RaxPHIInBlk1);
 
-  FuncValueTable MInLocs, MOutLocs;
-  std::tie(MInLocs, MOutLocs) = allocValueTables(3, 2);
+  auto [MInLocs, MOutLocs] = allocValueTables(3, 2);
 
   initValueArray(MInLocs, 3, 2);
   initValueArray(MOutLocs, 3, 2);
@@ -3200,8 +3186,7 @@ TEST_F(InstrRefLDVTest, VLocNestedLoop) {
   DbgOpID RspPHIInBlk2ID = addValueDbgOp(RspPHIInBlk2);
   DbgOpID RspDefInBlk2ID = addValueDbgOp(RspDefInBlk2);
 
-  FuncValueTable MInLocs, MOutLocs;
-  std::tie(MInLocs, MOutLocs) = allocValueTables(5, 2);
+  auto [MInLocs, MOutLocs] = allocValueTables(5, 2);
 
   initValueArray(MInLocs, 5, 2);
   initValueArray(MOutLocs, 5, 2);

From 061e4f24b24a3b59d73a94dc6f2f0d21a2b7beac Mon Sep 17 00:00:00 2001
From: Rik Huijzer <github@huijzer.xyz>
Date: Sat, 23 Dec 2023 21:48:33 +0100
Subject: [PATCH 216/342] [mlir][doc] Escape effects, interfaces, and traits
 (#76297)

Fixes https://github.com/llvm/llvm-project/issues/76270.

Thanks to @scottamain for the clear description.


Co-authored-by: Scott Main <scott@modular.com>
---
 mlir/test/mlir-tblgen/gen-dialect-doc.td |  6 +++---
 mlir/tools/mlir-tblgen/OpDocGen.cpp      | 12 +++++++++---
 2 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/mlir/test/mlir-tblgen/gen-dialect-doc.td b/mlir/test/mlir-tblgen/gen-dialect-doc.td
index ca0b6e38edf82..c9492eb9ac3ce 100644
--- a/mlir/test/mlir-tblgen/gen-dialect-doc.td
+++ b/mlir/test/mlir-tblgen/gen-dialect-doc.td
@@ -81,9 +81,9 @@ def TestTypeDefParams : TypeDef<Test_Dialect, "TestTypeDefParams"> {
 // CHECK: Other group
 // CHECK: test.b
 // CHECK: test.c
-// CHECK: Traits: SingleBlock, SingleBlockImplicitTerminator<YieldOp>
-// CHECK: Interfaces: NoMemoryEffect (MemoryEffectOpInterface)
-// CHECK: Effects: MemoryEffects::Effect{}
+// CHECK: Traits: `SingleBlockImplicitTerminator<YieldOp>`, `SingleBlock`
+// CHECK: Interfaces: `NoMemoryEffect (MemoryEffectOpInterface)`
+// CHECK: Effects: `MemoryEffects::Effect{}`
 
 // CHECK: ## Attribute constraints
 // CHECK: ### attribute summary
diff --git a/mlir/tools/mlir-tblgen/OpDocGen.cpp b/mlir/tools/mlir-tblgen/OpDocGen.cpp
index 877ef1089dcec..7cd2690ea8155 100644
--- a/mlir/tools/mlir-tblgen/OpDocGen.cpp
+++ b/mlir/tools/mlir-tblgen/OpDocGen.cpp
@@ -123,6 +123,12 @@ static void emitAssemblyFormat(StringRef opName, StringRef format,
   os << "```\n\n";
 }
 
+/// Place `text` between backticks so that the Markdown processor renders it as
+/// inline code.
+static std::string backticks(const std::string &text) {
+  return '`' + text + '`';
+}
+
 static void emitOpTraitsDoc(const Operator &op, raw_ostream &os) {
   // TODO: We should link to the trait/documentation of it. That also means we
   // should add descriptions to traits that can be queried.
@@ -155,14 +161,14 @@ static void emitOpTraitsDoc(const Operator &op, raw_ostream &os) {
           os << effect << " on " << rec->getValueAsString("resource");
         });
         os << "}";
-        effects.insert(os.str());
+        effects.insert(backticks(os.str()));
         name.append(llvm::formatv(" ({0})", traitName).str());
       }
-      interfaces.insert(name);
+      interfaces.insert(backticks(name));
       continue;
     }
 
-    traits.insert(name);
+    traits.insert(backticks(name));
   }
   if (!traits.empty()) {
     llvm::interleaveComma(traits, os << "\nTraits: ");

From 0e07bf91f7e3d5a53f0a51309da12e91ea8accc9 Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Sat, 23 Dec 2023 16:23:55 -0800
Subject: [PATCH 217/342] [NFC][sanitizer] Rename to Lock{Before,After}Fork
 StackDepotBase locking (#76303)

Followup to #76279
---
 compiler-rt/lib/dfsan/dfsan_chained_origin_depot.cpp     | 4 ++--
 compiler-rt/lib/msan/msan_chained_origin_depot.cpp       | 4 ++--
 .../sanitizer_common/sanitizer_chained_origin_depot.cpp  | 6 ++++--
 .../sanitizer_common/sanitizer_chained_origin_depot.h    | 4 ++--
 .../lib/sanitizer_common/sanitizer_stackdepot.cpp        | 4 ++--
 .../lib/sanitizer_common/sanitizer_stackdepotbase.h      | 9 +++++----
 6 files changed, 17 insertions(+), 14 deletions(-)

diff --git a/compiler-rt/lib/dfsan/dfsan_chained_origin_depot.cpp b/compiler-rt/lib/dfsan/dfsan_chained_origin_depot.cpp
index 6644bd6a7c6c0..f95194d19f03a 100644
--- a/compiler-rt/lib/dfsan/dfsan_chained_origin_depot.cpp
+++ b/compiler-rt/lib/dfsan/dfsan_chained_origin_depot.cpp
@@ -19,10 +19,10 @@ static ChainedOriginDepot chainedOriginDepot;
 
 ChainedOriginDepot* GetChainedOriginDepot() { return &chainedOriginDepot; }
 
-void ChainedOriginDepotLockBeforeFork() { chainedOriginDepot.LockAll(); }
+void ChainedOriginDepotLockBeforeFork() { chainedOriginDepot.LockBeforeFork(); }
 
 void ChainedOriginDepotUnlockAfterFork(bool fork_child) {
-  chainedOriginDepot.UnlockAll();
+  chainedOriginDepot.UnlockAfterFork(fork_child);
 }
 
 }  // namespace __dfsan
diff --git a/compiler-rt/lib/msan/msan_chained_origin_depot.cpp b/compiler-rt/lib/msan/msan_chained_origin_depot.cpp
index c3bd54141e6c3..b98b0e6b14b58 100644
--- a/compiler-rt/lib/msan/msan_chained_origin_depot.cpp
+++ b/compiler-rt/lib/msan/msan_chained_origin_depot.cpp
@@ -31,10 +31,10 @@ u32 ChainedOriginDepotGet(u32 id, u32 *other) {
   return chainedOriginDepot.Get(id, other);
 }
 
-void ChainedOriginDepotBeforeFork() { chainedOriginDepot.LockAll(); }
+void ChainedOriginDepotBeforeFork() { chainedOriginDepot.LockBeforeFork(); }
 
 void ChainedOriginDepotAfterFork(bool fork_child) {
-  chainedOriginDepot.UnlockAll();
+  chainedOriginDepot.UnlockAfterFork(fork_child);
 }
 
 } // namespace __msan
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_chained_origin_depot.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_chained_origin_depot.cpp
index e0e2bd01069f2..df2b2eb23df28 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_chained_origin_depot.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_chained_origin_depot.cpp
@@ -139,9 +139,11 @@ u32 ChainedOriginDepot::Get(u32 id, u32 *other) {
   return desc.here_id;
 }
 
-void ChainedOriginDepot::LockAll() { depot.LockAll(); }
+void ChainedOriginDepot::LockBeforeFork() { depot.LockBeforeFork(); }
 
-void ChainedOriginDepot::UnlockAll() { depot.UnlockAll(); }
+void ChainedOriginDepot::UnlockAfterFork(bool fork_child) {
+  depot.UnlockAfterFork(fork_child);
+}
 
 void ChainedOriginDepot::TestOnlyUnmap() { depot.TestOnlyUnmap(); }
 
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_chained_origin_depot.h b/compiler-rt/lib/sanitizer_common/sanitizer_chained_origin_depot.h
index f9f192b685719..f3da28129e6bd 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_chained_origin_depot.h
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_chained_origin_depot.h
@@ -32,8 +32,8 @@ class ChainedOriginDepot {
   // Retrieves the stored StackDepot ID for the given origin ID.
   u32 Get(u32 id, u32 *other);
 
-  void LockAll();
-  void UnlockAll();
+  void LockBeforeFork();
+  void UnlockAfterFork(bool fork_child);
   void TestOnlyUnmap();
 
  private:
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_stackdepot.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_stackdepot.cpp
index ce21f3c178bce..3776e8c97057e 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_stackdepot.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_stackdepot.cpp
@@ -216,7 +216,7 @@ StackTrace StackDepotGet(u32 id) {
 }
 
 void StackDepotLockBeforeFork() {
-  theDepot.LockAll();
+  theDepot.LockBeforeFork();
   compress_thread.LockAndStop();
   stackStore.LockAll();
 }
@@ -224,7 +224,7 @@ void StackDepotLockBeforeFork() {
 void StackDepotUnlockAfterFork(bool fork_child) {
   stackStore.UnlockAll();
   compress_thread.Unlock();
-  theDepot.UnlockAll();
+  theDepot.UnlockAfterFork(fork_child);
 }
 
 void StackDepotPrintAll() {
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_stackdepotbase.h b/compiler-rt/lib/sanitizer_common/sanitizer_stackdepotbase.h
index 96d1ddc87fd03..21d57d9ab2a91 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_stackdepotbase.h
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_stackdepotbase.h
@@ -52,8 +52,8 @@ class StackDepotBase {
     };
   }
 
-  void LockAll();
-  void UnlockAll();
+  void LockBeforeFork();
+  void UnlockAfterFork(bool fork_child);
   void PrintAll();
 
   void TestOnlyUnmap() {
@@ -160,14 +160,15 @@ StackDepotBase<Node, kReservedBits, kTabSizeLog>::Get(u32 id) {
 }
 
 template <class Node, int kReservedBits, int kTabSizeLog>
-void StackDepotBase<Node, kReservedBits, kTabSizeLog>::LockAll() {
+void StackDepotBase<Node, kReservedBits, kTabSizeLog>::LockBeforeFork() {
   for (int i = 0; i < kTabSize; ++i) {
     lock(&tab[i]);
   }
 }
 
 template <class Node, int kReservedBits, int kTabSizeLog>
-void StackDepotBase<Node, kReservedBits, kTabSizeLog>::UnlockAll() {
+void StackDepotBase<Node, kReservedBits, kTabSizeLog>::UnlockAfterFork(
+    bool fork_child) {
   for (int i = 0; i < kTabSize; ++i) {
     atomic_uint32_t *p = &tab[i];
     uptr s = atomic_load(p, memory_order_relaxed);

From 6e20df1a3b0f7654b2821fe182c7ae9bd52672e6 Mon Sep 17 00:00:00 2001
From: Shengchen Kan <shengchen.kan@intel.com>
Date: Sat, 23 Dec 2023 21:41:53 +0800
Subject: [PATCH 218/342] [X86][NFC] Set default OpPrefix to PS for
 XOP/VEX/EVEX instructions

It helps simplify the class definitions. Now, the only explicit usage of PS is
to check prefix 0x66/0xf2/0xf3 can not be used a prefix, e.g. wbinvd.

See 82974e0114f02ffc07557e217d87f8dc4e100a26 for more details.
---
 llvm/lib/Target/X86/X86InstrAMX.td        |   8 +-
 llvm/lib/Target/X86/X86InstrAVX512.td     | 128 ++++++++++----------
 llvm/lib/Target/X86/X86InstrArithmetic.td |   4 +-
 llvm/lib/Target/X86/X86InstrFPStack.td    |   8 +-
 llvm/lib/Target/X86/X86InstrFormats.td    |   5 +-
 llvm/lib/Target/X86/X86InstrMMX.td        |   6 +-
 llvm/lib/Target/X86/X86InstrMisc.td       |  82 ++++++-------
 llvm/lib/Target/X86/X86InstrRAOINT.td     |   2 +-
 llvm/lib/Target/X86/X86InstrSGX.td        |   6 +-
 llvm/lib/Target/X86/X86InstrSSE.td        | 138 +++++++++++-----------
 llvm/lib/Target/X86/X86InstrSystem.td     |  54 ++++-----
 llvm/lib/Target/X86/X86InstrTSX.td        |   4 +-
 llvm/lib/Target/X86/X86InstrUtils.td      |  26 ++--
 llvm/lib/Target/X86/X86InstrVMX.td        |  22 ++--
 14 files changed, 247 insertions(+), 246 deletions(-)

diff --git a/llvm/lib/Target/X86/X86InstrAMX.td b/llvm/lib/Target/X86/X86InstrAMX.td
index a4292b99511bb..7f3e193d9a1b9 100644
--- a/llvm/lib/Target/X86/X86InstrAMX.td
+++ b/llvm/lib/Target/X86/X86InstrAMX.td
@@ -20,7 +20,7 @@ let Predicates = [HasAMXTILE, In64BitMode] in {
         Defs = [TMM0,TMM1,TMM2,TMM3,TMM4,TMM5,TMM6,TMM7] in
     def LDTILECFG : I <0x49, MRM0m, (outs), (ins opaquemem:$src),
                        "ldtilecfg\t$src",
-                       [(int_x86_ldtilecfg addr:$src)]>, VEX, T8, PS;
+                       [(int_x86_ldtilecfg addr:$src)]>, VEX, T8;
     let hasSideEffects = 1 in
     def STTILECFG : I <0x49, MRM0m, (outs), (ins opaquemem:$src),
                        "sttilecfg\t$src",
@@ -37,7 +37,7 @@ let Predicates = [HasAMXTILE, In64BitMode] in {
                         VEX, T8, PD;
     let Defs = [TMM0,TMM1,TMM2,TMM3,TMM4,TMM5,TMM6,TMM7] in
     def TILERELEASE : I<0x49, MRM_C0, (outs), (ins),
-                        "tilerelease", [(int_x86_tilerelease)]>, VEX, T8, PS;
+                        "tilerelease", [(int_x86_tilerelease)]>, VEX, T8;
     let mayStore = 1 in
     def TILESTORED : I<0x4b, MRMDestMemFSIB, (outs),
                        (ins sibmem:$dst, TILE:$src),
@@ -103,7 +103,7 @@ let Predicates = [HasAMXINT8, In64BitMode] in {
       def TDPBUUD : I<0x5e, MRMSrcReg4VOp3, (outs TILE:$dst),
                       (ins TILE:$src1, TILE:$src2, TILE:$src3),
                       "tdpbuud\t{$src3, $src2, $dst|$dst, $src2, $src3}", []>,
-                      VEX, VVVV, T8, PS;
+                      VEX, VVVV, T8;
     }
 
     // Pseduo instruction for RA.
@@ -226,7 +226,7 @@ let Predicates = [HasAMXCOMPLEX, In64BitMode] in {
       def TCMMRLFP16PS : I<0x6c, MRMSrcReg4VOp3, (outs TILE:$dst),
                             (ins TILE:$src1, TILE:$src2, TILE:$src3),
                             "tcmmrlfp16ps\t{$src3, $src2, $src1|$src1, $src2, $src3}",
-                            []>, VEX, VVVV, WIG, T8, PS;
+                            []>, VEX, VVVV, WIG, T8;
 
     } // Constraints = "$src1 = $dst"
 
diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td
index e3a4aee3aceb7..7c3c1d5fe42b3 100644
--- a/llvm/lib/Target/X86/X86InstrAVX512.td
+++ b/llvm/lib/Target/X86/X86InstrAVX512.td
@@ -2634,11 +2634,11 @@ let Predicates = [HasDQI, HasEGPR, In64BitMode] in
 let Predicates = [HasAVX512, NoEGPR] in
   defm KMOVW : avx512_mask_mov<0x90, 0x90, 0x91, "kmovw", VK16, v16i1, i16mem>,
                avx512_mask_mov_gpr<0x92, 0x93, "kmovw", VK16, GR32>,
-               VEX, TB, PS;
+               VEX, TB;
 let Predicates = [HasAVX512, HasEGPR, In64BitMode] in
   defm KMOVW : avx512_mask_mov<0x90, 0x90, 0x91, "kmovw", VK16, v16i1, i16mem, "_EVEX">,
                avx512_mask_mov_gpr<0x92, 0x93, "kmovw", VK16, GR32, "_EVEX">,
-               EVEX, TB, PS;
+               EVEX, TB;
 
 let Predicates = [HasBWI, NoEGPR] in {
   defm KMOVD : avx512_mask_mov<0x90, 0x90, 0x91, "kmovd", VK32, v32i1,i32mem>,
@@ -2646,7 +2646,7 @@ let Predicates = [HasBWI, NoEGPR] in {
   defm KMOVD : avx512_mask_mov_gpr<0x92, 0x93, "kmovd", VK32, GR32>,
                VEX, TB, XD;
   defm KMOVQ : avx512_mask_mov<0x90, 0x90, 0x91, "kmovq", VK64, v64i1, i64mem>,
-               VEX, TB, PS, REX_W;
+               VEX, TB, REX_W;
   defm KMOVQ : avx512_mask_mov_gpr<0x92, 0x93, "kmovq", VK64, GR64>,
                VEX, TB, XD, REX_W;
 }
@@ -2656,7 +2656,7 @@ let Predicates = [HasBWI, HasEGPR, In64BitMode] in {
   defm KMOVD : avx512_mask_mov_gpr<0x92, 0x93, "kmovd", VK32, GR32, "_EVEX">,
                EVEX, TB, XD;
   defm KMOVQ : avx512_mask_mov<0x90, 0x90, 0x91, "kmovq", VK64, v64i1, i64mem, "_EVEX">,
-               EVEX, TB, PS, REX_W;
+               EVEX, TB, REX_W;
   defm KMOVQ : avx512_mask_mov_gpr<0x92, 0x93, "kmovq", VK64, GR64, "_EVEX">,
                EVEX, TB, XD, REX_W;
 }
@@ -2771,11 +2771,11 @@ multiclass avx512_mask_unop_all<bits<8> opc, string OpcodeStr,
   defm B : avx512_mask_unop<opc, !strconcat(OpcodeStr, "b"), VK8, OpNode,
                             sched, HasDQI>, VEX, TB, PD;
   defm W : avx512_mask_unop<opc, !strconcat(OpcodeStr, "w"), VK16, OpNode,
-                            sched, HasAVX512>, VEX, TB, PS;
+                            sched, HasAVX512>, VEX, TB;
   defm D : avx512_mask_unop<opc, !strconcat(OpcodeStr, "d"), VK32, OpNode,
                             sched, HasBWI>, VEX, TB, PD, REX_W;
   defm Q : avx512_mask_unop<opc, !strconcat(OpcodeStr, "q"), VK64, OpNode,
-                            sched, HasBWI>, VEX, TB, PS, REX_W;
+                            sched, HasBWI>, VEX, TB, REX_W;
 }
 
 // TODO - do we need a X86SchedWriteWidths::KMASK type?
@@ -2814,11 +2814,11 @@ multiclass avx512_mask_binop_all<bits<8> opc, string OpcodeStr,
   defm B : avx512_mask_binop<opc, !strconcat(OpcodeStr, "b"), VK8, OpNode,
                              sched, HasDQI, IsCommutable>, VEX, VVVV, VEX_L, TB, PD;
   defm W : avx512_mask_binop<opc, !strconcat(OpcodeStr, "w"), VK16, OpNode,
-                             sched, prdW, IsCommutable>, VEX, VVVV, VEX_L, TB, PS;
+                             sched, prdW, IsCommutable>, VEX, VVVV, VEX_L, TB;
   defm D : avx512_mask_binop<opc, !strconcat(OpcodeStr, "d"), VK32, OpNode,
                              sched, HasBWI, IsCommutable>, VEX, VVVV, VEX_L, REX_W, TB, PD;
   defm Q : avx512_mask_binop<opc, !strconcat(OpcodeStr, "q"), VK64, OpNode,
-                             sched, HasBWI, IsCommutable>, VEX, VVVV, VEX_L, REX_W, TB, PS;
+                             sched, HasBWI, IsCommutable>, VEX, VVVV, VEX_L, REX_W, TB;
 }
 
 // TODO - do we need a X86SchedWriteWidths::KMASK type?
@@ -2877,8 +2877,8 @@ multiclass avx512_mask_unpck<string Suffix, X86KVectorVTInfo Dst,
 }
 
 defm KUNPCKBW : avx512_mask_unpck<"bw", v16i1_info, v8i1_info,  WriteShuffle, HasAVX512>, TB, PD;
-defm KUNPCKWD : avx512_mask_unpck<"wd", v32i1_info, v16i1_info, WriteShuffle, HasBWI>, TB, PS;
-defm KUNPCKDQ : avx512_mask_unpck<"dq", v64i1_info, v32i1_info, WriteShuffle, HasBWI>, TB, PS, REX_W;
+defm KUNPCKWD : avx512_mask_unpck<"wd", v32i1_info, v16i1_info, WriteShuffle, HasBWI>, TB;
+defm KUNPCKDQ : avx512_mask_unpck<"dq", v64i1_info, v32i1_info, WriteShuffle, HasBWI>, TB, REX_W;
 
 // Mask bit testing
 multiclass avx512_mask_testop<bits<8> opc, string OpcodeStr, RegisterClass KRC,
@@ -2897,9 +2897,9 @@ multiclass avx512_mask_testop_w<bits<8> opc, string OpcodeStr, SDNode OpNode,
   defm B : avx512_mask_testop<opc, OpcodeStr#"b", VK8, OpNode, sched, HasDQI>,
                                                                 VEX, TB, PD;
   defm W : avx512_mask_testop<opc, OpcodeStr#"w", VK16, OpNode, sched, prdW>,
-                                                                VEX, TB, PS;
+                                                                VEX, TB;
   defm Q : avx512_mask_testop<opc, OpcodeStr#"q", VK64, OpNode, sched, HasBWI>,
-                                                                VEX, TB, PS, REX_W;
+                                                                VEX, TB, REX_W;
   defm D : avx512_mask_testop<opc, OpcodeStr#"d", VK32, OpNode, sched, HasBWI>,
                                                                 VEX, TB, PD, REX_W;
 }
@@ -3371,7 +3371,7 @@ defm VMOVAPS : avx512_alignedload_vl<0x28, "vmovaps", avx512vl_f32_info,
                                      HasAVX512, SchedWriteFMoveLS, "VMOVAPS">,
                avx512_alignedstore_vl<0x29, "vmovaps", avx512vl_f32_info,
                                       HasAVX512, SchedWriteFMoveLS, "VMOVAPS">,
-               TB, PS, EVEX_CD8<32, CD8VF>;
+               TB, EVEX_CD8<32, CD8VF>;
 
 defm VMOVAPD : avx512_alignedload_vl<0x28, "vmovapd", avx512vl_f64_info,
                                      HasAVX512, SchedWriteFMoveLS, "VMOVAPD">,
@@ -3383,7 +3383,7 @@ defm VMOVUPS : avx512_load_vl<0x10, "vmovups", avx512vl_f32_info, HasAVX512,
                               SchedWriteFMoveLS, "VMOVUPS", 0, null_frag>,
                avx512_store_vl<0x11, "vmovups", avx512vl_f32_info, HasAVX512,
                                SchedWriteFMoveLS, "VMOVUPS">,
-                               TB, PS, EVEX_CD8<32, CD8VF>;
+                               TB, EVEX_CD8<32, CD8VF>;
 
 defm VMOVUPD : avx512_load_vl<0x10, "vmovupd", avx512vl_f64_info, HasAVX512,
                               SchedWriteFMoveLS, "VMOVUPD", 0, null_frag>,
@@ -4589,7 +4589,7 @@ defm VMOVNTDQ : avx512_movnt_vl<0xE7, "vmovntdq", avx512vl_i64_info,
 defm VMOVNTPD : avx512_movnt_vl<0x2B, "vmovntpd", avx512vl_f64_info,
                                 SchedWriteFMoveLSNT>, TB, PD, REX_W;
 defm VMOVNTPS : avx512_movnt_vl<0x2B, "vmovntps", avx512vl_f32_info,
-                                SchedWriteFMoveLSNT>, TB, PS;
+                                SchedWriteFMoveLSNT>, TB;
 
 let Predicates = [HasAVX512], AddedComplexity = 400 in {
   def : Pat<(alignednontemporalstore (v16i32 VR512:$src), addr:$dst),
@@ -5607,7 +5607,7 @@ multiclass avx512_fp_binop_p<bits<8> opc, string OpcodeStr, SDPatternOperator Op
                              bit IsPD128Commutable = IsCommutable> {
   let Predicates = [prd] in {
   defm PSZ : avx512_fp_packed<opc, OpcodeStr, OpNode, MaskOpNode, v16f32_info,
-                              sched.PS.ZMM, IsCommutable>, EVEX_V512, TB, PS,
+                              sched.PS.ZMM, IsCommutable>, EVEX_V512, TB,
                               EVEX_CD8<32, CD8VF>;
   defm PDZ : avx512_fp_packed<opc, OpcodeStr, OpNode, MaskOpNode, v8f64_info,
                               sched.PD.ZMM, IsCommutable>, EVEX_V512, TB, PD, REX_W,
@@ -5617,10 +5617,10 @@ multiclass avx512_fp_binop_p<bits<8> opc, string OpcodeStr, SDPatternOperator Op
     // Define only if AVX512VL feature is present.
   let Predicates = [prd, HasVLX] in {
     defm PSZ128 : avx512_fp_packed<opc, OpcodeStr, OpNode, MaskOpNode, v4f32x_info,
-                                   sched.PS.XMM, IsCommutable>, EVEX_V128, TB, PS,
+                                   sched.PS.XMM, IsCommutable>, EVEX_V128, TB,
                                    EVEX_CD8<32, CD8VF>;
     defm PSZ256 : avx512_fp_packed<opc, OpcodeStr, OpNode, MaskOpNode, v8f32x_info,
-                                   sched.PS.YMM, IsCommutable>, EVEX_V256, TB, PS,
+                                   sched.PS.YMM, IsCommutable>, EVEX_V256, TB,
                                    EVEX_CD8<32, CD8VF>;
     defm PDZ128 : avx512_fp_packed<opc, OpcodeStr, OpNode, MaskOpNode, v2f64x_info,
                                    sched.PD.XMM, IsPD128Commutable,
@@ -5637,15 +5637,15 @@ multiclass avx512_fp_binop_ph<bits<8> opc, string OpcodeStr, SDPatternOperator O
                               X86SchedWriteSizes sched, bit IsCommutable = 0> {
   let Predicates = [HasFP16] in {
     defm PHZ : avx512_fp_packed<opc, OpcodeStr, OpNode, MaskOpNode, v32f16_info,
-                                sched.PH.ZMM, IsCommutable>, EVEX_V512, T_MAP5, PS,
+                                sched.PH.ZMM, IsCommutable>, EVEX_V512, T_MAP5,
                                 EVEX_CD8<16, CD8VF>;
   }
   let Predicates = [HasVLX, HasFP16] in {
     defm PHZ128 : avx512_fp_packed<opc, OpcodeStr, OpNode, MaskOpNode, v8f16x_info,
-                                   sched.PH.XMM, IsCommutable>, EVEX_V128, T_MAP5, PS,
+                                   sched.PH.XMM, IsCommutable>, EVEX_V128, T_MAP5,
                                    EVEX_CD8<16, CD8VF>;
     defm PHZ256 : avx512_fp_packed<opc, OpcodeStr, OpNode, MaskOpNode, v16f16x_info,
-                                   sched.PH.YMM, IsCommutable>, EVEX_V256, T_MAP5, PS,
+                                   sched.PH.YMM, IsCommutable>, EVEX_V256, T_MAP5,
                                    EVEX_CD8<16, CD8VF>;
   }
 }
@@ -5656,11 +5656,11 @@ multiclass avx512_fp_binop_p_round<bits<8> opc, string OpcodeStr, SDNode OpNodeR
   let Predicates = [HasFP16] in {
     defm PHZ : avx512_fp_round_packed<opc, OpcodeStr, OpNodeRnd, sched.PH.ZMM,
                                       v32f16_info>,
-                                      EVEX_V512, T_MAP5, PS, EVEX_CD8<16, CD8VF>;
+                                      EVEX_V512, T_MAP5, EVEX_CD8<16, CD8VF>;
   }
   defm PSZ : avx512_fp_round_packed<opc, OpcodeStr, OpNodeRnd, sched.PS.ZMM,
                                     v16f32_info>,
-                                    EVEX_V512, TB, PS, EVEX_CD8<32, CD8VF>;
+                                    EVEX_V512, TB, EVEX_CD8<32, CD8VF>;
   defm PDZ : avx512_fp_round_packed<opc, OpcodeStr, OpNodeRnd, sched.PD.ZMM,
                                     v8f64_info>,
                                     EVEX_V512, TB, PD, REX_W,EVEX_CD8<64, CD8VF>;
@@ -5672,11 +5672,11 @@ multiclass avx512_fp_binop_p_sae<bits<8> opc, string OpcodeStr, SDNode OpNodeRnd
   let Predicates = [HasFP16] in {
     defm PHZ : avx512_fp_sae_packed<opc, OpcodeStr, OpNodeRnd, sched.PH.ZMM,
                                     v32f16_info>,
-                                    EVEX_V512, T_MAP5, PS, EVEX_CD8<16, CD8VF>;
+                                    EVEX_V512, T_MAP5, EVEX_CD8<16, CD8VF>;
   }
   defm PSZ : avx512_fp_sae_packed<opc, OpcodeStr, OpNodeRnd, sched.PS.ZMM,
                                   v16f32_info>,
-                                  EVEX_V512, TB, PS, EVEX_CD8<32, CD8VF>;
+                                  EVEX_V512, TB, EVEX_CD8<32, CD8VF>;
   defm PDZ : avx512_fp_sae_packed<opc, OpcodeStr, OpNodeRnd, sched.PD.ZMM,
                                   v8f64_info>,
                                   EVEX_V512, TB, PD, REX_W,EVEX_CD8<64, CD8VF>;
@@ -6500,11 +6500,11 @@ multiclass avx512_mov_hilo_packed<bits<8> opc, string OpcodeStr,
 // No patterns for MOVLPS/MOVHPS as the Movlhps node should only be created in
 // SSE1. And MOVLPS pattern is even more complex.
 defm VMOVHPSZ128 : avx512_mov_hilo_packed<0x16, "vmovhps", null_frag,
-                                  v4f32x_info>, EVEX_CD8<32, CD8VT2>, TB, PS;
+                                  v4f32x_info>, EVEX_CD8<32, CD8VT2>, TB;
 defm VMOVHPDZ128 : avx512_mov_hilo_packed<0x16, "vmovhpd", X86Unpckl,
                                   v2f64x_info>, EVEX_CD8<64, CD8VT1>, TB, PD, REX_W;
 defm VMOVLPSZ128 : avx512_mov_hilo_packed<0x12, "vmovlps", null_frag,
-                                  v4f32x_info>, EVEX_CD8<32, CD8VT2>, TB, PS;
+                                  v4f32x_info>, EVEX_CD8<32, CD8VT2>, TB;
 defm VMOVLPDZ128 : avx512_mov_hilo_packed<0x12, "vmovlpd", X86Movsd,
                                   v2f64x_info>, EVEX_CD8<64, CD8VT1>, TB, PD, REX_W;
 
@@ -7731,10 +7731,10 @@ defm VCVTSH2SD : avx512_cvt_fp_scalar_extend<0x5A, "vcvtsh2sd", X86fpexts,
                                           f64x_info, HasFP16>, T_MAP5, XS;
 defm VCVTSS2SH : avx512_cvt_fp_scalar_trunc<0x1D, "vcvtss2sh", X86frounds,
                                           X86froundsRnd, WriteCvtSD2SS, f32x_info,
-                                          f16x_info, HasFP16>, T_MAP5, PS;
+                                          f16x_info, HasFP16>, T_MAP5;
 defm VCVTSH2SS : avx512_cvt_fp_scalar_extend<0x13, "vcvtsh2ss", X86fpexts,
                                           X86fpextsSAE, WriteCvtSS2SD, f16x_info,
-                                          f32x_info, HasFP16>, T_MAP6, PS;
+                                          f32x_info, HasFP16>, T_MAP6;
 
 def : Pat<(f64 (any_fpextend FR32X:$src)),
           (VCVTSS2SDZrr (f64 (IMPLICIT_DEF)), FR32X:$src)>,
@@ -7999,7 +7999,7 @@ defm VCVTPD2PS : avx512_cvt_trunc<0x5A, "vcvtpd2ps",
                                   REX_W, TB, PD, EVEX_CD8<64, CD8VF>;
 defm VCVTPS2PD : avx512_cvt_extend<0x5A, "vcvtps2pd",
                                    avx512vl_f64_info, avx512vl_f32_info, SchedWriteCvtPS2PD>,
-                                   TB, PS, EVEX_CD8<32, CD8VH>;
+                                   TB, EVEX_CD8<32, CD8VH>;
 
 // Extend Half to Double
 multiclass avx512_cvtph2pd<bits<8> opc, string OpcodeStr,
@@ -8115,7 +8115,7 @@ defm VCVTPH2PSX : avx512_cvt_extend<0x13, "vcvtph2psx", avx512vl_f32_info,
 defm VCVTPD2PH : avx512_cvtpd2ph<0x5A, "vcvtpd2ph", SchedWriteCvtPD2PS>,
                                  REX_W, T_MAP5, PD, EVEX_CD8<64, CD8VF>;
 defm VCVTPH2PD : avx512_cvtph2pd<0x5A, "vcvtph2pd", SchedWriteCvtPS2PD>,
-                                 T_MAP5, PS, EVEX_CD8<16, CD8VQ>;
+                                 T_MAP5, EVEX_CD8<16, CD8VQ>;
 
 let Predicates = [HasFP16, HasVLX] in {
   // Special patterns to allow use of X86vmfpround for masking. Instruction
@@ -8600,7 +8600,7 @@ defm VCVTDQ2PD : avx512_cvtdq2pd<0xE6, "vcvtdq2pd", any_sint_to_fp, sint_to_fp,
 
 defm VCVTDQ2PS : avx512_cvtdq2ps<0x5B, "vcvtdq2ps", any_sint_to_fp, sint_to_fp,
                                 X86VSintToFpRnd, SchedWriteCvtDQ2PS>,
-                                TB, PS, EVEX_CD8<32, CD8VF>;
+                                TB, EVEX_CD8<32, CD8VF>;
 
 defm VCVTTPS2DQ : avx512_cvttps2dq<0x5B, "vcvttps2dq", X86any_cvttp2si,
                                  X86cvttp2si, X86cvttp2siSAE,
@@ -8613,12 +8613,12 @@ defm VCVTTPD2DQ : avx512_cvttpd2dq<0xE6, "vcvttpd2dq", X86any_cvttp2si,
 
 defm VCVTTPS2UDQ : avx512_cvttps2dq<0x78, "vcvttps2udq", X86any_cvttp2ui,
                                  X86cvttp2ui, X86cvttp2uiSAE,
-                                 SchedWriteCvtPS2DQ>, TB, PS, EVEX_CD8<32, CD8VF>;
+                                 SchedWriteCvtPS2DQ>, TB, EVEX_CD8<32, CD8VF>;
 
 defm VCVTTPD2UDQ : avx512_cvttpd2dq<0x78, "vcvttpd2udq", X86any_cvttp2ui,
                                  X86cvttp2ui, X86cvttp2uiSAE,
                                  SchedWriteCvtPD2DQ>,
-                                 TB, PS, REX_W, EVEX_CD8<64, CD8VF>;
+                                 TB, REX_W, EVEX_CD8<64, CD8VF>;
 
 defm VCVTUDQ2PD : avx512_cvtdq2pd<0x7A, "vcvtudq2pd", any_uint_to_fp,
                                   uint_to_fp, X86any_VUintToFP, X86VUintToFP,
@@ -8638,11 +8638,11 @@ defm VCVTPD2DQ : avx512_cvtpd2dq<0xE6, "vcvtpd2dq", X86cvtp2Int, X86cvtp2Int,
 
 defm VCVTPS2UDQ : avx512_cvtps2dq<0x79, "vcvtps2udq", X86cvtp2UInt, X86cvtp2UInt,
                                  X86cvtp2UIntRnd, SchedWriteCvtPS2DQ>,
-                                 TB, PS, EVEX_CD8<32, CD8VF>;
+                                 TB, EVEX_CD8<32, CD8VF>;
 
 defm VCVTPD2UDQ : avx512_cvtpd2dq<0x79, "vcvtpd2udq", X86cvtp2UInt, X86cvtp2UInt,
                                  X86cvtp2UIntRnd, SchedWriteCvtPD2DQ>, REX_W,
-                                 TB, PS, EVEX_CD8<64, CD8VF>;
+                                 TB, EVEX_CD8<64, CD8VF>;
 
 defm VCVTPD2QQ : avx512_cvtpd2qq<0x7B, "vcvtpd2qq", X86cvtp2Int, X86cvtp2Int,
                                  X86cvtp2IntRnd, SchedWriteCvtPD2DQ>, REX_W,
@@ -8692,7 +8692,7 @@ defm VCVTDQ2PH : avx512_cvtqq2ps_dq2ph<0x5B, "vcvtdq2ph", any_sint_to_fp, sint_t
                             X86any_VSintToFP, X86VMSintToFP,
                             X86VSintToFpRnd, avx512vl_f16_info, avx512vl_i32_info,
                             SchedWriteCvtDQ2PS, HasFP16>,
-                            T_MAP5, PS, EVEX_CD8<32, CD8VF>;
+                            T_MAP5, EVEX_CD8<32, CD8VF>;
 
 defm VCVTUDQ2PH : avx512_cvtqq2ps_dq2ph<0x7A, "vcvtudq2ph", any_uint_to_fp, uint_to_fp,
                             X86any_VUintToFP, X86VMUintToFP,
@@ -8703,7 +8703,7 @@ defm VCVTUDQ2PH : avx512_cvtqq2ps_dq2ph<0x7A, "vcvtudq2ph", any_uint_to_fp, uint
 defm VCVTQQ2PS : avx512_cvtqq2ps_dq2ph<0x5B, "vcvtqq2ps", any_sint_to_fp, sint_to_fp,
                             X86any_VSintToFP, X86VMSintToFP,
                             X86VSintToFpRnd, avx512vl_f32_info, avx512vl_i64_info,
-                            SchedWriteCvtDQ2PS>, REX_W, TB, PS,
+                            SchedWriteCvtDQ2PS>, REX_W, TB,
                             EVEX_CD8<64, CD8VF>;
 
 defm VCVTUQQ2PS : avx512_cvtqq2ps_dq2ph<0x7A, "vcvtuqq2ps", any_uint_to_fp, uint_to_fp,
@@ -9068,27 +9068,27 @@ let Defs = [EFLAGS], Predicates = [HasAVX512] in {
 
 let Defs = [EFLAGS], Predicates = [HasAVX512] in {
   defm VUCOMISSZ : sse12_ord_cmp<0x2E, FR32X, X86any_fcmp, f32, f32mem, loadf32,
-                                 "ucomiss", SSEPackedSingle>, TB, PS, EVEX, VEX_LIG,
+                                 "ucomiss", SSEPackedSingle>, TB, EVEX, VEX_LIG,
                                  EVEX_CD8<32, CD8VT1>;
   defm VUCOMISDZ : sse12_ord_cmp<0x2E, FR64X, X86any_fcmp, f64, f64mem, loadf64,
                                   "ucomisd", SSEPackedDouble>, TB, PD, EVEX,
                                   VEX_LIG, REX_W, EVEX_CD8<64, CD8VT1>;
   defm VCOMISSZ  : sse12_ord_cmp<0x2F, FR32X, X86strict_fcmps, f32, f32mem, loadf32,
-                                 "comiss", SSEPackedSingle>, TB, PS, EVEX, VEX_LIG,
+                                 "comiss", SSEPackedSingle>, TB, EVEX, VEX_LIG,
                                  EVEX_CD8<32, CD8VT1>;
   defm VCOMISDZ  : sse12_ord_cmp<0x2F, FR64X, X86strict_fcmps, f64, f64mem, loadf64,
                                  "comisd", SSEPackedDouble>, TB, PD, EVEX,
                                   VEX_LIG, REX_W, EVEX_CD8<64, CD8VT1>;
   let isCodeGenOnly = 1 in {
     defm VUCOMISSZ  : sse12_ord_cmp_int<0x2E, VR128X, X86ucomi, v4f32, ssmem,
-                          sse_load_f32, "ucomiss", SSEPackedSingle>, TB, PS, EVEX, VEX_LIG,
+                          sse_load_f32, "ucomiss", SSEPackedSingle>, TB, EVEX, VEX_LIG,
                           EVEX_CD8<32, CD8VT1>;
     defm VUCOMISDZ  : sse12_ord_cmp_int<0x2E, VR128X, X86ucomi, v2f64, sdmem,
                           sse_load_f64, "ucomisd", SSEPackedDouble>, TB, PD, EVEX,
                           VEX_LIG, REX_W, EVEX_CD8<64, CD8VT1>;
 
     defm VCOMISSZ  : sse12_ord_cmp_int<0x2F, VR128X, X86comi, v4f32, ssmem,
-                          sse_load_f32, "comiss", SSEPackedSingle>, TB, PS, EVEX, VEX_LIG,
+                          sse_load_f32, "comiss", SSEPackedSingle>, TB, EVEX, VEX_LIG,
                           EVEX_CD8<32, CD8VT1>;
     defm VCOMISDZ  : sse12_ord_cmp_int<0x2F, VR128X, X86comi, v2f64, sdmem,
                           sse_load_f64, "comisd", SSEPackedDouble>, TB, PD, EVEX,
@@ -9104,19 +9104,19 @@ let Defs = [EFLAGS], Predicates = [HasFP16] in {
                                 SSEPackedSingle>, AVX512PSIi8Base, T_MAP5,
                                 EVEX_CD8<16, CD8VT1>;
   defm VUCOMISHZ : sse12_ord_cmp<0x2E, FR16X, X86any_fcmp, f16, f16mem, loadf16,
-                                "ucomish", SSEPackedSingle>, T_MAP5, PS, EVEX,
+                                "ucomish", SSEPackedSingle>, T_MAP5, EVEX,
                                 VEX_LIG, EVEX_CD8<16, CD8VT1>;
   defm VCOMISHZ : sse12_ord_cmp<0x2F, FR16X, X86strict_fcmps, f16, f16mem, loadf16,
-                                "comish", SSEPackedSingle>, T_MAP5, PS, EVEX,
+                                "comish", SSEPackedSingle>, T_MAP5, EVEX,
                                 VEX_LIG, EVEX_CD8<16, CD8VT1>;
   let isCodeGenOnly = 1 in {
     defm VUCOMISHZ : sse12_ord_cmp_int<0x2E, VR128X, X86ucomi, v8f16, shmem,
                                 sse_load_f16, "ucomish", SSEPackedSingle>,
-                                T_MAP5, PS, EVEX, VEX_LIG, EVEX_CD8<16, CD8VT1>;
+                                T_MAP5, EVEX, VEX_LIG, EVEX_CD8<16, CD8VT1>;
 
     defm VCOMISHZ : sse12_ord_cmp_int<0x2F, VR128X, X86comi, v8f16, shmem,
                                 sse_load_f16, "comish", SSEPackedSingle>,
-                                T_MAP5, PS, EVEX, VEX_LIG, EVEX_CD8<16, CD8VT1>;
+                                T_MAP5, EVEX, VEX_LIG, EVEX_CD8<16, CD8VT1>;
   }
 }
 
@@ -9401,18 +9401,18 @@ multiclass avx512_sqrt_packed_all<bits<8> opc, string OpcodeStr,
   let Predicates = [HasFP16] in
   defm PHZ : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "ph"),
                                 sched.PH.ZMM, v32f16_info>,
-                                EVEX_V512, T_MAP5, PS, EVEX_CD8<16, CD8VF>;
+                                EVEX_V512, T_MAP5, EVEX_CD8<16, CD8VF>;
   let Predicates = [HasFP16, HasVLX] in {
     defm PHZ128 : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "ph"),
                                      sched.PH.XMM, v8f16x_info>,
-                                     EVEX_V128, T_MAP5, PS, EVEX_CD8<16, CD8VF>;
+                                     EVEX_V128, T_MAP5, EVEX_CD8<16, CD8VF>;
     defm PHZ256 : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "ph"),
                                      sched.PH.YMM, v16f16x_info>,
-                                     EVEX_V256, T_MAP5, PS, EVEX_CD8<16, CD8VF>;
+                                     EVEX_V256, T_MAP5, EVEX_CD8<16, CD8VF>;
   }
   defm PSZ : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "ps"),
                                 sched.PS.ZMM, v16f32_info>,
-                                EVEX_V512, TB, PS, EVEX_CD8<32, CD8VF>;
+                                EVEX_V512, TB, EVEX_CD8<32, CD8VF>;
   defm PDZ : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "pd"),
                                 sched.PD.ZMM, v8f64_info>,
                                 EVEX_V512, REX_W, TB, PD, EVEX_CD8<64, CD8VF>;
@@ -9420,10 +9420,10 @@ multiclass avx512_sqrt_packed_all<bits<8> opc, string OpcodeStr,
   let Predicates = [HasVLX] in {
     defm PSZ128 : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "ps"),
                                      sched.PS.XMM, v4f32x_info>,
-                                     EVEX_V128, TB, PS, EVEX_CD8<32, CD8VF>;
+                                     EVEX_V128, TB, EVEX_CD8<32, CD8VF>;
     defm PSZ256 : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "ps"),
                                      sched.PS.YMM, v8f32x_info>,
-                                     EVEX_V256, TB, PS, EVEX_CD8<32, CD8VF>;
+                                     EVEX_V256, TB, EVEX_CD8<32, CD8VF>;
     defm PDZ128 : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "pd"),
                                      sched.PD.XMM, v2f64x_info>,
                                      EVEX_V128, REX_W, TB, PD, EVEX_CD8<64, CD8VF>;
@@ -9439,10 +9439,10 @@ multiclass avx512_sqrt_packed_all_round<bits<8> opc, string OpcodeStr,
   let Predicates = [HasFP16] in
   defm PHZ : avx512_sqrt_packed_round<opc, !strconcat(OpcodeStr, "ph"),
                                       sched.PH.ZMM, v32f16_info>,
-                                      EVEX_V512, T_MAP5, PS, EVEX_CD8<16, CD8VF>;
+                                      EVEX_V512, T_MAP5, EVEX_CD8<16, CD8VF>;
   defm PSZ : avx512_sqrt_packed_round<opc, !strconcat(OpcodeStr, "ps"),
                                       sched.PS.ZMM, v16f32_info>,
-                                      EVEX_V512, TB, PS, EVEX_CD8<32, CD8VF>;
+                                      EVEX_V512, TB, EVEX_CD8<32, CD8VF>;
   defm PDZ : avx512_sqrt_packed_round<opc, !strconcat(OpcodeStr, "pd"),
                                       sched.PD.ZMM, v8f64_info>,
                                       EVEX_V512, REX_W, TB, PD, EVEX_CD8<64, CD8VF>;
@@ -10663,7 +10663,7 @@ multiclass avx512_fp_packed_imm<bits<8> opc, string OpcodeStr, SDNode OpNode,
 multiclass avx512_3Op_rm_imm8<bits<8> opc, string OpcodeStr, SDNode OpNode,
                               X86FoldableSchedWrite sched, X86VectorVTInfo DestInfo,
                               X86VectorVTInfo SrcInfo>{
-  let ExeDomain = DestInfo.ExeDomain in {
+  let ExeDomain = DestInfo.ExeDomain, ImmT = Imm8 in {
   defm rri : AVX512_maskable<opc, MRMSrcReg, DestInfo, (outs DestInfo.RC:$dst),
                   (ins SrcInfo.RC:$src1, SrcInfo.RC:$src2, u8imm:$src3),
                   OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
@@ -10689,7 +10689,7 @@ multiclass avx512_3Op_imm8<bits<8> opc, string OpcodeStr, SDNode OpNode,
                            X86FoldableSchedWrite sched, X86VectorVTInfo _>:
   avx512_3Op_rm_imm8<opc, OpcodeStr, OpNode, sched, _, _>{
 
-  let ExeDomain = _.ExeDomain in
+  let ExeDomain = _.ExeDomain, ImmT = Imm8 in
   defm rmbi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
                     (ins _.RC:$src1, _.ScalarMemOp:$src2, u8imm:$src3),
                     OpcodeStr, "$src3, ${src2}"#_.BroadcastStr#", $src1",
@@ -11501,11 +11501,11 @@ multiclass avx512_shufp<string OpcodeStr, AVX512VLVectorVTInfo VTInfo_FP>{
   defm NAME: avx512_common_3Op_imm8<OpcodeStr, VTInfo_FP, 0xC6, X86Shufp,
                                     SchedWriteFShuffle>,
                                     EVEX_CD8<VTInfo_FP.info512.EltSize, CD8VF>,
-                                    AVX512AIi8Base, EVEX, VVVV;
+                                    TA, EVEX, VVVV;
 }
 
-defm VSHUFPS: avx512_shufp<"vshufps", avx512vl_f32_info>, TB, PS;
-defm VSHUFPD: avx512_shufp<"vshufpd", avx512vl_f64_info>, TB, REX_W;
+defm VSHUFPS: avx512_shufp<"vshufps", avx512vl_f32_info>, TB;
+defm VSHUFPD: avx512_shufp<"vshufpd", avx512vl_f64_info>, TB, PD, REX_W;
 
 //===----------------------------------------------------------------------===//
 // AVX-512 - Byte shift Left/Right
@@ -12920,7 +12920,7 @@ multiclass avx512_cvttph2w<bits<8> opc, string OpcodeStr, SDPatternOperator OpNo
 defm VCVTPH2UW : avx512_cvtph2w<0x7D, "vcvtph2uw", X86cvtp2UInt, X86cvtp2UInt,
                                 X86cvtp2UIntRnd, avx512vl_i16_info,
                                 avx512vl_f16_info, SchedWriteCvtPD2DQ>,
-                                T_MAP5, PS, EVEX_CD8<16, CD8VF>;
+                                T_MAP5, EVEX_CD8<16, CD8VF>;
 defm VCVTUW2PH : avx512_cvtph2w<0x7D, "vcvtuw2ph", any_uint_to_fp, uint_to_fp,
                                 X86VUintToFpRnd, avx512vl_f16_info,
                                 avx512vl_i16_info, SchedWriteCvtPD2DQ>,
@@ -12932,7 +12932,7 @@ defm VCVTTPH2W : avx512_cvttph2w<0x7C, "vcvttph2w", X86any_cvttp2si,
 defm VCVTTPH2UW : avx512_cvttph2w<0x7C, "vcvttph2uw", X86any_cvttp2ui,
                                 X86cvttp2ui, X86cvttp2uiSAE,
                                 avx512vl_i16_info, avx512vl_f16_info,
-                                SchedWriteCvtPD2DQ>, T_MAP5, PS, EVEX_CD8<16, CD8VF>;
+                                SchedWriteCvtPD2DQ>, T_MAP5, EVEX_CD8<16, CD8VF>;
 defm VCVTPH2W : avx512_cvtph2w<0x7D, "vcvtph2w", X86cvtp2Int, X86cvtp2Int,
                                 X86cvtp2IntRnd, avx512vl_i16_info,
                                 avx512vl_f16_info, SchedWriteCvtPD2DQ>,
@@ -12983,7 +12983,7 @@ defm VCVTPH2DQ : avx512_cvtph2dq<0x5B, "vcvtph2dq", X86cvtp2Int, X86cvtp2Int,
                                  X86cvtp2IntRnd, SchedWriteCvtPS2DQ>, T_MAP5, PD,
                                  EVEX_CD8<16, CD8VH>;
 defm VCVTPH2UDQ : avx512_cvtph2dq<0x79, "vcvtph2udq", X86cvtp2UInt, X86cvtp2UInt,
-                                 X86cvtp2UIntRnd, SchedWriteCvtPS2DQ>, T_MAP5, PS,
+                                 X86cvtp2UIntRnd, SchedWriteCvtPS2DQ>, T_MAP5,
                                  EVEX_CD8<16, CD8VH>;
 
 defm VCVTTPH2DQ : avx512_cvttph2dq<0x5B, "vcvttph2dq", X86any_cvttp2si,
@@ -12993,7 +12993,7 @@ defm VCVTTPH2DQ : avx512_cvttph2dq<0x5B, "vcvttph2dq", X86any_cvttp2si,
 
 defm VCVTTPH2UDQ : avx512_cvttph2dq<0x78, "vcvttph2udq", X86any_cvttp2ui,
                                  X86cvttp2ui, X86cvttp2uiSAE,
-                                 SchedWriteCvtPS2DQ>, T_MAP5, PS,
+                                 SchedWriteCvtPS2DQ>, T_MAP5,
                                  EVEX_CD8<16, CD8VH>;
 
 // Convert Half to Signed/Unsigned Quardword
@@ -13154,7 +13154,7 @@ multiclass avx512_cvtqq2ph<bits<8> opc, string OpcodeStr, SDPatternOperator OpNo
 }
 
 defm VCVTQQ2PH : avx512_cvtqq2ph<0x5B, "vcvtqq2ph", any_sint_to_fp, sint_to_fp,
-                            X86VSintToFpRnd, SchedWriteCvtDQ2PS>, REX_W, T_MAP5, PS,
+                            X86VSintToFpRnd, SchedWriteCvtDQ2PS>, REX_W, T_MAP5,
                             EVEX_CD8<64, CD8VF>;
 
 defm VCVTUQQ2PH : avx512_cvtqq2ph<0x7A, "vcvtuqq2ph", any_uint_to_fp, uint_to_fp,
diff --git a/llvm/lib/Target/X86/X86InstrArithmetic.td b/llvm/lib/Target/X86/X86InstrArithmetic.td
index 4fb05231010d8..abd0d87354f8e 100644
--- a/llvm/lib/Target/X86/X86InstrArithmetic.td
+++ b/llvm/lib/Target/X86/X86InstrArithmetic.td
@@ -1117,8 +1117,8 @@ let Predicates = [HasBMI, HasEGPR, In64BitMode] in {
 
 // Complexity is reduced to give and with immediate a chance to match first.
 let Defs = [EFLAGS], AddedComplexity = -6 in {
-  defm ANDN32 : bmi_andn<"andn{l}", GR32, i32mem, loadi32, WriteALU>, T8, PS;
-  defm ANDN64 : bmi_andn<"andn{q}", GR64, i64mem, loadi64, WriteALU>, T8, PS, REX_W;
+  defm ANDN32 : bmi_andn<"andn{l}", GR32, i32mem, loadi32, WriteALU>, T8;
+  defm ANDN64 : bmi_andn<"andn{q}", GR64, i64mem, loadi64, WriteALU>, T8, REX_W;
 }
 
 let Predicates = [HasBMI], AddedComplexity = -6 in {
diff --git a/llvm/lib/Target/X86/X86InstrFPStack.td b/llvm/lib/Target/X86/X86InstrFPStack.td
index dd63e921b8acd..6a9a74ce15f2a 100644
--- a/llvm/lib/Target/X86/X86InstrFPStack.td
+++ b/llvm/lib/Target/X86/X86InstrFPStack.td
@@ -666,20 +666,20 @@ def FCOMPP : I<0xDE, MRM_D9, (outs), (ins), "fcompp", []>;
 
 let Uses = [FPSW, FPCW] in {
 def FXSAVE : I<0xAE, MRM0m, (outs), (ins opaquemem:$dst),
-             "fxsave\t$dst", [(int_x86_fxsave addr:$dst)]>, TB, PS,
+             "fxsave\t$dst", [(int_x86_fxsave addr:$dst)]>, TB,
              Requires<[HasFXSR]>;
 def FXSAVE64 : RI<0xAE, MRM0m, (outs), (ins opaquemem:$dst),
                "fxsave64\t$dst", [(int_x86_fxsave64 addr:$dst)]>,
-               TB, PS, Requires<[HasFXSR, In64BitMode]>;
+               TB, Requires<[HasFXSR, In64BitMode]>;
 } // Uses = [FPSW, FPCW]
 
 let Defs = [FPSW, FPCW] in {
 def FXRSTOR : I<0xAE, MRM1m, (outs), (ins opaquemem:$src),
               "fxrstor\t$src", [(int_x86_fxrstor addr:$src)]>,
-              TB, PS, Requires<[HasFXSR]>;
+              TB, Requires<[HasFXSR]>;
 def FXRSTOR64 : RI<0xAE, MRM1m, (outs), (ins opaquemem:$src),
                 "fxrstor64\t$src", [(int_x86_fxrstor64 addr:$src)]>,
-                TB, PS, Requires<[HasFXSR, In64BitMode]>;
+                TB, Requires<[HasFXSR, In64BitMode]>;
 } // Defs = [FPSW, FPCW]
 } // SchedRW
 
diff --git a/llvm/lib/Target/X86/X86InstrFormats.td b/llvm/lib/Target/X86/X86InstrFormats.td
index df05a5788a50a..f94072a0c7076 100644
--- a/llvm/lib/Target/X86/X86InstrFormats.td
+++ b/llvm/lib/Target/X86/X86InstrFormats.td
@@ -234,7 +234,9 @@ class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins,
                                 // based on address size of the mode?
   bits<2> AdSizeBits = AdSize.Value;
 
-  Prefix OpPrefix = NoPrfx; // Which prefix byte does this inst have?
+  Encoding OpEnc = EncNormal; // Encoding used by this instruction
+  // Which prefix byte does this inst have?
+  Prefix OpPrefix = !if(!eq(OpEnc, EncNormal), NoPrfx, PS);
   bits<3> OpPrefixBits = OpPrefix.Value;
   Map OpMap = OB;           // Which opcode map does this inst have?
   bits<4> OpMapBits = OpMap.Value;
@@ -243,7 +245,6 @@ class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins,
   bit hasLockPrefix = 0;    // Does this inst have a 0xF0 prefix?
   Domain ExeDomain = d;
   bit hasREPPrefix = 0;     // Does this inst have a REP prefix?
-  Encoding OpEnc = EncNormal; // Encoding used by this instruction
   bits<2> OpEncBits = OpEnc.Value;
   bit IgnoresW = 0;         // Does this inst ignore REX_W field?
   bit EVEX_W1_VEX_W0 = 0;   // This EVEX inst with VEX.W==1 can become a VEX
diff --git a/llvm/lib/Target/X86/X86InstrMMX.td b/llvm/lib/Target/X86/X86InstrMMX.td
index 8d472ccd52df3..8d6bc8d0ee2cf 100644
--- a/llvm/lib/Target/X86/X86InstrMMX.td
+++ b/llvm/lib/Target/X86/X86InstrMMX.td
@@ -487,13 +487,13 @@ def MMX_PSHUFWmi : MMXIi8<0x70, MRMSrcMem,
 // -- Conversion Instructions
 defm MMX_CVTPS2PI : sse12_cvt_pint<0x2D, VR128, VR64, int_x86_sse_cvtps2pi,
                       f64mem, load, "cvtps2pi\t{$src, $dst|$dst, $src}",
-                      WriteCvtPS2I, SSEPackedSingle>, TB, PS, SIMD_EXC;
+                      WriteCvtPS2I, SSEPackedSingle>, TB, SIMD_EXC;
 defm MMX_CVTPD2PI : sse12_cvt_pint<0x2D, VR128, VR64, int_x86_sse_cvtpd2pi,
                       f128mem, memop, "cvtpd2pi\t{$src, $dst|$dst, $src}",
                       WriteCvtPD2I, SSEPackedDouble>, TB, PD, SIMD_EXC;
 defm MMX_CVTTPS2PI : sse12_cvt_pint<0x2C, VR128, VR64, int_x86_sse_cvttps2pi,
                        f64mem, load, "cvttps2pi\t{$src, $dst|$dst, $src}",
-                       WriteCvtPS2I, SSEPackedSingle>, TB, PS, SIMD_EXC;
+                       WriteCvtPS2I, SSEPackedSingle>, TB, SIMD_EXC;
 defm MMX_CVTTPD2PI : sse12_cvt_pint<0x2C, VR128, VR64, int_x86_sse_cvttpd2pi,
                        f128mem, memop, "cvttpd2pi\t{$src, $dst|$dst, $src}",
                        WriteCvtPD2I, SSEPackedDouble>, TB, PD, SIMD_EXC;
@@ -504,7 +504,7 @@ let Constraints = "$src1 = $dst" in {
   defm MMX_CVTPI2PS : sse12_cvt_pint_3addr<0x2A, VR64, VR128,
                          int_x86_sse_cvtpi2ps,
                          i64mem, load, "cvtpi2ps\t{$src2, $dst|$dst, $src2}",
-                         SSEPackedSingle>, TB, PS, SIMD_EXC;
+                         SSEPackedSingle>, TB, SIMD_EXC;
 }
 
 // Extract / Insert
diff --git a/llvm/lib/Target/X86/X86InstrMisc.td b/llvm/lib/Target/X86/X86InstrMisc.td
index 779f27085eae0..305bd74f7bd70 100644
--- a/llvm/lib/Target/X86/X86InstrMisc.td
+++ b/llvm/lib/Target/X86/X86InstrMisc.td
@@ -165,10 +165,10 @@ def POPP64r  : I<0x58, AddRegFrm, (outs GR64:$reg), (ins), "popp\t$reg", []>,
                  REX_W, ExplicitREX2Prefix, Requires<[In64BitMode]>;
 def POP2: I<0x8F, MRM0r, (outs GR64:$reg1, GR64:$reg2), (ins),
             "pop2\t{$reg2, $reg1|$reg1, $reg2}",
-            []>, EVEX, VVVV, EVEX_B, T_MAP4, PS;
+            []>, EVEX, VVVV, EVEX_B, T_MAP4;
 def POP2P: I<0x8F, MRM0r, (outs GR64:$reg1, GR64:$reg2), (ins),
              "pop2p\t{$reg2, $reg1|$reg1, $reg2}",
-             []>, EVEX, VVVV, EVEX_B, T_MAP4, PS, REX_W;
+             []>, EVEX, VVVV, EVEX_B, T_MAP4, REX_W;
 
 } // mayLoad, SchedRW
 let mayLoad = 1, mayStore = 1, SchedRW = [WriteCopy] in
@@ -186,10 +186,10 @@ def PUSHP64r  : I<0x50, AddRegFrm, (outs), (ins GR64:$reg), "pushp\t$reg", []>,
                   REX_W, ExplicitREX2Prefix, Requires<[In64BitMode]>;
 def PUSH2: I<0xFF, MRM6r, (outs), (ins GR64:$reg1, GR64:$reg2),
             "push2\t{$reg2, $reg1|$reg1, $reg2}",
-            []>, EVEX, VVVV, EVEX_B, T_MAP4, PS;
+            []>, EVEX, VVVV, EVEX_B, T_MAP4;
 def PUSH2P: I<0xFF, MRM6r, (outs), (ins GR64:$reg1, GR64:$reg2),
              "push2p\t{$reg2, $reg1|$reg1, $reg2}",
-             []>, EVEX, VVVV, EVEX_B, T_MAP4, PS, REX_W;
+             []>, EVEX, VVVV, EVEX_B, T_MAP4, REX_W;
 } // mayStore, SchedRW
 let mayLoad = 1, mayStore = 1, SchedRW = [WriteCopy] in {
 def PUSH64rmm: I<0xFF, MRM6m, (outs), (ins i64mem:$src), "push{q}\t$src", []>,
@@ -251,52 +251,52 @@ let Defs = [EFLAGS] in {
 def BSF16rr  : I<0xBC, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
                  "bsf{w}\t{$src, $dst|$dst, $src}",
                  [(set GR16:$dst, EFLAGS, (X86bsf GR16:$src))]>,
-                  TB, PS, OpSize16, Sched<[WriteBSF]>;
+                  TB, OpSize16, Sched<[WriteBSF]>;
 def BSF16rm  : I<0xBC, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
                  "bsf{w}\t{$src, $dst|$dst, $src}",
                  [(set GR16:$dst, EFLAGS, (X86bsf (loadi16 addr:$src)))]>,
-                 TB, PS, OpSize16, Sched<[WriteBSFLd]>;
+                 TB, OpSize16, Sched<[WriteBSFLd]>;
 def BSF32rr  : I<0xBC, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
                  "bsf{l}\t{$src, $dst|$dst, $src}",
                  [(set GR32:$dst, EFLAGS, (X86bsf GR32:$src))]>,
-                 TB, PS, OpSize32, Sched<[WriteBSF]>;
+                 TB, OpSize32, Sched<[WriteBSF]>;
 def BSF32rm  : I<0xBC, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
                  "bsf{l}\t{$src, $dst|$dst, $src}",
                  [(set GR32:$dst, EFLAGS, (X86bsf (loadi32 addr:$src)))]>,
-                 TB, PS, OpSize32, Sched<[WriteBSFLd]>;
+                 TB, OpSize32, Sched<[WriteBSFLd]>;
 def BSF64rr  : RI<0xBC, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
                   "bsf{q}\t{$src, $dst|$dst, $src}",
                   [(set GR64:$dst, EFLAGS, (X86bsf GR64:$src))]>,
-                  TB, PS, Sched<[WriteBSF]>;
+                  TB, Sched<[WriteBSF]>;
 def BSF64rm  : RI<0xBC, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
                   "bsf{q}\t{$src, $dst|$dst, $src}",
                   [(set GR64:$dst, EFLAGS, (X86bsf (loadi64 addr:$src)))]>,
-                  TB, PS, Sched<[WriteBSFLd]>;
+                  TB, Sched<[WriteBSFLd]>;
 
 def BSR16rr  : I<0xBD, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
                  "bsr{w}\t{$src, $dst|$dst, $src}",
                  [(set GR16:$dst, EFLAGS, (X86bsr GR16:$src))]>,
-                 TB, PS, OpSize16, Sched<[WriteBSR]>;
+                 TB, OpSize16, Sched<[WriteBSR]>;
 def BSR16rm  : I<0xBD, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
                  "bsr{w}\t{$src, $dst|$dst, $src}",
                  [(set GR16:$dst, EFLAGS, (X86bsr (loadi16 addr:$src)))]>,
-                 TB, PS, OpSize16, Sched<[WriteBSRLd]>;
+                 TB, OpSize16, Sched<[WriteBSRLd]>;
 def BSR32rr  : I<0xBD, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
                  "bsr{l}\t{$src, $dst|$dst, $src}",
                  [(set GR32:$dst, EFLAGS, (X86bsr GR32:$src))]>,
-                 TB, PS, OpSize32, Sched<[WriteBSR]>;
+                 TB, OpSize32, Sched<[WriteBSR]>;
 def BSR32rm  : I<0xBD, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
                  "bsr{l}\t{$src, $dst|$dst, $src}",
                  [(set GR32:$dst, EFLAGS, (X86bsr (loadi32 addr:$src)))]>,
-                 TB, PS, OpSize32, Sched<[WriteBSRLd]>;
+                 TB, OpSize32, Sched<[WriteBSRLd]>;
 def BSR64rr  : RI<0xBD, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
                   "bsr{q}\t{$src, $dst|$dst, $src}",
                   [(set GR64:$dst, EFLAGS, (X86bsr GR64:$src))]>,
-                  TB, PS, Sched<[WriteBSR]>;
+                  TB, Sched<[WriteBSR]>;
 def BSR64rm  : RI<0xBD, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
                   "bsr{q}\t{$src, $dst|$dst, $src}",
                   [(set GR64:$dst, EFLAGS, (X86bsr (loadi64 addr:$src)))]>,
-                  TB, PS, Sched<[WriteBSRLd]>;
+                  TB, Sched<[WriteBSRLd]>;
 } // Defs = [EFLAGS]
 
 let SchedRW = [WriteMicrocoded] in {
@@ -1095,29 +1095,29 @@ let Predicates = [HasMOVBE] in {
   def MOVBE16rm : I<0xF0, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
                     "movbe{w}\t{$src, $dst|$dst, $src}",
                     [(set GR16:$dst, (bswap (loadi16 addr:$src)))]>,
-                    OpSize16, T8, PS;
+                    OpSize16, T8;
   def MOVBE32rm : I<0xF0, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
                     "movbe{l}\t{$src, $dst|$dst, $src}",
                     [(set GR32:$dst, (bswap (loadi32 addr:$src)))]>,
-                    OpSize32, T8, PS;
+                    OpSize32, T8;
   def MOVBE64rm : RI<0xF0, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
                      "movbe{q}\t{$src, $dst|$dst, $src}",
                      [(set GR64:$dst, (bswap (loadi64 addr:$src)))]>,
-                     T8, PS;
+                     T8;
   }
   let SchedRW = [WriteStore] in {
   def MOVBE16mr : I<0xF1, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src),
                     "movbe{w}\t{$src, $dst|$dst, $src}",
                     [(store (bswap GR16:$src), addr:$dst)]>,
-                    OpSize16, T8, PS;
+                    OpSize16, T8;
   def MOVBE32mr : I<0xF1, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
                     "movbe{l}\t{$src, $dst|$dst, $src}",
                     [(store (bswap GR32:$src), addr:$dst)]>,
-                    OpSize32, T8, PS;
+                    OpSize32, T8;
   def MOVBE64mr : RI<0xF1, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
                      "movbe{q}\t{$src, $dst|$dst, $src}",
                      [(store (bswap GR64:$src), addr:$dst)]>,
-                     T8, PS;
+                     T8;
   }
 }
 
@@ -1127,13 +1127,13 @@ let Predicates = [HasMOVBE] in {
 let Predicates = [HasRDRAND], Defs = [EFLAGS], SchedRW = [WriteSystem] in {
   def RDRAND16r : I<0xC7, MRM6r, (outs GR16:$dst), (ins),
                     "rdrand{w}\t$dst", [(set GR16:$dst, EFLAGS, (X86rdrand))]>,
-                    OpSize16, TB, PS;
+                    OpSize16, TB;
   def RDRAND32r : I<0xC7, MRM6r, (outs GR32:$dst), (ins),
                     "rdrand{l}\t$dst", [(set GR32:$dst, EFLAGS, (X86rdrand))]>,
-                    OpSize32, TB, PS;
+                    OpSize32, TB;
   def RDRAND64r : RI<0xC7, MRM6r, (outs GR64:$dst), (ins),
                      "rdrand{q}\t$dst", [(set GR64:$dst, EFLAGS, (X86rdrand))]>,
-                     TB, PS;
+                     TB;
 }
 
 //===----------------------------------------------------------------------===//
@@ -1141,11 +1141,11 @@ let Predicates = [HasRDRAND], Defs = [EFLAGS], SchedRW = [WriteSystem] in {
 //
 let Predicates = [HasRDSEED], Defs = [EFLAGS], SchedRW = [WriteSystem] in {
   def RDSEED16r : I<0xC7, MRM7r, (outs GR16:$dst), (ins), "rdseed{w}\t$dst",
-                    [(set GR16:$dst, EFLAGS, (X86rdseed))]>, OpSize16, TB, PS;
+                    [(set GR16:$dst, EFLAGS, (X86rdseed))]>, OpSize16, TB;
   def RDSEED32r : I<0xC7, MRM7r, (outs GR32:$dst), (ins), "rdseed{l}\t$dst",
-                    [(set GR32:$dst, EFLAGS, (X86rdseed))]>, OpSize32, TB, PS;
+                    [(set GR32:$dst, EFLAGS, (X86rdseed))]>, OpSize32, TB;
   def RDSEED64r : RI<0xC7, MRM7r, (outs GR64:$dst), (ins), "rdseed{q}\t$dst",
-                     [(set GR64:$dst, EFLAGS, (X86rdseed))]>, TB, PS;
+                     [(set GR64:$dst, EFLAGS, (X86rdseed))]>, TB;
 }
 
 //===----------------------------------------------------------------------===//
@@ -1218,11 +1218,11 @@ multiclass bmi_bls<string mnemonic, Format RegMRM, Format MemMRM,
 let hasSideEffects = 0 in {
   def rr#Suffix : I<0xF3, RegMRM, (outs RC:$dst), (ins RC:$src),
                     !strconcat(mnemonic, "\t{$src, $dst|$dst, $src}"), []>,
-                  T8, PS, VEX, VVVV, Sched<[sched]>;
+                  T8, VEX, VVVV, Sched<[sched]>;
   let mayLoad = 1 in
   def rm#Suffix : I<0xF3, MemMRM, (outs RC:$dst), (ins x86memop:$src),
                     !strconcat(mnemonic, "\t{$src, $dst|$dst, $src}"), []>,
-                  T8, PS, VEX, VVVV, Sched<[sched.Folded]>;
+                  T8, VEX, VVVV, Sched<[sched.Folded]>;
 }
 }
 
@@ -1288,12 +1288,12 @@ multiclass bmi4VOp3_base<bits<8> opc, string mnemonic, RegisterClass RC,
   def rr#Suffix : I<opc, MRMSrcReg4VOp3, (outs RC:$dst), (ins RC:$src1, RC:$src2),
                     !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                     [(set RC:$dst, (OpNode RC:$src1, RC:$src2)), (implicit EFLAGS)]>,
-                  T8, PS, VEX, Sched<[Sched]>;
+                  T8, VEX, Sched<[Sched]>;
 let mayLoad = 1 in
   def rm#Suffix : I<opc, MRMSrcMem4VOp3, (outs RC:$dst), (ins x86memop:$src1, RC:$src2),
                     !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                     [(set RC:$dst, (OpNode (ld_frag addr:$src1), RC:$src2)),
-                     (implicit EFLAGS)]>, T8, PS, VEX,
+                     (implicit EFLAGS)]>, T8, VEX,
                   Sched<[Sched.Folded,
                          // x86memop:$src1
                          ReadDefault, ReadDefault, ReadDefault, ReadDefault,
@@ -1497,19 +1497,19 @@ let SchedRW = [WriteStore] in {
 def MOVDIRI32 : I<0xF9, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
                   "movdiri\t{$src, $dst|$dst, $src}",
                   [(int_x86_directstore32 addr:$dst, GR32:$src)]>,
-                 T8, PS, Requires<[HasMOVDIRI, NoEGPR]>;
+                 T8, Requires<[HasMOVDIRI, NoEGPR]>;
 def MOVDIRI64 : RI<0xF9, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
                    "movdiri\t{$src, $dst|$dst, $src}",
                    [(int_x86_directstore64 addr:$dst, GR64:$src)]>,
-                  T8, PS, Requires<[In64BitMode, HasMOVDIRI, NoEGPR]>;
+                  T8, Requires<[In64BitMode, HasMOVDIRI, NoEGPR]>;
 def MOVDIRI32_EVEX : I<0xF9, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
                        "movdiri\t{$src, $dst|$dst, $src}",
                        [(int_x86_directstore32 addr:$dst, GR32:$src)]>,
-                     EVEX, NoCD8, T_MAP4, PS, Requires<[In64BitMode, HasMOVDIRI, HasEGPR]>;
+                     EVEX, NoCD8, T_MAP4, Requires<[In64BitMode, HasMOVDIRI, HasEGPR]>;
 def MOVDIRI64_EVEX : RI<0xF9, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
                         "movdiri\t{$src, $dst|$dst, $src}",
                         [(int_x86_directstore64 addr:$dst, GR64:$src)]>,
-                     EVEX, NoCD8, T_MAP4, PS, Requires<[In64BitMode, HasMOVDIRI, HasEGPR]>;
+                     EVEX, NoCD8, T_MAP4, Requires<[In64BitMode, HasMOVDIRI, HasEGPR]>;
 } // SchedRW
 
 //===----------------------------------------------------------------------===//
@@ -1588,11 +1588,11 @@ let SchedRW = [WriteSystem] in {
   let Uses = [EAX, EDX] in
   def INVLPGB32 : I<0x01, MRM_FE, (outs), (ins),
                   "invlpgb", []>,
-                  TB, PS, Requires<[Not64BitMode]>;
+                  TB, Requires<[Not64BitMode]>;
   let Uses = [RAX, EDX] in
   def INVLPGB64 : I<0x01, MRM_FE, (outs), (ins),
                   "invlpgb", []>,
-                  TB, PS, Requires<[In64BitMode]>;
+                  TB, Requires<[In64BitMode]>;
 } // SchedRW
 
 //===----------------------------------------------------------------------===//
@@ -1602,7 +1602,7 @@ let SchedRW = [WriteSystem] in {
 let SchedRW = [WriteSystem] in {
   def TLBSYNC   : I<0x01, MRM_FF, (outs), (ins),
                   "tlbsync", []>,
-                  TB, PS, Requires<[]>;
+                  TB, Requires<[]>;
 } // SchedRW
 
 //===----------------------------------------------------------------------===//
@@ -1617,7 +1617,7 @@ let Uses = [EAX], SchedRW = [WriteSystem] in
 //
 let SchedRW = [WriteSystem] in
   def SERIALIZE : I<0x01, MRM_E8, (outs), (ins), "serialize",
-                    [(int_x86_serialize)]>, TB, PS,
+                    [(int_x86_serialize)]>, TB,
                     Requires<[HasSERIALIZE]>;
 
 //===----------------------------------------------------------------------===//
@@ -1711,4 +1711,4 @@ def CLWB       : I<0xAE, MRM6m, (outs), (ins i8mem:$src), "clwb\t$src",
 
 let Predicates = [HasCLDEMOTE], SchedRW = [WriteLoad] in
 def CLDEMOTE : I<0x1C, MRM0m, (outs), (ins i8mem:$src), "cldemote\t$src",
-                   [(int_x86_cldemote addr:$src)]>, TB, PS;
+                   [(int_x86_cldemote addr:$src)]>, TB;
diff --git a/llvm/lib/Target/X86/X86InstrRAOINT.td b/llvm/lib/Target/X86/X86InstrRAOINT.td
index 601355d4f7de4..bc17b00f3573a 100644
--- a/llvm/lib/Target/X86/X86InstrRAOINT.td
+++ b/llvm/lib/Target/X86/X86InstrRAOINT.td
@@ -39,7 +39,7 @@ multiclass RAOINT_BASE<string OpcodeStr> {
                Sched<[WriteALURMW]>, REX_W;
 }
 
-defm AADD : RAOINT_BASE<"add">, T8, PS;
+defm AADD : RAOINT_BASE<"add">, T8;
 defm AAND : RAOINT_BASE<"and">, T8, PD;
 defm AOR  : RAOINT_BASE<"or" >, T8, XD;
 defm AXOR : RAOINT_BASE<"xor">, T8, XS;
diff --git a/llvm/lib/Target/X86/X86InstrSGX.td b/llvm/lib/Target/X86/X86InstrSGX.td
index 3c8d6e3c6b6b3..747f5aa86653d 100644
--- a/llvm/lib/Target/X86/X86InstrSGX.td
+++ b/llvm/lib/Target/X86/X86InstrSGX.td
@@ -17,13 +17,13 @@
 let SchedRW = [WriteSystem], Predicates = [HasSGX] in {
 // ENCLS - Execute an Enclave System Function of Specified Leaf Number
 def ENCLS : I<0x01, MRM_CF, (outs), (ins),
-             "encls", []>, TB, PS;
+             "encls", []>, TB;
 
 // ENCLU - Execute an Enclave User Function of Specified Leaf Number
 def ENCLU : I<0x01, MRM_D7, (outs), (ins),
-             "enclu", []>, TB, PS;
+             "enclu", []>, TB;
 
 // ENCLV - Execute an Enclave VMM Function of Specified Leaf Number
 def ENCLV : I<0x01, MRM_C0, (outs), (ins),
-             "enclv", []>, TB, PS;
+             "enclv", []>, TB;
 } // SchedRW
diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td
index 27d3974a674ab..df1f0b5b4ca72 100644
--- a/llvm/lib/Target/X86/X86InstrSSE.td
+++ b/llvm/lib/Target/X86/X86InstrSSE.td
@@ -352,26 +352,26 @@ let canFoldAsLoad = 1, isReMaterializable = 1 in
 let Predicates = [HasAVX, NoVLX] in {
 defm VMOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32, "movaps",
                                 SSEPackedSingle, SchedWriteFMoveLS.XMM>,
-                                TB, PS, VEX, WIG;
+                                TB, VEX, WIG;
 defm VMOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64, "movapd",
                                 SSEPackedDouble, SchedWriteFMoveLS.XMM>,
                                 TB, PD, VEX, WIG;
 defm VMOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32, "movups",
                                 SSEPackedSingle, SchedWriteFMoveLS.XMM>,
-                                TB, PS, VEX, WIG;
+                                TB, VEX, WIG;
 defm VMOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64, "movupd",
                                 SSEPackedDouble, SchedWriteFMoveLS.XMM>,
                                 TB, PD, VEX, WIG;
 
 defm VMOVAPSY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv8f32, "movaps",
                                  SSEPackedSingle, SchedWriteFMoveLS.YMM>,
-                                 TB, PS, VEX, VEX_L, WIG;
+                                 TB, VEX, VEX_L, WIG;
 defm VMOVAPDY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv4f64, "movapd",
                                  SSEPackedDouble, SchedWriteFMoveLS.YMM>,
                                  TB, PD, VEX, VEX_L, WIG;
 defm VMOVUPSY : sse12_mov_packed<0x10, VR256, f256mem, loadv8f32, "movups",
                                  SSEPackedSingle, SchedWriteFMoveLS.YMM>,
-                                 TB, PS, VEX, VEX_L, WIG;
+                                 TB, VEX, VEX_L, WIG;
 defm VMOVUPDY : sse12_mov_packed<0x10, VR256, f256mem, loadv4f64, "movupd",
                                  SSEPackedDouble, SchedWriteFMoveLS.YMM>,
                                  TB, PD, VEX, VEX_L, WIG;
@@ -380,10 +380,10 @@ defm VMOVUPDY : sse12_mov_packed<0x10, VR256, f256mem, loadv4f64, "movupd",
 let Predicates = [UseSSE1] in {
 defm MOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32, "movaps",
                                SSEPackedSingle, SchedWriteFMoveLS.XMM>,
-                               TB, PS;
+                               TB;
 defm MOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32, "movups",
                                SSEPackedSingle, SchedWriteFMoveLS.XMM>,
-                               TB, PS;
+                               TB;
 }
 let Predicates = [UseSSE2] in {
 defm MOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64, "movapd",
@@ -666,7 +666,7 @@ multiclass sse12_mov_hilo_packed_base<bits<8>opc, SDPatternOperator pdnode,
   def PSrm : PI<opc, MRMSrcMem,
                 (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
                 !strconcat(base_opc, "s", asm_opr),
-                [], SSEPackedSingle>, TB, PS,
+                [], SSEPackedSingle>, TB,
                 Sched<[SchedWriteFShuffle.XMM.Folded, SchedWriteFShuffle.XMM.ReadAfterFold]>;
 
   def PDrm : PI<opc, MRMSrcMem,
@@ -1233,16 +1233,16 @@ defm CVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v4f32, X86cvts2si,
 defm VCVTDQ2PS   : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, load,
                                "vcvtdq2ps\t{$src, $dst|$dst, $src}",
                                SSEPackedSingle, WriteCvtI2PS>,
-                               TB, PS, VEX, Requires<[HasAVX, NoVLX]>, WIG;
+                               TB, VEX, Requires<[HasAVX, NoVLX]>, WIG;
 defm VCVTDQ2PSY  : sse12_cvt_p<0x5B, VR256, i256mem, v8f32, v8i32, load,
                                "vcvtdq2ps\t{$src, $dst|$dst, $src}",
                                SSEPackedSingle, WriteCvtI2PSY>,
-                               TB, PS, VEX, VEX_L, Requires<[HasAVX, NoVLX]>, WIG;
+                               TB, VEX, VEX_L, Requires<[HasAVX, NoVLX]>, WIG;
 
 defm CVTDQ2PS : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, memop,
                             "cvtdq2ps\t{$src, $dst|$dst, $src}",
                             SSEPackedSingle, WriteCvtI2PS>,
-                            TB, PS, Requires<[UseSSE2]>;
+                            TB, Requires<[UseSSE2]>;
 }
 
 // AVX aliases
@@ -1699,30 +1699,30 @@ let Predicates = [HasAVX, NoVLX], Uses = [MXCSR], mayRaiseFPException = 1 in {
 def VCVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                     "vcvtps2pd\t{$src, $dst|$dst, $src}",
                     [(set VR128:$dst, (v2f64 (X86any_vfpext (v4f32 VR128:$src))))]>,
-                    TB, PS, VEX, Sched<[WriteCvtPS2PD]>, WIG;
+                    TB, VEX, Sched<[WriteCvtPS2PD]>, WIG;
 def VCVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
                     "vcvtps2pd\t{$src, $dst|$dst, $src}",
                     [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))]>,
-                    TB, PS, VEX, Sched<[WriteCvtPS2PD.Folded]>, WIG;
+                    TB, VEX, Sched<[WriteCvtPS2PD.Folded]>, WIG;
 def VCVTPS2PDYrr : I<0x5A, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
                      "vcvtps2pd\t{$src, $dst|$dst, $src}",
                      [(set VR256:$dst, (v4f64 (any_fpextend (v4f32 VR128:$src))))]>,
-                     TB, PS, VEX, VEX_L, Sched<[WriteCvtPS2PDY]>, WIG;
+                     TB, VEX, VEX_L, Sched<[WriteCvtPS2PDY]>, WIG;
 def VCVTPS2PDYrm : I<0x5A, MRMSrcMem, (outs VR256:$dst), (ins f128mem:$src),
                      "vcvtps2pd\t{$src, $dst|$dst, $src}",
                      [(set VR256:$dst, (v4f64 (extloadv4f32 addr:$src)))]>,
-                     TB, PS, VEX, VEX_L, Sched<[WriteCvtPS2PDY.Folded]>, WIG;
+                     TB, VEX, VEX_L, Sched<[WriteCvtPS2PDY.Folded]>, WIG;
 }
 
 let Predicates = [UseSSE2], Uses = [MXCSR], mayRaiseFPException = 1 in {
 def CVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                    "cvtps2pd\t{$src, $dst|$dst, $src}",
                    [(set VR128:$dst, (v2f64 (X86any_vfpext (v4f32 VR128:$src))))]>,
-                   TB, PS, Sched<[WriteCvtPS2PD]>;
+                   TB, Sched<[WriteCvtPS2PD]>;
 def CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
                    "cvtps2pd\t{$src, $dst|$dst, $src}",
                    [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))]>,
-                   TB, PS, Sched<[WriteCvtPS2PD.Folded]>;
+                   TB, Sched<[WriteCvtPS2PD.Folded]>;
 }
 
 // Convert Packed DW Integers to Packed Double FP
@@ -1919,42 +1919,42 @@ let mayLoad = 1 in
 
 let Defs = [EFLAGS] in {
   defm VUCOMISS : sse12_ord_cmp<0x2E, FR32, X86any_fcmp, f32, f32mem, loadf32,
-                               "ucomiss", SSEPackedSingle>, TB, PS, VEX, VEX_LIG, WIG;
+                               "ucomiss", SSEPackedSingle>, TB, VEX, VEX_LIG, WIG;
   defm VUCOMISD : sse12_ord_cmp<0x2E, FR64, X86any_fcmp, f64, f64mem, loadf64,
                                "ucomisd", SSEPackedDouble>, TB, PD, VEX, VEX_LIG, WIG;
   defm VCOMISS  : sse12_ord_cmp<0x2F, FR32, X86strict_fcmps, f32, f32mem, loadf32,
-                               "comiss", SSEPackedSingle>, TB, PS, VEX, VEX_LIG, WIG;
+                               "comiss", SSEPackedSingle>, TB, VEX, VEX_LIG, WIG;
   defm VCOMISD  : sse12_ord_cmp<0x2F, FR64, X86strict_fcmps, f64, f64mem, loadf64,
                                "comisd", SSEPackedDouble>, TB, PD, VEX, VEX_LIG, WIG;
 
   let isCodeGenOnly = 1 in {
     defm VUCOMISS  : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v4f32, ssmem,
-                      sse_load_f32, "ucomiss", SSEPackedSingle>, TB, PS, VEX, VEX_LIG, WIG;
+                      sse_load_f32, "ucomiss", SSEPackedSingle>, TB, VEX, VEX_LIG, WIG;
     defm VUCOMISD  : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v2f64, sdmem,
                       sse_load_f64, "ucomisd", SSEPackedDouble>, TB, PD, VEX, VEX_LIG, WIG;
 
     defm VCOMISS  : sse12_ord_cmp_int<0x2F, VR128, X86comi, v4f32, ssmem,
-                       sse_load_f32, "comiss", SSEPackedSingle>, TB, PS, VEX, VEX_LIG, WIG;
+                       sse_load_f32, "comiss", SSEPackedSingle>, TB, VEX, VEX_LIG, WIG;
     defm VCOMISD  : sse12_ord_cmp_int<0x2F, VR128, X86comi, v2f64, sdmem,
                        sse_load_f64, "comisd", SSEPackedDouble>, TB, PD, VEX, VEX_LIG, WIG;
   }
   defm UCOMISS  : sse12_ord_cmp<0x2E, FR32, X86any_fcmp, f32, f32mem, loadf32,
-                                  "ucomiss", SSEPackedSingle>, TB, PS;
+                                  "ucomiss", SSEPackedSingle>, TB;
   defm UCOMISD  : sse12_ord_cmp<0x2E, FR64, X86any_fcmp, f64, f64mem, loadf64,
                                   "ucomisd", SSEPackedDouble>, TB, PD;
   defm COMISS   : sse12_ord_cmp<0x2F, FR32, X86strict_fcmps, f32, f32mem, loadf32,
-                                  "comiss", SSEPackedSingle>, TB, PS;
+                                  "comiss", SSEPackedSingle>, TB;
   defm COMISD   : sse12_ord_cmp<0x2F, FR64, X86strict_fcmps, f64, f64mem, loadf64,
                                   "comisd", SSEPackedDouble>, TB, PD;
 
   let isCodeGenOnly = 1 in {
     defm UCOMISS  : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v4f32, ssmem,
-                            sse_load_f32, "ucomiss", SSEPackedSingle>, TB, PS;
+                            sse_load_f32, "ucomiss", SSEPackedSingle>, TB;
     defm UCOMISD  : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v2f64, sdmem,
                             sse_load_f64, "ucomisd", SSEPackedDouble>, TB, PD;
 
     defm COMISS  : sse12_ord_cmp_int<0x2F, VR128, X86comi, v4f32, ssmem,
-                                sse_load_f32, "comiss", SSEPackedSingle>, TB, PS;
+                                sse_load_f32, "comiss", SSEPackedSingle>, TB;
     defm COMISD  : sse12_ord_cmp_int<0x2F, VR128, X86comi, v2f64, sdmem,
                                     sse_load_f64, "comisd", SSEPackedDouble>, TB, PD;
   }
@@ -1979,20 +1979,20 @@ multiclass sse12_cmp_packed<RegisterClass RC, X86MemOperand x86memop,
 
 defm VCMPPS : sse12_cmp_packed<VR128, f128mem, v4f32,
                "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
-               SchedWriteFCmpSizes.PS.XMM, SSEPackedSingle, loadv4f32>, TB, PS, VEX, VVVV, WIG;
+               SchedWriteFCmpSizes.PS.XMM, SSEPackedSingle, loadv4f32>, TB, VEX, VVVV, WIG;
 defm VCMPPD : sse12_cmp_packed<VR128, f128mem, v2f64,
                "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
                SchedWriteFCmpSizes.PD.XMM, SSEPackedDouble, loadv2f64>, TB, PD, VEX, VVVV, WIG;
 defm VCMPPSY : sse12_cmp_packed<VR256, f256mem, v8f32,
                "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
-               SchedWriteFCmpSizes.PS.YMM, SSEPackedSingle, loadv8f32>, TB, PS, VEX, VVVV, VEX_L, WIG;
+               SchedWriteFCmpSizes.PS.YMM, SSEPackedSingle, loadv8f32>, TB, VEX, VVVV, VEX_L, WIG;
 defm VCMPPDY : sse12_cmp_packed<VR256, f256mem, v4f64,
                "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
                SchedWriteFCmpSizes.PD.YMM, SSEPackedDouble, loadv4f64>, TB, PD, VEX, VVVV, VEX_L, WIG;
 let Constraints = "$src1 = $dst" in {
   defm CMPPS : sse12_cmp_packed<VR128, f128mem, v4f32,
                  "cmpps\t{$cc, $src2, $dst|$dst, $src2, $cc}",
-                 SchedWriteFCmpSizes.PS.XMM, SSEPackedSingle, memopv4f32>, TB, PS;
+                 SchedWriteFCmpSizes.PS.XMM, SSEPackedSingle, memopv4f32>, TB;
   defm CMPPD : sse12_cmp_packed<VR128, f128mem, v2f64,
                  "cmppd\t{$cc, $src2, $dst|$dst, $src2, $cc}",
                  SchedWriteFCmpSizes.PD.XMM, SSEPackedDouble, memopv2f64>, TB, PD;
@@ -2076,11 +2076,11 @@ let Predicates = [HasAVX, NoVLX] in {
   defm VSHUFPS  : sse12_shuffle<VR128, f128mem, v4f32,
            "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
            loadv4f32, SchedWriteFShuffle.XMM, SSEPackedSingle>,
-           TB, PS, VEX, VVVV, WIG;
+           TB, VEX, VVVV, WIG;
   defm VSHUFPSY : sse12_shuffle<VR256, f256mem, v8f32,
            "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
            loadv8f32, SchedWriteFShuffle.YMM, SSEPackedSingle>,
-           TB, PS, VEX, VVVV, VEX_L, WIG;
+           TB, VEX, VVVV, VEX_L, WIG;
   defm VSHUFPD  : sse12_shuffle<VR128, f128mem, v2f64,
            "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
            loadv2f64, SchedWriteFShuffle.XMM, SSEPackedDouble>,
@@ -2093,7 +2093,7 @@ let Predicates = [HasAVX, NoVLX] in {
 let Constraints = "$src1 = $dst" in {
   defm SHUFPS : sse12_shuffle<VR128, f128mem, v4f32,
                     "shufps\t{$src3, $src2, $dst|$dst, $src2, $src3}",
-                    memopv4f32, SchedWriteFShuffle.XMM, SSEPackedSingle>, TB, PS;
+                    memopv4f32, SchedWriteFShuffle.XMM, SSEPackedSingle>, TB;
   defm SHUFPD : sse12_shuffle<VR128, f128mem, v2f64,
                     "shufpd\t{$src3, $src2, $dst|$dst, $src2, $src3}",
                     memopv2f64, SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, TB, PD;
@@ -2126,26 +2126,26 @@ multiclass sse12_unpack_interleave<bits<8> opc, SDNode OpNode, ValueType vt,
 let Predicates = [HasAVX, NoVLX] in {
 defm VUNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, load,
       VR128, f128mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                     SchedWriteFShuffle.XMM, SSEPackedSingle>, TB, PS, VEX, VVVV, WIG;
+                     SchedWriteFShuffle.XMM, SSEPackedSingle>, TB, VEX, VVVV, WIG;
 defm VUNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, load,
       VR128, f128mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                      SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, TB, PD, VEX, VVVV, WIG;
 defm VUNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, load,
       VR128, f128mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                     SchedWriteFShuffle.XMM, SSEPackedSingle>, TB, PS, VEX, VVVV, WIG;
+                     SchedWriteFShuffle.XMM, SSEPackedSingle>, TB, VEX, VVVV, WIG;
 defm VUNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, load,
       VR128, f128mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                      SchedWriteFShuffle.XMM, SSEPackedDouble>, TB, PD, VEX, VVVV, WIG;
 
 defm VUNPCKHPSY: sse12_unpack_interleave<0x15, X86Unpckh, v8f32, load,
       VR256, f256mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                     SchedWriteFShuffle.YMM, SSEPackedSingle>, TB, PS, VEX, VVVV, VEX_L, WIG;
+                     SchedWriteFShuffle.YMM, SSEPackedSingle>, TB, VEX, VVVV, VEX_L, WIG;
 defm VUNPCKHPDY: sse12_unpack_interleave<0x15, X86Unpckh, v4f64, load,
       VR256, f256mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                      SchedWriteFShuffle.YMM, SSEPackedDouble>, TB, PD, VEX, VVVV, VEX_L, WIG;
 defm VUNPCKLPSY: sse12_unpack_interleave<0x14, X86Unpckl, v8f32, load,
       VR256, f256mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                     SchedWriteFShuffle.YMM, SSEPackedSingle>, TB, PS, VEX, VVVV, VEX_L, WIG;
+                     SchedWriteFShuffle.YMM, SSEPackedSingle>, TB, VEX, VVVV, VEX_L, WIG;
 defm VUNPCKLPDY: sse12_unpack_interleave<0x14, X86Unpckl, v4f64, load,
       VR256, f256mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                      SchedWriteFShuffle.YMM, SSEPackedDouble>, TB, PD, VEX, VVVV, VEX_L, WIG;
@@ -2154,13 +2154,13 @@ defm VUNPCKLPDY: sse12_unpack_interleave<0x14, X86Unpckl, v4f64, load,
 let Constraints = "$src1 = $dst" in {
   defm UNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, memop,
         VR128, f128mem, "unpckhps\t{$src2, $dst|$dst, $src2}",
-                       SchedWriteFShuffle.XMM, SSEPackedSingle>, TB, PS;
+                       SchedWriteFShuffle.XMM, SSEPackedSingle>, TB;
   defm UNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, memop,
         VR128, f128mem, "unpckhpd\t{$src2, $dst|$dst, $src2}",
                        SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, TB, PD;
   defm UNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, memop,
         VR128, f128mem, "unpcklps\t{$src2, $dst|$dst, $src2}",
-                       SchedWriteFShuffle.XMM, SSEPackedSingle>, TB, PS;
+                       SchedWriteFShuffle.XMM, SSEPackedSingle>, TB;
   defm UNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, memop,
         VR128, f128mem, "unpcklpd\t{$src2, $dst|$dst, $src2}",
                        SchedWriteFShuffle.XMM, SSEPackedDouble>, TB, PD;
@@ -2208,11 +2208,11 @@ multiclass sse12_extr_sign_mask<RegisterClass RC, ValueType vt,
 
 let Predicates = [HasAVX] in {
   defm VMOVMSKPS : sse12_extr_sign_mask<VR128, v4f32, "movmskps",
-                                        SSEPackedSingle>, TB, PS, VEX, WIG;
+                                        SSEPackedSingle>, TB, VEX, WIG;
   defm VMOVMSKPD : sse12_extr_sign_mask<VR128, v2f64, "movmskpd",
                                         SSEPackedDouble>, TB, PD, VEX, WIG;
   defm VMOVMSKPSY : sse12_extr_sign_mask<VR256, v8f32, "movmskps",
-                                         SSEPackedSingle>, TB, PS, VEX, VEX_L, WIG;
+                                         SSEPackedSingle>, TB, VEX, VEX_L, WIG;
   defm VMOVMSKPDY : sse12_extr_sign_mask<VR256, v4f64, "movmskpd",
                                          SSEPackedDouble>, TB, PD, VEX, VEX_L, WIG;
 
@@ -2228,7 +2228,7 @@ let Predicates = [HasAVX] in {
 }
 
 defm MOVMSKPS : sse12_extr_sign_mask<VR128, v4f32, "movmskps",
-                                     SSEPackedSingle>, TB, PS;
+                                     SSEPackedSingle>, TB;
 defm MOVMSKPD : sse12_extr_sign_mask<VR128, v2f64, "movmskpd",
                                      SSEPackedDouble>, TB, PD;
 
@@ -2312,7 +2312,7 @@ multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr,
   let Predicates = [HasAVX, NoVLX] in {
   defm V#NAME#PSY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedSingle,
         !strconcat(OpcodeStr, "ps"), f256mem, sched.YMM,
-        [], [], 0>, TB, PS, VEX, VVVV, VEX_L, WIG;
+        [], [], 0>, TB, VEX, VVVV, VEX_L, WIG;
 
   defm V#NAME#PDY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedDouble,
         !strconcat(OpcodeStr, "pd"), f256mem, sched.YMM,
@@ -2320,7 +2320,7 @@ multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr,
 
   defm V#NAME#PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle,
        !strconcat(OpcodeStr, "ps"), f128mem, sched.XMM,
-       [], [], 0>, TB, PS, VEX, VVVV, WIG;
+       [], [], 0>, TB, VEX, VVVV, WIG;
 
   defm V#NAME#PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble,
        !strconcat(OpcodeStr, "pd"), f128mem, sched.XMM,
@@ -2330,7 +2330,7 @@ multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr,
   let Constraints = "$src1 = $dst" in {
     defm PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle,
          !strconcat(OpcodeStr, "ps"), f128mem, sched.XMM,
-         [], []>, TB, PS;
+         [], []>, TB;
 
     defm PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble,
          !strconcat(OpcodeStr, "pd"), f128mem, sched.XMM,
@@ -2636,14 +2636,14 @@ let Uses = [MXCSR], mayRaiseFPException = 1 in {
   let Predicates = [HasAVX, NoVLX] in {
   defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode,
                                VR128, v4f32, f128mem, loadv4f32,
-                               SSEPackedSingle, sched.PS.XMM, 0>, TB, PS, VEX, VVVV, WIG;
+                               SSEPackedSingle, sched.PS.XMM, 0>, TB, VEX, VVVV, WIG;
   defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode,
                                VR128, v2f64, f128mem, loadv2f64,
                                SSEPackedDouble, sched.PD.XMM, 0>, TB, PD, VEX, VVVV, WIG;
 
   defm V#NAME#PSY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"),
                         OpNode, VR256, v8f32, f256mem, loadv8f32,
-                        SSEPackedSingle, sched.PS.YMM, 0>, TB, PS, VEX, VVVV, VEX_L, WIG;
+                        SSEPackedSingle, sched.PS.YMM, 0>, TB, VEX, VVVV, VEX_L, WIG;
   defm V#NAME#PDY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"),
                         OpNode, VR256, v4f64, f256mem, loadv4f64,
                         SSEPackedDouble, sched.PD.YMM, 0>, TB, PD, VEX, VVVV, VEX_L, WIG;
@@ -2652,7 +2652,7 @@ let Uses = [MXCSR], mayRaiseFPException = 1 in {
   let Constraints = "$src1 = $dst" in {
     defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR128,
                               v4f32, f128mem, memopv4f32, SSEPackedSingle,
-                              sched.PS.XMM>, TB, PS;
+                              sched.PS.XMM>, TB;
     defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, VR128,
                               v2f64, f128mem, memopv2f64, SSEPackedDouble,
                               sched.PD.XMM>, TB, PD;
@@ -3165,11 +3165,11 @@ let SchedRW = [WriteStoreNT] in {
 def MOVNTImr : I<0xC3, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
                  "movnti{l}\t{$src, $dst|$dst, $src}",
                  [(nontemporalstore (i32 GR32:$src), addr:$dst)]>,
-               TB, PS, Requires<[HasSSE2]>;
+               TB, Requires<[HasSSE2]>;
 def MOVNTI_64mr : RI<0xC3, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
                      "movnti{q}\t{$src, $dst|$dst, $src}",
                      [(nontemporalstore (i64 GR64:$src), addr:$dst)]>,
-                  TB, PS, Requires<[HasSSE2]>;
+                  TB, Requires<[HasSSE2]>;
 } // SchedRW = [WriteStoreNT]
 
 let Predicates = [HasAVX, NoVLX] in {
@@ -3226,7 +3226,7 @@ let SchedRW = [WriteLoad] in {
 // Flush cache
 def CLFLUSH : I<0xAE, MRM7m, (outs), (ins i8mem:$src),
                "clflush\t$src", [(int_x86_sse2_clflush addr:$src)]>,
-               TB, PS, Requires<[HasCLFLUSH]>;
+               TB, Requires<[HasCLFLUSH]>;
 }
 
 let SchedRW = [WriteNop] in {
@@ -3241,11 +3241,11 @@ let SchedRW = [WriteFence] in {
 // TODO: As with mfence, we may want to ease the availability of sfence/lfence
 // to include any 64-bit target.
 def SFENCE : I<0xAE, MRM7X, (outs), (ins), "sfence", [(int_x86_sse_sfence)]>,
-               TB, PS, Requires<[HasSSE1]>;
+               TB, Requires<[HasSSE1]>;
 def LFENCE : I<0xAE, MRM5X, (outs), (ins), "lfence", [(int_x86_sse2_lfence)]>,
-               TB, PS, Requires<[HasSSE2]>;
+               TB, Requires<[HasSSE2]>;
 def MFENCE : I<0xAE, MRM6X, (outs), (ins), "mfence", [(int_x86_sse2_mfence)]>,
-               TB, PS, Requires<[HasMFence]>;
+               TB, Requires<[HasMFence]>;
 } // SchedRW
 
 def : Pat<(X86MFence), (MFENCE)>;
@@ -3266,11 +3266,11 @@ def VSTMXCSR : VPSI<0xAE, MRM3m, (outs), (ins i32mem:$dst),
 let mayLoad=1, hasSideEffects=1, Defs=[MXCSR] in
 def LDMXCSR : I<0xAE, MRM2m, (outs), (ins i32mem:$src),
               "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)]>,
-              TB, PS, Sched<[WriteLDMXCSR]>;
+              TB, Sched<[WriteLDMXCSR]>;
 let mayStore=1, hasSideEffects=1, Uses=[MXCSR] in
 def STMXCSR : I<0xAE, MRM3m, (outs), (ins i32mem:$dst),
               "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)]>,
-              TB, PS, Sched<[WriteSTMXCSR]>;
+              TB, Sched<[WriteSTMXCSR]>;
 
 //===---------------------------------------------------------------------===//
 // SSE2 - Move Aligned/Unaligned Packed Integer Instructions
@@ -6715,7 +6715,7 @@ multiclass SHAI_binop<bits<8> Opc, string OpcodeStr, Intrinsic IntId,
                     [!if(UsesXMM0,
                          (set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0)),
                          (set VR128:$dst, (IntId VR128:$src1, VR128:$src2)))]>,
-                    T8, PS, Sched<[sched]>;
+                    T8, Sched<[sched]>;
 
   def rm#Suffix : I<Opc, MRMSrcMem, (outs VR128:$dst),
                     (ins VR128:$src1, i128mem:$src2),
@@ -6726,7 +6726,7 @@ multiclass SHAI_binop<bits<8> Opc, string OpcodeStr, Intrinsic IntId,
                          (set VR128:$dst, (IntId VR128:$src1,
                            (memop addr:$src2), XMM0)),
                          (set VR128:$dst, (IntId VR128:$src1,
-                           (memop addr:$src2))))]>, T8, PS,
+                           (memop addr:$src2))))]>, T8,
                     Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
@@ -6736,7 +6736,7 @@ let Constraints = "$src1 = $dst", Predicates = [HasSHA, NoEGPR] in {
                          "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}",
                          [(set VR128:$dst,
                            (int_x86_sha1rnds4 VR128:$src1, VR128:$src2,
-                            (i8 timm:$src3)))]>, TA, PS,
+                            (i8 timm:$src3)))]>, TA,
                          Sched<[SchedWriteVecIMul.XMM]>;
   def SHA1RNDS4rmi : Ii8<0xCC, MRMSrcMem, (outs VR128:$dst),
                          (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
@@ -6744,7 +6744,7 @@ let Constraints = "$src1 = $dst", Predicates = [HasSHA, NoEGPR] in {
                          [(set VR128:$dst,
                            (int_x86_sha1rnds4 VR128:$src1,
                             (memop addr:$src2),
-                            (i8 timm:$src3)))]>, TA, PS,
+                            (i8 timm:$src3)))]>, TA,
                          Sched<[SchedWriteVecIMul.XMM.Folded,
                                 SchedWriteVecIMul.XMM.ReadAfterFold]>;
 
@@ -6772,7 +6772,7 @@ let Constraints = "$src1 = $dst", Predicates = [HasSHA, HasEGPR, In64BitMode] in
                              [(set VR128:$dst,
                                (int_x86_sha1rnds4 VR128:$src1, VR128:$src2,
                                 (i8 timm:$src3)))]>,
-                         EVEX, NoCD8, T_MAP4, PS, Sched<[SchedWriteVecIMul.XMM]>;
+                         EVEX, NoCD8, T_MAP4, Sched<[SchedWriteVecIMul.XMM]>;
   def SHA1RNDS4rmi_EVEX: Ii8<0xD4, MRMSrcMem, (outs VR128:$dst),
                              (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
                              "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}",
@@ -6780,7 +6780,7 @@ let Constraints = "$src1 = $dst", Predicates = [HasSHA, HasEGPR, In64BitMode] in
                                (int_x86_sha1rnds4 VR128:$src1,
                                 (memop addr:$src2),
                                 (i8 timm:$src3)))]>,
-                         EVEX, NoCD8, T_MAP4, PS,
+                         EVEX, NoCD8, T_MAP4,
                          Sched<[SchedWriteVecIMul.XMM.Folded,
                                 SchedWriteVecIMul.XMM.ReadAfterFold]>;
 
@@ -7474,12 +7474,12 @@ let Defs = [YMM0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7,
             YMM8, YMM9, YMM10, YMM11, YMM12, YMM13, YMM14, YMM15] in {
   // Zero All YMM registers
   def VZEROALL : I<0x77, RawFrm, (outs), (ins), "vzeroall",
-                  [(int_x86_avx_vzeroall)]>, TB, PS, VEX, VEX_L,
+                  [(int_x86_avx_vzeroall)]>, TB, VEX, VEX_L,
                   Requires<[HasAVX]>, WIG;
 
   // Zero Upper bits of YMM registers
   def VZEROUPPER : I<0x77, RawFrm, (outs), (ins), "vzeroupper",
-                     [(int_x86_avx_vzeroupper)]>, TB, PS, VEX,
+                     [(int_x86_avx_vzeroupper)]>, TB, VEX,
                      Requires<[HasAVX]>, WIG;
 } // Defs
 } // SchedRW
@@ -8240,10 +8240,10 @@ let Predicates = [HasAVXVNNIINT8] in {
                                    1>, VEX_L, T8, XD;
   defm VPDPBUUD   : avx_dotprod_rm<0x50,"vpdpbuud",  v4i32, VR128, loadv4i32,
                                    i128mem, X86vpdpbuud, SchedWriteVecIMul.XMM,
-                                   1>, T8, PS;
+                                   1>, T8;
   defm VPDPBUUDY  : avx_dotprod_rm<0x50,"vpdpbuud",  v8i32, VR256, loadv8i32,
                                    i256mem, X86vpdpbuud, SchedWriteVecIMul.YMM,
-                                   1>, VEX_L, T8, PS;
+                                   1>, VEX_L, T8;
   defm VPDPBSSDS  : avx_dotprod_rm<0x51,"vpdpbssds", v4i32, VR128, loadv4i32,
                                    i128mem, X86vpdpbssds, SchedWriteVecIMul.XMM,
                                    1>, T8, XD;
@@ -8252,10 +8252,10 @@ let Predicates = [HasAVXVNNIINT8] in {
                                    1>, VEX_L, T8, XD;
   defm VPDPBUUDS  : avx_dotprod_rm<0x51,"vpdpbuuds", v4i32, VR128, loadv4i32,
                                    i128mem, X86vpdpbuuds, SchedWriteVecIMul.XMM,
-                                   1>, T8, PS;
+                                   1>, T8;
   defm VPDPBUUDSY : avx_dotprod_rm<0x51,"vpdpbuuds", v8i32, VR256, loadv8i32,
                                    i256mem, X86vpdpbuuds, SchedWriteVecIMul.YMM,
-                                   1>, VEX_L, T8, PS;
+                                   1>, VEX_L, T8;
   defm VPDPBSUD   : avx_dotprod_rm<0x50,"vpdpbsud",  v4i32, VR128, loadv4i32,
                                    i128mem, X86vpdpbsud,  SchedWriteVecIMul.XMM,
                                    0>, T8, XS;
@@ -8316,7 +8316,7 @@ let Predicates = [HasAVXNECONVERT] in {
   defm VCVTNEOBF162PS : AVX_NE_CONVERT_BASE<0xb0, "vcvtneobf162ps", f128mem,
        f256mem>, T8, XD;
   defm VCVTNEOPH2PS : AVX_NE_CONVERT_BASE<0xb0, "vcvtneoph2ps", f128mem,
-       f256mem>, T8, PS;
+       f256mem>, T8;
   defm VCVTNEPS2BF16 : VCVTNEPS2BF16_BASE, VEX, T8, XS, ExplicitVEXPrefix;
 
   def : Pat<(v8bf16 (X86vfpround (v8f32 VR256:$src))),
@@ -8389,7 +8389,7 @@ let Predicates = [HasSM3], Constraints = "$src1 = $dst" in {
   }
 }
 
-defm VSM3MSG1 : SM3_Base<"vsm3msg1">, T8, PS;
+defm VSM3MSG1 : SM3_Base<"vsm3msg1">, T8;
 defm VSM3MSG2 : SM3_Base<"vsm3msg2">, T8, PD;
 defm VSM3RNDS2 : VSM3RNDS2_Base, VEX, VVVV, TA, PD;
 
@@ -8458,5 +8458,5 @@ defm VPDPWSUD   : avx_vnni_int16<0xd2, "vpdpwsud", 0>, T8, XS;
 defm VPDPWSUDS  : avx_vnni_int16<0xd3, "vpdpwsuds", 0>, T8, XS;
 defm VPDPWUSD   : avx_vnni_int16<0xd2, "vpdpwusd", 0>, T8, PD;
 defm VPDPWUSDS  : avx_vnni_int16<0xd3, "vpdpwusds", 0>, T8, PD;
-defm VPDPWUUD   : avx_vnni_int16<0xd2, "vpdpwuud", 1>, T8, PS;
-defm VPDPWUUDS  : avx_vnni_int16<0xd3, "vpdpwuuds", 1>, T8, PS;
+defm VPDPWUUD   : avx_vnni_int16<0xd2, "vpdpwuud", 1>, T8;
+defm VPDPWUUDS  : avx_vnni_int16<0xd3, "vpdpwuuds", 1>, T8;
diff --git a/llvm/lib/Target/X86/X86InstrSystem.td b/llvm/lib/Target/X86/X86InstrSystem.td
index 4471071e8f9a9..efb58c6102dd1 100644
--- a/llvm/lib/Target/X86/X86InstrSystem.td
+++ b/llvm/lib/Target/X86/X86InstrSystem.td
@@ -426,11 +426,11 @@ let SchedRW = [WriteSystem] in {
 let Uses = [EAX, ECX, EDX] in
 def WRMSR : I<0x30, RawFrm, (outs), (ins), "wrmsr", []>, TB;
 let Uses = [EAX, ECX, EDX] in
-def WRMSRNS : I<0x01, MRM_C6, (outs), (ins), "wrmsrns", []>, TB, PS;
+def WRMSRNS : I<0x01, MRM_C6, (outs), (ins), "wrmsrns", []>, TB;
 let Defs = [EAX, EDX], Uses = [ECX] in
 def RDMSR : I<0x32, RawFrm, (outs), (ins), "rdmsr", []>, TB;
 let Defs = [RAX, EFLAGS], Uses = [RBX, RCX], Predicates = [In64BitMode] in
-def PBNDKB : I<0x01, MRM_C7, (outs), (ins), "pbndkb", []>, TB, PS;
+def PBNDKB : I<0x01, MRM_C7, (outs), (ins), "pbndkb", []>, TB;
 let Uses = [RSI, RDI, RCX], Predicates = [In64BitMode] in {
 def WRMSRLIST : I<0x01, MRM_C6, (outs), (ins), "wrmsrlist", []>, TB, XS;
 def RDMSRLIST : I<0x01, MRM_C6, (outs), (ins), "rdmsrlist", []>, TB, XD;
@@ -523,10 +523,10 @@ let SchedRW = [WriteSystem] in {
 let Predicates = [NoEGPR] in {
   def WRSSD : I<0xF6, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
                 "wrssd\t{$src, $dst|$dst, $src}",
-                [(int_x86_wrssd GR32:$src, addr:$dst)]>, T8, PS;
+                [(int_x86_wrssd GR32:$src, addr:$dst)]>, T8;
   def WRSSQ : RI<0xF6, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
                  "wrssq\t{$src, $dst|$dst, $src}",
-                 [(int_x86_wrssq GR64:$src, addr:$dst)]>, T8, PS;
+                 [(int_x86_wrssq GR64:$src, addr:$dst)]>, T8;
   def WRUSSD : I<0xF5, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
                  "wrussd\t{$src, $dst|$dst, $src}",
                  [(int_x86_wrussd GR32:$src, addr:$dst)]>, T8, PD;
@@ -538,10 +538,10 @@ let Predicates = [NoEGPR] in {
 let Predicates = [HasEGPR, In64BitMode] in {
   def WRSSD_EVEX : I<0x66, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
                      "wrssd\t{$src, $dst|$dst, $src}",
-                     [(int_x86_wrssd GR32:$src, addr:$dst)]>, EVEX, NoCD8, T_MAP4, PS;
+                     [(int_x86_wrssd GR32:$src, addr:$dst)]>, EVEX, NoCD8, T_MAP4;
   def WRSSQ_EVEX : RI<0x66, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
                       "wrssq\t{$src, $dst|$dst, $src}",
-                      [(int_x86_wrssq GR64:$src, addr:$dst)]>, EVEX, NoCD8, T_MAP4, PS;
+                      [(int_x86_wrssq GR64:$src, addr:$dst)]>, EVEX, NoCD8, T_MAP4;
   def WRUSSD_EVEX : I<0x65, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
                       "wrussd\t{$src, $dst|$dst, $src}",
                       [(int_x86_wrussd GR32:$src, addr:$dst)]>, EVEX, NoCD8, T_MAP4, PD;
@@ -574,51 +574,51 @@ let SchedRW = [WriteSystem] in {
 // on Windows without needing to enable the xsave feature to be compatible with
 // MSVC.
 let Defs = [EDX, EAX], Uses = [ECX] in
-def XGETBV : I<0x01, MRM_D0, (outs), (ins), "xgetbv", []>, TB, PS;
+def XGETBV : I<0x01, MRM_D0, (outs), (ins), "xgetbv", []>, TB;
 
 let Uses = [EDX, EAX, ECX] in
 def XSETBV : I<0x01, MRM_D1, (outs), (ins),
               "xsetbv",
-              [(int_x86_xsetbv ECX, EDX, EAX)]>, TB, PS;
+              [(int_x86_xsetbv ECX, EDX, EAX)]>, TB;
 
 
 let Uses = [EDX, EAX] in {
 def XSAVE : I<0xAE, MRM4m, (outs), (ins opaquemem:$dst),
               "xsave\t$dst",
-              [(int_x86_xsave addr:$dst, EDX, EAX)]>, TB, PS, Requires<[HasXSAVE]>;
+              [(int_x86_xsave addr:$dst, EDX, EAX)]>, TB, Requires<[HasXSAVE]>;
 def XSAVE64 : RI<0xAE, MRM4m, (outs), (ins opaquemem:$dst),
                  "xsave64\t$dst",
-                 [(int_x86_xsave64 addr:$dst, EDX, EAX)]>, TB, PS, Requires<[HasXSAVE, In64BitMode]>;
+                 [(int_x86_xsave64 addr:$dst, EDX, EAX)]>, TB, Requires<[HasXSAVE, In64BitMode]>;
 def XRSTOR : I<0xAE, MRM5m, (outs), (ins opaquemem:$dst),
                "xrstor\t$dst",
-               [(int_x86_xrstor addr:$dst, EDX, EAX)]>, TB, PS, Requires<[HasXSAVE]>;
+               [(int_x86_xrstor addr:$dst, EDX, EAX)]>, TB, Requires<[HasXSAVE]>;
 def XRSTOR64 : RI<0xAE, MRM5m, (outs), (ins opaquemem:$dst),
                   "xrstor64\t$dst",
-                  [(int_x86_xrstor64 addr:$dst, EDX, EAX)]>, TB, PS, Requires<[HasXSAVE, In64BitMode]>;
+                  [(int_x86_xrstor64 addr:$dst, EDX, EAX)]>, TB, Requires<[HasXSAVE, In64BitMode]>;
 def XSAVEOPT : I<0xAE, MRM6m, (outs), (ins opaquemem:$dst),
                  "xsaveopt\t$dst",
-                 [(int_x86_xsaveopt addr:$dst, EDX, EAX)]>, TB, PS, Requires<[HasXSAVEOPT]>;
+                 [(int_x86_xsaveopt addr:$dst, EDX, EAX)]>, TB, Requires<[HasXSAVEOPT]>;
 def XSAVEOPT64 : RI<0xAE, MRM6m, (outs), (ins opaquemem:$dst),
                     "xsaveopt64\t$dst",
-                    [(int_x86_xsaveopt64 addr:$dst, EDX, EAX)]>, TB, PS, Requires<[HasXSAVEOPT, In64BitMode]>;
+                    [(int_x86_xsaveopt64 addr:$dst, EDX, EAX)]>, TB, Requires<[HasXSAVEOPT, In64BitMode]>;
 def XSAVEC : I<0xC7, MRM4m, (outs), (ins opaquemem:$dst),
                "xsavec\t$dst",
-               [(int_x86_xsavec addr:$dst, EDX, EAX)]>, TB, PS, Requires<[HasXSAVEC]>;
+               [(int_x86_xsavec addr:$dst, EDX, EAX)]>, TB, Requires<[HasXSAVEC]>;
 def XSAVEC64 : RI<0xC7, MRM4m, (outs), (ins opaquemem:$dst),
                  "xsavec64\t$dst",
-                 [(int_x86_xsavec64 addr:$dst, EDX, EAX)]>, TB, PS, Requires<[HasXSAVEC, In64BitMode]>;
+                 [(int_x86_xsavec64 addr:$dst, EDX, EAX)]>, TB, Requires<[HasXSAVEC, In64BitMode]>;
 def XSAVES : I<0xC7, MRM5m, (outs), (ins opaquemem:$dst),
                "xsaves\t$dst",
-               [(int_x86_xsaves addr:$dst, EDX, EAX)]>, TB, PS, Requires<[HasXSAVES]>;
+               [(int_x86_xsaves addr:$dst, EDX, EAX)]>, TB, Requires<[HasXSAVES]>;
 def XSAVES64 : RI<0xC7, MRM5m, (outs), (ins opaquemem:$dst),
                   "xsaves64\t$dst",
-                  [(int_x86_xsaves64 addr:$dst, EDX, EAX)]>, TB, PS, Requires<[HasXSAVE, In64BitMode]>;
+                  [(int_x86_xsaves64 addr:$dst, EDX, EAX)]>, TB, Requires<[HasXSAVE, In64BitMode]>;
 def XRSTORS : I<0xC7, MRM3m, (outs), (ins opaquemem:$dst),
                 "xrstors\t$dst",
-                [(int_x86_xrstors addr:$dst, EDX, EAX)]>, TB, PS, Requires<[HasXSAVES]>;
+                [(int_x86_xrstors addr:$dst, EDX, EAX)]>, TB, Requires<[HasXSAVES]>;
 def XRSTORS64 : RI<0xC7, MRM3m, (outs), (ins opaquemem:$dst),
                    "xrstors64\t$dst",
-                   [(int_x86_xrstors64 addr:$dst, EDX, EAX)]>, TB, PS, Requires<[HasXSAVES, In64BitMode]>;
+                   [(int_x86_xrstors64 addr:$dst, EDX, EAX)]>, TB, Requires<[HasXSAVES, In64BitMode]>;
 } // Uses
 } // SchedRW
 
@@ -651,10 +651,10 @@ let Defs = [RAX, RDX, RSI], Uses = [RAX, RSI] in
 let SchedRW = [WriteSystem] in {
 let Defs = [EAX, EDX], Uses = [ECX] in
   def RDPKRUr : I<0x01, MRM_EE, (outs), (ins), "rdpkru",
-                  [(set EAX, (X86rdpkru ECX)), (implicit EDX)]>, TB, PS;
+                  [(set EAX, (X86rdpkru ECX)), (implicit EDX)]>, TB;
 let Uses = [EAX, ECX, EDX] in
   def WRPKRUr : I<0x01, MRM_EF, (outs), (ins), "wrpkru",
-                  [(X86wrpkru EAX, EDX, ECX)]>, TB, PS;
+                  [(X86wrpkru EAX, EDX, ECX)]>, TB;
 } // SchedRW
 
 //===----------------------------------------------------------------------===//
@@ -718,15 +718,15 @@ let Predicates = [In64BitMode, HasINVPCID] in {
 //===----------------------------------------------------------------------===//
 // SMAP Instruction
 let Defs = [EFLAGS], SchedRW = [WriteSystem] in {
-  def CLAC : I<0x01, MRM_CA, (outs), (ins), "clac", []>, TB, PS;
-  def STAC : I<0x01, MRM_CB, (outs), (ins), "stac", []>, TB, PS;
+  def CLAC : I<0x01, MRM_CA, (outs), (ins), "clac", []>, TB;
+  def STAC : I<0x01, MRM_CB, (outs), (ins), "stac", []>, TB;
 }
 
 //===----------------------------------------------------------------------===//
 // SMX Instruction
 let SchedRW = [WriteSystem] in {
 let Uses = [RAX, RBX, RCX, RDX], Defs = [RAX, RBX, RCX] in {
-  def GETSEC : I<0x37, RawFrm, (outs), (ins), "getsec", []>, TB, PS;
+  def GETSEC : I<0x37, RawFrm, (outs), (ins), "getsec", []>, TB;
 } // Uses, Defs
 } // SchedRW
 
@@ -784,7 +784,7 @@ def PTWRITE64r : RI<0xAE, MRM4r, (outs), (ins GR64:$dst),
 
 let SchedRW = [WriteSystem] in {
 let Uses = [ECX], Defs = [EAX, EDX] in
-   def RDPRU : I<0x01, MRM_FD, (outs), (ins), "rdpru", []>, TB, PS,
+   def RDPRU : I<0x01, MRM_FD, (outs), (ins), "rdpru", []>, TB,
                Requires<[HasRDPRU]>;
 }
 
@@ -803,6 +803,6 @@ let Uses = [ECX], Defs = [EAX, EDX] in
 
 let SchedRW = [WriteSystem] in {
 let Uses = [RAX, RBX, RCX, RDX], Defs = [RAX, RBX, RCX, RDX, EFLAGS] in
-    def PCONFIG : I<0x01, MRM_C5, (outs), (ins), "pconfig", []>, TB, PS,
+    def PCONFIG : I<0x01, MRM_C5, (outs), (ins), "pconfig", []>, TB,
                   Requires<[HasPCONFIG]>;
 } // SchedRW
diff --git a/llvm/lib/Target/X86/X86InstrTSX.td b/llvm/lib/Target/X86/X86InstrTSX.td
index cc9174a0c491c..57604b682d54e 100644
--- a/llvm/lib/Target/X86/X86InstrTSX.td
+++ b/llvm/lib/Target/X86/X86InstrTSX.td
@@ -37,11 +37,11 @@ def XABORT_DEF : I<0, Pseudo, (outs), (ins), "# XABORT DEF", []>;
 }
 
 def XEND : I<0x01, MRM_D5, (outs), (ins),
-             "xend", [(int_x86_xend)]>, TB, PS, Requires<[HasRTM]>;
+             "xend", [(int_x86_xend)]>, TB, Requires<[HasRTM]>;
 
 let Defs = [EFLAGS] in
 def XTEST : I<0x01, MRM_D6, (outs), (ins),
-              "xtest", [(set EFLAGS, (X86xtest))]>, TB, PS, Requires<[HasRTM]>;
+              "xtest", [(set EFLAGS, (X86xtest))]>, TB, Requires<[HasRTM]>;
 
 def XABORT : Ii8<0xc6, MRM_F8, (outs), (ins i8imm:$imm),
                  "xabort\t$imm",
diff --git a/llvm/lib/Target/X86/X86InstrUtils.td b/llvm/lib/Target/X86/X86InstrUtils.td
index 87eacf704de6c..919e941abfd11 100644
--- a/llvm/lib/Target/X86/X86InstrUtils.td
+++ b/llvm/lib/Target/X86/X86InstrUtils.td
@@ -31,9 +31,9 @@ class T_MAP4 { Map OpMap = T_MAP4; }
 class T_MAP5 { Map OpMap = T_MAP5; }
 class T_MAP6 { Map OpMap = T_MAP6; }
 class T_MAP7 { Map OpMap = T_MAP7; }
-class XOP8   { Map OpMap = XOP8; Prefix OpPrefix = PS; }
-class XOP9   { Map OpMap = XOP9; Prefix OpPrefix = PS; }
-class XOPA   { Map OpMap = XOPA; Prefix OpPrefix = PS; }
+class XOP8   { Map OpMap = XOP8; }
+class XOP9   { Map OpMap = XOP9; }
+class XOPA   { Map OpMap = XOPA; }
 class ThreeDNow { Map OpMap = ThreeDNow; }
 class PS { Prefix OpPrefix = PS; }
 class PD { Prefix OpPrefix = PD; }
@@ -79,7 +79,7 @@ class AVX512XDIi8Base : TB, XD {
   Domain ExeDomain = SSEPackedInt;
   ImmType ImmT = Imm8;
 }
-class AVX512PSIi8Base : TB, PS {
+class AVX512PSIi8Base : TB {
   Domain ExeDomain = SSEPackedSingle;
   ImmType ImmT = Imm8;
 }
@@ -574,11 +574,11 @@ class SSIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
       : Ii8<o, F, outs, ins, asm, pattern>, TB, XS, Requires<[UseSSE1]>;
 class PSI<bits<8> o, Format F, dag outs, dag ins, string asm,
           list<dag> pattern>
-      : I<o, F, outs, ins, asm, pattern, SSEPackedSingle>, TB, PS,
+      : I<o, F, outs, ins, asm, pattern, SSEPackedSingle>, TB,
         Requires<[UseSSE1]>;
 class PSIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
             list<dag> pattern>
-      : Ii8<o, F, outs, ins, asm, pattern, SSEPackedSingle>, TB, PS,
+      : Ii8<o, F, outs, ins, asm, pattern, SSEPackedSingle>, TB,
         Requires<[UseSSE1]>;
 class VSSI<bits<8> o, Format F, dag outs, dag ins, string asm,
            list<dag> pattern>
@@ -587,7 +587,7 @@ class VSSI<bits<8> o, Format F, dag outs, dag ins, string asm,
 class VPSI<bits<8> o, Format F, dag outs, dag ins, string asm,
            list<dag> pattern>
       : I<o, F, outs, ins, !strconcat("v", asm), pattern, SSEPackedSingle>,
-        TB, PS, Requires<[HasAVX]>;
+        TB, Requires<[HasAVX]>;
 
 // SSE2 Instruction Templates:
 //
@@ -694,11 +694,11 @@ class SS3AI<bits<8> o, Format F, dag outs, dag ins, string asm,
         Requires<[UseSSSE3]>;
 class MMXSS38I<bits<8> o, Format F, dag outs, dag ins, string asm,
                list<dag> pattern>
-      : I<o, F, outs, ins, asm, pattern, SSEPackedInt>, T8, PS,
+      : I<o, F, outs, ins, asm, pattern, SSEPackedInt>, T8,
         Requires<[HasMMX, HasSSSE3]>;
 class MMXSS3AI<bits<8> o, Format F, dag outs, dag ins, string asm,
                list<dag> pattern>
-      : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, TA, PS,
+      : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, TA,
         Requires<[HasMMX, HasSSSE3]>;
 
 // SSE4.1 Instruction Templates:
@@ -824,7 +824,7 @@ class AVX512PDI<bits<8> o, Format F, dag outs, dag ins, string asm,
         Requires<[HasAVX512]>;
 class AVX512PSI<bits<8> o, Format F, dag outs, dag ins, string asm,
            list<dag> pattern>
-      : I<o, F, outs, ins, asm, pattern, SSEPackedSingle>, TB, PS,
+      : I<o, F, outs, ins, asm, pattern, SSEPackedSingle>, TB,
         Requires<[HasAVX512]>;
 class AVX512PIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
               list<dag> pattern, Domain d>
@@ -947,14 +947,14 @@ class VRS2I<bits<8> o, Format F, dag outs, dag ins, string asm,
 // MMXIi8 - MMX instructions with ImmT == Imm8 and PS prefix.
 class MMXI<bits<8> o, Format F, dag outs, dag ins, string asm,
            list<dag> pattern>
-      : I<o, F, outs, ins, asm, pattern>, TB, PS, Requires<[HasMMX]>;
+      : I<o, F, outs, ins, asm, pattern>, TB, Requires<[HasMMX]>;
 class MMXRI<bits<8> o, Format F, dag outs, dag ins, string asm,
             list<dag> pattern>
-      : I<o, F, outs, ins, asm, pattern>, TB, PS, REX_W,
+      : I<o, F, outs, ins, asm, pattern>, TB, REX_W,
         Requires<[HasMMX,In64BitMode]>;
 class MMXIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
              list<dag> pattern>
-      : Ii8<o, F, outs, ins, asm, pattern>, TB, PS, Requires<[HasMMX]>;
+      : Ii8<o, F, outs, ins, asm, pattern>, TB, Requires<[HasMMX]>;
 
 /// ITy - This instruction base class takes the type info for the instruction.
 /// Using this, it:
diff --git a/llvm/lib/Target/X86/X86InstrVMX.td b/llvm/lib/Target/X86/X86InstrVMX.td
index f2fc0dbaa3703..7cc468fe15ad4 100644
--- a/llvm/lib/Target/X86/X86InstrVMX.td
+++ b/llvm/lib/Target/X86/X86InstrVMX.td
@@ -43,7 +43,7 @@ def VMCLEARm : I<0xC7, MRM6m, (outs), (ins i64mem:$vmcs),
   "vmclear\t$vmcs", []>, TB, PD;
 
 // OF 01 D4
-def VMFUNC : I<0x01, MRM_D4, (outs), (ins), "vmfunc", []>, TB, PS;
+def VMFUNC : I<0x01, MRM_D4, (outs), (ins), "vmfunc", []>, TB;
 
 // 0F 01 C2
 def VMLAUNCH : I<0x01, MRM_C2, (outs), (ins), "vmlaunch", []>, TB;
@@ -51,31 +51,31 @@ def VMLAUNCH : I<0x01, MRM_C2, (outs), (ins), "vmlaunch", []>, TB;
 // 0F 01 C3
 def VMRESUME : I<0x01, MRM_C3, (outs), (ins), "vmresume", []>, TB;
 def VMPTRLDm : I<0xC7, MRM6m, (outs), (ins i64mem:$vmcs),
-  "vmptrld\t$vmcs", []>, TB, PS;
+  "vmptrld\t$vmcs", []>, TB;
 def VMPTRSTm : I<0xC7, MRM7m, (outs), (ins i64mem:$vmcs),
-  "vmptrst\t$vmcs", []>, TB, PS;
+  "vmptrst\t$vmcs", []>, TB;
 def VMREAD64rr : I<0x78, MRMDestReg, (outs GR64:$dst), (ins GR64:$src),
-  "vmread{q}\t{$src, $dst|$dst, $src}", []>, TB, PS, Requires<[In64BitMode]>;
+  "vmread{q}\t{$src, $dst|$dst, $src}", []>, TB, Requires<[In64BitMode]>;
 def VMREAD32rr : I<0x78, MRMDestReg, (outs GR32:$dst), (ins GR32:$src),
-  "vmread{l}\t{$src, $dst|$dst, $src}", []>, TB, PS, Requires<[Not64BitMode]>;
+  "vmread{l}\t{$src, $dst|$dst, $src}", []>, TB, Requires<[Not64BitMode]>;
 
 let mayStore = 1 in {
 def VMREAD64mr : I<0x78, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
-  "vmread{q}\t{$src, $dst|$dst, $src}", []>, TB, PS, Requires<[In64BitMode]>;
+  "vmread{q}\t{$src, $dst|$dst, $src}", []>, TB, Requires<[In64BitMode]>;
 def VMREAD32mr : I<0x78, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
-  "vmread{l}\t{$src, $dst|$dst, $src}", []>, TB, PS, Requires<[Not64BitMode]>;
+  "vmread{l}\t{$src, $dst|$dst, $src}", []>, TB, Requires<[Not64BitMode]>;
 } // mayStore
 
 def VMWRITE64rr : I<0x79, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
-  "vmwrite{q}\t{$src, $dst|$dst, $src}", []>, TB, PS, Requires<[In64BitMode]>;
+  "vmwrite{q}\t{$src, $dst|$dst, $src}", []>, TB, Requires<[In64BitMode]>;
 def VMWRITE32rr : I<0x79, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
-  "vmwrite{l}\t{$src, $dst|$dst, $src}", []>, TB, PS, Requires<[Not64BitMode]>;
+  "vmwrite{l}\t{$src, $dst|$dst, $src}", []>, TB, Requires<[Not64BitMode]>;
 
 let mayLoad = 1 in {
 def VMWRITE64rm : I<0x79, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
-  "vmwrite{q}\t{$src, $dst|$dst, $src}", []>, TB, PS, Requires<[In64BitMode]>;
+  "vmwrite{q}\t{$src, $dst|$dst, $src}", []>, TB, Requires<[In64BitMode]>;
 def VMWRITE32rm : I<0x79, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
-  "vmwrite{l}\t{$src, $dst|$dst, $src}", []>, TB, PS, Requires<[Not64BitMode]>;
+  "vmwrite{l}\t{$src, $dst|$dst, $src}", []>, TB, Requires<[Not64BitMode]>;
 } // mayLoad
 
 // 0F 01 C4

From 17ff25a58ee4f29816d932fdb75f0d305718069f Mon Sep 17 00:00:00 2001
From: Shengchen Kan <shengchen.kan@intel.com>
Date: Sun, 24 Dec 2023 11:09:16 +0800
Subject: [PATCH 219/342] [X86][NFC] Not infer OpSize from Xi8|16|32|64

For legacy (arithmetic) instructions, the operand size override prefix (0x66)
is used to switch the operand data size from 32b to 16b (in 32/64-bit mode),
16b to 32b (in 16-bit mode). That's why we set OpSize16 for 16-bit instructions
and set OpSize32 for 32-bit instructions.

But it's not a generic rule any more after APX. APX adds 4 variants for
arithmetic instructions: promoted EVEX, NDD (new data destination), NF (no flag),
NF_NDD. All the 4 variants are in EVEX space and only legal in 64-bit
mode. EVEX.pp is set to 01 for the 16-bit instructions to encode 0x66.
For APX, we should set OpSizeFixed for 8/16/32/64-bit variants and set PD for
the 16-bit variants.

Hence, to reuse the classes ITy and its subclasses BinOp* for APX instructions,
we extract the OpSize setting from the class ITy.
---
 llvm/lib/Target/X86/X86InstrArithmetic.td | 224 +++++++++++-----------
 llvm/lib/Target/X86/X86InstrFormats.td    |   2 +-
 llvm/lib/Target/X86/X86InstrUtils.td      |  25 +--
 3 files changed, 121 insertions(+), 130 deletions(-)

diff --git a/llvm/lib/Target/X86/X86InstrArithmetic.td b/llvm/lib/Target/X86/X86InstrArithmetic.td
index abd0d87354f8e..22394545a7fa2 100644
--- a/llvm/lib/Target/X86/X86InstrArithmetic.td
+++ b/llvm/lib/Target/X86/X86InstrArithmetic.td
@@ -407,28 +407,28 @@ class IMulOpRMI<bits<8> o, string m, X86TypeInfo t,
 }
 
 let Constraints = "$src1 = $dst" in {
-def INC16r_alt : INCDECR_ALT<0x40, "inc", Xi16>;
-def INC32r_alt : INCDECR_ALT<0x40, "inc", Xi32>;
+def INC16r_alt : INCDECR_ALT<0x40, "inc", Xi16>, OpSize16;
+def INC32r_alt : INCDECR_ALT<0x40, "inc", Xi32>, OpSize32;
 def INC8r  : INCDECR<MRM0r, "inc", Xi8, X86add_flag_nocf>;
-def INC16r : INCDECR<MRM0r, "inc", Xi16, X86add_flag_nocf>;
-def INC32r : INCDECR<MRM0r, "inc", Xi32, X86add_flag_nocf>;
+def INC16r : INCDECR<MRM0r, "inc", Xi16, X86add_flag_nocf>, OpSize16;
+def INC32r : INCDECR<MRM0r, "inc", Xi32, X86add_flag_nocf>, OpSize32;
 def INC64r : INCDECR<MRM0r, "inc", Xi64, X86add_flag_nocf>;
 
-def DEC16r_alt : INCDECR_ALT<0x48, "dec", Xi16>;
-def DEC32r_alt : INCDECR_ALT<0x48, "dec", Xi32>;
+def DEC16r_alt : INCDECR_ALT<0x48, "dec", Xi16>, OpSize16;
+def DEC32r_alt : INCDECR_ALT<0x48, "dec", Xi32>, OpSize32;
 def DEC8r  : INCDECR<MRM1r, "dec", Xi8, X86sub_flag_nocf>;
-def DEC16r : INCDECR<MRM1r, "dec", Xi16, X86sub_flag_nocf>;
-def DEC32r : INCDECR<MRM1r, "dec", Xi32, X86sub_flag_nocf>;
+def DEC16r : INCDECR<MRM1r, "dec", Xi16, X86sub_flag_nocf>, OpSize16;
+def DEC32r : INCDECR<MRM1r, "dec", Xi32, X86sub_flag_nocf>, OpSize32;
 def DEC64r : INCDECR<MRM1r, "dec", Xi64, X86sub_flag_nocf>;
 }
 
 let Predicates = [UseIncDec] in {
 def INC8m  : INCDECM<MRM0m, "inc", Xi8, 1>;
-def INC16m : INCDECM<MRM0m, "inc", Xi16, 1>;
-def INC32m : INCDECM<MRM0m, "inc", Xi32, 1>;
+def INC16m : INCDECM<MRM0m, "inc", Xi16, 1>, OpSize16;
+def INC32m : INCDECM<MRM0m, "inc", Xi32, 1>, OpSize32;
 def DEC8m  : INCDECM<MRM1m, "dec", Xi8, -1>;
-def DEC16m : INCDECM<MRM1m, "dec", Xi16, -1>;
-def DEC32m : INCDECM<MRM1m, "dec", Xi32, -1>;
+def DEC16m : INCDECM<MRM1m, "dec", Xi16, -1>, OpSize16;
+def DEC32m : INCDECM<MRM1m, "dec", Xi32, -1>, OpSize32;
 }
 let Predicates = [UseIncDec, In64BitMode] in {
 def INC64m : INCDECM<MRM0m, "inc", Xi64, 1>;
@@ -448,11 +448,11 @@ def MUL8r  : MulOpR<0xF6, MRM4r, "mul", Xi8, WriteIMul8,
                [(set AL, (mul AL, GR8:$src)), (implicit EFLAGS)]>;
 // AX,DX = AX*GR16
 let Defs = [AX,DX,EFLAGS], Uses = [AX] in
-def MUL16r : MulOpR<0xF7, MRM4r, "mul", Xi16, WriteIMul16, []>;
+def MUL16r : MulOpR<0xF7, MRM4r, "mul", Xi16, WriteIMul16, []>, OpSize16;
 // EAX,EDX = EAX*GR32
 let Defs = [EAX,EDX,EFLAGS], Uses = [EAX] in
 def MUL32r : MulOpR<0xF7, MRM4r, "mul", Xi32, WriteIMul32,
-               [/*(set EAX, EDX, EFLAGS, (X86umul_flag EAX, GR32:$src))*/]>;
+               [/*(set EAX, EDX, EFLAGS, (X86umul_flag EAX, GR32:$src))*/]>, OpSize32;
 // RAX,RDX = RAX*GR64
 let Defs = [RAX,RDX,EFLAGS], Uses = [RAX] in
 def MUL64r : MulOpR<0xF7, MRM4r, "mul", Xi64, WriteIMul64,
@@ -467,10 +467,10 @@ def MUL8m  : MulOpM<0xF6, MRM4m, "mul", Xi8, WriteIMul8,
                 (implicit EFLAGS)]>;
 // AX,DX = AX*[mem16]
 let Defs = [AX,DX,EFLAGS], Uses = [AX] in
-def MUL16m : MulOpM<0xF7, MRM4m, "mul", Xi16, WriteIMul16, []>;
+def MUL16m : MulOpM<0xF7, MRM4m, "mul", Xi16, WriteIMul16, []>, OpSize16;
 // EAX,EDX = EAX*[mem32]
 let Defs = [EAX,EDX,EFLAGS], Uses = [EAX] in
-def MUL32m : MulOpM<0xF7, MRM4m, "mul", Xi32, WriteIMul32, []>;
+def MUL32m : MulOpM<0xF7, MRM4m, "mul", Xi32, WriteIMul32, []>, OpSize32;
 // RAX,RDX = RAX*[mem64]
 let Defs = [RAX,RDX,EFLAGS], Uses = [RAX] in
 def MUL64m : MulOpM<0xF7, MRM4m, "mul", Xi64, WriteIMul64, []>,
@@ -481,10 +481,10 @@ let Defs = [AL,EFLAGS,AX], Uses = [AL] in
 def IMUL8r  : MulOpR<0xF6, MRM5r, "imul", Xi8, WriteIMul8, []>;
 // AX,DX = AX*GR16
 let Defs = [AX,DX,EFLAGS], Uses = [AX] in
-def IMUL16r : MulOpR<0xF7, MRM5r, "imul", Xi16, WriteIMul16, []>;
+def IMUL16r : MulOpR<0xF7, MRM5r, "imul", Xi16, WriteIMul16, []>, OpSize16;
 // EAX,EDX = EAX*GR32
 let Defs = [EAX,EDX,EFLAGS], Uses = [EAX] in
-def IMUL32r : MulOpR<0xF7, MRM5r, "imul", Xi32, WriteIMul32, []>;
+def IMUL32r : MulOpR<0xF7, MRM5r, "imul", Xi32, WriteIMul32, []>, OpSize32;
 // RAX,RDX = RAX*GR64
 let Defs = [RAX,RDX,EFLAGS], Uses = [RAX] in
 def IMUL64r : MulOpR<0xF7, MRM5r, "imul", Xi64, WriteIMul64, []>;
@@ -494,10 +494,10 @@ let Defs = [AL,EFLAGS,AX], Uses = [AL] in
 def IMUL8m  : MulOpM<0xF6, MRM5m, "imul", Xi8, WriteIMul8, []>;
 // AX,DX = AX*[mem16]
 let Defs = [AX,DX,EFLAGS], Uses = [AX] in
-def IMUL16m : MulOpM<0xF7, MRM5m, "imul", Xi16, WriteIMul16, []>;
+def IMUL16m : MulOpM<0xF7, MRM5m, "imul", Xi16, WriteIMul16, []>, OpSize16;
 // EAX,EDX = EAX*[mem32]
 let Defs = [EAX,EDX,EFLAGS], Uses = [EAX] in
-def IMUL32m : MulOpM<0xF7, MRM5m, "imul", Xi32, WriteIMul32, []>;
+def IMUL32m : MulOpM<0xF7, MRM5m, "imul", Xi32, WriteIMul32, []>, OpSize32;
 // RAX,RDX = RAX*[mem64]
 let Defs = [RAX,RDX,EFLAGS], Uses = [RAX] in
 def IMUL64m : MulOpM<0xF7, MRM5m, "imul", Xi64, WriteIMul64, []>,
@@ -505,13 +505,13 @@ def IMUL64m : MulOpM<0xF7, MRM5m, "imul", Xi64, WriteIMul64, []>,
 
 let Constraints = "$src1 = $dst" in {
 // Register-Register Signed Integer Multiply
-def IMUL16rr : IMulOpRR<0xAF, "imul", Xi16, WriteIMul16Reg>;
-def IMUL32rr : IMulOpRR<0xAF, "imul", Xi32, WriteIMul32Reg>;
+def IMUL16rr : IMulOpRR<0xAF, "imul", Xi16, WriteIMul16Reg>, OpSize16;
+def IMUL32rr : IMulOpRR<0xAF, "imul", Xi32, WriteIMul32Reg>, OpSize32;
 def IMUL64rr : IMulOpRR<0xAF, "imul", Xi64, WriteIMul64Reg>;
 
 // Register-Memory Signed Integer Multiply
-def IMUL16rm : IMulOpRM<0xAF, "imul", Xi16, WriteIMul16Reg>;
-def IMUL32rm : IMulOpRM<0xAF, "imul", Xi32, WriteIMul32Reg>;
+def IMUL16rm : IMulOpRM<0xAF, "imul", Xi16, WriteIMul16Reg>, OpSize16;
+def IMUL32rm : IMulOpRM<0xAF, "imul", Xi32, WriteIMul32Reg>, OpSize32;
 def IMUL64rm : IMulOpRM<0xAF, "imul", Xi64, WriteIMul64Reg>;
 }
 
@@ -521,13 +521,13 @@ def IMUL64rm : IMulOpRM<0xAF, "imul", Xi64, WriteIMul64Reg>;
 
 // Register-Integer Signed Integer Multiply
 // GR16 = GR16*I8
-def IMUL16rri8 : IMulOpRRI8<0x6B, "imul", Xi16, WriteIMul16Imm>;
+def IMUL16rri8 : IMulOpRRI8<0x6B, "imul", Xi16, WriteIMul16Imm>, OpSize16;
 // GR16 = GR16*I16
-def IMUL16rri  : IMulOpRRI<0x69, "imul", Xi16, WriteIMul16Imm>;
+def IMUL16rri  : IMulOpRRI<0x69, "imul", Xi16, WriteIMul16Imm>, OpSize16;
 // GR32 = GR32*I8
-def IMUL32rri8 : IMulOpRRI8<0x6B, "imul", Xi32, WriteIMul32Imm>;
+def IMUL32rri8 : IMulOpRRI8<0x6B, "imul", Xi32, WriteIMul32Imm>, OpSize32;
 // GR32 = GR32*I32
-def IMUL32rri  : IMulOpRRI<0x69, "imul", Xi32, WriteIMul32Imm>;
+def IMUL32rri  : IMulOpRRI<0x69, "imul", Xi32, WriteIMul32Imm>, OpSize32;
 // GR64 = GR64*I8
 def IMUL64rri8 : IMulOpRRI8<0x6B, "imul", Xi64, WriteIMul64Imm>;
 // GR64 = GR64*I32
@@ -535,13 +535,13 @@ def IMUL64rri32 : IMulOpRRI<0x69, "imul", Xi64, WriteIMul64Imm>;
 
 // Memory-Integer Signed Integer Multiply
 // GR16 = [mem16]*I8
-def IMUL16rmi8 : IMulOpRMI8<0x6B, "imul", Xi16, WriteIMul16Imm>;
+def IMUL16rmi8 : IMulOpRMI8<0x6B, "imul", Xi16, WriteIMul16Imm>, OpSize16;
 // GR16 = [mem16]*I16
-def IMUL16rmi  : IMulOpRMI<0x69, "imul", Xi16, WriteIMul16Imm>;
+def IMUL16rmi  : IMulOpRMI<0x69, "imul", Xi16, WriteIMul16Imm>, OpSize16;
 // GR32 = [mem32]*I8
-def IMUL32rmi8 : IMulOpRMI8<0x6B, "imul", Xi32, WriteIMul32Imm>;
+def IMUL32rmi8 : IMulOpRMI8<0x6B, "imul", Xi32, WriteIMul32Imm>, OpSize32;
 // GR32 = [mem32]*I32
-def IMUL32rmi  : IMulOpRMI<0x69, "imul", Xi32, WriteIMul32Imm>;
+def IMUL32rmi  : IMulOpRMI<0x69, "imul", Xi32, WriteIMul32Imm>, OpSize32;
 // GR64 = [mem64]*I8
 def IMUL64rmi8 : IMulOpRMI8<0x6B, "imul", Xi64, WriteIMul64Imm>;
 // GR64 = [mem64]*I32
@@ -554,10 +554,10 @@ let Defs = [AL,AH,EFLAGS], Uses = [AX] in
 def DIV8r  : MulOpR<0xF6, MRM6r, "div", Xi8, WriteDiv8, []>;
 let Defs = [AX,DX,EFLAGS], Uses = [AX,DX] in
 // DX:AX/r16 = AX,DX
-def DIV16r : MulOpR<0xF7, MRM6r, "div", Xi16, WriteDiv16, []>;
+def DIV16r : MulOpR<0xF7, MRM6r, "div", Xi16, WriteDiv16, []>, OpSize16;
 let Defs = [EAX,EDX,EFLAGS], Uses = [EAX,EDX] in
 // EDX:EAX/r32 = EAX,EDX
-def DIV32r : MulOpR<0xF7, MRM6r, "div", Xi32, WriteDiv32, []>;
+def DIV32r : MulOpR<0xF7, MRM6r, "div", Xi32, WriteDiv32, []>, OpSize32;
 // RDX:RAX/r64 = RAX,RDX
 let Defs = [RAX,RDX,EFLAGS], Uses = [RAX,RDX] in
 def DIV64r : MulOpR<0xF7, MRM6r, "div", Xi64, WriteDiv64, []>;
@@ -568,9 +568,9 @@ let Defs = [AL,AH,EFLAGS], Uses = [AX] in
 def DIV8m  : MulOpM<0xF6, MRM6m, "div", Xi8, WriteDiv8, []>;
 let Defs = [AX,DX,EFLAGS], Uses = [AX,DX] in
 // DX:AX/[mem16] = AX,DX
-def DIV16m : MulOpM<0xF7, MRM6m, "div", Xi16, WriteDiv16, []>;
+def DIV16m : MulOpM<0xF7, MRM6m, "div", Xi16, WriteDiv16, []>, OpSize16;
 let Defs = [EAX,EDX,EFLAGS], Uses = [EAX,EDX] in    // EDX:EAX/[mem32] = EAX,EDX
-def DIV32m : MulOpM<0xF7, MRM6m, "div", Xi32, WriteDiv32, []>;
+def DIV32m : MulOpM<0xF7, MRM6m, "div", Xi32, WriteDiv32, []>, OpSize32;
 // RDX:RAX/[mem64] = RAX,RDX
 let Defs = [RAX,RDX,EFLAGS], Uses = [RAX,RDX] in
 def DIV64m : MulOpM<0xF7, MRM6m, "div", Xi64, WriteDiv64, []>,
@@ -583,10 +583,10 @@ let Defs = [AL,AH,EFLAGS], Uses = [AX] in
 def IDIV8r : MulOpR<0xF6, MRM7r, "idiv", Xi8, WriteIDiv8, []>;
 let Defs = [AX,DX,EFLAGS], Uses = [AX,DX] in
 // DX:AX/r16 = AX,DX
-def IDIV16r: MulOpR<0xF7, MRM7r, "idiv", Xi16, WriteIDiv16, []>;
+def IDIV16r: MulOpR<0xF7, MRM7r, "idiv", Xi16, WriteIDiv16, []>, OpSize16;
 let Defs = [EAX,EDX,EFLAGS], Uses = [EAX,EDX] in
 // EDX:EAX/r32 = EAX,EDX
-def IDIV32r: MulOpR<0xF7, MRM7r, "idiv", Xi32, WriteIDiv32, []>;
+def IDIV32r: MulOpR<0xF7, MRM7r, "idiv", Xi32, WriteIDiv32, []>, OpSize32;
 // RDX:RAX/r64 = RAX,RDX
 let Defs = [RAX,RDX,EFLAGS], Uses = [RAX,RDX] in
 def IDIV64r: MulOpR<0xF7, MRM7r, "idiv", Xi64, WriteIDiv64, []>;
@@ -596,10 +596,10 @@ let Defs = [AL,AH,EFLAGS], Uses = [AX] in
 def IDIV8m : MulOpM<0xF6, MRM7m, "idiv", Xi8, WriteIDiv8, []>;
 let Defs = [AX,DX,EFLAGS], Uses = [AX,DX] in
 // DX:AX/[mem16] = AX,DX
-def IDIV16m: MulOpM<0xF7, MRM7m, "idiv", Xi16, WriteIDiv16, []>;
+def IDIV16m: MulOpM<0xF7, MRM7m, "idiv", Xi16, WriteIDiv16, []>, OpSize16;
 let Defs = [EAX,EDX,EFLAGS], Uses = [EAX,EDX] in
 // EDX:EAX/[mem32] = EAX,EDX
-def IDIV32m: MulOpM<0xF7, MRM7m, "idiv", Xi32, WriteIDiv32, []>;
+def IDIV32m: MulOpM<0xF7, MRM7m, "idiv", Xi32, WriteIDiv32, []>, OpSize32;
 let Defs = [RAX,RDX,EFLAGS], Uses = [RAX,RDX] in // RDX:RAX/[mem64] = RAX,RDX
 // RDX:RAX/[mem64] = RAX,RDX
 def IDIV64m: MulOpM<0xF7, MRM7m, "idiv", Xi64, WriteIDiv64, []>,
@@ -608,26 +608,26 @@ def IDIV64m: MulOpM<0xF7, MRM7m, "idiv", Xi64, WriteIDiv64, []>,
 
 let Constraints = "$src1 = $dst" in {
 def NEG8r  : NegOpR<0xF6, "neg", Xi8>;
-def NEG16r : NegOpR<0xF7, "neg", Xi16>;
-def NEG32r : NegOpR<0xF7, "neg", Xi32>;
+def NEG16r : NegOpR<0xF7, "neg", Xi16>, OpSize16;
+def NEG32r : NegOpR<0xF7, "neg", Xi32>, OpSize32;
 def NEG64r : NegOpR<0xF7, "neg", Xi64>;
 }
 
 def NEG8m  : NegOpM<0xF6, "neg", Xi8>;
-def NEG16m : NegOpM<0xF7, "neg", Xi16>;
-def NEG32m : NegOpM<0xF7, "neg", Xi32>;
+def NEG16m : NegOpM<0xF7, "neg", Xi16>, OpSize16;
+def NEG32m : NegOpM<0xF7, "neg", Xi32>, OpSize32;
 def NEG64m : NegOpM<0xF7, "neg", Xi64>, Requires<[In64BitMode]>;
 
 let Constraints = "$src1 = $dst" in {
 def NOT8r  : NotOpR<0xF6, "not", Xi8>;
-def NOT16r : NotOpR<0xF7, "not", Xi16>;
-def NOT32r : NotOpR<0xF7, "not", Xi32>;
+def NOT16r : NotOpR<0xF7, "not", Xi16>, OpSize16;
+def NOT32r : NotOpR<0xF7, "not", Xi32>, OpSize32;
 def NOT64r : NotOpR<0xF7, "not", Xi64>;
 }
 
 def NOT8m  : NotOpM<0xF6, "not", Xi8>;
-def NOT16m : NotOpM<0xF7, "not", Xi16>;
-def NOT32m : NotOpM<0xF7, "not", Xi32>;
+def NOT16m : NotOpM<0xF7, "not", Xi16>, OpSize16;
+def NOT32m : NotOpM<0xF7, "not", Xi32>, OpSize32;
 def NOT64m : NotOpM<0xF7, "not", Xi64>, Requires<[In64BitMode]>;
 
 /// ArithBinOp_RF - This is an arithmetic binary operator where the pattern is
@@ -644,50 +644,50 @@ multiclass ArithBinOp_RF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
     let isCommutable = CommutableRR,
         isConvertibleToThreeAddress = ConvertibleToThreeAddressRR in {
     def NAME#8rr  : BinOpRR_RF<BaseOpc, mnemonic, Xi8 , opnodeflag>;
-    def NAME#16rr : BinOpRR_RF<BaseOpc, mnemonic, Xi16, opnodeflag>;
-    def NAME#32rr : BinOpRR_RF<BaseOpc, mnemonic, Xi32, opnodeflag>;
+    def NAME#16rr : BinOpRR_RF<BaseOpc, mnemonic, Xi16, opnodeflag>, OpSize16;
+    def NAME#32rr : BinOpRR_RF<BaseOpc, mnemonic, Xi32, opnodeflag>, OpSize32;
     def NAME#64rr : BinOpRR_RF<BaseOpc, mnemonic, Xi64, opnodeflag>;
     }
 
   def NAME#8rr_REV  : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi8>;
-  def NAME#16rr_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi16>;
-  def NAME#32rr_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi32>;
+  def NAME#16rr_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi16>, OpSize16;
+  def NAME#32rr_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi32>, OpSize32;
   def NAME#64rr_REV : BinOpRR_RF_Rev<BaseOpc2, mnemonic, Xi64>;
 
   def NAME#8rm   : BinOpRM_RF<BaseOpc2, mnemonic, Xi8 , opnodeflag>;
-  def NAME#16rm  : BinOpRM_RF<BaseOpc2, mnemonic, Xi16, opnodeflag>;
-  def NAME#32rm  : BinOpRM_RF<BaseOpc2, mnemonic, Xi32, opnodeflag>;
+  def NAME#16rm  : BinOpRM_RF<BaseOpc2, mnemonic, Xi16, opnodeflag>, OpSize16;
+  def NAME#32rm  : BinOpRM_RF<BaseOpc2, mnemonic, Xi32, opnodeflag>, OpSize32;
   def NAME#64rm  : BinOpRM_RF<BaseOpc2, mnemonic, Xi64, opnodeflag>;
 
     let isConvertibleToThreeAddress = ConvertibleToThreeAddress in {
     def NAME#8ri   : BinOpRI_RF<0x80, mnemonic, Xi8 , opnodeflag, RegMRM>;
     // NOTE: These are order specific, we want the ri8 forms to be listed
     // first so that they are slightly preferred to the ri forms.
-    def NAME#16ri8 : BinOpRI8_RF<0x83, mnemonic, Xi16, RegMRM>;
-    def NAME#32ri8 : BinOpRI8_RF<0x83, mnemonic, Xi32, RegMRM>;
+    def NAME#16ri8 : BinOpRI8_RF<0x83, mnemonic, Xi16, RegMRM>, OpSize16;
+    def NAME#32ri8 : BinOpRI8_RF<0x83, mnemonic, Xi32, RegMRM>, OpSize32;
     def NAME#64ri8 : BinOpRI8_RF<0x83, mnemonic, Xi64, RegMRM>;
 
-    def NAME#16ri  : BinOpRI_RF<0x81, mnemonic, Xi16, opnodeflag, RegMRM>;
-    def NAME#32ri  : BinOpRI_RF<0x81, mnemonic, Xi32, opnodeflag, RegMRM>;
+    def NAME#16ri  : BinOpRI_RF<0x81, mnemonic, Xi16, opnodeflag, RegMRM>, OpSize16;
+    def NAME#32ri  : BinOpRI_RF<0x81, mnemonic, Xi32, opnodeflag, RegMRM>, OpSize32;
     def NAME#64ri32: BinOpRI_RF<0x81, mnemonic, Xi64, opnodeflag, RegMRM>;
     }
   } // Constraints = "$src1 = $dst"
 
   def NAME#8mr    : BinOpMR_MF<BaseOpc, mnemonic, Xi8 , opnode>;
-  def NAME#16mr   : BinOpMR_MF<BaseOpc, mnemonic, Xi16, opnode>;
-  def NAME#32mr   : BinOpMR_MF<BaseOpc, mnemonic, Xi32, opnode>;
+  def NAME#16mr   : BinOpMR_MF<BaseOpc, mnemonic, Xi16, opnode>, OpSize16;
+  def NAME#32mr   : BinOpMR_MF<BaseOpc, mnemonic, Xi32, opnode>, OpSize32;
   def NAME#64mr   : BinOpMR_MF<BaseOpc, mnemonic, Xi64, opnode>;
 
   // NOTE: These are order specific, we want the mi8 forms to be listed
   // first so that they are slightly preferred to the mi forms.
-  def NAME#16mi8  : BinOpMI8_MF<mnemonic, Xi16, MemMRM>;
-  def NAME#32mi8  : BinOpMI8_MF<mnemonic, Xi32, MemMRM>;
+  def NAME#16mi8  : BinOpMI8_MF<mnemonic, Xi16, MemMRM>, OpSize16;
+  def NAME#32mi8  : BinOpMI8_MF<mnemonic, Xi32, MemMRM>, OpSize32;
   let Predicates = [In64BitMode] in
   def NAME#64mi8  : BinOpMI8_MF<mnemonic, Xi64, MemMRM>;
 
   def NAME#8mi    : BinOpMI_MF<0x80, mnemonic, Xi8 , opnode, MemMRM>;
-  def NAME#16mi   : BinOpMI_MF<0x81, mnemonic, Xi16, opnode, MemMRM>;
-  def NAME#32mi   : BinOpMI_MF<0x81, mnemonic, Xi32, opnode, MemMRM>;
+  def NAME#16mi   : BinOpMI_MF<0x81, mnemonic, Xi16, opnode, MemMRM>, OpSize16;
+  def NAME#32mi   : BinOpMI_MF<0x81, mnemonic, Xi32, opnode, MemMRM>, OpSize32;
   let Predicates = [In64BitMode] in
   def NAME#64mi32 : BinOpMI_MF<0x81, mnemonic, Xi64, opnode, MemMRM>;
 
@@ -702,9 +702,9 @@ multiclass ArithBinOp_RF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
   def NAME#8i8 : BinOpAI_AF<BaseOpc4, mnemonic, Xi8 , AL,
                             "{$src, %al|al, $src}">;
   def NAME#16i16 : BinOpAI_AF<BaseOpc4, mnemonic, Xi16, AX,
-                              "{$src, %ax|ax, $src}">;
+                              "{$src, %ax|ax, $src}">, OpSize16;
   def NAME#32i32 : BinOpAI_AF<BaseOpc4, mnemonic, Xi32, EAX,
-                              "{$src, %eax|eax, $src}">;
+                              "{$src, %eax|eax, $src}">, OpSize32;
   def NAME#64i32 : BinOpAI_AF<BaseOpc4, mnemonic, Xi64, RAX,
                               "{$src, %rax|rax, $src}">;
 }
@@ -723,51 +723,51 @@ multiclass ArithBinOp_RFF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
     let isCommutable = CommutableRR in {
     def NAME#8rr  : BinOpRRF_RF<BaseOpc, mnemonic, Xi8 , opnode>;
       let isConvertibleToThreeAddress = ConvertibleToThreeAddress in {
-      def NAME#16rr : BinOpRRF_RF<BaseOpc, mnemonic, Xi16, opnode>;
-      def NAME#32rr : BinOpRRF_RF<BaseOpc, mnemonic, Xi32, opnode>;
+      def NAME#16rr : BinOpRRF_RF<BaseOpc, mnemonic, Xi16, opnode>, OpSize16;
+      def NAME#32rr : BinOpRRF_RF<BaseOpc, mnemonic, Xi32, opnode>, OpSize32;
       def NAME#64rr : BinOpRRF_RF<BaseOpc, mnemonic, Xi64, opnode>;
     } // isConvertibleToThreeAddress
   } // isCommutable
 
   def NAME#8rr_REV  : BinOpRRF_RF_Rev<BaseOpc2, mnemonic, Xi8>;
-  def NAME#16rr_REV : BinOpRRF_RF_Rev<BaseOpc2, mnemonic, Xi16>;
-  def NAME#32rr_REV : BinOpRRF_RF_Rev<BaseOpc2, mnemonic, Xi32>;
+  def NAME#16rr_REV : BinOpRRF_RF_Rev<BaseOpc2, mnemonic, Xi16>, OpSize16;
+  def NAME#32rr_REV : BinOpRRF_RF_Rev<BaseOpc2, mnemonic, Xi32>, OpSize32;
   def NAME#64rr_REV : BinOpRRF_RF_Rev<BaseOpc2, mnemonic, Xi64>;
 
   def NAME#8rm   : BinOpRMF_RF<BaseOpc2, mnemonic, Xi8 , opnode>;
-  def NAME#16rm  : BinOpRMF_RF<BaseOpc2, mnemonic, Xi16, opnode>;
-  def NAME#32rm  : BinOpRMF_RF<BaseOpc2, mnemonic, Xi32, opnode>;
+  def NAME#16rm  : BinOpRMF_RF<BaseOpc2, mnemonic, Xi16, opnode>, OpSize16;
+  def NAME#32rm  : BinOpRMF_RF<BaseOpc2, mnemonic, Xi32, opnode>, OpSize32;
   def NAME#64rm  : BinOpRMF_RF<BaseOpc2, mnemonic, Xi64, opnode>;
 
   def NAME#8ri   : BinOpRIF_RF<0x80, mnemonic, Xi8 , opnode, RegMRM>;
     let isConvertibleToThreeAddress = ConvertibleToThreeAddress in {
     // NOTE: These are order specific, we want the ri8 forms to be listed
     // first so that they are slightly preferred to the ri forms.
-    def NAME#16ri8 : BinOpRI8F_RF<0x83, mnemonic, Xi16, RegMRM>;
-    def NAME#32ri8 : BinOpRI8F_RF<0x83, mnemonic, Xi32, RegMRM>;
+    def NAME#16ri8 : BinOpRI8F_RF<0x83, mnemonic, Xi16, RegMRM>, OpSize16;
+    def NAME#32ri8 : BinOpRI8F_RF<0x83, mnemonic, Xi32, RegMRM>, OpSize32;
     def NAME#64ri8 : BinOpRI8F_RF<0x83, mnemonic, Xi64, RegMRM>;
 
-    def NAME#16ri  : BinOpRIF_RF<0x81, mnemonic, Xi16, opnode, RegMRM>;
-    def NAME#32ri  : BinOpRIF_RF<0x81, mnemonic, Xi32, opnode, RegMRM>;
+    def NAME#16ri  : BinOpRIF_RF<0x81, mnemonic, Xi16, opnode, RegMRM>, OpSize16;
+    def NAME#32ri  : BinOpRIF_RF<0x81, mnemonic, Xi32, opnode, RegMRM>, OpSize32;
     def NAME#64ri32: BinOpRIF_RF<0x81, mnemonic, Xi64, opnode, RegMRM>;
     }
   } // Constraints = "$src1 = $dst"
 
   def NAME#8mr    : BinOpMRF_MF<BaseOpc, mnemonic, Xi8 , opnode>;
-  def NAME#16mr   : BinOpMRF_MF<BaseOpc, mnemonic, Xi16, opnode>;
-  def NAME#32mr   : BinOpMRF_MF<BaseOpc, mnemonic, Xi32, opnode>;
+  def NAME#16mr   : BinOpMRF_MF<BaseOpc, mnemonic, Xi16, opnode>, OpSize16;
+  def NAME#32mr   : BinOpMRF_MF<BaseOpc, mnemonic, Xi32, opnode>, OpSize32;
   def NAME#64mr   : BinOpMRF_MF<BaseOpc, mnemonic, Xi64, opnode>;
 
   // NOTE: These are order specific, we want the mi8 forms to be listed
   // first so that they are slightly preferred to the mi forms.
-  def NAME#16mi8  : BinOpMI8F_MF<mnemonic, Xi16, MemMRM>;
-  def NAME#32mi8  : BinOpMI8F_MF<mnemonic, Xi32, MemMRM>;
+  def NAME#16mi8  : BinOpMI8F_MF<mnemonic, Xi16, MemMRM>, OpSize16;
+  def NAME#32mi8  : BinOpMI8F_MF<mnemonic, Xi32, MemMRM>, OpSize32;
   let Predicates = [In64BitMode] in
   def NAME#64mi8  : BinOpMI8F_MF<mnemonic, Xi64, MemMRM>;
 
   def NAME#8mi    : BinOpMIF_MF<0x80, mnemonic, Xi8 , opnode, MemMRM>;
-  def NAME#16mi   : BinOpMIF_MF<0x81, mnemonic, Xi16, opnode, MemMRM>;
-  def NAME#32mi   : BinOpMIF_MF<0x81, mnemonic, Xi32, opnode, MemMRM>;
+  def NAME#16mi   : BinOpMIF_MF<0x81, mnemonic, Xi16, opnode, MemMRM>, OpSize16;
+  def NAME#32mi   : BinOpMIF_MF<0x81, mnemonic, Xi32, opnode, MemMRM>, OpSize32;
   let Predicates = [In64BitMode] in
   def NAME#64mi32 : BinOpMIF_MF<0x81, mnemonic, Xi64, opnode, MemMRM>;
 
@@ -782,9 +782,9 @@ multiclass ArithBinOp_RFF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
   def NAME#8i8 : BinOpAIF_AF<BaseOpc4, mnemonic, Xi8 , AL,
                              "{$src, %al|al, $src}">;
   def NAME#16i16 : BinOpAIF_AF<BaseOpc4, mnemonic, Xi16, AX,
-                               "{$src, %ax|ax, $src}">;
+                               "{$src, %ax|ax, $src}">, OpSize16;
   def NAME#32i32 : BinOpAIF_AF<BaseOpc4, mnemonic, Xi32, EAX,
-                               "{$src, %eax|eax, $src}">;
+                               "{$src, %eax|eax, $src}">, OpSize32;
   def NAME#64i32 : BinOpAIF_AF<BaseOpc4, mnemonic, Xi64, RAX,
                                "{$src, %rax|rax, $src}">;
 }
@@ -800,20 +800,20 @@ multiclass ArithBinOp_F<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
   let isCommutable = CommutableRR in {
   def NAME#8rr  : BinOpRR_F<BaseOpc, mnemonic, Xi8 , opnode>;
     let isConvertibleToThreeAddress = ConvertibleToThreeAddress in {
-    def NAME#16rr : BinOpRR_F<BaseOpc, mnemonic, Xi16, opnode>;
-    def NAME#32rr : BinOpRR_F<BaseOpc, mnemonic, Xi32, opnode>;
+    def NAME#16rr : BinOpRR_F<BaseOpc, mnemonic, Xi16, opnode>, OpSize16;
+    def NAME#32rr : BinOpRR_F<BaseOpc, mnemonic, Xi32, opnode>, OpSize32;
     def NAME#64rr : BinOpRR_F<BaseOpc, mnemonic, Xi64, opnode>;
     } // isConvertibleToThreeAddress
   } // isCommutable
 
   def NAME#8rr_REV  : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi8>;
-  def NAME#16rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi16>;
-  def NAME#32rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi32>;
+  def NAME#16rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi16>, OpSize16;
+  def NAME#32rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi32>, OpSize32;
   def NAME#64rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi64>;
 
   def NAME#8rm   : BinOpRM_F<BaseOpc2, mnemonic, Xi8 , opnode>;
-  def NAME#16rm  : BinOpRM_F<BaseOpc2, mnemonic, Xi16, opnode>;
-  def NAME#32rm  : BinOpRM_F<BaseOpc2, mnemonic, Xi32, opnode>;
+  def NAME#16rm  : BinOpRM_F<BaseOpc2, mnemonic, Xi16, opnode>, OpSize16;
+  def NAME#32rm  : BinOpRM_F<BaseOpc2, mnemonic, Xi32, opnode>, OpSize32;
   def NAME#64rm  : BinOpRM_F<BaseOpc2, mnemonic, Xi64, opnode>;
 
   def NAME#8ri   : BinOpRI_F<0x80, mnemonic, Xi8 , opnode, RegMRM>;
@@ -821,30 +821,30 @@ multiclass ArithBinOp_F<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
   let isConvertibleToThreeAddress = ConvertibleToThreeAddress in {
   // NOTE: These are order specific, we want the ri8 forms to be listed
   // first so that they are slightly preferred to the ri forms.
-  def NAME#16ri8 : BinOpRI8_F<0x83, mnemonic, Xi16, RegMRM>;
-  def NAME#32ri8 : BinOpRI8_F<0x83, mnemonic, Xi32, RegMRM>;
+  def NAME#16ri8 : BinOpRI8_F<0x83, mnemonic, Xi16, RegMRM>, OpSize16;
+  def NAME#32ri8 : BinOpRI8_F<0x83, mnemonic, Xi32, RegMRM>, OpSize32;
   def NAME#64ri8 : BinOpRI8_F<0x83, mnemonic, Xi64, RegMRM>;
 
-  def NAME#16ri  : BinOpRI_F<0x81, mnemonic, Xi16, opnode, RegMRM>;
-  def NAME#32ri  : BinOpRI_F<0x81, mnemonic, Xi32, opnode, RegMRM>;
+  def NAME#16ri  : BinOpRI_F<0x81, mnemonic, Xi16, opnode, RegMRM>, OpSize16;
+  def NAME#32ri  : BinOpRI_F<0x81, mnemonic, Xi32, opnode, RegMRM>, OpSize32;
   def NAME#64ri32: BinOpRI_F<0x81, mnemonic, Xi64, opnode, RegMRM>;
   }
 
   def NAME#8mr    : BinOpMR_F<BaseOpc, mnemonic, Xi8 , opnode>;
-  def NAME#16mr   : BinOpMR_F<BaseOpc, mnemonic, Xi16, opnode>;
-  def NAME#32mr   : BinOpMR_F<BaseOpc, mnemonic, Xi32, opnode>;
+  def NAME#16mr   : BinOpMR_F<BaseOpc, mnemonic, Xi16, opnode>, OpSize16;
+  def NAME#32mr   : BinOpMR_F<BaseOpc, mnemonic, Xi32, opnode>, OpSize32;
   def NAME#64mr   : BinOpMR_F<BaseOpc, mnemonic, Xi64, opnode>;
 
   // NOTE: These are order specific, we want the mi8 forms to be listed
   // first so that they are slightly preferred to the mi forms.
-  def NAME#16mi8  : BinOpMI8_F<mnemonic, Xi16, MemMRM>;
-  def NAME#32mi8  : BinOpMI8_F<mnemonic, Xi32, MemMRM>;
+  def NAME#16mi8  : BinOpMI8_F<mnemonic, Xi16, MemMRM>, OpSize16;
+  def NAME#32mi8  : BinOpMI8_F<mnemonic, Xi32, MemMRM>, OpSize32;
   let Predicates = [In64BitMode] in
   def NAME#64mi8  : BinOpMI8_F<mnemonic, Xi64, MemMRM>;
 
   def NAME#8mi    : BinOpMI_F<0x80, mnemonic, Xi8 , opnode, MemMRM>;
-  def NAME#16mi   : BinOpMI_F<0x81, mnemonic, Xi16, opnode, MemMRM>;
-  def NAME#32mi   : BinOpMI_F<0x81, mnemonic, Xi32, opnode, MemMRM>;
+  def NAME#16mi   : BinOpMI_F<0x81, mnemonic, Xi16, opnode, MemMRM>, OpSize16;
+  def NAME#32mi   : BinOpMI_F<0x81, mnemonic, Xi32, opnode, MemMRM>, OpSize32;
   let Predicates = [In64BitMode] in
   def NAME#64mi32 : BinOpMI_F<0x81, mnemonic, Xi64, opnode, MemMRM>;
 
@@ -859,9 +859,9 @@ multiclass ArithBinOp_F<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
   def NAME#8i8 : BinOpAI_F<BaseOpc4, mnemonic, Xi8 , AL,
                            "{$src, %al|al, $src}">;
   def NAME#16i16 : BinOpAI_F<BaseOpc4, mnemonic, Xi16, AX,
-                           "{$src, %ax|ax, $src}">;
+                           "{$src, %ax|ax, $src}">, OpSize16;
   def NAME#32i32 : BinOpAI_F<BaseOpc4, mnemonic, Xi32, EAX,
-                           "{$src, %eax|eax, $src}">;
+                           "{$src, %eax|eax, $src}">, OpSize32;
   def NAME#64i32 : BinOpAI_F<BaseOpc4, mnemonic, Xi64, RAX,
                            "{$src, %rax|rax, $src}">;
 }
@@ -1039,31 +1039,31 @@ let isCompare = 1 in {
   // combine them. This gives bunch of other patterns that start with
   // and a chance to match.
   def TEST8rr  : BinOpRR_F<0x84, "test", Xi8 , null_frag>;
-  def TEST16rr : BinOpRR_F<0x85, "test", Xi16, null_frag>;
-  def TEST32rr : BinOpRR_F<0x85, "test", Xi32, null_frag>;
+  def TEST16rr : BinOpRR_F<0x85, "test", Xi16, null_frag>, OpSize16;
+  def TEST32rr : BinOpRR_F<0x85, "test", Xi32, null_frag>, OpSize32;
   def TEST64rr : BinOpRR_F<0x85, "test", Xi64, null_frag>;
   } // isCommutable
 
 def TEST8mr    : BinOpMR_F<0x84, "test", Xi8 , null_frag>;
-def TEST16mr   : BinOpMR_F<0x85, "test", Xi16, null_frag>;
-def TEST32mr   : BinOpMR_F<0x85, "test", Xi32, null_frag>;
+def TEST16mr   : BinOpMR_F<0x85, "test", Xi16, null_frag>, OpSize16;
+def TEST32mr   : BinOpMR_F<0x85, "test", Xi32, null_frag>, OpSize32;
 def TEST64mr   : BinOpMR_F<0x85, "test", Xi64, null_frag>;
 
 def TEST8ri    : BinOpRI_F<0xF6, "test", Xi8 , X86testpat, MRM0r>;
-def TEST16ri   : BinOpRI_F<0xF7, "test", Xi16, X86testpat, MRM0r>;
-def TEST32ri   : BinOpRI_F<0xF7, "test", Xi32, X86testpat, MRM0r>;
+def TEST16ri   : BinOpRI_F<0xF7, "test", Xi16, X86testpat, MRM0r>, OpSize16;
+def TEST32ri   : BinOpRI_F<0xF7, "test", Xi32, X86testpat, MRM0r>, OpSize32;
 def TEST64ri32 : BinOpRI_F<0xF7, "test", Xi64, X86testpat, MRM0r>;
 
 def TEST8mi    : BinOpMI_F<0xF6, "test", Xi8 , X86testpat, MRM0m>;
-def TEST16mi   : BinOpMI_F<0xF7, "test", Xi16, X86testpat, MRM0m>;
-def TEST32mi   : BinOpMI_F<0xF7, "test", Xi32, X86testpat, MRM0m>;
+def TEST16mi   : BinOpMI_F<0xF7, "test", Xi16, X86testpat, MRM0m>, OpSize16;
+def TEST32mi   : BinOpMI_F<0xF7, "test", Xi32, X86testpat, MRM0m>, OpSize32;
 
   let Predicates = [In64BitMode] in
   def TEST64mi32 : BinOpMI_F<0xF7, "test", Xi64, X86testpat, MRM0m>;
 
 def TEST8i8 : BinOpAI_F<0xA8, "test", Xi8 , AL, "{$src, %al|al, $src}">;
-def TEST16i16 : BinOpAI_F<0xA9, "test", Xi16, AX, "{$src, %ax|ax, $src}">;
-def TEST32i32 : BinOpAI_F<0xA9, "test", Xi32, EAX, "{$src, %eax|eax, $src}">;
+def TEST16i16 : BinOpAI_F<0xA9, "test", Xi16, AX, "{$src, %ax|ax, $src}">, OpSize16;
+def TEST32i32 : BinOpAI_F<0xA9, "test", Xi32, EAX, "{$src, %eax|eax, $src}">, OpSize32;
 def TEST64i32 : BinOpAI_F<0xA9, "test", Xi64, RAX, "{$src, %rax|rax, $src}">;
 } // isCompare
 
diff --git a/llvm/lib/Target/X86/X86InstrFormats.td b/llvm/lib/Target/X86/X86InstrFormats.td
index f94072a0c7076..07e5576960d65 100644
--- a/llvm/lib/Target/X86/X86InstrFormats.td
+++ b/llvm/lib/Target/X86/X86InstrFormats.td
@@ -180,7 +180,7 @@ class OperandSize<bits<2> val> {
   bits<2> Value = val;
 }
 def OpSizeFixed  : OperandSize<0>; // Never needs a 0x66 prefix.
-def OpSize16     : OperandSize<1>; // Needs 0x66 prefix in 32-bit mode.
+def OpSize16     : OperandSize<1>; // Needs 0x66 prefix in 32/64-bit mode.
 def OpSize32     : OperandSize<2>; // Needs 0x66 prefix in 16-bit mode.
 
 // Address size for encodings that change based on mode.
diff --git a/llvm/lib/Target/X86/X86InstrUtils.td b/llvm/lib/Target/X86/X86InstrUtils.td
index 919e941abfd11..ac0507fce94fb 100644
--- a/llvm/lib/Target/X86/X86InstrUtils.td
+++ b/llvm/lib/Target/X86/X86InstrUtils.td
@@ -122,8 +122,7 @@ class X86TypeInfo<ValueType vt, string instrsuffix, RegisterClass regclass,
                   Operand immoperand, SDPatternOperator immoperator,
                   SDPatternOperator immnosuoperator, Operand imm8operand,
                   SDPatternOperator imm8operator, SDPatternOperator imm8nosuoperator,
-                  bit hasEvenOpcode, OperandSize opSize,
-                  bit hasREX_W> {
+                  bit hasEvenOpcode, bit hasREX_W> {
   /// VT - This is the value type itself.
   ValueType VT = vt;
 
@@ -177,11 +176,6 @@ class X86TypeInfo<ValueType vt, string instrsuffix, RegisterClass regclass,
   /// other datatypes are usually odd.
   bit HasEvenOpcode = hasEvenOpcode;
 
-  /// OpSize - Selects whether the instruction needs a 0x66 prefix based on
-  /// 16-bit vs 32-bit mode. i8/i64 set this to OpSizeFixed. i16 sets this
-  /// to Opsize16. i32 sets this to OpSize32.
-  OperandSize OpSize = opSize;
-
   /// HasREX_W - This bit is set to true if the instruction should have
   /// the 0x40 REX prefix.  This is set for i64 types.
   bit HasREX_W = hasREX_W;
@@ -191,16 +185,16 @@ def invalid_node : SDNode<"<<invalid_node>>", SDTIntLeaf,[],"<<invalid_node>>">;
 
 def Xi8  : X86TypeInfo<i8, "b", GR8, loadi8, i8mem, Imm8, i8imm,
                        imm_su, imm, i8imm, invalid_node, invalid_node,
-                       1, OpSizeFixed, 0>;
+                       1, 0>;
 def Xi16 : X86TypeInfo<i16, "w", GR16, loadi16, i16mem, Imm16, i16imm,
                        imm_su, imm, i16i8imm, i16immSExt8_su, i16immSExt8,
-                       0, OpSize16, 0>;
+                       0, 0>;
 def Xi32 : X86TypeInfo<i32, "l", GR32, loadi32, i32mem, Imm32, i32imm,
                        imm_su, imm, i32i8imm, i32immSExt8_su, i32immSExt8,
-                       0, OpSize32, 0>;
+                       0, 0>;
 def Xi64 : X86TypeInfo<i64, "q", GR64, loadi64, i64mem, Imm32S, i64i32imm,
-                      i64immSExt32_su, i64immSExt32, i64i8imm, i64immSExt8_su,
-                      i64immSExt8, 0, OpSizeFixed, 1>;
+                       i64immSExt32_su, i64immSExt32, i64i8imm, i64immSExt8_su,
+                       i64immSExt8, 0, 1>;
 
 // Group template arguments that can be derived from the vector type (EltNum x
 // EltVT).  These are things like the register class for the writemask, etc.
@@ -960,9 +954,8 @@ class MMXIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
 /// Using this, it:
 /// 1. Concatenates together the instruction mnemonic with the appropriate
 ///    suffix letter, a tab, and the arguments.
-/// 2. Infers whether the instruction should have a 0x66 prefix byte.
-/// 3. Infers whether the instruction should have a 0x40 REX_W prefix.
-/// 4. Infers whether the low bit of the opcode should be 0 (for i8 operations)
+/// 2. Infers whether the instruction should have a 0x40 REX_W prefix.
+/// 3. Infers whether the low bit of the opcode should be 0 (for i8 operations)
 ///    or 1 (for i16,i32,i64 operations).
 class ITy<bits<8> opcode, Format f, X86TypeInfo typeinfo, dag outs, dag ins,
           string mnemonic, string args, list<dag> pattern>
@@ -972,7 +965,5 @@ class ITy<bits<8> opcode, Format f, X86TypeInfo typeinfo, dag outs, dag ins,
       !strconcat(mnemonic, "{", typeinfo.InstrSuffix, "}\t", args), pattern> {
 
   let hasSideEffects = 0;
-  // Infer instruction prefixes from type info.
-  let OpSize = typeinfo.OpSize;
   let hasREX_W  = typeinfo.HasREX_W;
 }

From 1daf2994de49d1ecba4bee4e6842aa8a564cbc96 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Sat, 23 Dec 2023 22:21:52 -0800
Subject: [PATCH 220/342] [llvm] Use StringRef::contains (NFC)

---
 llvm/lib/Support/Signals.cpp                                | 2 +-
 .../lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp | 2 +-
 llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp                     | 4 ++--
 llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp    | 2 +-
 llvm/lib/Transforms/Instrumentation/MemProfiler.cpp         | 2 +-
 llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp   | 2 +-
 llvm/utils/TableGen/AsmMatcherEmitter.cpp                   | 6 +++---
 7 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/Support/Signals.cpp b/llvm/lib/Support/Signals.cpp
index 669a9e2a83965..9f9030e79d104 100644
--- a/llvm/lib/Support/Signals.cpp
+++ b/llvm/lib/Support/Signals.cpp
@@ -145,7 +145,7 @@ static bool printSymbolizedStackTrace(StringRef Argv0, void **StackTrace,
     return false;
 
   // Don't recursively invoke the llvm-symbolizer binary.
-  if (Argv0.find("llvm-symbolizer") != std::string::npos)
+  if (Argv0.contains("llvm-symbolizer"))
     return false;
 
   // FIXME: Subtract necessary number from StackTrace entries to turn return addresses
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
index fffd5abd9f8bd..0740ac58a3381 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
@@ -554,7 +554,7 @@ MCSubtargetInfo *Hexagon_MC::createHexagonMCSubtargetInfo(const Triple &TT,
   // Add qfloat subtarget feature by default to v68 and above
   // unless explicitely disabled
   if (checkFeature(X, Hexagon::ExtensionHVXV68) &&
-      ArchFS.find("-hvx-qfloat", 0) == std::string::npos) {
+      !ArchFS.contains("-hvx-qfloat")) {
     llvm::FeatureBitset Features = X->getFeatureBits();
     X->setFeatureBits(Features.set(Hexagon::ExtensionHVXQFloat));
   }
diff --git a/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp b/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp
index 5ac45079bd002..c85bd27d256b2 100644
--- a/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp
@@ -1617,7 +1617,7 @@ static bool buildEnqueueKernel(const SPIRV::IncomingCall *Call,
                                SPIRVGlobalRegistry *GR) {
   MachineRegisterInfo *MRI = MIRBuilder.getMRI();
   const DataLayout &DL = MIRBuilder.getDataLayout();
-  bool HasEvents = Call->Builtin->Name.find("events") != StringRef::npos;
+  bool HasEvents = Call->Builtin->Name.contains("events");
   const SPIRVType *Int32Ty = GR->getOrCreateSPIRVIntegerType(32, MIRBuilder);
 
   // Make vararg instructions before OpEnqueueKernel.
@@ -2098,7 +2098,7 @@ parseBuiltinTypeNameToTargetExtType(std::string TypeName,
 
   // Parameterized SPIR-V builtins names follow this format:
   // e.g. %spirv.Image._void_1_0_0_0_0_0_0, %spirv.Pipe._0
-  if (NameWithParameters.find('_') == std::string::npos)
+  if (!NameWithParameters.contains('_'))
     return TargetExtType::get(MIRBuilder.getContext(), NameWithParameters);
 
   SmallVector<StringRef> Parameters;
diff --git a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index 6468d07b4f4f4..afb0e6cd1548b 100644
--- a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -2737,7 +2737,7 @@ bool AddressSanitizer::maybeInsertAsanInitAtFunctionEntry(Function &F) {
   // the shadow memory.
   // We cannot just ignore these methods, because they may call other
   // instrumented functions.
-  if (F.getName().find(" load]") != std::string::npos) {
+  if (F.getName().contains(" load]")) {
     FunctionCallee AsanInitFunction =
         declareSanitizerInitFunction(*F.getParent(), kAsanInitName, {});
     IRBuilder<> IRB(&F.front(), F.front().begin());
diff --git a/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp b/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp
index 539b7441d24b3..2236e9cd44c50 100644
--- a/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp
@@ -535,7 +535,7 @@ bool MemProfiler::maybeInsertMemProfInitAtFunctionEntry(Function &F) {
   // the shadow memory.
   // We cannot just ignore these methods, because they may call other
   // instrumented functions.
-  if (F.getName().find(" load]") != std::string::npos) {
+  if (F.getName().contains(" load]")) {
     FunctionCallee MemProfInitFunction =
         declareSanitizerInitFunction(*F.getParent(), MemProfInitName, {});
     IRBuilder<> IRB(&F.front(), F.front().begin());
diff --git a/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp b/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
index fe672a4377a1f..ce570bdfd8b8d 100644
--- a/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
+++ b/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
@@ -603,7 +603,7 @@ void ModuleSanitizerCoverage::instrumentFunction(
     Function &F, DomTreeCallback DTCallback, PostDomTreeCallback PDTCallback) {
   if (F.empty())
     return;
-  if (F.getName().find(".module_ctor") != std::string::npos)
+  if (F.getName().contains(".module_ctor"))
     return; // Should not instrument sanitizer init functions.
   if (F.getName().starts_with("__sanitizer_"))
     return; // Don't instrument __sanitizer_* callbacks.
diff --git a/llvm/utils/TableGen/AsmMatcherEmitter.cpp b/llvm/utils/TableGen/AsmMatcherEmitter.cpp
index 7deeff8887dbb..73724e662f9e8 100644
--- a/llvm/utils/TableGen/AsmMatcherEmitter.cpp
+++ b/llvm/utils/TableGen/AsmMatcherEmitter.cpp
@@ -985,7 +985,7 @@ void MatchableInfo::tokenizeAsmString(const AsmMatcherInfo &Info,
   bool IsIsolatedToken = true;
   for (size_t i = 0, e = String.size(); i != e; ++i) {
     char Char = String[i];
-    if (Variant.BreakCharacters.find(Char) != std::string::npos) {
+    if (Variant.BreakCharacters.contains(Char)) {
       if (InTok) {
         addAsmOperand(String.slice(Prev, i), false);
         Prev = i;
@@ -994,7 +994,7 @@ void MatchableInfo::tokenizeAsmString(const AsmMatcherInfo &Info,
       InTok = true;
       continue;
     }
-    if (Variant.TokenizingCharacters.find(Char) != std::string::npos) {
+    if (Variant.TokenizingCharacters.contains(Char)) {
       if (InTok) {
         addAsmOperand(String.slice(Prev, i), IsIsolatedToken);
         InTok = false;
@@ -1005,7 +1005,7 @@ void MatchableInfo::tokenizeAsmString(const AsmMatcherInfo &Info,
       IsIsolatedToken = true;
       continue;
     }
-    if (Variant.SeparatorCharacters.find(Char) != std::string::npos) {
+    if (Variant.SeparatorCharacters.contains(Char)) {
       if (InTok) {
         addAsmOperand(String.slice(Prev, i), IsIsolatedToken);
         InTok = false;

From 0f1721c480369bad1c8d3f9a664f8db6853f35fc Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Sat, 23 Dec 2023 22:30:03 -0800
Subject: [PATCH 221/342] [clang-tidy] Use StringRef::contains (NFC)

---
 .../clang-tidy/performance/InefficientAlgorithmCheck.cpp        | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang-tools-extra/clang-tidy/performance/InefficientAlgorithmCheck.cpp b/clang-tools-extra/clang-tidy/performance/InefficientAlgorithmCheck.cpp
index 3c8751dbdd733..ad900fcec2dee 100644
--- a/clang-tools-extra/clang-tidy/performance/InefficientAlgorithmCheck.cpp
+++ b/clang-tools-extra/clang-tidy/performance/InefficientAlgorithmCheck.cpp
@@ -97,7 +97,7 @@ void InefficientAlgorithmCheck::check(const MatchFinder::MatchResult &Result) {
   if (!AlgDecl)
     return;
 
-  if (Unordered && AlgDecl->getName().find("bound") != llvm::StringRef::npos)
+  if (Unordered && AlgDecl->getName().contains("bound"))
     return;
 
   const auto *AlgParam = Result.Nodes.getNodeAs<Expr>("AlgParam");

From eeeb963841e05e3d53d730a1a46fd9fa9996d409 Mon Sep 17 00:00:00 2001
From: Nikolas Klauser <nikolasklauser@berlin.de>
Date: Sun, 24 Dec 2023 08:45:25 +0100
Subject: [PATCH 222/342] [libc++] Use __datasizeof for __libcpp_datasizeof if
 available (#72104)

This avoids the UB and makes things a bit cheaper in terms of
compile-times.
---
 libcxx/include/__type_traits/datasizeof.h | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/libcxx/include/__type_traits/datasizeof.h b/libcxx/include/__type_traits/datasizeof.h
index 5688e3293a69e..3a8b151601073 100644
--- a/libcxx/include/__type_traits/datasizeof.h
+++ b/libcxx/include/__type_traits/datasizeof.h
@@ -28,13 +28,17 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 
 template <class _Tp>
 struct __libcpp_datasizeof {
-#if __has_cpp_attribute(__no_unique_address__)
+#if __has_extension(datasizeof)
+  static const size_t value = __datasizeof(_Tp);
+#else
+// NOLINTNEXTLINE(readability-redundant-preprocessor) This is https://llvm.org/PR64825
+#  if __has_cpp_attribute(__no_unique_address__)
   template <class = char>
   struct _FirstPaddingByte {
     [[__no_unique_address__]] _Tp __v_;
     char __first_padding_byte_;
   };
-#else
+#  else
   template <bool = __libcpp_is_final<_Tp>::value || !is_class<_Tp>::value>
   struct _FirstPaddingByte : _Tp {
     char __first_padding_byte_;
@@ -45,7 +49,7 @@ struct __libcpp_datasizeof {
     _Tp __v_;
     char __first_padding_byte_;
   };
-#endif
+#  endif // __has_cpp_attribute(__no_unique_address__)
 
   // _FirstPaddingByte<> is sometimes non-standard layout. Using `offsetof` is UB in that case, but GCC and Clang allow
   // the use as an extension.
@@ -53,6 +57,7 @@ struct __libcpp_datasizeof {
   _LIBCPP_CLANG_DIAGNOSTIC_IGNORED("-Winvalid-offsetof")
   static const size_t value = offsetof(_FirstPaddingByte<>, __first_padding_byte_);
   _LIBCPP_DIAGNOSTIC_POP
+#endif   // __has_extension(datasizeof)
 };
 
 _LIBCPP_END_NAMESPACE_STD

From 50ae0da058297944ef26bf9c01f4133be65676ab Mon Sep 17 00:00:00 2001
From: Nikolas Klauser <nikolasklauser@berlin.de>
Date: Sun, 24 Dec 2023 08:50:01 +0100
Subject: [PATCH 223/342] [libc++][NFC] Refactor <experimental/simd> a bit to
 simplify dependencies (#76283)

---
 libcxx/include/CMakeLists.txt                 |  2 -
 libcxx/include/experimental/__simd/abi_tag.h  | 55 -------------------
 .../include/experimental/__simd/aligned_tag.h | 13 ++++-
 .../include/experimental/__simd/declaration.h | 52 +++++++++++++++++-
 .../__simd/internal_declaration.h             | 41 --------------
 libcxx/include/experimental/__simd/scalar.h   |  2 +-
 libcxx/include/experimental/__simd/simd.h     |  2 -
 .../include/experimental/__simd/simd_mask.h   |  2 -
 libcxx/include/experimental/__simd/traits.h   | 15 +----
 libcxx/include/experimental/__simd/vec_ext.h  |  2 +-
 libcxx/include/experimental/simd              |  1 -
 libcxx/include/module.modulemap.in            |  2 -
 12 files changed, 66 insertions(+), 123 deletions(-)
 delete mode 100644 libcxx/include/experimental/__simd/abi_tag.h
 delete mode 100644 libcxx/include/experimental/__simd/internal_declaration.h

diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt
index 746d5812fba04..0fe3ab44d2466 100644
--- a/libcxx/include/CMakeLists.txt
+++ b/libcxx/include/CMakeLists.txt
@@ -914,10 +914,8 @@ set(files
   expected
   experimental/__config
   experimental/__memory
-  experimental/__simd/abi_tag.h
   experimental/__simd/aligned_tag.h
   experimental/__simd/declaration.h
-  experimental/__simd/internal_declaration.h
   experimental/__simd/reference.h
   experimental/__simd/scalar.h
   experimental/__simd/simd.h
diff --git a/libcxx/include/experimental/__simd/abi_tag.h b/libcxx/include/experimental/__simd/abi_tag.h
deleted file mode 100644
index cec5be65ce5c2..0000000000000
--- a/libcxx/include/experimental/__simd/abi_tag.h
+++ /dev/null
@@ -1,55 +0,0 @@
-// -*- C++ -*-
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _LIBCPP_EXPERIMENTAL___SIMD_ABI_TAG_H
-#define _LIBCPP_EXPERIMENTAL___SIMD_ABI_TAG_H
-
-#include <cstddef>
-#include <experimental/__config>
-#include <experimental/__simd/internal_declaration.h>
-
-#if _LIBCPP_STD_VER >= 17 && defined(_LIBCPP_ENABLE_EXPERIMENTAL)
-
-_LIBCPP_BEGIN_NAMESPACE_EXPERIMENTAL
-inline namespace parallelism_v2 {
-namespace simd_abi {
-
-using scalar = __scalar;
-
-// TODO: make this platform dependent
-template <int _Np>
-using fixed_size = __vec_ext<_Np>;
-
-template <class _Tp>
-inline constexpr int max_fixed_size = 32;
-
-// TODO: make this platform dependent
-template <class _Tp>
-using compatible = __vec_ext<16 / sizeof(_Tp)>;
-
-// TODO: make this platform dependent
-template <class _Tp>
-using native = __vec_ext<_LIBCPP_NATIVE_SIMD_WIDTH_IN_BYTES / sizeof(_Tp)>;
-
-// TODO: make this platform dependent
-template <class _Tp, size_t _Np, class... _Abis>
-struct deduce {
-  using type = fixed_size<_Np>;
-};
-
-// TODO: make this platform dependent
-template <class _Tp, size_t _Np, class... _Abis>
-using deduce_t = typename deduce<_Tp, _Np, _Abis...>::type;
-
-} // namespace simd_abi
-} // namespace parallelism_v2
-_LIBCPP_END_NAMESPACE_EXPERIMENTAL
-
-#endif // _LIBCPP_STD_VER >= 17 && defined(_LIBCPP_ENABLE_EXPERIMENTAL)
-#endif // _LIBCPP_EXPERIMENTAL___SIMD_ABI_TAG_H
diff --git a/libcxx/include/experimental/__simd/aligned_tag.h b/libcxx/include/experimental/__simd/aligned_tag.h
index d216a21c073f3..edbb3b24931f5 100644
--- a/libcxx/include/experimental/__simd/aligned_tag.h
+++ b/libcxx/include/experimental/__simd/aligned_tag.h
@@ -10,10 +10,10 @@
 #ifndef _LIBCPP_EXPERIMENTAL___SIMD_ALIGNED_TAG_H
 #define _LIBCPP_EXPERIMENTAL___SIMD_ALIGNED_TAG_H
 
-#include <__bit/bit_ceil.h>
 #include <__memory/assume_aligned.h>
 #include <cstddef>
 #include <experimental/__config>
+#include <experimental/__simd/traits.h>
 
 #if _LIBCPP_STD_VER >= 17 && defined(_LIBCPP_ENABLE_EXPERIMENTAL)
 
@@ -30,9 +30,12 @@ struct element_aligned_tag {
   }
 };
 
+template <>
+inline constexpr bool is_simd_flag_type_v<element_aligned_tag> = true;
+
 struct vector_aligned_tag {
   template <class _Tp, class _Up = typename _Tp::value_type>
-  static constexpr size_t __alignment = std::__bit_ceil(sizeof(_Up) * _Tp::size());
+  static constexpr size_t __alignment = memory_alignment_v<_Tp, _Up>;
 
   template <class _Tp, class _Up>
   static _LIBCPP_HIDE_FROM_ABI constexpr _Up* __apply(_Up* __ptr) {
@@ -40,6 +43,9 @@ struct vector_aligned_tag {
   }
 };
 
+template <>
+inline constexpr bool is_simd_flag_type_v<vector_aligned_tag> = true;
+
 template <size_t _Np>
 struct overaligned_tag {
   template <class _Tp, class _Up = typename _Tp::value_type>
@@ -51,6 +57,9 @@ struct overaligned_tag {
   }
 };
 
+template <size_t _Np>
+inline constexpr bool is_simd_flag_type_v<overaligned_tag<_Np>> = true;
+
 inline constexpr element_aligned_tag element_aligned{};
 
 inline constexpr vector_aligned_tag vector_aligned{};
diff --git a/libcxx/include/experimental/__simd/declaration.h b/libcxx/include/experimental/__simd/declaration.h
index 065faeaec3841..7b45d035c2712 100644
--- a/libcxx/include/experimental/__simd/declaration.h
+++ b/libcxx/include/experimental/__simd/declaration.h
@@ -10,13 +10,63 @@
 #ifndef _LIBCPP_EXPERIMENTAL___SIMD_DECLARATION_H
 #define _LIBCPP_EXPERIMENTAL___SIMD_DECLARATION_H
 
+#include <cstddef>
 #include <experimental/__config>
-#include <experimental/__simd/abi_tag.h>
 
 #if _LIBCPP_STD_VER >= 17 && defined(_LIBCPP_ENABLE_EXPERIMENTAL)
 
 _LIBCPP_BEGIN_NAMESPACE_EXPERIMENTAL
 inline namespace parallelism_v2 {
+namespace simd_abi {
+template <int>
+struct __vec_ext;
+struct __scalar;
+
+using scalar = __scalar;
+
+// TODO: make this platform dependent
+template <int _Np>
+using fixed_size = __vec_ext<_Np>;
+
+template <class _Tp>
+inline constexpr int max_fixed_size = 32;
+
+// TODO: make this platform dependent
+template <class _Tp>
+using compatible = __vec_ext<16 / sizeof(_Tp)>;
+
+// TODO: make this platform dependent
+template <class _Tp>
+using native = __vec_ext<_LIBCPP_NATIVE_SIMD_WIDTH_IN_BYTES / sizeof(_Tp)>;
+
+// TODO: make this platform dependent
+template <class _Tp, size_t _Np, class... _Abis>
+struct deduce {
+  using type = fixed_size<_Np>;
+};
+
+// TODO: make this platform dependent
+template <class _Tp, size_t _Np, class... _Abis>
+using deduce_t = typename deduce<_Tp, _Np, _Abis...>::type;
+
+} // namespace simd_abi
+
+template <class _Tp, class _Abi>
+struct __simd_storage;
+
+template <class _Tp, class _Abi>
+struct __mask_storage;
+
+template <class _Tp, class _Abi>
+struct __simd_operations;
+
+template <class _Tp, class _Abi>
+struct __mask_operations;
+
+struct element_aligned_tag;
+struct vector_aligned_tag;
+template <size_t>
+struct overaligned_tag;
 
 template <class _Tp, class _Abi = simd_abi::compatible<_Tp>>
 class simd;
diff --git a/libcxx/include/experimental/__simd/internal_declaration.h b/libcxx/include/experimental/__simd/internal_declaration.h
deleted file mode 100644
index 9ad1ad1ae3192..0000000000000
--- a/libcxx/include/experimental/__simd/internal_declaration.h
+++ /dev/null
@@ -1,41 +0,0 @@
-// -*- C++ -*-
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _LIBCPP_EXPERIMENTAL___SIMD_INTERNAL_DECLARATION_H
-#define _LIBCPP_EXPERIMENTAL___SIMD_INTERNAL_DECLARATION_H
-
-#include <experimental/__config>
-
-#if _LIBCPP_STD_VER >= 17 && defined(_LIBCPP_ENABLE_EXPERIMENTAL)
-
-_LIBCPP_BEGIN_NAMESPACE_EXPERIMENTAL
-inline namespace parallelism_v2 {
-namespace simd_abi {
-template <int>
-struct __vec_ext;
-struct __scalar;
-} // namespace simd_abi
-
-template <class _Tp, class _Abi>
-struct __simd_storage;
-
-template <class _Tp, class _Abi>
-struct __mask_storage;
-
-template <class _Tp, class _Abi>
-struct __simd_operations;
-
-template <class _Tp, class _Abi>
-struct __mask_operations;
-
-} // namespace parallelism_v2
-_LIBCPP_END_NAMESPACE_EXPERIMENTAL
-
-#endif // _LIBCPP_STD_VER >= 17 && defined(_LIBCPP_ENABLE_EXPERIMENTAL)
-#endif // _LIBCPP_EXPERIMENTAL___SIMD_INTERNAL_DECLARATION_H
diff --git a/libcxx/include/experimental/__simd/scalar.h b/libcxx/include/experimental/__simd/scalar.h
index 53fa1c29f374c..5eeff4c1e82a3 100644
--- a/libcxx/include/experimental/__simd/scalar.h
+++ b/libcxx/include/experimental/__simd/scalar.h
@@ -12,7 +12,7 @@
 
 #include <cstddef>
 #include <experimental/__config>
-#include <experimental/__simd/internal_declaration.h>
+#include <experimental/__simd/declaration.h>
 #include <experimental/__simd/traits.h>
 
 #if _LIBCPP_STD_VER >= 17 && defined(_LIBCPP_ENABLE_EXPERIMENTAL)
diff --git a/libcxx/include/experimental/__simd/simd.h b/libcxx/include/experimental/__simd/simd.h
index ffb328eb345b1..c345811fee7fc 100644
--- a/libcxx/include/experimental/__simd/simd.h
+++ b/libcxx/include/experimental/__simd/simd.h
@@ -15,9 +15,7 @@
 #include <__utility/forward.h>
 #include <cstddef>
 #include <experimental/__config>
-#include <experimental/__simd/abi_tag.h>
 #include <experimental/__simd/declaration.h>
-#include <experimental/__simd/internal_declaration.h>
 #include <experimental/__simd/reference.h>
 #include <experimental/__simd/traits.h>
 #include <experimental/__simd/utility.h>
diff --git a/libcxx/include/experimental/__simd/simd_mask.h b/libcxx/include/experimental/__simd/simd_mask.h
index 325b8409e3b6d..db03843b46e3a 100644
--- a/libcxx/include/experimental/__simd/simd_mask.h
+++ b/libcxx/include/experimental/__simd/simd_mask.h
@@ -13,9 +13,7 @@
 #include <__type_traits/is_same.h>
 #include <cstddef>
 #include <experimental/__config>
-#include <experimental/__simd/abi_tag.h>
 #include <experimental/__simd/declaration.h>
-#include <experimental/__simd/internal_declaration.h>
 #include <experimental/__simd/reference.h>
 #include <experimental/__simd/traits.h>
 
diff --git a/libcxx/include/experimental/__simd/traits.h b/libcxx/include/experimental/__simd/traits.h
index 9b4abe9d0c232..ec25b4bfa7f95 100644
--- a/libcxx/include/experimental/__simd/traits.h
+++ b/libcxx/include/experimental/__simd/traits.h
@@ -10,14 +10,12 @@
 #ifndef _LIBCPP_EXPERIMENTAL___SIMD_TRAITS_H
 #define _LIBCPP_EXPERIMENTAL___SIMD_TRAITS_H
 
+#include <__bit/bit_ceil.h>
 #include <__type_traits/integral_constant.h>
 #include <__type_traits/is_same.h>
 #include <cstddef>
 #include <experimental/__config>
-#include <experimental/__simd/abi_tag.h>
-#include <experimental/__simd/aligned_tag.h>
 #include <experimental/__simd/declaration.h>
-#include <experimental/__simd/internal_declaration.h>
 #include <experimental/__simd/utility.h>
 
 #if _LIBCPP_STD_VER >= 17 && defined(_LIBCPP_ENABLE_EXPERIMENTAL)
@@ -47,15 +45,6 @@ struct is_simd_mask : bool_constant<is_simd_mask_v<_Tp>> {};
 template <class _Tp>
 inline constexpr bool is_simd_flag_type_v = false;
 
-template <>
-inline constexpr bool is_simd_flag_type_v<element_aligned_tag> = true;
-
-template <>
-inline constexpr bool is_simd_flag_type_v<vector_aligned_tag> = true;
-
-template <size_t _Np>
-inline constexpr bool is_simd_flag_type_v<overaligned_tag<_Np>> = true;
-
 template <class _Tp>
 struct is_simd_flag_type : bool_constant<is_simd_flag_type_v<_Tp>> {};
 
@@ -71,7 +60,7 @@ inline constexpr size_t simd_size_v = simd_size<_Tp, _Abi>::value;
 template <class _Tp,
           class _Up = typename _Tp::value_type,
           bool      = (is_simd_v<_Tp> && __is_vectorizable_v<_Up>) || (is_simd_mask_v<_Tp> && is_same_v<_Up, bool>)>
-struct memory_alignment : integral_constant<size_t, vector_aligned_tag::__alignment<_Tp, _Up>> {};
+struct memory_alignment : integral_constant<size_t, std::__bit_ceil(sizeof(_Up) * _Tp::size())> {};
 
 template <class _Tp, class _Up>
 struct memory_alignment<_Tp, _Up, false> {};
diff --git a/libcxx/include/experimental/__simd/vec_ext.h b/libcxx/include/experimental/__simd/vec_ext.h
index 56a0b888104bf..07ba032f493b1 100644
--- a/libcxx/include/experimental/__simd/vec_ext.h
+++ b/libcxx/include/experimental/__simd/vec_ext.h
@@ -15,7 +15,7 @@
 #include <__utility/integer_sequence.h>
 #include <cstddef>
 #include <experimental/__config>
-#include <experimental/__simd/internal_declaration.h>
+#include <experimental/__simd/declaration.h>
 #include <experimental/__simd/traits.h>
 #include <experimental/__simd/utility.h>
 
diff --git a/libcxx/include/experimental/simd b/libcxx/include/experimental/simd
index 56858832857c1..adca9faa47bb0 100644
--- a/libcxx/include/experimental/simd
+++ b/libcxx/include/experimental/simd
@@ -78,7 +78,6 @@ inline namespace parallelism_v2 {
 #endif
 
 #include <experimental/__config>
-#include <experimental/__simd/abi_tag.h>
 #include <experimental/__simd/aligned_tag.h>
 #include <experimental/__simd/declaration.h>
 #include <experimental/__simd/scalar.h>
diff --git a/libcxx/include/module.modulemap.in b/libcxx/include/module.modulemap.in
index a37e96205cf2e..d10670d4faaff 100644
--- a/libcxx/include/module.modulemap.in
+++ b/libcxx/include/module.modulemap.in
@@ -530,10 +530,8 @@ module std_experimental [system] {
     export *
   }
   module simd {
-    module abi_tag              { private header "experimental/__simd/abi_tag.h" }
     module aligned_tag          { private header "experimental/__simd/aligned_tag.h" }
     module declaration          { private header "experimental/__simd/declaration.h" }
-    module internal_declaration { private header "experimental/__simd/internal_declaration.h" }
     module reference            { private header "experimental/__simd/reference.h" }
     module scalar               { private header "experimental/__simd/scalar.h" }
     module simd                 { private header "experimental/__simd/simd.h" }

From e060392f0e7791a695ef8578d7d6543e89995f8c Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Sun, 24 Dec 2023 07:51:51 +0000
Subject: [PATCH 224/342] [gn build] Port 50ae0da05829

---
 llvm/utils/gn/secondary/libcxx/include/BUILD.gn | 2 --
 1 file changed, 2 deletions(-)

diff --git a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
index cdd74ecbbca30..fc24717974c56 100644
--- a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
+++ b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
@@ -980,10 +980,8 @@ if (current_toolchain == default_toolchain) {
       "execution",
       "experimental/__config",
       "experimental/__memory",
-      "experimental/__simd/abi_tag.h",
       "experimental/__simd/aligned_tag.h",
       "experimental/__simd/declaration.h",
-      "experimental/__simd/internal_declaration.h",
       "experimental/__simd/reference.h",
       "experimental/__simd/scalar.h",
       "experimental/__simd/simd.h",

From d26791b09bae4f8bf0f9531957a14864f8696f15 Mon Sep 17 00:00:00 2001
From: Yingwei Zheng <dtcxzyw2333@gmail.com>
Date: Sun, 24 Dec 2023 16:04:49 +0800
Subject: [PATCH 225/342] [Clang][RISCV] Use `__builtin_popcount` in
 `__riscv_cpop_32/64` (#76286)

This patch replaces `__builtin_riscv_cpop_32/64` with `__builtin_popcount(ll)` because `__builtin_riscv_cpop_32/64` is not implemented in clang.
---
 clang/lib/Headers/riscv_bitmanip.h            |  4 +--
 clang/test/CodeGen/RISCV/rvb-intrinsics/zbb.c | 34 ++++++++++++++++---
 2 files changed, 32 insertions(+), 6 deletions(-)

diff --git a/clang/lib/Headers/riscv_bitmanip.h b/clang/lib/Headers/riscv_bitmanip.h
index 1a81cc8618c97..044cbaa037e43 100644
--- a/clang/lib/Headers/riscv_bitmanip.h
+++ b/clang/lib/Headers/riscv_bitmanip.h
@@ -34,7 +34,7 @@ __riscv_ctz_32(uint32_t __x) {
 
 static __inline__ unsigned __attribute__((__always_inline__, __nodebug__))
 __riscv_cpop_32(uint32_t __x) {
-  return __builtin_riscv_cpop_32(__x);
+  return __builtin_popcount(__x);
 }
 
 #if __riscv_xlen == 64
@@ -55,7 +55,7 @@ __riscv_ctz_64(uint64_t __x) {
 
 static __inline__ unsigned __attribute__((__always_inline__, __nodebug__))
 __riscv_cpop_64(uint64_t __x) {
-  return __builtin_riscv_cpop_64(__x);
+  return __builtin_popcountll(__x);
 }
 #endif
 #endif // defined(__riscv_zbb)
diff --git a/clang/test/CodeGen/RISCV/rvb-intrinsics/zbb.c b/clang/test/CodeGen/RISCV/rvb-intrinsics/zbb.c
index 5edbc578e82e9..fbc51b4bf144a 100644
--- a/clang/test/CodeGen/RISCV/rvb-intrinsics/zbb.c
+++ b/clang/test/CodeGen/RISCV/rvb-intrinsics/zbb.c
@@ -51,8 +51,8 @@ unsigned int clz_32(uint32_t a) {
 // RV64ZBB-LABEL: @clz_64(
 // RV64ZBB-NEXT:  entry:
 // RV64ZBB-NEXT:    [[TMP0:%.*]] = call i64 @llvm.ctlz.i64(i64 [[A:%.*]], i1 false)
-// RV64ZBB-NEXT:    [[CAST:%.*]] = trunc i64 [[TMP0]] to i32
-// RV64ZBB-NEXT:    ret i32 [[CAST]]
+// RV64ZBB-NEXT:    [[CAST_I:%.*]] = trunc i64 [[TMP0]] to i32
+// RV64ZBB-NEXT:    ret i32 [[CAST_I]]
 //
 unsigned int clz_64(uint64_t a) {
   return __riscv_clz_64(a);
@@ -77,10 +77,36 @@ unsigned int ctz_32(uint32_t a) {
 // RV64ZBB-LABEL: @ctz_64(
 // RV64ZBB-NEXT:  entry:
 // RV64ZBB-NEXT:    [[TMP0:%.*]] = call i64 @llvm.cttz.i64(i64 [[A:%.*]], i1 false)
-// RV64ZBB-NEXT:    [[CAST:%.*]] = trunc i64 [[TMP0]] to i32
-// RV64ZBB-NEXT:    ret i32 [[CAST]]
+// RV64ZBB-NEXT:    [[CAST_I:%.*]] = trunc i64 [[TMP0]] to i32
+// RV64ZBB-NEXT:    ret i32 [[CAST_I]]
 //
 unsigned int ctz_64(uint64_t a) {
   return __riscv_ctz_64(a);
 }
 #endif
+
+// RV32ZBB-LABEL: @cpop_32(
+// RV32ZBB-NEXT:  entry:
+// RV32ZBB-NEXT:    [[TMP0:%.*]] = call i32 @llvm.ctpop.i32(i32 [[A:%.*]])
+// RV32ZBB-NEXT:    ret i32 [[TMP0]]
+//
+// RV64ZBB-LABEL: @cpop_32(
+// RV64ZBB-NEXT:  entry:
+// RV64ZBB-NEXT:    [[TMP0:%.*]] = call i32 @llvm.ctpop.i32(i32 [[A:%.*]])
+// RV64ZBB-NEXT:    ret i32 [[TMP0]]
+//
+unsigned int cpop_32(uint32_t a) {
+  return __riscv_cpop_32(a);
+}
+
+#if __riscv_xlen == 64
+// RV64ZBB-LABEL: @cpop_64(
+// RV64ZBB-NEXT:  entry:
+// RV64ZBB-NEXT:    [[TMP0:%.*]] = call i64 @llvm.ctpop.i64(i64 [[A:%.*]])
+// RV64ZBB-NEXT:    [[CAST_I:%.*]] = trunc i64 [[TMP0]] to i32
+// RV64ZBB-NEXT:    ret i32 [[CAST_I]]
+//
+unsigned int cpop_64(uint64_t a) {
+  return __riscv_cpop_64(a);
+}
+#endif

From 1dc715a8a4d058dc8b7afbf9ce3fff5a3ff6e4ef Mon Sep 17 00:00:00 2001
From: Yingwei Zheng <dtcxzyw2333@gmail.com>
Date: Sun, 24 Dec 2023 16:14:22 +0800
Subject: [PATCH 226/342] [Clang][RISCV] Add missing support for
 `__riscv_clmulr_32/64` in `riscv_bitmanip.h` (#76289)

This patch adds support for `__riscv_clmulr_32/64` in `riscv_bitmanip.h`.
It also fixes the extension requirements of `clmul/clmulh`.
---
 clang/lib/Headers/riscv_bitmanip.h            | 20 +++++++++++++++++--
 clang/test/CodeGen/RISCV/rvb-intrinsics/zbc.c | 14 ++++++-------
 2 files changed, 25 insertions(+), 9 deletions(-)

diff --git a/clang/lib/Headers/riscv_bitmanip.h b/clang/lib/Headers/riscv_bitmanip.h
index 044cbaa037e43..2bc7ee022a96b 100644
--- a/clang/lib/Headers/riscv_bitmanip.h
+++ b/clang/lib/Headers/riscv_bitmanip.h
@@ -120,7 +120,23 @@ __riscv_zip_32(uint32_t __x) {
 #endif
 #endif // defined(__riscv_zbkb)
 
-#if defined(__riscv_zbkc)
+#if defined(__riscv_zbc)
+#if __riscv_xlen == 32
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+__riscv_clmulr_32(uint32_t __x, uint32_t __y) {
+  return __builtin_riscv_clmulr_32(__x, __y);
+}
+#endif
+
+#if __riscv_xlen == 64
+static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
+__riscv_clmulr_64(uint64_t __x, uint64_t __y) {
+  return __builtin_riscv_clmulr_64(__x, __y);
+}
+#endif
+#endif // defined(__riscv_zbc)
+
+#if defined(__riscv_zbkc) || defined(__riscv_zbc)
 static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
 __riscv_clmul_32(uint32_t __x, uint32_t __y) {
   return __builtin_riscv_clmul_32(__x, __y);
@@ -144,7 +160,7 @@ __riscv_clmulh_64(uint64_t __x, uint64_t __y) {
   return __builtin_riscv_clmulh_64(__x, __y);
 }
 #endif
-#endif // defined(__riscv_zbkc)
+#endif // defined(__riscv_zbkc) || defined(__riscv_zbc)
 
 #if defined(__riscv_zbkx)
 #if __riscv_xlen == 32
diff --git a/clang/test/CodeGen/RISCV/rvb-intrinsics/zbc.c b/clang/test/CodeGen/RISCV/rvb-intrinsics/zbc.c
index ae9153eff155e..93db3a482ef2b 100644
--- a/clang/test/CodeGen/RISCV/rvb-intrinsics/zbc.c
+++ b/clang/test/CodeGen/RISCV/rvb-intrinsics/zbc.c
@@ -6,7 +6,7 @@
 // RUN:     -disable-O0-optnone | opt -S -passes=mem2reg \
 // RUN:     | FileCheck %s  -check-prefix=RV64ZBC
 
-#include <stdint.h>
+#include <riscv_bitmanip.h>
 
 #if __riscv_xlen == 64
 // RV64ZBC-LABEL: @clmul_64(
@@ -15,7 +15,7 @@
 // RV64ZBC-NEXT:    ret i64 [[TMP0]]
 //
 uint64_t clmul_64(uint64_t a, uint64_t b) {
-  return __builtin_riscv_clmul_64(a, b);
+  return __riscv_clmul_64(a, b);
 }
 
 // RV64ZBC-LABEL: @clmulh_64(
@@ -24,7 +24,7 @@ uint64_t clmul_64(uint64_t a, uint64_t b) {
 // RV64ZBC-NEXT:    ret i64 [[TMP0]]
 //
 uint64_t clmulh_64(uint64_t a, uint64_t b) {
-  return __builtin_riscv_clmulh_64(a, b);
+  return __riscv_clmulh_64(a, b);
 }
 
 // RV64ZBC-LABEL: @clmulr_64(
@@ -33,7 +33,7 @@ uint64_t clmulh_64(uint64_t a, uint64_t b) {
 // RV64ZBC-NEXT:    ret i64 [[TMP0]]
 //
 uint64_t clmulr_64(uint64_t a, uint64_t b) {
-  return __builtin_riscv_clmulr_64(a, b);
+  return __riscv_clmulr_64(a, b);
 }
 #endif
 
@@ -48,7 +48,7 @@ uint64_t clmulr_64(uint64_t a, uint64_t b) {
 // RV64ZBC-NEXT:    ret i32 [[TMP0]]
 //
 uint32_t clmul_32(uint32_t a, uint32_t b) {
-  return __builtin_riscv_clmul_32(a, b);
+  return __riscv_clmul_32(a, b);
 }
 
 #if __riscv_xlen == 32
@@ -58,7 +58,7 @@ uint32_t clmul_32(uint32_t a, uint32_t b) {
 // RV32ZBC-NEXT:    ret i32 [[TMP0]]
 //
 uint32_t clmulh_32(uint32_t a, uint32_t b) {
-  return __builtin_riscv_clmulh_32(a, b);
+  return __riscv_clmulh_32(a, b);
 }
 
 // RV32ZBC-LABEL: @clmulr_32(
@@ -67,6 +67,6 @@ uint32_t clmulh_32(uint32_t a, uint32_t b) {
 // RV32ZBC-NEXT:    ret i32 [[TMP0]]
 //
 uint32_t clmulr_32(uint32_t a, uint32_t b) {
-  return __builtin_riscv_clmulr_32(a, b);
+  return __riscv_clmulr_32(a, b);
 }
 #endif

From 8f9803b5ab0b03c31c8cb182b44bd2eb70d9d8b0 Mon Sep 17 00:00:00 2001
From: Owen Pan <owenpiano@gmail.com>
Date: Sun, 24 Dec 2023 01:05:10 -0800
Subject: [PATCH 227/342] [clang-format] Add an fnmatch-like function for
 .clang-format-ignore (#76021)

This is needed because Windows doesn't have anything equivalent to the
POSIX fnmatch() function.
---
 clang/lib/Format/CMakeLists.txt              |   1 +
 clang/lib/Format/MatchFilePath.cpp           | 122 +++++++++++++
 clang/lib/Format/MatchFilePath.h             |  22 +++
 clang/unittests/Format/CMakeLists.txt        |   1 +
 clang/unittests/Format/MatchFilePathTest.cpp | 169 +++++++++++++++++++
 5 files changed, 315 insertions(+)
 create mode 100644 clang/lib/Format/MatchFilePath.cpp
 create mode 100644 clang/lib/Format/MatchFilePath.h
 create mode 100644 clang/unittests/Format/MatchFilePathTest.cpp

diff --git a/clang/lib/Format/CMakeLists.txt b/clang/lib/Format/CMakeLists.txt
index 015ec7c0cc84e..84a3c136f650a 100644
--- a/clang/lib/Format/CMakeLists.txt
+++ b/clang/lib/Format/CMakeLists.txt
@@ -11,6 +11,7 @@ add_clang_library(clangFormat
   IntegerLiteralSeparatorFixer.cpp
   MacroCallReconstructor.cpp
   MacroExpander.cpp
+  MatchFilePath.cpp
   NamespaceEndCommentsFixer.cpp
   ObjCPropertyAttributeOrderFixer.cpp
   QualifierAlignmentFixer.cpp
diff --git a/clang/lib/Format/MatchFilePath.cpp b/clang/lib/Format/MatchFilePath.cpp
new file mode 100644
index 0000000000000..412ee4954587e
--- /dev/null
+++ b/clang/lib/Format/MatchFilePath.cpp
@@ -0,0 +1,122 @@
+//===--- MatchFilePath.cpp - Match file path with pattern -------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file implements the functionality of matching a file path name to
+/// a pattern, similar to the POSIX fnmatch() function.
+///
+//===----------------------------------------------------------------------===//
+
+#include "MatchFilePath.h"
+
+using namespace llvm;
+
+namespace clang {
+namespace format {
+
+// Check whether `FilePath` matches `Pattern` based on POSIX (1003.1-2008)
+// 2.13.1, 2.13.2, and Rule 1 of 2.13.3.
+bool matchFilePath(StringRef Pattern, StringRef FilePath) {
+  assert(!Pattern.empty());
+  assert(!FilePath.empty());
+
+  // No match if `Pattern` ends with a non-meta character not equal to the last
+  // character of `FilePath`.
+  if (const auto C = Pattern.back(); !strchr("?*]", C) && C != FilePath.back())
+    return false;
+
+  constexpr auto Separator = '/';
+  const auto EOP = Pattern.size();  // End of `Pattern`.
+  const auto End = FilePath.size(); // End of `FilePath`.
+  unsigned I = 0;                   // Index to `Pattern`.
+
+  for (unsigned J = 0; J < End; ++J) {
+    if (I == EOP)
+      return false;
+
+    switch (const auto F = FilePath[J]; Pattern[I]) {
+    case '\\':
+      if (++I == EOP || F != Pattern[I])
+        return false;
+      break;
+    case '?':
+      if (F == Separator)
+        return false;
+      break;
+    case '*': {
+      while (++I < EOP && Pattern[I] == '*') { // Skip consecutive stars.
+      }
+      const auto K = FilePath.find(Separator, J); // Index of next `Separator`.
+      const bool NoMoreSeparatorsInFilePath = K == StringRef::npos;
+      if (I == EOP) // `Pattern` ends with a star.
+        return NoMoreSeparatorsInFilePath;
+      // `Pattern` ends with a lone backslash.
+      if (Pattern[I] == '\\' && ++I == EOP)
+        return false;
+      // The star is followed by a (possibly escaped) `Separator`.
+      if (Pattern[I] == Separator) {
+        if (NoMoreSeparatorsInFilePath)
+          return false;
+        J = K; // Skip to next `Separator` in `FilePath`.
+        break;
+      }
+      // Recurse.
+      for (auto Pat = Pattern.substr(I); J < End && FilePath[J] != Separator;
+           ++J) {
+        if (matchFilePath(Pat, FilePath.substr(J)))
+          return true;
+      }
+      return false;
+    }
+    case '[':
+      // Skip e.g. `[!]`.
+      if (I + 3 < EOP || (I + 3 == EOP && Pattern[I + 1] != '!')) {
+        // Skip unpaired `[`, brackets containing slashes, and `[]`.
+        if (const auto K = Pattern.find_first_of("]/", I + 1);
+            K != StringRef::npos && Pattern[K] == ']' && K > I + 1) {
+          if (F == Separator)
+            return false;
+          ++I; // After the `[`.
+          bool Negated = false;
+          if (Pattern[I] == '!') {
+            Negated = true;
+            ++I; // After the `!`.
+          }
+          bool Match = false;
+          do {
+            if (I + 2 < K && Pattern[I + 1] == '-') {
+              Match = Pattern[I] <= F && F <= Pattern[I + 2];
+              I += 3; // After the range, e.g. `A-Z`.
+            } else {
+              Match = F == Pattern[I++];
+            }
+          } while (!Match && I < K);
+          if (Negated ? Match : !Match)
+            return false;
+          I = K + 1; // After the `]`.
+          continue;
+        }
+      }
+      [[fallthrough]]; // Match `[` literally.
+    default:
+      if (F != Pattern[I])
+        return false;
+    }
+
+    ++I;
+  }
+
+  // Match trailing stars with null strings.
+  while (I < EOP && Pattern[I] == '*')
+    ++I;
+
+  return I == EOP;
+}
+
+} // namespace format
+} // namespace clang
diff --git a/clang/lib/Format/MatchFilePath.h b/clang/lib/Format/MatchFilePath.h
new file mode 100644
index 0000000000000..482dab7c748e5
--- /dev/null
+++ b/clang/lib/Format/MatchFilePath.h
@@ -0,0 +1,22 @@
+//===--- MatchFilePath.h ----------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_LIB_FORMAT_MATCHFILEPATH_H
+#define LLVM_CLANG_LIB_FORMAT_MATCHFILEPATH_H
+
+#include "llvm/ADT/StringRef.h"
+
+namespace clang {
+namespace format {
+
+bool matchFilePath(llvm::StringRef Pattern, llvm::StringRef FilePath);
+
+} // end namespace format
+} // end namespace clang
+
+#endif
diff --git a/clang/unittests/Format/CMakeLists.txt b/clang/unittests/Format/CMakeLists.txt
index 53136328928f5..71f5886d946c8 100644
--- a/clang/unittests/Format/CMakeLists.txt
+++ b/clang/unittests/Format/CMakeLists.txt
@@ -27,6 +27,7 @@ add_clang_unittest(FormatTests
   IntegerLiteralSeparatorTest.cpp
   MacroCallReconstructorTest.cpp
   MacroExpanderTest.cpp
+  MatchFilePathTest.cpp
   NamespaceEndCommentsFixerTest.cpp
   ObjCPropertyAttributeOrderFixerTest.cpp
   QualifierFixerTest.cpp
diff --git a/clang/unittests/Format/MatchFilePathTest.cpp b/clang/unittests/Format/MatchFilePathTest.cpp
new file mode 100644
index 0000000000000..55723584ddc80
--- /dev/null
+++ b/clang/unittests/Format/MatchFilePathTest.cpp
@@ -0,0 +1,169 @@
+//===- unittest/Format/MatchFilePathTest.cpp ------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "../../lib/Format/MatchFilePath.h"
+#include "gtest/gtest.h"
+
+namespace clang {
+namespace format {
+namespace {
+
+class MatchFilePathTest : public ::testing::Test {
+protected:
+  bool match(llvm::StringRef FilePath, llvm::StringRef Pattern) {
+    return matchFilePath(Pattern, FilePath);
+  }
+};
+
+// Most of the test cases below are from:
+// https://github.com/python/cpython/blob/main/Lib/test/test_fnmatch.py
+
+TEST_F(MatchFilePathTest, Wildcard) {
+  EXPECT_TRUE(match("abc", "?*?"));
+  EXPECT_TRUE(match("abc", "???*"));
+  EXPECT_TRUE(match("abc", "*???"));
+  EXPECT_TRUE(match("abc", "???"));
+  EXPECT_TRUE(match("abc", "*"));
+  EXPECT_TRUE(match("abc", "ab[cd]"));
+  EXPECT_TRUE(match("abc", "ab[!de]"));
+  EXPECT_FALSE(match("abc", "ab[de]"));
+  EXPECT_FALSE(match("a", "??"));
+  EXPECT_FALSE(match("a", "b"));
+}
+
+TEST_F(MatchFilePathTest, Backslash) {
+  EXPECT_TRUE(match("a?", R"(a\?)"));
+  EXPECT_FALSE(match("a\\", R"(a\)"));
+  EXPECT_TRUE(match("\\", R"([\])"));
+  EXPECT_TRUE(match("a", R"([!\])"));
+  EXPECT_FALSE(match("\\", R"([!\])"));
+}
+
+TEST_F(MatchFilePathTest, Newline) {
+  EXPECT_TRUE(match("foo\nbar", "foo*"));
+  EXPECT_TRUE(match("foo\nbar\n", "foo*"));
+  EXPECT_FALSE(match("\nfoo", "foo*"));
+  EXPECT_TRUE(match("\n", "*"));
+}
+
+TEST_F(MatchFilePathTest, Star) {
+  EXPECT_TRUE(match(std::string(50, 'a'), "*a*a*a*a*a*a*a*a*a*a"));
+  EXPECT_FALSE(match((std::string(50, 'a') + 'b'), "*a*a*a*a*a*a*a*a*a*a"));
+}
+
+TEST_F(MatchFilePathTest, CaseSensitive) {
+  EXPECT_TRUE(match("abc", "abc"));
+  EXPECT_FALSE(match("AbC", "abc"));
+  EXPECT_FALSE(match("abc", "AbC"));
+  EXPECT_TRUE(match("AbC", "AbC"));
+}
+
+TEST_F(MatchFilePathTest, PathSeparators) {
+  EXPECT_TRUE(match("usr/bin", "usr/bin"));
+  EXPECT_TRUE(match("usr\\bin", R"(usr\\bin)"));
+}
+
+TEST_F(MatchFilePathTest, NumericEscapeSequence) {
+  EXPECT_TRUE(match("test", "te*"));
+  EXPECT_TRUE(match("test\xff", "te*\xff"));
+  EXPECT_TRUE(match("foo\nbar", "foo*"));
+}
+
+TEST_F(MatchFilePathTest, ValidBrackets) {
+  EXPECT_TRUE(match("z", "[az]"));
+  EXPECT_FALSE(match("z", "[!az]"));
+  EXPECT_TRUE(match("a", "[aa]"));
+  EXPECT_TRUE(match("^", "[^az]"));
+  EXPECT_TRUE(match("[", "[[az]"));
+  EXPECT_FALSE(match("]", "[!]]"));
+}
+
+TEST_F(MatchFilePathTest, InvalidBrackets) {
+  EXPECT_TRUE(match("[", "["));
+  EXPECT_TRUE(match("[]", "[]"));
+  EXPECT_TRUE(match("[!", "[!"));
+  EXPECT_TRUE(match("[!]", "[!]"));
+}
+
+TEST_F(MatchFilePathTest, Range) {
+  EXPECT_TRUE(match("c", "[b-d]"));
+  EXPECT_FALSE(match("c", "[!b-d]"));
+  EXPECT_TRUE(match("y", "[b-dx-z]"));
+  EXPECT_FALSE(match("y", "[!b-dx-z]"));
+}
+
+TEST_F(MatchFilePathTest, Hyphen) {
+  EXPECT_FALSE(match("#", "[!-#]"));
+  EXPECT_FALSE(match("-", "[!--.]"));
+  EXPECT_TRUE(match("_", "[^-`]"));
+  EXPECT_TRUE(match("]", "[[-^]"));
+  EXPECT_TRUE(match("]", R"([\-^])"));
+  EXPECT_TRUE(match("-", "[b-]"));
+  EXPECT_FALSE(match("-", "[!b-]"));
+  EXPECT_TRUE(match("-", "[-b]"));
+  EXPECT_FALSE(match("-", "[!-b]"));
+  EXPECT_TRUE(match("-", "[-]"));
+  EXPECT_FALSE(match("-", "[!-]"));
+}
+
+TEST_F(MatchFilePathTest, UpperLELower) {
+  EXPECT_FALSE(match("c", "[d-b]"));
+  EXPECT_TRUE(match("c", "[!d-b]"));
+  EXPECT_TRUE(match("y", "[d-bx-z]"));
+  EXPECT_FALSE(match("y", "[!d-bx-z]"));
+  EXPECT_TRUE(match("_", "[d-b^-`]"));
+  EXPECT_TRUE(match("]", "[d-b[-^]"));
+  EXPECT_TRUE(match("b", "[b-b]"));
+}
+
+TEST_F(MatchFilePathTest, SlashAndBackslashInBrackets) {
+  EXPECT_FALSE(match("/", "[/]"));
+  EXPECT_TRUE(match("\\", R"([\])"));
+  EXPECT_TRUE(match("[/]", "[/]"));
+  EXPECT_TRUE(match("\\", R"([\t])"));
+  EXPECT_TRUE(match("t", R"([\t])"));
+  EXPECT_FALSE(match("\t", R"([\t])"));
+}
+
+TEST_F(MatchFilePathTest, SlashAndBackslashInRange) {
+  EXPECT_FALSE(match("a/b", "a[.-0]b"));
+  EXPECT_TRUE(match("a\\b", "a[Z-^]b"));
+  EXPECT_FALSE(match("a/b", "a[/-0]b"));
+  EXPECT_TRUE(match("a[/-0]b", "a[/-0]b"));
+  EXPECT_FALSE(match("a/b", "a[.-/]b"));
+  EXPECT_TRUE(match("a[.-/]b", "a[.-/]b"));
+  EXPECT_TRUE(match("a\\b", R"(a[\-^]b)"));
+  EXPECT_TRUE(match("a\\b", R"(a[Z-\]b)"));
+}
+
+TEST_F(MatchFilePathTest, Brackets) {
+  EXPECT_TRUE(match("[", "[[]"));
+  EXPECT_TRUE(match("&", "[a&&b]"));
+  EXPECT_TRUE(match("|", "[a||b]"));
+  EXPECT_TRUE(match("~", "[a~~b]"));
+  EXPECT_TRUE(match(",", "[a-z+--A-Z]"));
+  EXPECT_FALSE(match(".", "[a-z--/A-Z]"));
+}
+
+TEST_F(MatchFilePathTest, Path) {
+  EXPECT_TRUE(match(".clang-format", "*"));
+  EXPECT_TRUE(match(".git", "*git*"));
+  EXPECT_TRUE(match(".gitignore", "*git*"));
+  EXPECT_TRUE(match("foo/bar", "foo*/*bar"));
+  EXPECT_TRUE(match("foo/bar", "*/*"));
+  EXPECT_TRUE(match("foo/bar", R"(*foo*\/*bar*)"));
+  EXPECT_FALSE(match("foo/bar", "foo*"));
+  EXPECT_FALSE(match("foo/bar", "foo?bar"));
+  EXPECT_FALSE(match("foo/bar", "foo*bar"));
+  EXPECT_FALSE(match("foobar", "foo*/*"));
+  EXPECT_FALSE(match("foo\\", R"(foo*\)"));
+}
+
+} // namespace
+} // namespace format
+} // namespace clang

From 51b988efb06f0343e7b71c9aec9ec3195412179d Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Sun, 24 Dec 2023 09:05:28 +0000
Subject: [PATCH 228/342] [gn build] Port 8f9803b5ab0b

---
 llvm/utils/gn/secondary/clang/lib/Format/BUILD.gn       | 1 +
 llvm/utils/gn/secondary/clang/unittests/Format/BUILD.gn | 1 +
 2 files changed, 2 insertions(+)

diff --git a/llvm/utils/gn/secondary/clang/lib/Format/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/Format/BUILD.gn
index ecb96b36ea74d..58cd59a7c941b 100644
--- a/llvm/utils/gn/secondary/clang/lib/Format/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/lib/Format/BUILD.gn
@@ -19,6 +19,7 @@ static_library("Format") {
     "IntegerLiteralSeparatorFixer.cpp",
     "MacroCallReconstructor.cpp",
     "MacroExpander.cpp",
+    "MatchFilePath.cpp",
     "NamespaceEndCommentsFixer.cpp",
     "ObjCPropertyAttributeOrderFixer.cpp",
     "QualifierAlignmentFixer.cpp",
diff --git a/llvm/utils/gn/secondary/clang/unittests/Format/BUILD.gn b/llvm/utils/gn/secondary/clang/unittests/Format/BUILD.gn
index c34c11fc55c4a..b35061970c226 100644
--- a/llvm/utils/gn/secondary/clang/unittests/Format/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/unittests/Format/BUILD.gn
@@ -35,6 +35,7 @@ unittest("FormatTests") {
     "IntegerLiteralSeparatorTest.cpp",
     "MacroCallReconstructorTest.cpp",
     "MacroExpanderTest.cpp",
+    "MatchFilePathTest.cpp",
     "NamespaceEndCommentsFixerTest.cpp",
     "ObjCPropertyAttributeOrderFixerTest.cpp",
     "QualifierFixerTest.cpp",

From 1479fe849249ce628ebd3058bed5d9db7d413a3e Mon Sep 17 00:00:00 2001
From: Shengchen Kan <shengchen.kan@intel.com>
Date: Sun, 24 Dec 2023 18:54:54 +0800
Subject: [PATCH 229/342] [X86][NFC] Rename variables and define a var for
 duplicated strings in X86InstrArithmetic.td

This patch is to extract the NFC in #76319 into a separate commit.
---
 llvm/lib/Target/X86/X86InstrArithmetic.td | 38 +++++++++++------------
 llvm/lib/Target/X86/X86InstrUtils.td      |  2 ++
 2 files changed, 21 insertions(+), 19 deletions(-)

diff --git a/llvm/lib/Target/X86/X86InstrArithmetic.td b/llvm/lib/Target/X86/X86InstrArithmetic.td
index 22394545a7fa2..936db48bb9df4 100644
--- a/llvm/lib/Target/X86/X86InstrArithmetic.td
+++ b/llvm/lib/Target/X86/X86InstrArithmetic.td
@@ -47,7 +47,7 @@ def PLEA64r   : PseudoI<(outs GR64:$dst), (ins anymem:$src), []>;
 // BinOpRR - Instructions that read "reg, reg".
 class BinOpRR<bits<8> o, string m, X86TypeInfo t, dag out, list<dag> p>
   : ITy<o, MRMDestReg, t, out, (ins t.RegClass:$src1, t.RegClass:$src2), m,
-        "{$src2, $src1|$src1, $src2}", p>, Sched<[WriteALU]>;
+        binop_args, p>, Sched<[WriteALU]>;
 // BinOpRR_F - Instructions that read "reg, reg" and write EFLAGS only.
 class BinOpRR_F<bits<8> o, string m, X86TypeInfo t, SDPatternOperator node>
   : BinOpRR<o, m, t, (outs),
@@ -86,7 +86,7 @@ class BinOpRRF_RF_Rev<bits<8> o, string m, X86TypeInfo t>
 // BinOpRM - Instructions that read "reg, [mem]".
 class BinOpRM<bits<8> o, string m, X86TypeInfo t, dag out, list<dag> p>
   : ITy<o, MRMSrcMem, t, out, (ins t.RegClass:$src1, t.MemOperand:$src2), m,
-        "{$src2, $src1|$src1, $src2}", p>,
+        binop_args, p>,
     Sched<[WriteALU.Folded, WriteALU.ReadAfterFold]> {
   let mayLoad = 1;
 }
@@ -117,7 +117,7 @@ class BinOpRMF_RF<bits<8> o, string m, X86TypeInfo t, SDPatternOperator node>
 // BinOpRI - Instructions that read "reg, imm".
 class BinOpRI<bits<8> o, string m, X86TypeInfo t, Format f, dag out, list<dag> p>
   : ITy<o, f, t, out, (ins t.RegClass:$src1, t.ImmOperand:$src2), m,
-        "{$src2, $src1|$src1, $src2}", p>, Sched<[WriteALU]> {
+        binop_args, p>, Sched<[WriteALU]> {
   let ImmT = t.ImmEncoding;
 }
 // BinOpRI_F - Instructions that read "reg, imm" and write EFLAGS only.
@@ -143,7 +143,7 @@ class BinOpRIF_RF<bits<8> o, string m, X86TypeInfo t, SDNode node, Format f>
 // BinOpRI8 - Instructions that read "reg, imm8".
 class BinOpRI8<bits<8> o, string m, X86TypeInfo t, Format f, dag out>
   : ITy<o, f, t, out, (ins t.RegClass:$src1, t.Imm8Operand:$src2), m,
-        "{$src2, $src1|$src1, $src2}", []>, Sched<[WriteALU]> {
+        binop_args, []>, Sched<[WriteALU]> {
   let ImmT = Imm8;
 }
 // BinOpRI8_F - Instructions that read "reg, imm8" and write EFLAGS only.
@@ -161,20 +161,20 @@ class BinOpRI8F_RF<bits<8> o, string m, X86TypeInfo t, Format f>
 
 // BinOpMR - Instructions that read "[mem], reg".
 class BinOpMR<bits<8> o, string m, X86TypeInfo t, list<dag> p>
-  : ITy<o, MRMDestMem, t, (outs), (ins t.MemOperand:$dst, t.RegClass:$src), m,
-        "{$src, $dst|$dst, $src}", p> {
+  : ITy<o, MRMDestMem, t, (outs), (ins t.MemOperand:$src1, t.RegClass:$src2), m,
+        binop_args, p> {
   let mayLoad = 1;
 }
 // BinOpMR_F - Instructions that read "[mem], imm8" and write EFLAGS only.
 class BinOpMR_F<bits<8> o, string m, X86TypeInfo t, SDPatternOperator node>
   : BinOpMR<o, m, t,
-            [(set EFLAGS, (node (t.LoadNode addr:$dst), t.RegClass:$src))]>,
+            [(set EFLAGS, (node (t.LoadNode addr:$src1), t.RegClass:$src2))]>,
     Sched<[WriteALU.Folded, ReadDefault, ReadDefault, ReadDefault,
             ReadDefault, ReadDefault, WriteALU.ReadAfterFold]>, DefEFLAGS;
 // BinOpMR_MF - Instructions that read "[mem], reg" and write "[mem]", EFLAGS.
 class BinOpMR_MF<bits<8> o, string m, X86TypeInfo t, SDNode node>
   : BinOpMR<o, m, t,
-            [(store (node (load addr:$dst), t.RegClass:$src), addr:$dst),
+            [(store (node (load addr:$src1), t.RegClass:$src2), addr:$src1),
              (implicit EFLAGS)]>,
     Sched<[WriteALURMW,
            // base, scale, index, offset, segment
@@ -187,8 +187,8 @@ class BinOpMR_MF<bits<8> o, string m, X86TypeInfo t, SDNode node>
 // read/write EFLAGS.
 class BinOpMRF_MF<bits<8> o, string m, X86TypeInfo t, SDNode node>
   : BinOpMR<o, m, t,
-            [(store (node (load addr:$dst), t.RegClass:$src, EFLAGS),
-             addr:$dst), (implicit EFLAGS)]>,
+            [(store (node (load addr:$src1), t.RegClass:$src2, EFLAGS),
+             addr:$src1), (implicit EFLAGS)]>,
     Sched<[WriteADCRMW,
           // base, scale, index, offset, segment
           ReadDefault, ReadDefault, ReadDefault,
@@ -201,8 +201,8 @@ class BinOpMRF_MF<bits<8> o, string m, X86TypeInfo t, SDNode node>
 
 // BinOpMI - Instructions that read "[mem], imm".
 class BinOpMI<bits<8> o, string m, X86TypeInfo t, Format f, list<dag> p>
-  : ITy<o, f, t, (outs), (ins t.MemOperand:$dst, t.ImmOperand:$src), m,
-        "{$src, $dst|$dst, $src}", p> {
+  : ITy<o, f, t, (outs), (ins t.MemOperand:$src1, t.ImmOperand:$src2), m,
+        binop_args, p> {
   let ImmT = t.ImmEncoding;
   let mayLoad = 1;
 }
@@ -210,13 +210,13 @@ class BinOpMI<bits<8> o, string m, X86TypeInfo t, Format f, list<dag> p>
 class BinOpMI_F<bits<8> o, string m, X86TypeInfo t, SDPatternOperator node,
                 Format f>
   : BinOpMI<o, m, t, f,
-            [(set EFLAGS, (node (t.LoadNode addr:$dst), t.ImmOperator:$src))]>,
+            [(set EFLAGS, (node (t.LoadNode addr:$src1), t.ImmOperator:$src2))]>,
     Sched<[WriteALU.Folded]>, DefEFLAGS;
 // BinOpMI_MF - Instructions that read "[mem], imm" and write "[mem]", EFLAGS.
 class BinOpMI_MF<bits<8> o, string m, X86TypeInfo t, SDNode node, Format f>
   : BinOpMI<o, m, t, f,
-            [(store (node (t.VT (load addr:$dst)),
-             t.ImmOperator:$src), addr:$dst), (implicit EFLAGS)]>,
+            [(store (node (t.VT (load addr:$src1)),
+             t.ImmOperator:$src2), addr:$src1), (implicit EFLAGS)]>,
     Sched<[WriteALURMW]>, DefEFLAGS {
   let mayStore = 1;
 }
@@ -224,16 +224,16 @@ class BinOpMI_MF<bits<8> o, string m, X86TypeInfo t, SDNode node, Format f>
 // read/write EFLAGS.
 class BinOpMIF_MF<bits<8> o, string m, X86TypeInfo t, SDNode node, Format f>
   : BinOpMI<o, m, t, f,
-            [(store (node (t.VT (load addr:$dst)),
-             t.ImmOperator:$src, EFLAGS), addr:$dst), (implicit EFLAGS)]>,
+            [(store (node (t.VT (load addr:$src1)),
+             t.ImmOperator:$src2, EFLAGS), addr:$src1), (implicit EFLAGS)]>,
     Sched<[WriteADCRMW]>, DefEFLAGS, UseEFLAGS {
   let mayStore = 1;
 }
 
 // BinOpMI8 - Instructions that read "[mem], imm8".
 class BinOpMI8<string m, X86TypeInfo t, Format f>
-  : ITy<0x83, f, t, (outs), (ins t.MemOperand:$dst, t.Imm8Operand:$src), m,
-        "{$src, $dst|$dst, $src}", []> {
+  : ITy<0x83, f, t, (outs), (ins t.MemOperand:$src1, t.Imm8Operand:$src2), m,
+        binop_args, []> {
   let ImmT = Imm8;
   let mayLoad = 1;
 }
diff --git a/llvm/lib/Target/X86/X86InstrUtils.td b/llvm/lib/Target/X86/X86InstrUtils.td
index ac0507fce94fb..9499753143d9d 100644
--- a/llvm/lib/Target/X86/X86InstrUtils.td
+++ b/llvm/lib/Target/X86/X86InstrUtils.td
@@ -967,3 +967,5 @@ class ITy<bits<8> opcode, Format f, X86TypeInfo typeinfo, dag outs, dag ins,
   let hasSideEffects = 0;
   let hasREX_W  = typeinfo.HasREX_W;
 }
+
+defvar binop_args = "{$src2, $src1|$src1, $src2}";

From eea217681d01af8935e7db7ff981daaa0541eaee Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Sun, 24 Dec 2023 20:11:27 +0800
Subject: [PATCH 230/342] [Doc] update the usage of opt with mem2reg pass in
 tutorial (#76282)

The current command will raise an error:

> The `opt -passname` syntax for the new pass manager is not supported,
please use `opt -passes=<pipeline>` (or the `-p` alias for a more
concise version).

Update the usage now.
---
 llvm/docs/tutorial/MyFirstLanguageFrontend/LangImpl07.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/docs/tutorial/MyFirstLanguageFrontend/LangImpl07.rst b/llvm/docs/tutorial/MyFirstLanguageFrontend/LangImpl07.rst
index 0347127d0cdf1..8fd4c39d3ff47 100644
--- a/llvm/docs/tutorial/MyFirstLanguageFrontend/LangImpl07.rst
+++ b/llvm/docs/tutorial/MyFirstLanguageFrontend/LangImpl07.rst
@@ -182,7 +182,7 @@ example through the pass, for example, you'll get:
 
 .. code-block:: bash
 
-    $ llvm-as < example.ll | opt -mem2reg | llvm-dis
+    $ llvm-as < example.ll | opt -passes=mem2reg | llvm-dis
     @G = weak global i32 0
     @H = weak global i32 0
 

From 1e710cfc8091b901f53828318287c650332194a7 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Sun, 24 Dec 2023 13:19:10 +0000
Subject: [PATCH 231/342] [DAG] Add TLI::isTruncateFree(SDValue, EVT) wrapper.

Similar to the existing isZExtFree(SDValue, EVT) wrapper, this will allow targets to override for specific cases (e.g. free truncation of an ext/extload node). But for now its just used to wrap the existing isTruncateFree(EVT, EVT) call.
---
 llvm/include/llvm/CodeGen/TargetLowering.h    | 6 ++++++
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 9 +++------
 2 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 490125164ab34..ed2b513be9608 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -2869,6 +2869,12 @@ class TargetLoweringBase {
                           getApproximateEVTForLLT(ToTy, DL, Ctx));
   }
 
+  /// Return true if truncating the specific node Val to type VT2 is free.
+  virtual bool isTruncateFree(SDValue Val, EVT VT2) const {
+    // Fallback to type matching.
+    return isTruncateFree(Val.getValueType(), VT2);
+  }
+
   virtual bool isProfitableToHoist(Instruction *I) const { return true; }
 
   /// Return true if the extension represented by \p I is free.
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index c92a0c2a06d45..0d46c7868d87e 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -13703,8 +13703,7 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
   if (N0.getOpcode() == ISD::AND &&
       N0.getOperand(0).getOpcode() == ISD::TRUNCATE &&
       N0.getOperand(1).getOpcode() == ISD::Constant &&
-      (!TLI.isTruncateFree(N0.getOperand(0).getOperand(0).getValueType(),
-                           N0.getValueType()) ||
+      (!TLI.isTruncateFree(N0.getOperand(0).getOperand(0), N0.getValueType()) ||
        !TLI.isZExtFree(N0.getValueType(), VT))) {
     SDValue X = N0.getOperand(0).getOperand(0);
     X = DAG.getAnyExtOrTrunc(X, SDLoc(X), VT);
@@ -13935,8 +13934,7 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) {
   if (N0.getOpcode() == ISD::AND &&
       N0.getOperand(0).getOpcode() == ISD::TRUNCATE &&
       N0.getOperand(1).getOpcode() == ISD::Constant &&
-      !TLI.isTruncateFree(N0.getOperand(0).getOperand(0).getValueType(),
-                          N0.getValueType())) {
+      !TLI.isTruncateFree(N0.getOperand(0).getOperand(0), N0.getValueType())) {
     SDLoc DL(N);
     SDValue X = DAG.getAnyExtOrTrunc(N0.getOperand(0).getOperand(0), DL, VT);
     SDValue Y = DAG.getNode(ISD::ANY_EXTEND, DL, VT, N0.getOperand(1));
@@ -18855,8 +18853,7 @@ struct LoadedSlice {
     void addSliceGain(const LoadedSlice &LS) {
       // Each slice saves a truncate.
       const TargetLowering &TLI = LS.DAG->getTargetLoweringInfo();
-      if (!TLI.isTruncateFree(LS.Inst->getOperand(0).getValueType(),
-                              LS.Inst->getValueType(0)))
+      if (!TLI.isTruncateFree(LS.Inst->getOperand(0), LS.Inst->getValueType(0)))
         ++Truncates;
       // If there is a shift amount, this slice gets rid of it.
       if (LS.Shift)

From 9423e459875b0dcdf24975976838d651a92f1bdb Mon Sep 17 00:00:00 2001
From: Benjamin Kramer <benny.kra@googlemail.com>
Date: Sun, 24 Dec 2023 17:48:18 +0100
Subject: [PATCH 232/342] [ProfileData] Copy CallTargetMaps a bit less. NFCI

---
 llvm/include/llvm/ProfileData/SampleProf.h          |  4 ++--
 .../include/llvm/Transforms/IPO/ProfiledCallGraph.h |  5 ++---
 llvm/lib/Target/X86/X86InsertPrefetch.cpp           |  6 +++---
 llvm/lib/Transforms/IPO/SampleProfile.cpp           | 13 ++++++-------
 llvm/tools/llvm-profgen/CSPreInliner.cpp            |  5 ++---
 5 files changed, 15 insertions(+), 18 deletions(-)

diff --git a/llvm/include/llvm/ProfileData/SampleProf.h b/llvm/include/llvm/ProfileData/SampleProf.h
index d995cc69af894..66aaf602d0e1d 100644
--- a/llvm/include/llvm/ProfileData/SampleProf.h
+++ b/llvm/include/llvm/ProfileData/SampleProf.h
@@ -883,7 +883,7 @@ class FunctionSamples {
   /// Returns the call target map collected at a given location.
   /// Each location is specified by \p LineOffset and \p Discriminator.
   /// If the location is not found in profile, return error.
-  ErrorOr<SampleRecord::CallTargetMap>
+  ErrorOr<const SampleRecord::CallTargetMap &>
   findCallTargetMapAt(uint32_t LineOffset, uint32_t Discriminator) const {
     const auto &ret = BodySamples.find(
         mapIRLocToProfileLoc(LineLocation(LineOffset, Discriminator)));
@@ -894,7 +894,7 @@ class FunctionSamples {
 
   /// Returns the call target map collected at a given location specified by \p
   /// CallSite. If the location is not found in profile, return error.
-  ErrorOr<SampleRecord::CallTargetMap>
+  ErrorOr<const SampleRecord::CallTargetMap &>
   findCallTargetMapAt(const LineLocation &CallSite) const {
     const auto &Ret = BodySamples.find(mapIRLocToProfileLoc(CallSite));
     if (Ret == BodySamples.end())
diff --git a/llvm/include/llvm/Transforms/IPO/ProfiledCallGraph.h b/llvm/include/llvm/Transforms/IPO/ProfiledCallGraph.h
index 9a0abdfa89544..8bf902fc8d284 100644
--- a/llvm/include/llvm/Transforms/IPO/ProfiledCallGraph.h
+++ b/llvm/include/llvm/Transforms/IPO/ProfiledCallGraph.h
@@ -114,9 +114,8 @@ class ProfiledCallGraph {
           uint64_t CallsiteCount = 0;
           LineLocation Callsite = Callee->getCallSiteLoc();
           if (auto CallTargets = CallerSamples->findCallTargetMapAt(Callsite)) {
-            SampleRecord::CallTargetMap &TargetCounts = CallTargets.get();
-            auto It = TargetCounts.find(CalleeSamples->getFunction());
-            if (It != TargetCounts.end())
+            auto It = CallTargets->find(CalleeSamples->getFunction());
+            if (It != CallTargets->end())
               CallsiteCount = It->second;
           }
           Weight = std::max(CallsiteCount, CalleeEntryCount);
diff --git a/llvm/lib/Target/X86/X86InsertPrefetch.cpp b/llvm/lib/Target/X86/X86InsertPrefetch.cpp
index 3e11ab2d98a44..6c23928228d21 100644
--- a/llvm/lib/Target/X86/X86InsertPrefetch.cpp
+++ b/llvm/lib/Target/X86/X86InsertPrefetch.cpp
@@ -69,8 +69,8 @@ using PrefetchHints = SampleRecord::CallTargetMap;
 
 // Return any prefetching hints for the specified MachineInstruction. The hints
 // are returned as pairs (name, delta).
-ErrorOr<PrefetchHints> getPrefetchHints(const FunctionSamples *TopSamples,
-                                        const MachineInstr &MI) {
+ErrorOr<const PrefetchHints &>
+getPrefetchHints(const FunctionSamples *TopSamples, const MachineInstr &MI) {
   if (const auto &Loc = MI.getDebugLoc())
     if (const auto *Samples = TopSamples->findFunctionSamples(Loc))
       return Samples->findCallTargetMapAt(FunctionSamples::getOffset(Loc),
@@ -123,7 +123,7 @@ bool X86InsertPrefetch::findPrefetchInfo(const FunctionSamples *TopSamples,
   };
   static const char *SerializedPrefetchPrefix = "__prefetch";
 
-  const ErrorOr<PrefetchHints> T = getPrefetchHints(TopSamples, MI);
+  auto T = getPrefetchHints(TopSamples, MI);
   if (!T)
     return false;
   int16_t max_index = -1;
diff --git a/llvm/lib/Transforms/IPO/SampleProfile.cpp b/llvm/lib/Transforms/IPO/SampleProfile.cpp
index 6c6f0a0eca72a..2fd8668d15e20 100644
--- a/llvm/lib/Transforms/IPO/SampleProfile.cpp
+++ b/llvm/lib/Transforms/IPO/SampleProfile.cpp
@@ -794,10 +794,9 @@ SampleProfileLoader::findIndirectCallFunctionSamples(
     return R;
 
   auto CallSite = FunctionSamples::getCallSiteIdentifier(DIL);
-  auto T = FS->findCallTargetMapAt(CallSite);
   Sum = 0;
-  if (T)
-    for (const auto &T_C : T.get())
+  if (auto T = FS->findCallTargetMapAt(CallSite))
+    for (const auto &T_C : *T)
       Sum += T_C.second;
   if (const FunctionSamplesMap *M = FS->findFunctionSamplesMapAt(CallSite)) {
     if (M->empty())
@@ -1679,7 +1678,8 @@ void SampleProfileLoader::generateMDProfMetadata(Function &F) {
           if (!FS)
             continue;
           auto CallSite = FunctionSamples::getCallSiteIdentifier(DIL);
-          auto T = FS->findCallTargetMapAt(CallSite);
+          ErrorOr<SampleRecord::CallTargetMap> T =
+              FS->findCallTargetMapAt(CallSite);
           if (!T || T.get().empty())
             continue;
           if (FunctionSamples::ProfileIsProbeBased) {
@@ -2261,9 +2261,8 @@ void SampleProfileMatcher::countProfileCallsiteMismatches(
 
     // Compute number of samples in the original profile.
     uint64_t CallsiteSamples = 0;
-    auto CTM = FS.findCallTargetMapAt(Loc);
-    if (CTM) {
-      for (const auto &I : CTM.get())
+    if (auto CTM = FS.findCallTargetMapAt(Loc)) {
+      for (const auto &I : *CTM)
         CallsiteSamples += I.second;
     }
     const auto *FSMap = FS.findFunctionSamplesMapAt(Loc);
diff --git a/llvm/tools/llvm-profgen/CSPreInliner.cpp b/llvm/tools/llvm-profgen/CSPreInliner.cpp
index 025d3ca5a6da5..87df6996aa435 100644
--- a/llvm/tools/llvm-profgen/CSPreInliner.cpp
+++ b/llvm/tools/llvm-profgen/CSPreInliner.cpp
@@ -128,9 +128,8 @@ bool CSPreInliner::getInlineCandidates(ProfiledCandidateQueue &CQueue,
     uint64_t CallsiteCount = 0;
     LineLocation Callsite = CalleeNode->getCallSiteLoc();
     if (auto CallTargets = CallerSamples->findCallTargetMapAt(Callsite)) {
-      SampleRecord::CallTargetMap &TargetCounts = CallTargets.get();
-      auto It = TargetCounts.find(CalleeSamples->getFunction());
-      if (It != TargetCounts.end())
+      auto It = CallTargets->find(CalleeSamples->getFunction());
+      if (It != CallTargets->end())
         CallsiteCount = It->second;
     }
 

From d8ddcae547e782a4765783c62747381624f076d4 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Sun, 24 Dec 2023 10:35:01 -0800
Subject: [PATCH 233/342] [LSR] Fix typo in debug message where backspace
 escape was used instead of new line.

---
 llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index 39607464dd009..a58bbe3185638 100644
--- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -7006,7 +7006,7 @@ static bool ReduceLoopStrength(Loop *L, IVUsers &IU, ScalarEvolution &SE,
 
       LLVM_DEBUG(dbgs() << "Old term-cond:\n"
                         << *OldTermCond << "\n"
-                        << "New term-cond:\b" << *NewTermCond << "\n");
+                        << "New term-cond:\n" << *NewTermCond << "\n");
 
       BI->setCondition(NewTermCond);
 

From 4c1bc8e753c24df27ee8ccfc4041dd55e7c3a6ee Mon Sep 17 00:00:00 2001
From: Lang Hames <lhames@gmail.com>
Date: Sat, 23 Dec 2023 18:33:30 -0800
Subject: [PATCH 234/342] [JITLink][MachO] Handle intra-block subtractor
 relocations.

Previously the JITLink MachO backends (aarch64 and x86-64) only looked at the
fixup block to determine which symbol was being fixed up. This assumption breaks
if both symbols used in the subtractor are in the same block. The fix is to
check for such cases and use the offsets of each symbol to decide which is being
fixed up.

The issue only resulted in incorrect behavior for negative-delta relocations,
so the testcases use eh-frames with explicit edges for the CIE-pointer field in
FDEs (since these are negative-deltas).

rdar://119351329
---
 .../ExecutionEngine/JITLink/MachO_arm64.cpp   |  32 +-
 .../ExecutionEngine/JITLink/MachO_x86_64.cpp  |  32 +-
 .../MachO_subtractor_single_block.yaml        | 306 ++++++++++++++++++
 .../x86-64/MachO_subtractor_single_block.yaml | 159 +++++++++
 4 files changed, 517 insertions(+), 12 deletions(-)
 create mode 100644 llvm/test/ExecutionEngine/JITLink/AArch64/MachO_subtractor_single_block.yaml
 create mode 100644 llvm/test/ExecutionEngine/JITLink/x86-64/MachO_subtractor_single_block.yaml

diff --git a/llvm/lib/ExecutionEngine/JITLink/MachO_arm64.cpp b/llvm/lib/ExecutionEngine/JITLink/MachO_arm64.cpp
index 409bec7a874b5..809b2d51f0596 100644
--- a/llvm/lib/ExecutionEngine/JITLink/MachO_arm64.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/MachO_arm64.cpp
@@ -188,21 +188,41 @@ class MachOLinkGraphBuilder_arm64 : public MachOLinkGraphBuilder {
     Edge::Kind DeltaKind;
     Symbol *TargetSymbol;
     uint64_t Addend;
+
+    bool FixingFromSymbol = true;
     if (&BlockToFix == &FromSymbol->getAddressable()) {
+      if (LLVM_UNLIKELY(&BlockToFix == &ToSymbol->getAddressable())) {
+        // From and To are symbols in the same block. Decide direction by offset
+        // instead.
+        if (ToSymbol->getAddress() > FixupAddress)
+          FixingFromSymbol = true;
+        else if (FromSymbol->getAddress() > FixupAddress)
+          FixingFromSymbol = false;
+        else
+          FixingFromSymbol = FromSymbol->getAddress() >= ToSymbol->getAddress();
+      } else
+        FixingFromSymbol = true;
+    } else {
+      if (&BlockToFix == &ToSymbol->getAddressable())
+        FixingFromSymbol = false;
+      else {
+        // BlockToFix was neither FromSymbol nor ToSymbol.
+        return make_error<JITLinkError>("SUBTRACTOR relocation must fix up "
+                                        "either 'A' or 'B' (or a symbol in one "
+                                        "of their alt-entry groups)");
+      }
+    }
+
+    if (FixingFromSymbol) {
       TargetSymbol = ToSymbol;
       DeltaKind = (SubRI.r_length == 3) ? aarch64::Delta64 : aarch64::Delta32;
       Addend = FixupValue + (FixupAddress - FromSymbol->getAddress());
       // FIXME: handle extern 'from'.
-    } else if (&BlockToFix == &ToSymbol->getAddressable()) {
+    } else {
       TargetSymbol = &*FromSymbol;
       DeltaKind =
           (SubRI.r_length == 3) ? aarch64::NegDelta64 : aarch64::NegDelta32;
       Addend = FixupValue - (FixupAddress - ToSymbol->getAddress());
-    } else {
-      // BlockToFix was neither FromSymbol nor ToSymbol.
-      return make_error<JITLinkError>("SUBTRACTOR relocation must fix up "
-                                      "either 'A' or 'B' (or a symbol in one "
-                                      "of their alt-entry groups)");
     }
 
     return PairRelocInfo(DeltaKind, TargetSymbol, Addend);
diff --git a/llvm/lib/ExecutionEngine/JITLink/MachO_x86_64.cpp b/llvm/lib/ExecutionEngine/JITLink/MachO_x86_64.cpp
index 49f619357f089..eeca27771ad64 100644
--- a/llvm/lib/ExecutionEngine/JITLink/MachO_x86_64.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/MachO_x86_64.cpp
@@ -179,21 +179,41 @@ class MachOLinkGraphBuilder_x86_64 : public MachOLinkGraphBuilder {
     Edge::Kind DeltaKind;
     Symbol *TargetSymbol;
     uint64_t Addend;
+
+    bool FixingFromSymbol = true;
     if (&BlockToFix == &FromSymbol->getAddressable()) {
+      if (LLVM_UNLIKELY(&BlockToFix == &ToSymbol->getAddressable())) {
+        // From and To are symbols in the same block. Decide direction by offset
+        // instead.
+        if (ToSymbol->getAddress() > FixupAddress)
+          FixingFromSymbol = true;
+        else if (FromSymbol->getAddress() > FixupAddress)
+          FixingFromSymbol = false;
+        else
+          FixingFromSymbol = FromSymbol->getAddress() >= ToSymbol->getAddress();
+      } else
+        FixingFromSymbol = true;
+    } else {
+      if (&BlockToFix == &ToSymbol->getAddressable())
+        FixingFromSymbol = false;
+      else {
+        // BlockToFix was neither FromSymbol nor ToSymbol.
+        return make_error<JITLinkError>("SUBTRACTOR relocation must fix up "
+                                        "either 'A' or 'B' (or a symbol in one "
+                                        "of their alt-entry groups)");
+      }
+    }
+
+    if (FixingFromSymbol) {
       TargetSymbol = ToSymbol;
       DeltaKind = (SubRI.r_length == 3) ? x86_64::Delta64 : x86_64::Delta32;
       Addend = FixupValue + (FixupAddress - FromSymbol->getAddress());
       // FIXME: handle extern 'from'.
-    } else if (&BlockToFix == &ToSymbol->getAddressable()) {
+    } else {
       TargetSymbol = FromSymbol;
       DeltaKind =
           (SubRI.r_length == 3) ? x86_64::NegDelta64 : x86_64::NegDelta32;
       Addend = FixupValue - (FixupAddress - ToSymbol->getAddress());
-    } else {
-      // BlockToFix was neither FromSymbol nor ToSymbol.
-      return make_error<JITLinkError>("SUBTRACTOR relocation must fix up "
-                                      "either 'A' or 'B' (or a symbol in one "
-                                      "of their alt-entry chains)");
     }
 
     return PairRelocInfo(DeltaKind, TargetSymbol, Addend);
diff --git a/llvm/test/ExecutionEngine/JITLink/AArch64/MachO_subtractor_single_block.yaml b/llvm/test/ExecutionEngine/JITLink/AArch64/MachO_subtractor_single_block.yaml
new file mode 100644
index 0000000000000..dec9f274072cb
--- /dev/null
+++ b/llvm/test/ExecutionEngine/JITLink/AArch64/MachO_subtractor_single_block.yaml
@@ -0,0 +1,306 @@
+# RUN: yaml2obj %s -o %t
+# RUN: llvm-jitlink -noexec %t
+#
+# Check that MachO::ARM64_RELOC_SUBTRACTOR relocations work when the fixup
+# location and target are in the same block (in this case in the __eh_frame
+# section).
+
+--- !mach-o
+FileHeader:
+  magic:           0xFEEDFACF
+  cputype:         0x100000C
+  cpusubtype:      0x0
+  filetype:        0x1
+  ncmds:           5
+  sizeofcmds:      480
+  flags:           0x0
+  reserved:        0x0
+LoadCommands:
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         392
+    segname:         ''
+    vmaddr:          0
+    vmsize:          200
+    fileoff:         544
+    filesize:        200
+    maxprot:         7
+    initprot:        7
+    nsects:          4
+    flags:           0
+    Sections:
+      - sectname:        __text
+        segname:         __TEXT
+        addr:            0x0
+        size:            72
+        offset:          0x220
+        align:           2
+        reloff:          0x2E8
+        nreloc:          6
+        flags:           0x80000400
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
+        content:         F44FBEA9FD7B01A9800080520000009448058052080000B901000090210040F9020080D200000094200020D400000094130040B900000094E00313AAFD7B41A9F44FC2A8C0035FD6
+        relocations:
+          - address:         0x34
+            symbolnum:       7
+            pcrel:           true
+            length:          2
+            extern:          true
+            type:            2
+            scattered:       false
+            value:           0
+          - address:         0x2C
+            symbolnum:       6
+            pcrel:           true
+            length:          2
+            extern:          true
+            type:            2
+            scattered:       false
+            value:           0
+          - address:         0x24
+            symbolnum:       8
+            pcrel:           true
+            length:          2
+            extern:          true
+            type:            2
+            scattered:       false
+            value:           0
+          - address:         0x1C
+            symbolnum:       4
+            pcrel:           false
+            length:          2
+            extern:          true
+            type:            6
+            scattered:       false
+            value:           0
+          - address:         0x18
+            symbolnum:       4
+            pcrel:           true
+            length:          2
+            extern:          true
+            type:            5
+            scattered:       false
+            value:           0
+          - address:         0xC
+            symbolnum:       5
+            pcrel:           true
+            length:          2
+            extern:          true
+            type:            2
+            scattered:       false
+            value:           0
+      - sectname:        __gcc_except_tab
+        segname:         __TEXT
+        addr:            0x48
+        size:            24
+        offset:          0x268
+        align:           2
+        reloff:          0x318
+        nreloc:          1
+        flags:           0x0
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
+        content:         FF9B15010C0018000018102C012820000001000000000000
+        relocations:
+          - address:         0x14
+            symbolnum:       4
+            pcrel:           true
+            length:          2
+            extern:          true
+            type:            7
+            scattered:       false
+            value:           0
+      - sectname:        __eh_frame
+        segname:         __TEXT
+        addr:            0x60
+        size:            72
+        offset:          0x280
+        align:           3
+        reloff:          0x320
+        nreloc:          7
+        flags:           0x0
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
+        content:         1800000000000000017A504C520001781E079B0000000010100C1F002800000004000000F8FFFFFFFFFFFFFF480000000000000008E7FFFFFFFFFFFFFF480E209E019D0293039404
+        relocations:
+          - address:         0x13
+            symbolnum:       9
+            pcrel:           true
+            length:          2
+            extern:          true
+            type:            7
+            scattered:       false
+            value:           0
+          - address:         0x20
+            symbolnum:       1
+            pcrel:           false
+            length:          2
+            extern:          true
+            type:            1
+            scattered:       false
+            value:           0
+          - address:         0x20
+            symbolnum:       2
+            pcrel:           false
+            length:          2
+            extern:          true
+            type:            0
+            scattered:       false
+            value:           0
+          - address:         0x24
+            symbolnum:       2
+            pcrel:           false
+            length:          3
+            extern:          true
+            type:            1
+            scattered:       false
+            value:           0
+          - address:         0x24
+            symbolnum:       3
+            pcrel:           false
+            length:          3
+            extern:          true
+            type:            0
+            scattered:       false
+            value:           0
+          - address:         0x35
+            symbolnum:       2
+            pcrel:           false
+            length:          3
+            extern:          true
+            type:            1
+            scattered:       false
+            value:           0
+          - address:         0x35
+            symbolnum:       0
+            pcrel:           false
+            length:          3
+            extern:          true
+            type:            0
+            scattered:       false
+            value:           0
+      - sectname:        __compact_unwind
+        segname:         __LD
+        addr:            0xA8
+        size:            32
+        offset:          0x2C8
+        align:           3
+        reloff:          0x358
+        nreloc:          2
+        flags:           0x2000000
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
+        content:         '0000000000000000480000000000000300000000000000000000000000000000'
+        relocations:
+          - address:         0x0
+            symbolnum:       3
+            pcrel:           false
+            length:          3
+            extern:          true
+            type:            0
+            scattered:       false
+            value:           0
+          - address:         0x18
+            symbolnum:       0
+            pcrel:           false
+            length:          3
+            extern:          true
+            type:            0
+            scattered:       false
+            value:           0
+  - cmd:             LC_SYMTAB
+    cmdsize:         24
+    symoff:          880
+    nsyms:           10
+    stroff:          1040
+    strsize:         152
+  - cmd:             LC_BUILD_VERSION
+    cmdsize:         32
+    platform:        1
+    minos:           917504
+    sdk:             0
+    ntools:          1
+    Tools:
+      - tool:            3
+        version:         59048448
+  - cmd:             LC_DATA_IN_CODE
+    cmdsize:         16
+    dataoff:         872
+    datasize:        0
+  - cmd:             LC_LINKER_OPTIMIZATION_HINT
+    cmdsize:         16
+    dataoff:         872
+    datasize:        8
+LinkEditData:
+  NameList:
+    - n_strx:          112
+      n_type:          0xE
+      n_sect:          2
+      n_desc:          32
+      n_value:         72
+    - n_strx:          130
+      n_type:          0xE
+      n_sect:          3
+      n_desc:          0
+      n_value:         96
+    - n_strx:          140
+      n_type:          0xE
+      n_sect:          3
+      n_desc:          0
+      n_value:         124
+    - n_strx:          2
+      n_type:          0xF
+      n_sect:          1
+      n_desc:          32
+      n_value:         0
+    - n_strx:          8
+      n_type:          0x1
+      n_sect:          0
+      n_desc:          0
+      n_value:         0
+    - n_strx:          15
+      n_type:          0x1
+      n_sect:          0
+      n_desc:          0
+      n_value:         0
+    - n_strx:          41
+      n_type:          0x1
+      n_sect:          0
+      n_desc:          0
+      n_value:         0
+    - n_strx:          60
+      n_type:          0x1
+      n_sect:          0
+      n_desc:          0
+      n_value:         0
+    - n_strx:          77
+      n_type:          0x1
+      n_sect:          0
+      n_desc:          0
+      n_value:         0
+    - n_strx:          90
+      n_type:          0x1
+      n_sect:          0
+      n_desc:          0
+      n_value:         0
+  StringTable:
+    - ' '
+    - _main
+    - __ZTIi
+    - ___cxa_allocate_exception
+    - ___cxa_begin_catch
+    - ___cxa_end_catch
+    - ___cxa_throw
+    - ___gxx_personality_v0
+    - GCC_except_table0
+    - EH_Frame1
+    - func.eh
+    - ''
+    - ''
+    - ''
+    - ''
+...
diff --git a/llvm/test/ExecutionEngine/JITLink/x86-64/MachO_subtractor_single_block.yaml b/llvm/test/ExecutionEngine/JITLink/x86-64/MachO_subtractor_single_block.yaml
new file mode 100644
index 0000000000000..704c611cf9f79
--- /dev/null
+++ b/llvm/test/ExecutionEngine/JITLink/x86-64/MachO_subtractor_single_block.yaml
@@ -0,0 +1,159 @@
+# RUN: yaml2obj %s -o %t
+# RUN: llvm-jitlink -noexec %t
+#
+# Check that MachO::X86_64_RELOC_SUBTRACTOR relocations work when the fixup
+# location and target are in the same block (in this case in the __eh_frame
+# section).
+
+--- !mach-o
+FileHeader:
+  magic:           0xFEEDFACF
+  cputype:         0x1000007
+  cpusubtype:      0x3
+  filetype:        0x1
+  ncmds:           4
+  sizeofcmds:      384
+  flags:           0x0
+  reserved:        0x0
+LoadCommands:
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         312
+    segname:         ''
+    vmaddr:          0
+    vmsize:          96
+    fileoff:         448
+    filesize:        96
+    maxprot:         7
+    initprot:        7
+    nsects:          3
+    flags:           0
+    Sections:
+      - sectname:        __text
+        segname:         __TEXT
+        addr:            0x0
+        size:            3
+        offset:          0x1C0
+        align:           4
+        reloff:          0x0
+        nreloc:          0
+        flags:           0x80000400
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
+        content:         31C0C3
+      - sectname:        __eh_frame
+        segname:         __TEXT
+        addr:            0x8
+        size:            56
+        offset:          0x1C8
+        align:           3
+        reloff:          0x220
+        nreloc:          4
+        flags:           0x0
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
+        content:         1400000000000000017A520001781001100C0708900100001C00000004000000F8FFFFFFFFFFFFFF03000000000000000000000000000000
+        relocations:
+          - address:         0x1C
+            symbolnum:       0
+            pcrel:           false
+            length:          2
+            extern:          true
+            type:            5
+            scattered:       false
+            value:           0
+          - address:         0x1C
+            symbolnum:       1
+            pcrel:           false
+            length:          2
+            extern:          true
+            type:            0
+            scattered:       false
+            value:           0
+          - address:         0x20
+            symbolnum:       1
+            pcrel:           false
+            length:          3
+            extern:          true
+            type:            5
+            scattered:       false
+            value:           0
+          - address:         0x20
+            symbolnum:       2
+            pcrel:           false
+            length:          3
+            extern:          true
+            type:            0
+            scattered:       false
+            value:           0
+      - sectname:        __compact_unwind
+        segname:         __LD
+        addr:            0x40
+        size:            32
+        offset:          0x200
+        align:           3
+        reloff:          0x240
+        nreloc:          1
+        flags:           0x2000000
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
+        content:         '0000000000000000030000000000000400000000000000000000000000000000'
+        relocations:
+          - address:         0x0
+            symbolnum:       2
+            pcrel:           false
+            length:          3
+            extern:          true
+            type:            0
+            scattered:       false
+            value:           0
+  - cmd:             LC_SYMTAB
+    cmdsize:         24
+    symoff:          584
+    nsyms:           3
+    stroff:          632
+    strsize:         32
+  - cmd:             LC_BUILD_VERSION
+    cmdsize:         32
+    platform:        1
+    minos:           917504
+    sdk:             0
+    ntools:          1
+    Tools:
+      - tool:            3
+        version:         59048448
+  - cmd:             LC_DATA_IN_CODE
+    cmdsize:         16
+    dataoff:         584
+    datasize:        0
+LinkEditData:
+  NameList:
+    - n_strx:          8
+      n_type:          0xE
+      n_sect:          2
+      n_desc:          0
+      n_value:         8
+    - n_strx:          18
+      n_type:          0xE
+      n_sect:          2
+      n_desc:          0
+      n_value:         32
+    - n_strx:          2
+      n_type:          0xF
+      n_sect:          1
+      n_desc:          32
+      n_value:         0
+  StringTable:
+    - ' '
+    - _main
+    - EH_Frame1
+    - func.eh
+    - ''
+    - ''
+    - ''
+    - ''
+    - ''
+    - ''
+...

From 09e6f12cba1e6250ef223c6afba6fcf18b0cacae Mon Sep 17 00:00:00 2001
From: Petr Hosek <phosek@google.com>
Date: Sun, 24 Dec 2023 15:10:40 -0800
Subject: [PATCH 235/342] [builtins] Fix CPU feature detection for Zircon
 (#76276)

This is a follow up to #75635 which broke the build on Fuchsia. We don't
support ifunc on Fuchsia so we shouldn't define __init_cpu_features. For
__init_cpu_features_resolver we have to use _zx_system_get_features as a
Zircon native solution.
---
 compiler-rt/lib/builtins/cpu_model/aarch64.c  |  1 -
 .../cpu_model/aarch64/fmv/fuchsia.inc         | 59 ++++++++++++++-----
 2 files changed, 44 insertions(+), 16 deletions(-)

diff --git a/compiler-rt/lib/builtins/cpu_model/aarch64.c b/compiler-rt/lib/builtins/cpu_model/aarch64.c
index d59dbfdad3808..8e85de2218f7d 100644
--- a/compiler-rt/lib/builtins/cpu_model/aarch64.c
+++ b/compiler-rt/lib/builtins/cpu_model/aarch64.c
@@ -133,7 +133,6 @@ struct {
 #include "aarch64/fmv/mrs.inc"
 #include "aarch64/fmv/freebsd.inc"
 #elif defined(__Fuchsia__)
-#include "aarch64/fmv/mrs.inc"
 #include "aarch64/fmv/fuchsia.inc"
 #elif defined(__ANDROID__)
 #include "aarch64/fmv/mrs.inc"
diff --git a/compiler-rt/lib/builtins/cpu_model/aarch64/fmv/fuchsia.inc b/compiler-rt/lib/builtins/cpu_model/aarch64/fmv/fuchsia.inc
index 4dab6ff58b378..d8e0280f40416 100644
--- a/compiler-rt/lib/builtins/cpu_model/aarch64/fmv/fuchsia.inc
+++ b/compiler-rt/lib/builtins/cpu_model/aarch64/fmv/fuchsia.inc
@@ -1,22 +1,51 @@
-void __init_cpu_features_resolver(unsigned long hwcap,
-                                  const __ifunc_arg_t *arg) {
+#include <zircon/features.h>
+#include <zircon/syscalls.h>
+
+void __init_cpu_features_resolver() {
   if (__aarch64_cpu_features.features)
     return;
 
-  __init_cpu_features_constructor(hwcap, arg);
-}
-
-void CONSTRUCTOR_ATTRIBUTE __init_cpu_features(void) {
-  // CPU features already initialized.
-  if (__aarch64_cpu_features.features)
+    // This ensures the vDSO is a direct link-time dependency of anything that
+    // needs this initializer code.
+#pragma comment(lib, "zircon")
+  uint32_t features;
+  zx_status_t status = _zx_system_get_features(ZX_FEATURE_KIND_CPU, &features);
+  if (status != ZX_OK)
     return;
 
-  unsigned long hwcap = getauxval(AT_HWCAP);
-  unsigned long hwcap2 = getauxval(AT_HWCAP2);
+#define setCPUFeature(cpu_feature)                                             \
+  __aarch64_cpu_features.features |= 1ULL << cpu_feature
+
+  if (features & ZX_ARM64_FEATURE_ISA_FP)
+    setCPUFeature(FEAT_FP);
+  if (features & ZX_ARM64_FEATURE_ISA_ASIMD)
+    setCPUFeature(FEAT_SIMD);
+  if (features & ZX_ARM64_FEATURE_ISA_AES)
+    setCPUFeature(FEAT_AES);
+  if (features & ZX_ARM64_FEATURE_ISA_PMULL)
+    setCPUFeature(FEAT_PMULL);
+  if (features & ZX_ARM64_FEATURE_ISA_SHA1)
+    setCPUFeature(FEAT_SHA1);
+  if (features & ZX_ARM64_FEATURE_ISA_SHA256)
+    setCPUFeature(FEAT_SHA2);
+  if (features & ZX_ARM64_FEATURE_ISA_CRC32)
+    setCPUFeature(FEAT_CRC);
+  if (features & ZX_ARM64_FEATURE_ISA_RDM)
+    setCPUFeature(FEAT_RDM);
+  if (features & ZX_ARM64_FEATURE_ISA_SHA3)
+    setCPUFeature(FEAT_SHA3);
+  if (features & ZX_ARM64_FEATURE_ISA_SM4)
+    setCPUFeature(FEAT_SM4);
+  if (features & ZX_ARM64_FEATURE_ISA_DP)
+    setCPUFeature(FEAT_DOTPROD);
+  if (features & ZX_ARM64_FEATURE_ISA_FHM)
+    setCPUFeature(FEAT_FP16FML);
+  if (features & ZX_ARM64_FEATURE_ISA_SHA512)
+    setCPUFeature(FEAT_SHA3);
+  if (features & ZX_ARM64_FEATURE_ISA_I8MM)
+    setCPUFeature(FEAT_I8MM);
+  if (features & ZX_ARM64_FEATURE_ISA_SVE)
+    setCPUFeature(FEAT_SVE);
 
-  __ifunc_arg_t arg;
-  arg._size = sizeof(__ifunc_arg_t);
-  arg._hwcap = hwcap;
-  arg._hwcap2 = hwcap2;
-  __init_cpu_features_constructor(hwcap | _IFUNC_ARG_HWCAP, &arg);
+  setCPUFeature(FEAT_INIT);
 }

From 422b67aaab1a49cc42f2186069a1e7c548d57858 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Sun, 24 Dec 2023 15:13:15 -0800
Subject: [PATCH 236/342] [Analysis] Use range-based for loops (NFC)

---
 llvm/lib/Analysis/MemoryDependenceAnalysis.cpp | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp b/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp
index 49eccde45f317..951e00e341422 100644
--- a/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp
+++ b/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp
@@ -1292,16 +1292,16 @@ bool MemoryDependenceResults::getNonLocalPointerDepFromBB(
         if (InsertRes.first->second != Pointer.getAddr()) {
           // Make sure to clean up the Visited map before continuing on to
           // PredTranslationFailure.
-          for (unsigned i = 0; i < NewBlocks.size(); i++)
-            Visited.erase(NewBlocks[i]);
+          for (auto *NewBlock : NewBlocks)
+            Visited.erase(NewBlock);
           goto PredTranslationFailure;
         }
       }
       if (NewBlocks.size() > WorklistEntries) {
         // Make sure to clean up the Visited map before continuing on to
         // PredTranslationFailure.
-        for (unsigned i = 0; i < NewBlocks.size(); i++)
-          Visited.erase(NewBlocks[i]);
+        for (auto *NewBlock : NewBlocks)
+          Visited.erase(NewBlock);
         GotWorklistLimit = true;
         goto PredTranslationFailure;
       }
@@ -1359,8 +1359,8 @@ bool MemoryDependenceResults::getNonLocalPointerDepFromBB(
 
         // Make sure to clean up the Visited map before continuing on to
         // PredTranslationFailure.
-        for (unsigned i = 0, n = PredList.size(); i < n; ++i)
-          Visited.erase(PredList[i].first);
+        for (const auto &Pred : PredList)
+          Visited.erase(Pred.first);
 
         goto PredTranslationFailure;
       }
@@ -1371,9 +1371,9 @@ bool MemoryDependenceResults::getNonLocalPointerDepFromBB(
     // any results for.  (getNonLocalPointerDepFromBB will modify our
     // datastructures in ways the code after the PredTranslationFailure label
     // doesn't expect.)
-    for (unsigned i = 0, n = PredList.size(); i < n; ++i) {
-      BasicBlock *Pred = PredList[i].first;
-      PHITransAddr &PredPointer = PredList[i].second;
+    for (auto &I : PredList) {
+      BasicBlock *Pred = I.first;
+      PHITransAddr &PredPointer = I.second;
       Value *PredPtrVal = PredPointer.getAddr();
 
       bool CanTranslate = true;

From 0d454d6e591a579f450093c4ba8c49675e1643ad Mon Sep 17 00:00:00 2001
From: Yingwei Zheng <dtcxzyw2333@gmail.com>
Date: Mon, 25 Dec 2023 07:14:31 +0800
Subject: [PATCH 237/342] [InstCombine] Fold xor of icmps using range
 information (#76334)

This patch folds xor of icmps into a single comparison using range-based reasoning as `foldAndOrOfICmpsUsingRanges` does.
Fixes #70928.
---
 .../InstCombine/InstCombineAndOrXor.cpp       |  49 ++++--
 .../Transforms/InstCombine/and-or-icmps.ll    |   6 +-
 llvm/test/Transforms/InstCombine/xor-icmps.ll | 148 ++++++++++++++++++
 3 files changed, 182 insertions(+), 21 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index 5e362f4117d05..63b1e0f64a882 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -3956,35 +3956,50 @@ Value *InstCombinerImpl::foldXorOfICmps(ICmpInst *LHS, ICmpInst *RHS,
   const APInt *LC, *RC;
   if (match(LHS1, m_APInt(LC)) && match(RHS1, m_APInt(RC)) &&
       LHS0->getType() == RHS0->getType() &&
-      LHS0->getType()->isIntOrIntVectorTy() &&
-      (LHS->hasOneUse() || RHS->hasOneUse())) {
+      LHS0->getType()->isIntOrIntVectorTy()) {
     // Convert xor of signbit tests to signbit test of xor'd values:
     // (X > -1) ^ (Y > -1) --> (X ^ Y) < 0
     // (X <  0) ^ (Y <  0) --> (X ^ Y) < 0
     // (X > -1) ^ (Y <  0) --> (X ^ Y) > -1
     // (X <  0) ^ (Y > -1) --> (X ^ Y) > -1
     bool TrueIfSignedL, TrueIfSignedR;
-    if (isSignBitCheck(PredL, *LC, TrueIfSignedL) &&
+    if ((LHS->hasOneUse() || RHS->hasOneUse()) &&
+        isSignBitCheck(PredL, *LC, TrueIfSignedL) &&
         isSignBitCheck(PredR, *RC, TrueIfSignedR)) {
       Value *XorLR = Builder.CreateXor(LHS0, RHS0);
       return TrueIfSignedL == TrueIfSignedR ? Builder.CreateIsNeg(XorLR) :
                                               Builder.CreateIsNotNeg(XorLR);
     }
 
-    // (X > C) ^ (X < C + 2) --> X != C + 1
-    // (X < C + 2) ^ (X > C) --> X != C + 1
-    // Considering the correctness of this pattern, we should avoid that C is
-    // non-negative and C + 2 is negative, although it will be matched by other
-    // patterns.
-    const APInt *C1, *C2;
-    if ((PredL == CmpInst::ICMP_SGT && match(LHS1, m_APInt(C1)) &&
-         PredR == CmpInst::ICMP_SLT && match(RHS1, m_APInt(C2))) ||
-        (PredL == CmpInst::ICMP_SLT && match(LHS1, m_APInt(C2)) &&
-         PredR == CmpInst::ICMP_SGT && match(RHS1, m_APInt(C1))))
-      if (LHS0 == RHS0 && *C1 + 2 == *C2 &&
-          (C1->isNegative() || C2->isNonNegative()))
-        return Builder.CreateICmpNE(LHS0,
-                                    ConstantInt::get(LHS0->getType(), *C1 + 1));
+    // Fold (icmp pred1 X, C1) ^ (icmp pred2 X, C2)
+    // into a single comparison using range-based reasoning.
+    if (LHS0 == RHS0) {
+      ConstantRange CR1 = ConstantRange::makeExactICmpRegion(PredL, *LC);
+      ConstantRange CR2 = ConstantRange::makeExactICmpRegion(PredR, *RC);
+      auto CRUnion = CR1.exactUnionWith(CR2);
+      auto CRIntersect = CR1.exactIntersectWith(CR2);
+      if (CRUnion && CRIntersect)
+        if (auto CR = CRUnion->exactIntersectWith(CRIntersect->inverse())) {
+          if (CR->isFullSet())
+            return ConstantInt::getTrue(I.getType());
+          if (CR->isEmptySet())
+            return ConstantInt::getFalse(I.getType());
+
+          CmpInst::Predicate NewPred;
+          APInt NewC, Offset;
+          CR->getEquivalentICmp(NewPred, NewC, Offset);
+
+          if ((Offset.isZero() && (LHS->hasOneUse() || RHS->hasOneUse())) ||
+              (LHS->hasOneUse() && RHS->hasOneUse())) {
+            Value *NewV = LHS0;
+            Type *Ty = LHS0->getType();
+            if (!Offset.isZero())
+              NewV = Builder.CreateAdd(NewV, ConstantInt::get(Ty, Offset));
+            return Builder.CreateICmp(NewPred, NewV,
+                                      ConstantInt::get(Ty, NewC));
+          }
+        }
+    }
   }
 
   // Instead of trying to imitate the folds for and/or, decompose this 'xor'
diff --git a/llvm/test/Transforms/InstCombine/and-or-icmps.ll b/llvm/test/Transforms/InstCombine/and-or-icmps.ll
index 881a9b7ff129d..91ecf24760259 100644
--- a/llvm/test/Transforms/InstCombine/and-or-icmps.ll
+++ b/llvm/test/Transforms/InstCombine/and-or-icmps.ll
@@ -3015,10 +3015,8 @@ define i32 @icmp_x_slt_0_and_icmp_y_sgt_neg1_i32_fail(i32 %x, i32 %y) {
 
 define i32 @icmp_slt_0_xor_icmp_sge_neg2_i32_fail(i32 %x) {
 ; CHECK-LABEL: @icmp_slt_0_xor_icmp_sge_neg2_i32_fail(
-; CHECK-NEXT:    [[A:%.*]] = icmp sgt i32 [[X:%.*]], -3
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp slt i32 [[X]], 0
-; CHECK-NEXT:    [[TMP2:%.*]] = xor i1 [[TMP1]], [[A]]
-; CHECK-NEXT:    [[D:%.*]] = zext i1 [[TMP2]] to i32
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[X:%.*]], -2
+; CHECK-NEXT:    [[D:%.*]] = zext i1 [[TMP1]] to i32
 ; CHECK-NEXT:    ret i32 [[D]]
 ;
   %A = icmp sge i32 %x, -2
diff --git a/llvm/test/Transforms/InstCombine/xor-icmps.ll b/llvm/test/Transforms/InstCombine/xor-icmps.ll
index c85993ea9a7e0..f104cd7fdcada 100644
--- a/llvm/test/Transforms/InstCombine/xor-icmps.ll
+++ b/llvm/test/Transforms/InstCombine/xor-icmps.ll
@@ -171,3 +171,151 @@ define i1 @xor_icmp_ptr(ptr %c, ptr %d) {
   ret i1 %xor
 }
 
+; Tests from PR70928
+define i1 @xor_icmp_true_signed(i32 %a) {
+; CHECK-LABEL: @xor_icmp_true_signed(
+; CHECK-NEXT:    ret i1 true
+;
+  %cmp = icmp sgt i32 %a, 5
+  %cmp1 = icmp slt i32 %a, 6
+  %cmp3 = xor i1 %cmp, %cmp1
+  ret i1 %cmp3
+}
+define i1 @xor_icmp_true_signed_multiuse1(i32 %a) {
+; CHECK-LABEL: @xor_icmp_true_signed_multiuse1(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[A:%.*]], 5
+; CHECK-NEXT:    call void @use(i1 [[CMP]])
+; CHECK-NEXT:    ret i1 true
+;
+  %cmp = icmp sgt i32 %a, 5
+  call void @use(i1 %cmp)
+  %cmp1 = icmp slt i32 %a, 6
+  %cmp3 = xor i1 %cmp, %cmp1
+  ret i1 %cmp3
+}
+define i1 @xor_icmp_true_signed_multiuse2(i32 %a) {
+; CHECK-LABEL: @xor_icmp_true_signed_multiuse2(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[A:%.*]], 5
+; CHECK-NEXT:    call void @use(i1 [[CMP]])
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[A]], 6
+; CHECK-NEXT:    call void @use(i1 [[CMP1]])
+; CHECK-NEXT:    ret i1 true
+;
+  %cmp = icmp sgt i32 %a, 5
+  call void @use(i1 %cmp)
+  %cmp1 = icmp slt i32 %a, 6
+  call void @use(i1 %cmp1)
+  %cmp3 = xor i1 %cmp, %cmp1
+  ret i1 %cmp3
+}
+define i1 @xor_icmp_true_signed_commuted(i32 %a) {
+; CHECK-LABEL: @xor_icmp_true_signed_commuted(
+; CHECK-NEXT:    ret i1 true
+;
+  %cmp = icmp sgt i32 %a, 5
+  %cmp1 = icmp slt i32 %a, 6
+  %cmp3 = xor i1 %cmp1, %cmp
+  ret i1 %cmp3
+}
+define i1 @xor_icmp_true_unsigned(i32 %a) {
+; CHECK-LABEL: @xor_icmp_true_unsigned(
+; CHECK-NEXT:    ret i1 true
+;
+  %cmp = icmp ugt i32 %a, 5
+  %cmp1 = icmp ult i32 %a, 6
+  %cmp3 = xor i1 %cmp, %cmp1
+  ret i1 %cmp3
+}
+define i1 @xor_icmp_to_ne(i32 %a) {
+; CHECK-LABEL: @xor_icmp_to_ne(
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp ne i32 [[A:%.*]], 5
+; CHECK-NEXT:    ret i1 [[CMP3]]
+;
+  %cmp = icmp sgt i32 %a, 4
+  %cmp1 = icmp slt i32 %a, 6
+  %cmp3 = xor i1 %cmp, %cmp1
+  ret i1 %cmp3
+}
+define i1 @xor_icmp_to_ne_multiuse1(i32 %a) {
+; CHECK-LABEL: @xor_icmp_to_ne_multiuse1(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[A:%.*]], 4
+; CHECK-NEXT:    call void @use(i1 [[CMP]])
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp ne i32 [[A]], 5
+; CHECK-NEXT:    ret i1 [[CMP3]]
+;
+  %cmp = icmp sgt i32 %a, 4
+  call void @use(i1 %cmp)
+  %cmp1 = icmp slt i32 %a, 6
+  %cmp3 = xor i1 %cmp, %cmp1
+  ret i1 %cmp3
+}
+define i1 @xor_icmp_to_icmp_add(i32 %a) {
+; CHECK-LABEL: @xor_icmp_to_icmp_add(
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[A:%.*]], -6
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp ult i32 [[TMP1]], -2
+; CHECK-NEXT:    ret i1 [[CMP3]]
+;
+  %cmp = icmp sgt i32 %a, 3
+  %cmp1 = icmp slt i32 %a, 6
+  %cmp3 = xor i1 %cmp, %cmp1
+  ret i1 %cmp3
+}
+; Negative tests
+; The result of ConstantRange::difference is not exact.
+define i1 @xor_icmp_invalid_range(i8 %x0) {
+; CHECK-LABEL: @xor_icmp_invalid_range(
+; CHECK-NEXT:    [[TMP1:%.*]] = and i8 [[X0:%.*]], -5
+; CHECK-NEXT:    [[OR_COND:%.*]] = icmp ne i8 [[TMP1]], 0
+; CHECK-NEXT:    ret i1 [[OR_COND]]
+;
+  %cmp = icmp eq i8 %x0, 0
+  %cmp4 = icmp ne i8 %x0, 4
+  %or.cond = xor i1 %cmp, %cmp4
+  ret i1 %or.cond
+}
+define i1 @xor_icmp_to_ne_multiuse2(i32 %a) {
+; CHECK-LABEL: @xor_icmp_to_ne_multiuse2(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[A:%.*]], 4
+; CHECK-NEXT:    call void @use(i1 [[CMP]])
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[A]], 6
+; CHECK-NEXT:    call void @use(i1 [[CMP1]])
+; CHECK-NEXT:    [[CMP3:%.*]] = xor i1 [[CMP]], [[CMP1]]
+; CHECK-NEXT:    ret i1 [[CMP3]]
+;
+  %cmp = icmp sgt i32 %a, 4
+  call void @use(i1 %cmp)
+  %cmp1 = icmp slt i32 %a, 6
+  call void @use(i1 %cmp1)
+  %cmp3 = xor i1 %cmp, %cmp1
+  ret i1 %cmp3
+}
+define i1 @xor_icmp_to_icmp_add_multiuse1(i32 %a) {
+; CHECK-LABEL: @xor_icmp_to_icmp_add_multiuse1(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[A:%.*]], 3
+; CHECK-NEXT:    call void @use(i1 [[CMP]])
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[A]], 6
+; CHECK-NEXT:    [[CMP3:%.*]] = xor i1 [[CMP]], [[CMP1]]
+; CHECK-NEXT:    ret i1 [[CMP3]]
+;
+  %cmp = icmp sgt i32 %a, 3
+  call void @use(i1 %cmp)
+  %cmp1 = icmp slt i32 %a, 6
+  %cmp3 = xor i1 %cmp, %cmp1
+  ret i1 %cmp3
+}
+define i1 @xor_icmp_to_icmp_add_multiuse2(i32 %a) {
+; CHECK-LABEL: @xor_icmp_to_icmp_add_multiuse2(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[A:%.*]], 3
+; CHECK-NEXT:    call void @use(i1 [[CMP]])
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[A]], 6
+; CHECK-NEXT:    call void @use(i1 [[CMP1]])
+; CHECK-NEXT:    [[CMP3:%.*]] = xor i1 [[CMP]], [[CMP1]]
+; CHECK-NEXT:    ret i1 [[CMP3]]
+;
+  %cmp = icmp sgt i32 %a, 3
+  call void @use(i1 %cmp)
+  %cmp1 = icmp slt i32 %a, 6
+  call void @use(i1 %cmp1)
+  %cmp3 = xor i1 %cmp, %cmp1
+  ret i1 %cmp3
+}

From a041da31093303b02b6da5fe919a0a3c234eb466 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Sun, 24 Dec 2023 15:56:36 -0800
Subject: [PATCH 238/342] [X86] Use range-based for loops (NFC)

---
 llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp | 7 +++----
 llvm/lib/Target/X86/X86FastISel.cpp            | 7 +++----
 llvm/lib/Target/X86/X86FloatingPoint.cpp       | 3 +--
 llvm/lib/Target/X86/X86InstrInfo.cpp           | 8 ++++----
 4 files changed, 11 insertions(+), 14 deletions(-)

diff --git a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
index 1d40ce35c1b41..bc5f562d95893 100644
--- a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -263,8 +263,7 @@ class X86AsmParser : public MCTargetAsmParser {
         return 0;
 
       SmallVector<ICToken, 16> OperandStack;
-      for (unsigned i = 0, e = PostfixStack.size(); i != e; ++i) {
-        ICToken Op = PostfixStack[i];
+      for (const ICToken &Op : PostfixStack) {
         if (Op.first == IC_IMM || Op.first == IC_REGISTER) {
           OperandStack.push_back(Op);
         } else if (isUnaryOperator(Op.first)) {
@@ -1731,8 +1730,8 @@ bool X86AsmParser::VerifyAndAdjustOperands(OperandVector &OrigOperands,
       OrigOperands.pop_back();
   }
   // OrigOperands.append(FinalOperands.begin(), FinalOperands.end());
-  for (unsigned int i = 0; i < FinalOperands.size(); ++i)
-    OrigOperands.push_back(std::move(FinalOperands[i]));
+  for (auto &Op : FinalOperands)
+    OrigOperands.push_back(std::move(Op));
 
   return false;
 }
diff --git a/llvm/lib/Target/X86/X86FastISel.cpp b/llvm/lib/Target/X86/X86FastISel.cpp
index 7f134fe1c72bd..0ba31e173a1a7 100644
--- a/llvm/lib/Target/X86/X86FastISel.cpp
+++ b/llvm/lib/Target/X86/X86FastISel.cpp
@@ -1306,8 +1306,8 @@ bool X86FastISel::X86SelectRet(const Instruction *I) {
     MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
                   TII.get(Subtarget->is64Bit() ? X86::RET64 : X86::RET32));
   }
-  for (unsigned i = 0, e = RetRegs.size(); i != e; ++i)
-    MIB.addReg(RetRegs[i], RegState::Implicit);
+  for (unsigned Reg : RetRegs)
+    MIB.addReg(Reg, RegState::Implicit);
   return true;
 }
 
@@ -3346,8 +3346,7 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
 
   // Walk the register/memloc assignments, inserting copies/loads.
   const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
-  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
-    CCValAssign const &VA = ArgLocs[i];
+  for (const CCValAssign &VA : ArgLocs) {
     const Value *ArgVal = OutVals[VA.getValNo()];
     MVT ArgVT = OutVTs[VA.getValNo()];
 
diff --git a/llvm/lib/Target/X86/X86FloatingPoint.cpp b/llvm/lib/Target/X86/X86FloatingPoint.cpp
index aab2535aa86d9..ca4d03913d093 100644
--- a/llvm/lib/Target/X86/X86FloatingPoint.cpp
+++ b/llvm/lib/Target/X86/X86FloatingPoint.cpp
@@ -462,8 +462,7 @@ bool FPS::processBasicBlock(MachineFunction &MF, MachineBasicBlock &BB) {
 
     // Check to see if any of the values defined by this instruction are dead
     // after definition.  If so, pop them.
-    for (unsigned i = 0, e = DeadRegs.size(); i != e; ++i) {
-      unsigned Reg = DeadRegs[i];
+    for (unsigned Reg : DeadRegs) {
       // Check if Reg is live on the stack. An inline-asm register operand that
       // is in the clobber list and marked dead might not be live on the stack.
       static_assert(X86::FP7 - X86::FP0 == 7, "sequential FP regnumbers");
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index bc2d5ed1e17dd..bddda6891356e 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -8263,8 +8263,8 @@ bool X86InstrInfo::unfoldMemoryOperand(
 
     DebugLoc DL;
     MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc), Reg);
-    for (unsigned i = 0, e = AddrOps.size(); i != e; ++i)
-      MIB.add(AddrOps[i]);
+    for (const MachineOperand &AddrOp : AddrOps)
+      MIB.add(AddrOp);
     MIB.setMemRefs(MMOs);
     NewMIs.push_back(MIB);
 
@@ -8341,8 +8341,8 @@ bool X86InstrInfo::unfoldMemoryOperand(
     unsigned Opc = getStoreRegOpcode(Reg, DstRC, isAligned, Subtarget);
     DebugLoc DL;
     MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc));
-    for (unsigned i = 0, e = AddrOps.size(); i != e; ++i)
-      MIB.add(AddrOps[i]);
+    for (const MachineOperand &AddrOp : AddrOps)
+      MIB.add(AddrOp);
     MIB.addReg(Reg, RegState::Kill);
     MIB.setMemRefs(MMOs);
     NewMIs.push_back(MIB);

From fd331ef9971e0c5011dba7e0d9286c8d593108c0 Mon Sep 17 00:00:00 2001
From: HaohaiWen <haohai.wen@intel.com>
Date: Mon, 25 Dec 2023 08:38:25 +0800
Subject: [PATCH 239/342] [CostModel][X86] Track fpext conversion cost for 16
 elements (#76277)

---
 llvm/test/Analysis/CostModel/X86/cast.ll | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/llvm/test/Analysis/CostModel/X86/cast.ll b/llvm/test/Analysis/CostModel/X86/cast.ll
index 5a83d4e81fd38..e0173e9df4dc3 100644
--- a/llvm/test/Analysis/CostModel/X86/cast.ll
+++ b/llvm/test/Analysis/CostModel/X86/cast.ll
@@ -616,27 +616,31 @@ define void @fp_conv(<8 x float> %a, <16 x float>%b, <4 x float> %c) {
 ; SSE-LABEL: 'fp_conv'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %A1 = fpext <4 x float> %c to <4 x double>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %A2 = fpext <8 x float> %a to <8 x double>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %A3 = fptrunc <4 x double> undef to <4 x float>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %A4 = fptrunc <8 x double> undef to <8 x float>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %A3 = fpext <16 x float> %b to <16 x double>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %A4 = fptrunc <4 x double> undef to <4 x float>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %A5 = fptrunc <8 x double> undef to <8 x float>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX-LABEL: 'fp_conv'
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %A1 = fpext <4 x float> %c to <4 x double>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %A2 = fpext <8 x float> %a to <8 x double>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %A3 = fptrunc <4 x double> undef to <4 x float>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %A4 = fptrunc <8 x double> undef to <8 x float>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %A3 = fpext <16 x float> %b to <16 x double>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %A4 = fptrunc <4 x double> undef to <4 x float>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %A5 = fptrunc <8 x double> undef to <8 x float>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX512-LABEL: 'fp_conv'
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %A1 = fpext <4 x float> %c to <4 x double>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %A2 = fpext <8 x float> %a to <8 x double>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %A3 = fptrunc <4 x double> undef to <4 x float>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %A4 = fptrunc <8 x double> undef to <8 x float>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %A3 = fpext <16 x float> %b to <16 x double>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %A4 = fptrunc <4 x double> undef to <4 x float>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %A5 = fptrunc <8 x double> undef to <8 x float>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %A1 = fpext <4 x float> %c to <4 x double>
   %A2 = fpext <8 x float> %a to <8 x double>
-  %A3 = fptrunc <4 x double> undef to <4 x float>
-  %A4 = fptrunc <8 x double> undef to <8 x float>
+  %A3 = fpext <16 x float> %b to <16 x double>
+  %A4 = fptrunc <4 x double> undef to <4 x float>
+  %A5 = fptrunc <8 x double> undef to <8 x float>
   ret void
 }

From 536b043219e75976888bd77f6063b02ebb6ffdb9 Mon Sep 17 00:00:00 2001
From: HaohaiWen <haohai.wen@intel.com>
Date: Mon, 25 Dec 2023 09:42:31 +0800
Subject: [PATCH 240/342] [RegAllocFast] Lazily initialize InstrPosIndexes for
 each MBB (#76275)

Most basic block do not need to query dominates. Defer initialization of
InstrPosIndexes to first query for each MBB.
---
 llvm/lib/CodeGen/RegAllocFast.cpp | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/CodeGen/RegAllocFast.cpp b/llvm/lib/CodeGen/RegAllocFast.cpp
index d7edaa1d7ea47..e81d479301368 100644
--- a/llvm/lib/CodeGen/RegAllocFast.cpp
+++ b/llvm/lib/CodeGen/RegAllocFast.cpp
@@ -66,6 +66,8 @@ namespace {
 /// can be used to determine dominance between instructions in same MBB.
 class InstrPosIndexes {
 public:
+  void unsetInitialized() { IsInitialized = false; }
+
   void init(const MachineBasicBlock &MBB) {
     CurMBB = &MBB;
     Instr2PosIndex.clear();
@@ -80,6 +82,13 @@ class InstrPosIndexes {
   /// index without affecting existing instruction's index. Return true if all
   /// instructions index has been reassigned.
   bool getIndex(const MachineInstr &MI, uint64_t &Index) {
+    if (!IsInitialized) {
+      init(*MI.getParent());
+      IsInitialized = true;
+      Index = Instr2PosIndex.at(&MI);
+      return true;
+    }
+
     assert(MI.getParent() == CurMBB && "MI is not in CurMBB");
     auto It = Instr2PosIndex.find(&MI);
     if (It != Instr2PosIndex.end()) {
@@ -159,6 +168,7 @@ class InstrPosIndexes {
   }
 
 private:
+  bool IsInitialized = false;
   enum { InstrDist = 1024 };
   const MachineBasicBlock *CurMBB = nullptr;
   DenseMap<const MachineInstr *, uint64_t> Instr2PosIndex;
@@ -1665,7 +1675,7 @@ void RegAllocFast::allocateBasicBlock(MachineBasicBlock &MBB) {
   this->MBB = &MBB;
   LLVM_DEBUG(dbgs() << "\nAllocating " << MBB);
 
-  PosIndexes.init(MBB);
+  PosIndexes.unsetInitialized();
   RegUnitStates.assign(TRI->getNumRegUnits(), regFree);
   assert(LiveVirtRegs.empty() && "Mapping not cleared from last block?");
 

From 34727b01eb49181e4e5592df5dca984592ab4123 Mon Sep 17 00:00:00 2001
From: Jim Lin <jim@andestech.com>
Date: Sun, 24 Dec 2023 20:04:55 -0600
Subject: [PATCH 241/342] [RISCV] Remove +experimental-zfbfmin from the
 testcases for Zvfbfmin intrinsics. NFC. (#76317)

Zvfbfmin doesn't need Zfbfmin also enabled.
---
 llvm/test/CodeGen/RISCV/rvv/vfncvtbf16-f-f.ll | 4 ++--
 llvm/test/CodeGen/RISCV/rvv/vfwcvtbf16-f-f.ll | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/llvm/test/CodeGen/RISCV/rvv/vfncvtbf16-f-f.ll b/llvm/test/CodeGen/RISCV/rvv/vfncvtbf16-f-f.ll
index 906b4b232d652..4c8fc06ee1959 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfncvtbf16-f-f.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfncvtbf16-f-f.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+experimental-zfbfmin,+experimental-zvfbfmin \
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+experimental-zvfbfmin \
 ; RUN:   -verify-machineinstrs -target-abi=ilp32d | FileCheck %s
-; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+experimental-zfbfmin,+experimental-zvfbfmin \
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+experimental-zvfbfmin \
 ; RUN:   -verify-machineinstrs -target-abi=lp64d | FileCheck %s
 
 declare <vscale x 1 x bfloat> @llvm.riscv.vfncvtbf16.f.f.w.nxv1bf16.nxv1f32(
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfwcvtbf16-f-f.ll b/llvm/test/CodeGen/RISCV/rvv/vfwcvtbf16-f-f.ll
index c297cfd1f6eda..35b2df75babf4 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfwcvtbf16-f-f.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfwcvtbf16-f-f.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+experimental-zfbfmin,+experimental-zvfbfmin \
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+experimental-zvfbfmin \
 ; RUN:   -verify-machineinstrs -target-abi=ilp32d | FileCheck %s
-; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+experimental-zfbfmin,+experimental-zvfbfmin \
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+experimental-zvfbfmin \
 ; RUN:   -verify-machineinstrs -target-abi=lp64d | FileCheck %s
 
 declare <vscale x 1 x float> @llvm.riscv.vfwcvtbf16.f.f.v.nxv1f32.nxv1bf16(

From 9b99a307b225d0db071a5cc68cbe4f5a0534e724 Mon Sep 17 00:00:00 2001
From: Qizhi Hu <836744285@qq.com>
Date: Mon, 25 Dec 2023 10:13:35 +0800
Subject: [PATCH 242/342] [clang][ASTImporter] skip TemplateTypeParmDecl in
 VisitTypeAliasTemplateDecl (#74919)

Skip checking `TemplateTypeParmDecl ` in `VisitTypeAliasTemplateDecl`.
[Fix this crash](https://github.com/llvm/llvm-project/issues/74765)

Co-authored-by: huqizhi <836744285@qq.com>
---
 clang/lib/AST/ASTImporter.cpp              |  9 +++--
 clang/lib/AST/ASTStructuralEquivalence.cpp | 12 ++++++
 clang/unittests/AST/ASTImporterTest.cpp    | 47 ++++++++++++++++++++++
 3 files changed, 64 insertions(+), 4 deletions(-)

diff --git a/clang/lib/AST/ASTImporter.cpp b/clang/lib/AST/ASTImporter.cpp
index 949310856562c..b61180c4f3491 100644
--- a/clang/lib/AST/ASTImporter.cpp
+++ b/clang/lib/AST/ASTImporter.cpp
@@ -2771,9 +2771,11 @@ ASTNodeImporter::VisitTypeAliasTemplateDecl(TypeAliasTemplateDecl *D) {
     for (auto *FoundDecl : FoundDecls) {
       if (!FoundDecl->isInIdentifierNamespace(IDNS))
         continue;
-      if (auto *FoundAlias = dyn_cast<TypeAliasTemplateDecl>(FoundDecl))
-        return Importer.MapImported(D, FoundAlias);
-      ConflictingDecls.push_back(FoundDecl);
+      if (auto *FoundAlias = dyn_cast<TypeAliasTemplateDecl>(FoundDecl)) {
+        if (IsStructuralMatch(D, FoundAlias))
+          return Importer.MapImported(D, FoundAlias);
+        ConflictingDecls.push_back(FoundDecl);
+      }
     }
 
     if (!ConflictingDecls.empty()) {
@@ -9402,7 +9404,6 @@ Expected<Decl *> ASTImporter::Import(Decl *FromD) {
     setImportDeclError(FromD, *Error);
     return make_error<ASTImportError>(*Error);
   }
-
   // Make sure that ImportImpl registered the imported decl.
   assert(ImportedDecls.count(FromD) != 0 && "Missing call to MapImported?");
   if (auto Error = ImportAttrs(ToD, FromD))
diff --git a/clang/lib/AST/ASTStructuralEquivalence.cpp b/clang/lib/AST/ASTStructuralEquivalence.cpp
index 6bb4bf14b873d..1f492b051e034 100644
--- a/clang/lib/AST/ASTStructuralEquivalence.cpp
+++ b/clang/lib/AST/ASTStructuralEquivalence.cpp
@@ -1977,6 +1977,18 @@ static bool IsStructurallyEquivalent(StructuralEquivalenceContext &Context,
                                   D2->getTemplatedDecl()->getType());
 }
 
+static bool IsStructurallyEquivalent(StructuralEquivalenceContext &Context,
+                                     TypeAliasTemplateDecl *D1,
+                                     TypeAliasTemplateDecl *D2) {
+  // Check template parameters.
+  if (!IsTemplateDeclCommonStructurallyEquivalent(Context, D1, D2))
+    return false;
+
+  // Check the templated declaration.
+  return IsStructurallyEquivalent(Context, D1->getTemplatedDecl(),
+                                  D2->getTemplatedDecl());
+}
+
 static bool IsStructurallyEquivalent(StructuralEquivalenceContext &Context,
                                      ConceptDecl *D1,
                                      ConceptDecl *D2) {
diff --git a/clang/unittests/AST/ASTImporterTest.cpp b/clang/unittests/AST/ASTImporterTest.cpp
index 6c7b2b64ca2d1..ed8ecb080e268 100644
--- a/clang/unittests/AST/ASTImporterTest.cpp
+++ b/clang/unittests/AST/ASTImporterTest.cpp
@@ -9295,6 +9295,53 @@ TEST_P(ASTImporterOptionSpecificTestBase,
   // EXPECT_EQ(ToF1Imported->getPreviousDecl(), ToF1);
 }
 
+TEST_P(ASTImporterOptionSpecificTestBase,
+       ImportTypeAliasTemplateAfterSimilarCalledTemplateTypeParm) {
+  const char *Code =
+      R"(
+      struct S;
+      template <typename>
+      using Callable = S;
+      template <typename Callable>
+      int bindingFunctionVTable;
+      )";
+  Decl *FromTU = getTuDecl(Code, Lang_CXX17);
+
+  auto *FromCallable = FirstDeclMatcher<TypeAliasTemplateDecl>().match(
+      FromTU, typeAliasTemplateDecl(hasName("Callable")));
+
+  auto *FromCallableParm = FirstDeclMatcher<TemplateTypeParmDecl>().match(
+      FromTU, templateTypeParmDecl(hasName("Callable")));
+
+  auto *ToFromCallableParm = Import(FromCallableParm, Lang_CXX17);
+  auto *ToCallable = Import(FromCallable, Lang_CXX17);
+  EXPECT_TRUE(ToFromCallableParm);
+  EXPECT_TRUE(ToCallable);
+}
+
+TEST_P(ASTImporterOptionSpecificTestBase, ImportConflictTypeAliasTemplate) {
+  const char *ToCode =
+      R"(
+      struct S;
+      template <typename, typename>
+      using Callable = S;
+      )";
+  const char *Code =
+      R"(
+      struct S;
+      template <typename>
+      using Callable = S;
+      )";
+  (void)getToTuDecl(ToCode, Lang_CXX17);
+  Decl *FromTU = getTuDecl(Code, Lang_CXX17);
+
+  auto *FromCallable = FirstDeclMatcher<TypeAliasTemplateDecl>().match(
+      FromTU, typeAliasTemplateDecl(hasName("Callable")));
+
+  auto *ImportedCallable = Import(FromCallable, Lang_CXX17);
+  EXPECT_FALSE(ImportedCallable);
+}
+
 INSTANTIATE_TEST_SUITE_P(ParameterizedTests, ASTImporterLookupTableTest,
                          DefaultTestValuesForRunOptions);
 

From 81ae2a8bb01d38162e0269fc6819584af6d60b03 Mon Sep 17 00:00:00 2001
From: Michael Liao <michael.hliao@gmail.com>
Date: Sun, 24 Dec 2023 21:48:48 -0500
Subject: [PATCH 243/342] [clang] Fix '-Wunused-variable' warnings. NFC

---
 clang/lib/Driver/ToolChains/Clang.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 4783affd3220b..70dc7e54aca12 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -3203,13 +3203,13 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D,
                    options::OPT_fstrict_float_cast_overflow, false))
     CmdArgs.push_back("-fno-strict-float-cast-overflow");
 
-  if (const Arg *A = Args.getLastArg(options::OPT_fcx_limited_range))
+  if (Args.hasArg(options::OPT_fcx_limited_range))
     CmdArgs.push_back("-fcx-limited-range");
-  if (const Arg *A = Args.getLastArg(options::OPT_fcx_fortran_rules))
+  if (Args.hasArg(options::OPT_fcx_fortran_rules))
     CmdArgs.push_back("-fcx-fortran-rules");
-  if (const Arg *A = Args.getLastArg(options::OPT_fno_cx_limited_range))
+  if (Args.hasArg(options::OPT_fno_cx_limited_range))
     CmdArgs.push_back("-fno-cx-limited-range");
-  if (const Arg *A = Args.getLastArg(options::OPT_fno_cx_fortran_rules))
+  if (Args.hasArg(options::OPT_fno_cx_fortran_rules))
     CmdArgs.push_back("-fno-cx-fortran-rules");
 }
 

From af837d44c7e126eca16426531ed54d94083f3359 Mon Sep 17 00:00:00 2001
From: Yeting Kuo <46629943+yetingk@users.noreply.github.com>
Date: Mon, 25 Dec 2023 11:18:22 +0800
Subject: [PATCH 244/342] [RISCV][DAG] Teach computeKnownBits consider
 SEW/LMUL/AVL for vsetvli. (#76158)

This patch also add tests whose masks are too narrow to combine. I think
it can help us to find out bugs caused by too large known bits.
---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp  |  23 +-
 llvm/test/CodeGen/RISCV/rvv/vsetvl-ext.ll    | 610 +++++++++++++++++-
 llvm/test/CodeGen/RISCV/rvv/vsetvlmax-ext.ll | 626 +++++++++++++++++++
 3 files changed, 1250 insertions(+), 9 deletions(-)
 create mode 100644 llvm/test/CodeGen/RISCV/rvv/vsetvlmax-ext.ll

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 40518097fcce7..c2508a158837b 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -16099,13 +16099,26 @@ void RISCVTargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
       // We can't do anything for most intrinsics.
       break;
     case Intrinsic::riscv_vsetvli:
-    case Intrinsic::riscv_vsetvlimax:
-      // Assume that VL output is <= 65536.
-      // TODO: Take SEW and LMUL into account.
-      if (BitWidth > 17)
-        Known.Zero.setBitsFrom(17);
+    case Intrinsic::riscv_vsetvlimax: {
+      bool HasAVL = IntNo == Intrinsic::riscv_vsetvli;
+      unsigned VSEW = Op.getConstantOperandVal(HasAVL + 1);
+      RISCVII::VLMUL VLMUL =
+          static_cast<RISCVII::VLMUL>(Op.getConstantOperandVal(HasAVL + 2));
+      unsigned SEW = RISCVVType::decodeVSEW(VSEW);
+      auto [LMul, Fractional] = RISCVVType::decodeVLMUL(VLMUL);
+      uint64_t MaxVL = Subtarget.getRealMaxVLen() / SEW;
+      MaxVL = (Fractional) ? MaxVL / LMul : MaxVL * LMul;
+
+      // Result of vsetvli must be not larger than AVL.
+      if (HasAVL && isa<ConstantSDNode>(Op.getOperand(1)))
+        MaxVL = std::min(MaxVL, Op.getConstantOperandVal(1));
+
+      unsigned KnownZeroFirstBit = Log2_32(MaxVL) + 1;
+      if (BitWidth > KnownZeroFirstBit)
+        Known.Zero.setBitsFrom(KnownZeroFirstBit);
       break;
     }
+    }
     break;
   }
   }
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsetvl-ext.ll b/llvm/test/CodeGen/RISCV/rvv/vsetvl-ext.ll
index 5804f8edf84d2..39d73bed25926 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsetvl-ext.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vsetvl-ext.ll
@@ -24,12 +24,614 @@ define zeroext i32 @vsetvl_zext() {
   ret i32 %b
 }
 
-define i64 @vsetvl_and17bits() {
-; CHECK-LABEL: vsetvl_and17bits:
+define i64 @vsetvl_e8m1_and14bits(i64 %avl) {
+; CHECK-LABEL: vsetvl_e8m1_and14bits:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli a0, 1, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli a0, a0, e8, m1, ta, ma
 ; CHECK-NEXT:    ret
-  %a = call i64 @llvm.riscv.vsetvli(i64 1, i64 1, i64 1)
+  %a = call i64 @llvm.riscv.vsetvli(i64 %avl, i64 0, i64 0)
+  %b = and i64 %a, 16383
+  ret i64 %b
+}
+
+define i64 @vsetvl_e8m1_and13bits(i64 %avl) {
+; CHECK-LABEL: vsetvl_e8m1_and13bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e8, m1, ta, ma
+; CHECK-NEXT:    slli a0, a0, 51
+; CHECK-NEXT:    srli a0, a0, 51
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvli(i64 %avl, i64 0, i64 0)
+  %b = and i64 %a, 8191
+  ret i64 %b
+}
+
+define i64 @vsetvl_e8m1_constant_avl() {
+; CHECK-LABEL: vsetvl_e8m1_constant_avl:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli a0, 1, e8, m1, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvli(i64 1, i64 0, i64 0)
+  %b = and i64 %a, 1
+  ret i64 %b
+}
+
+define i64 @vsetvl_e8m2_and15bits(i64 %avl) {
+; CHECK-LABEL: vsetvl_e8m2_and15bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e8, m2, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvli(i64 %avl, i64 0, i64 1)
+  %b = and i64 %a, 32767
+  ret i64 %b
+}
+
+define i64 @vsetvl_e8m2_and14bits(i64 %avl) {
+; CHECK-LABEL: vsetvl_e8m2_and14bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e8, m2, ta, ma
+; CHECK-NEXT:    slli a0, a0, 50
+; CHECK-NEXT:    srli a0, a0, 50
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvli(i64 %avl, i64 0, i64 1)
+  %b = and i64 %a, 16383
+  ret i64 %b
+}
+
+define i64 @vsetvl_e8m4_and16bits(i64 %avl) {
+; CHECK-LABEL: vsetvl_e8m4_and16bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e8, m4, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvli(i64 %avl, i64 0, i64 2)
+  %b = and i64 %a, 65535
+  ret i64 %b
+}
+
+define i64 @vsetvl_e8m4_and15bits(i64 %avl) {
+; CHECK-LABEL: vsetvl_e8m4_and15bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e8, m4, ta, ma
+; CHECK-NEXT:    slli a0, a0, 49
+; CHECK-NEXT:    srli a0, a0, 49
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvli(i64 %avl, i64 0, i64 2)
+  %b = and i64 %a, 32767
+  ret i64 %b
+}
+
+define i64 @vsetvl_e8m8_and17bits(i64 %avl) {
+; CHECK-LABEL: vsetvl_e8m8_and17bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e8, m8, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvli(i64 %avl, i64 0, i64 3)
   %b = and i64 %a, 131071
   ret i64 %b
 }
+
+define i64 @vsetvl_e8m8_and16bits(i64 %avl) {
+; CHECK-LABEL: vsetvl_e8m8_and16bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e8, m8, ta, ma
+; CHECK-NEXT:    slli a0, a0, 48
+; CHECK-NEXT:    srli a0, a0, 48
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvli(i64 %avl, i64 0, i64 3)
+  %b = and i64 %a, 65535
+  ret i64 %b
+}
+
+define i64 @vsetvl_e8mf2_and11bits(i64 %avl) {
+; CHECK-LABEL: vsetvl_e8mf2_and11bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvli(i64 %avl, i64 0, i64 5)
+  %b = and i64 %a, 2047
+  ret i64 %b
+}
+
+define i64 @vsetvl_e8mf2_and10bits(i64 %avl) {
+; CHECK-LABEL: vsetvl_e8mf2_and10bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    andi a0, a0, 1023
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvli(i64 %avl, i64 0, i64 5)
+  %b = and i64 %a, 1023
+  ret i64 %b
+}
+
+define i64 @vsetvl_e8mf4_and12bits(i64 %avl) {
+; CHECK-LABEL: vsetvl_e8mf4_and12bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvli(i64 %avl, i64 0, i64 6)
+  %b = and i64 %a, 4095
+  ret i64 %b
+}
+
+define i64 @vsetvl_e8mf4_and11bits(i64 %avl) {
+; CHECK-LABEL: vsetvl_e8mf4_and11bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    andi a0, a0, 2047
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvli(i64 %avl, i64 0, i64 6)
+  %b = and i64 %a, 2047
+  ret i64 %b
+}
+
+define i64 @vsetvl_e8mf8_and13bits(i64 %avl) {
+; CHECK-LABEL: vsetvl_e8mf8_and13bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvli(i64 %avl, i64 0, i64 7)
+  %b = and i64 %a, 8191
+  ret i64 %b
+}
+
+define i64 @vsetvl_e8mf8_and12bits(i64 %avl) {
+; CHECK-LABEL: vsetvl_e8mf8_and12bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    slli a0, a0, 52
+; CHECK-NEXT:    srli a0, a0, 52
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvli(i64 %avl, i64 0, i64 7)
+  %b = and i64 %a, 4095
+  ret i64 %b
+}
+
+define i64 @vsetvl_e16m1_and13bits(i64 %avl) {
+; CHECK-LABEL: vsetvl_e16m1_and13bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e16, m1, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvli(i64 %avl, i64 1, i64 0)
+  %b = and i64 %a, 8191
+  ret i64 %b
+}
+
+define i64 @vsetvl_e16m1_and12bits(i64 %avl) {
+; CHECK-LABEL: vsetvl_e16m1_and12bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e16, m1, ta, ma
+; CHECK-NEXT:    slli a0, a0, 52
+; CHECK-NEXT:    srli a0, a0, 52
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvli(i64 %avl, i64 1, i64 0)
+  %b = and i64 %a, 4095
+  ret i64 %b
+}
+
+define i64 @vsetvl_e16m2_and14bits(i64 %avl) {
+; CHECK-LABEL: vsetvl_e16m2_and14bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e16, m2, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvli(i64 %avl, i64 1, i64 1)
+  %b = and i64 %a, 16383
+  ret i64 %b
+}
+
+define i64 @vsetvl_e16m2_and13bits(i64 %avl) {
+; CHECK-LABEL: vsetvl_e16m2_and13bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e16, m2, ta, ma
+; CHECK-NEXT:    slli a0, a0, 51
+; CHECK-NEXT:    srli a0, a0, 51
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvli(i64 %avl, i64 1, i64 1)
+  %b = and i64 %a, 8191
+  ret i64 %b
+}
+
+define i64 @vsetvl_e16m4_and15bits(i64 %avl) {
+; CHECK-LABEL: vsetvl_e16m4_and15bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e16, m4, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvli(i64 %avl, i64 1, i64 2)
+  %b = and i64 %a, 32767
+  ret i64 %b
+}
+
+define i64 @vsetvl_e16m4_and14bits(i64 %avl) {
+; CHECK-LABEL: vsetvl_e16m4_and14bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e16, m4, ta, ma
+; CHECK-NEXT:    slli a0, a0, 50
+; CHECK-NEXT:    srli a0, a0, 50
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvli(i64 %avl, i64 1, i64 2)
+  %b = and i64 %a, 16383
+  ret i64 %b
+}
+
+define i64 @vsetvl_e16m8_and16bits(i64 %avl) {
+; CHECK-LABEL: vsetvl_e16m8_and16bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e16, m8, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvli(i64 %avl, i64 1, i64 3)
+  %b = and i64 %a, 65535
+  ret i64 %b
+}
+
+define i64 @vsetvl_e16m8_and15bits(i64 %avl) {
+; CHECK-LABEL: vsetvl_e16m8_and15bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e16, m8, ta, ma
+; CHECK-NEXT:    slli a0, a0, 49
+; CHECK-NEXT:    srli a0, a0, 49
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvli(i64 %avl, i64 1, i64 3)
+  %b = and i64 %a, 32767
+  ret i64 %b
+}
+
+define i64 @vsetvl_e16mf2_and10bits(i64 %avl) {
+; CHECK-LABEL: vsetvl_e16mf2_and10bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e16, mf8, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvli(i64 %avl, i64 1, i64 5)
+  %b = and i64 %a, 1023
+  ret i64 %b
+}
+
+define i64 @vsetvl_e16mf2_and9bits(i64 %avl) {
+; CHECK-LABEL: vsetvl_e16mf2_and9bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e16, mf8, ta, ma
+; CHECK-NEXT:    andi a0, a0, 511
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvli(i64 %avl, i64 1, i64 5)
+  %b = and i64 %a, 511
+  ret i64 %b
+}
+
+define i64 @vsetvl_e16mf4_and11bits(i64 %avl) {
+; CHECK-LABEL: vsetvl_e16mf4_and11bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvli(i64 %avl, i64 1, i64 6)
+  %b = and i64 %a, 2047
+  ret i64 %b
+}
+
+define i64 @vsetvl_e16mf4_and10bits(i64 %avl) {
+; CHECK-LABEL: vsetvl_e16mf4_and10bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    andi a0, a0, 1023
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvli(i64 %avl, i64 1, i64 6)
+  %b = and i64 %a, 1023
+  ret i64 %b
+}
+
+define i64 @vsetvl_e16mf8_and12bits(i64 %avl) {
+; CHECK-LABEL: vsetvl_e16mf8_and12bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvli(i64 %avl, i64 1, i64 7)
+  %b = and i64 %a, 4095
+  ret i64 %b
+}
+
+define i64 @vsetvl_e16mf8_and11bits(i64 %avl) {
+; CHECK-LABEL: vsetvl_e16mf8_and11bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    andi a0, a0, 2047
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvli(i64 %avl, i64 1, i64 7)
+  %b = and i64 %a, 2047
+  ret i64 %b
+}
+
+define i64 @vsetvl_e32m1_and12bits(i64 %avl) {
+; CHECK-LABEL: vsetvl_e32m1_and12bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e32, m1, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvli(i64 %avl, i64 2, i64 0)
+  %b = and i64 %a, 4095
+  ret i64 %b
+}
+
+define i64 @vsetvl_e32m1_and11bits(i64 %avl) {
+; CHECK-LABEL: vsetvl_e32m1_and11bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e32, m1, ta, ma
+; CHECK-NEXT:    andi a0, a0, 2047
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvli(i64 %avl, i64 2, i64 0)
+  %b = and i64 %a, 2047
+  ret i64 %b
+}
+
+define i64 @vsetvl_e32m2_and13bits(i64 %avl) {
+; CHECK-LABEL: vsetvl_e32m2_and13bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e32, m2, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvli(i64 %avl, i64 2, i64 1)
+  %b = and i64 %a, 8191
+  ret i64 %b
+}
+
+define i64 @vsetvl_e32m2_and12bits(i64 %avl) {
+; CHECK-LABEL: vsetvl_e32m2_and12bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e32, m2, ta, ma
+; CHECK-NEXT:    slli a0, a0, 52
+; CHECK-NEXT:    srli a0, a0, 52
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvli(i64 %avl, i64 2, i64 1)
+  %b = and i64 %a, 4095
+  ret i64 %b
+}
+
+define i64 @vsetvl_e32m4_and14bits(i64 %avl) {
+; CHECK-LABEL: vsetvl_e32m4_and14bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e32, m4, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvli(i64 %avl, i64 2, i64 2)
+  %b = and i64 %a, 16383
+  ret i64 %b
+}
+
+define i64 @vsetvl_e32m4_and13bits(i64 %avl) {
+; CHECK-LABEL: vsetvl_e32m4_and13bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e32, m4, ta, ma
+; CHECK-NEXT:    slli a0, a0, 51
+; CHECK-NEXT:    srli a0, a0, 51
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvli(i64 %avl, i64 2, i64 2)
+  %b = and i64 %a, 8191
+  ret i64 %b
+}
+
+define i64 @vsetvl_e32m8_and15bits(i64 %avl) {
+; CHECK-LABEL: vsetvl_e32m8_and15bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e32, m8, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvli(i64 %avl, i64 2, i64 3)
+  %b = and i64 %a, 32767
+  ret i64 %b
+}
+
+define i64 @vsetvl_e32m8_and14bits(i64 %avl) {
+; CHECK-LABEL: vsetvl_e32m8_and14bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e32, m8, ta, ma
+; CHECK-NEXT:    slli a0, a0, 50
+; CHECK-NEXT:    srli a0, a0, 50
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvli(i64 %avl, i64 2, i64 3)
+  %b = and i64 %a, 16383
+  ret i64 %b
+}
+
+define i64 @vsetvl_e32mf2_and9bits(i64 %avl) {
+; CHECK-LABEL: vsetvl_e32mf2_and9bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e32, mf8, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvli(i64 %avl, i64 2, i64 5)
+  %b = and i64 %a, 511
+  ret i64 %b
+}
+
+define i64 @vsetvl_e32mf2_and8bits(i64 %avl) {
+; CHECK-LABEL: vsetvl_e32mf2_and8bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e32, mf8, ta, ma
+; CHECK-NEXT:    andi a0, a0, 255
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvli(i64 %avl, i64 2, i64 5)
+  %b = and i64 %a, 255
+  ret i64 %b
+}
+
+define i64 @vsetvl_e32mf4_and10bits(i64 %avl) {
+; CHECK-LABEL: vsetvl_e32mf4_and10bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e32, mf4, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvli(i64 %avl, i64 2, i64 6)
+  %b = and i64 %a, 1023
+  ret i64 %b
+}
+
+define i64 @vsetvl_e32mf4_and9bits(i64 %avl) {
+; CHECK-LABEL: vsetvl_e32mf4_and9bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e32, mf4, ta, ma
+; CHECK-NEXT:    andi a0, a0, 511
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvli(i64 %avl, i64 2, i64 6)
+  %b = and i64 %a, 511
+  ret i64 %b
+}
+
+define i64 @vsetvl_e32mf8_and11bits(i64 %avl) {
+; CHECK-LABEL: vsetvl_e32mf8_and11bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvli(i64 %avl, i64 2, i64 7)
+  %b = and i64 %a, 2047
+  ret i64 %b
+}
+
+define i64 @vsetvl_e32mf8_and10bits(i64 %avl) {
+; CHECK-LABEL: vsetvl_e32mf8_and10bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    andi a0, a0, 1023
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvli(i64 %avl, i64 2, i64 7)
+  %b = and i64 %a, 1023
+  ret i64 %b
+}
+
+define i64 @vsetvl_e64m1_and11bits(i64 %avl) {
+; CHECK-LABEL: vsetvl_e64m1_and11bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e64, m1, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvli(i64 %avl, i64 3, i64 0)
+  %b = and i64 %a, 2047
+  ret i64 %b
+}
+
+define i64 @vsetvl_e64m1_and10bits(i64 %avl) {
+; CHECK-LABEL: vsetvl_e64m1_and10bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e64, m1, ta, ma
+; CHECK-NEXT:    andi a0, a0, 1023
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvli(i64 %avl, i64 3, i64 0)
+  %b = and i64 %a, 1023
+  ret i64 %b
+}
+
+define i64 @vsetvl_e64m2_and12bits(i64 %avl) {
+; CHECK-LABEL: vsetvl_e64m2_and12bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e64, m2, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvli(i64 %avl, i64 3, i64 1)
+  %b = and i64 %a, 4095
+  ret i64 %b
+}
+
+define i64 @vsetvl_e64m2_and11bits(i64 %avl) {
+; CHECK-LABEL: vsetvl_e64m2_and11bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e64, m2, ta, ma
+; CHECK-NEXT:    andi a0, a0, 2047
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvli(i64 %avl, i64 3, i64 1)
+  %b = and i64 %a, 2047
+  ret i64 %b
+}
+
+define i64 @vsetvl_e64m4_and13bits(i64 %avl) {
+; CHECK-LABEL: vsetvl_e64m4_and13bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e64, m4, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvli(i64 %avl, i64 3, i64 2)
+  %b = and i64 %a, 8191
+  ret i64 %b
+}
+
+define i64 @vsetvl_e64m4_and12bits(i64 %avl) {
+; CHECK-LABEL: vsetvl_e64m4_and12bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e64, m4, ta, ma
+; CHECK-NEXT:    slli a0, a0, 52
+; CHECK-NEXT:    srli a0, a0, 52
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvli(i64 %avl, i64 3, i64 2)
+  %b = and i64 %a, 4095
+  ret i64 %b
+}
+
+define i64 @vsetvl_e64m8_and14bits(i64 %avl) {
+; CHECK-LABEL: vsetvl_e64m8_and14bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e64, m8, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvli(i64 %avl, i64 3, i64 3)
+  %b = and i64 %a, 16383
+  ret i64 %b
+}
+
+define i64 @vsetvl_e64m8_and13bits(i64 %avl) {
+; CHECK-LABEL: vsetvl_e64m8_and13bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e64, m8, ta, ma
+; CHECK-NEXT:    slli a0, a0, 51
+; CHECK-NEXT:    srli a0, a0, 51
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvli(i64 %avl, i64 3, i64 3)
+  %b = and i64 %a, 8191
+  ret i64 %b
+}
+
+define i64 @vsetvl_e64mf2_and8bits(i64 %avl) {
+; CHECK-LABEL: vsetvl_e64mf2_and8bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e64, mf8, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvli(i64 %avl, i64 3, i64 5)
+  %b = and i64 %a, 255
+  ret i64 %b
+}
+
+define i64 @vsetvl_e64mf2_and7bits(i64 %avl) {
+; CHECK-LABEL: vsetvl_e64mf2_and7bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e64, mf8, ta, ma
+; CHECK-NEXT:    andi a0, a0, 127
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvli(i64 %avl, i64 3, i64 5)
+  %b = and i64 %a, 127
+  ret i64 %b
+}
+
+define i64 @vsetvl_e64mf4_and9bits(i64 %avl) {
+; CHECK-LABEL: vsetvl_e64mf4_and9bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e64, mf4, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvli(i64 %avl, i64 3, i64 6)
+  %b = and i64 %a, 511
+  ret i64 %b
+}
+
+define i64 @vsetvl_e64mf4_and8bits(i64 %avl) {
+; CHECK-LABEL: vsetvl_e64mf4_and8bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e64, mf4, ta, ma
+; CHECK-NEXT:    andi a0, a0, 255
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvli(i64 %avl, i64 3, i64 6)
+  %b = and i64 %a, 255
+  ret i64 %b
+}
+
+define i64 @vsetvl_e64mf8_and10bits(i64 %avl) {
+; CHECK-LABEL: vsetvl_e64mf8_and10bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e64, mf2, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvli(i64 %avl, i64 3, i64 7)
+  %b = and i64 %a, 1023
+  ret i64 %b
+}
+
+define i64 @vsetvl_e64mf8_and9bits(i64 %avl) {
+; CHECK-LABEL: vsetvl_e64mf8_and9bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, a0, e64, mf2, ta, ma
+; CHECK-NEXT:    andi a0, a0, 511
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvli(i64 %avl, i64 3, i64 7)
+  %b = and i64 %a, 511
+  ret i64 %b
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsetvlmax-ext.ll b/llvm/test/CodeGen/RISCV/rvv/vsetvlmax-ext.ll
new file mode 100644
index 0000000000000..b2a676dc0daf4
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vsetvlmax-ext.ll
@@ -0,0 +1,626 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=riscv64 -mattr=+v | FileCheck %s
+
+declare i64 @llvm.riscv.vsetvlimax(i64, i64);
+
+define signext i32 @vsetvlmax_sext() {
+; CHECK-LABEL: vsetvlmax_sext:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvlimax(i64 1, i64 1)
+  %b = trunc i64 %a to i32
+  ret i32 %b
+}
+
+define zeroext i32 @vsetvlmax_zext() {
+; CHECK-LABEL: vsetvlmax_zext:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvlimax(i64 1, i64 1)
+  %b = trunc i64 %a to i32
+  ret i32 %b
+}
+
+define i64 @vsetvlmax_e8m1_and14bits() {
+; CHECK-LABEL: vsetvlmax_e8m1_and14bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvlimax(i64 0, i64 0)
+  %b = and i64 %a, 16383
+  ret i64 %b
+}
+
+define i64 @vsetvlmax_e8m1_and13bits() {
+; CHECK-LABEL: vsetvlmax_e8m1_and13bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
+; CHECK-NEXT:    slli a0, a0, 51
+; CHECK-NEXT:    srli a0, a0, 51
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvlimax(i64 0, i64 0)
+  %b = and i64 %a, 8191
+  ret i64 %b
+}
+
+define i64 @vsetvlmax_e8m2_and15bits() {
+; CHECK-LABEL: vsetvlmax_e8m2_and15bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvlimax(i64 0, i64 1)
+  %b = and i64 %a, 32767
+  ret i64 %b
+}
+
+define i64 @vsetvlmax_e8m2_and14bits() {
+; CHECK-LABEL: vsetvlmax_e8m2_and14bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
+; CHECK-NEXT:    slli a0, a0, 50
+; CHECK-NEXT:    srli a0, a0, 50
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvlimax(i64 0, i64 1)
+  %b = and i64 %a, 16383
+  ret i64 %b
+}
+
+define i64 @vsetvlmax_e8m4_and16bits() {
+; CHECK-LABEL: vsetvlmax_e8m4_and16bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvlimax(i64 0, i64 2)
+  %b = and i64 %a, 65535
+  ret i64 %b
+}
+
+define i64 @vsetvlmax_e8m4_and15bits() {
+; CHECK-LABEL: vsetvlmax_e8m4_and15bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
+; CHECK-NEXT:    slli a0, a0, 49
+; CHECK-NEXT:    srli a0, a0, 49
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvlimax(i64 0, i64 2)
+  %b = and i64 %a, 32767
+  ret i64 %b
+}
+
+define i64 @vsetvlmax_e8m8_and17bits() {
+; CHECK-LABEL: vsetvlmax_e8m8_and17bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvlimax(i64 0, i64 3)
+  %b = and i64 %a, 131071
+  ret i64 %b
+}
+
+define i64 @vsetvlmax_e8m8_and16bits() {
+; CHECK-LABEL: vsetvlmax_e8m8_and16bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
+; CHECK-NEXT:    slli a0, a0, 48
+; CHECK-NEXT:    srli a0, a0, 48
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvlimax(i64 0, i64 3)
+  %b = and i64 %a, 65535
+  ret i64 %b
+}
+
+define i64 @vsetvlmax_e8mf2_and11bits() {
+; CHECK-LABEL: vsetvlmax_e8mf2_and11bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e8, mf8, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvlimax(i64 0, i64 5)
+  %b = and i64 %a, 2047
+  ret i64 %b
+}
+
+define i64 @vsetvlmax_e8mf2_and10bits() {
+; CHECK-LABEL: vsetvlmax_e8mf2_and10bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e8, mf8, ta, ma
+; CHECK-NEXT:    andi a0, a0, 1023
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvlimax(i64 0, i64 5)
+  %b = and i64 %a, 1023
+  ret i64 %b
+}
+
+define i64 @vsetvlmax_e8mf4_and12bits() {
+; CHECK-LABEL: vsetvlmax_e8mf4_and12bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvlimax(i64 0, i64 6)
+  %b = and i64 %a, 4095
+  ret i64 %b
+}
+
+define i64 @vsetvlmax_e8mf4_and11bits() {
+; CHECK-LABEL: vsetvlmax_e8mf4_and11bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    andi a0, a0, 2047
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvlimax(i64 0, i64 6)
+  %b = and i64 %a, 2047
+  ret i64 %b
+}
+
+define i64 @vsetvlmax_e8mf8_and13bits() {
+; CHECK-LABEL: vsetvlmax_e8mf8_and13bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvlimax(i64 0, i64 7)
+  %b = and i64 %a, 8191
+  ret i64 %b
+}
+
+define i64 @vsetvlmax_e8mf8_and12bits() {
+; CHECK-LABEL: vsetvlmax_e8mf8_and12bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    slli a0, a0, 52
+; CHECK-NEXT:    srli a0, a0, 52
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvlimax(i64 0, i64 7)
+  %b = and i64 %a, 4095
+  ret i64 %b
+}
+
+define i64 @vsetvlmax_e16m1_and13bits() {
+; CHECK-LABEL: vsetvlmax_e16m1_and13bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvlimax(i64 1, i64 0)
+  %b = and i64 %a, 8191
+  ret i64 %b
+}
+
+define i64 @vsetvlmax_e16m1_and12bits() {
+; CHECK-LABEL: vsetvlmax_e16m1_and12bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT:    slli a0, a0, 52
+; CHECK-NEXT:    srli a0, a0, 52
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvlimax(i64 1, i64 0)
+  %b = and i64 %a, 4095
+  ret i64 %b
+}
+
+define i64 @vsetvlmax_e16m2_and14bits() {
+; CHECK-LABEL: vsetvlmax_e16m2_and14bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvlimax(i64 1, i64 1)
+  %b = and i64 %a, 16383
+  ret i64 %b
+}
+
+define i64 @vsetvlmax_e16m2_and13bits() {
+; CHECK-LABEL: vsetvlmax_e16m2_and13bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT:    slli a0, a0, 51
+; CHECK-NEXT:    srli a0, a0, 51
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvlimax(i64 1, i64 1)
+  %b = and i64 %a, 8191
+  ret i64 %b
+}
+
+define i64 @vsetvlmax_e16m4_and15bits() {
+; CHECK-LABEL: vsetvlmax_e16m4_and15bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvlimax(i64 1, i64 2)
+  %b = and i64 %a, 32767
+  ret i64 %b
+}
+
+define i64 @vsetvlmax_e16m4_and14bits() {
+; CHECK-LABEL: vsetvlmax_e16m4_and14bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT:    slli a0, a0, 50
+; CHECK-NEXT:    srli a0, a0, 50
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvlimax(i64 1, i64 2)
+  %b = and i64 %a, 16383
+  ret i64 %b
+}
+
+define i64 @vsetvlmax_e16m8_and16bits() {
+; CHECK-LABEL: vsetvlmax_e16m8_and16bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvlimax(i64 1, i64 3)
+  %b = and i64 %a, 65535
+  ret i64 %b
+}
+
+define i64 @vsetvlmax_e16m8_and15bits() {
+; CHECK-LABEL: vsetvlmax_e16m8_and15bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
+; CHECK-NEXT:    slli a0, a0, 49
+; CHECK-NEXT:    srli a0, a0, 49
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvlimax(i64 1, i64 3)
+  %b = and i64 %a, 32767
+  ret i64 %b
+}
+
+define i64 @vsetvlmax_e16mf2_and10bits() {
+; CHECK-LABEL: vsetvlmax_e16mf2_and10bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e16, mf8, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvlimax(i64 1, i64 5)
+  %b = and i64 %a, 1023
+  ret i64 %b
+}
+
+define i64 @vsetvlmax_e16mf2_and9bits() {
+; CHECK-LABEL: vsetvlmax_e16mf2_and9bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e16, mf8, ta, ma
+; CHECK-NEXT:    andi a0, a0, 511
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvlimax(i64 1, i64 5)
+  %b = and i64 %a, 511
+  ret i64 %b
+}
+
+define i64 @vsetvlmax_e16mf4_and11bits() {
+; CHECK-LABEL: vsetvlmax_e16mf4_and11bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvlimax(i64 1, i64 6)
+  %b = and i64 %a, 2047
+  ret i64 %b
+}
+
+define i64 @vsetvlmax_e16mf4_and10bits() {
+; CHECK-LABEL: vsetvlmax_e16mf4_and10bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    andi a0, a0, 1023
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvlimax(i64 1, i64 6)
+  %b = and i64 %a, 1023
+  ret i64 %b
+}
+
+define i64 @vsetvlmax_e16mf8_and12bits() {
+; CHECK-LABEL: vsetvlmax_e16mf8_and12bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvlimax(i64 1, i64 7)
+  %b = and i64 %a, 4095
+  ret i64 %b
+}
+
+define i64 @vsetvlmax_e16mf8_and11bits() {
+; CHECK-LABEL: vsetvlmax_e16mf8_and11bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    andi a0, a0, 2047
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvlimax(i64 1, i64 7)
+  %b = and i64 %a, 2047
+  ret i64 %b
+}
+
+define i64 @vsetvlmax_e32m1_and12bits() {
+; CHECK-LABEL: vsetvlmax_e32m1_and12bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvlimax(i64 2, i64 0)
+  %b = and i64 %a, 4095
+  ret i64 %b
+}
+
+define i64 @vsetvlmax_e32m1_and11bits() {
+; CHECK-LABEL: vsetvlmax_e32m1_and11bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
+; CHECK-NEXT:    andi a0, a0, 2047
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvlimax(i64 2, i64 0)
+  %b = and i64 %a, 2047
+  ret i64 %b
+}
+
+define i64 @vsetvlmax_e32m2_and13bits() {
+; CHECK-LABEL: vsetvlmax_e32m2_and13bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvlimax(i64 2, i64 1)
+  %b = and i64 %a, 8191
+  ret i64 %b
+}
+
+define i64 @vsetvlmax_e32m2_and12bits() {
+; CHECK-LABEL: vsetvlmax_e32m2_and12bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
+; CHECK-NEXT:    slli a0, a0, 52
+; CHECK-NEXT:    srli a0, a0, 52
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvlimax(i64 2, i64 1)
+  %b = and i64 %a, 4095
+  ret i64 %b
+}
+
+define i64 @vsetvlmax_e32m4_and14bits() {
+; CHECK-LABEL: vsetvlmax_e32m4_and14bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvlimax(i64 2, i64 2)
+  %b = and i64 %a, 16383
+  ret i64 %b
+}
+
+define i64 @vsetvlmax_e32m4_and13bits() {
+; CHECK-LABEL: vsetvlmax_e32m4_and13bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
+; CHECK-NEXT:    slli a0, a0, 51
+; CHECK-NEXT:    srli a0, a0, 51
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvlimax(i64 2, i64 2)
+  %b = and i64 %a, 8191
+  ret i64 %b
+}
+
+define i64 @vsetvlmax_e32m8_and15bits() {
+; CHECK-LABEL: vsetvlmax_e32m8_and15bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvlimax(i64 2, i64 3)
+  %b = and i64 %a, 32767
+  ret i64 %b
+}
+
+define i64 @vsetvlmax_e32m8_and14bits() {
+; CHECK-LABEL: vsetvlmax_e32m8_and14bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
+; CHECK-NEXT:    slli a0, a0, 50
+; CHECK-NEXT:    srli a0, a0, 50
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvlimax(i64 2, i64 3)
+  %b = and i64 %a, 16383
+  ret i64 %b
+}
+
+define i64 @vsetvlmax_e32mf2_and9bits() {
+; CHECK-LABEL: vsetvlmax_e32mf2_and9bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e32, mf8, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvlimax(i64 2, i64 5)
+  %b = and i64 %a, 511
+  ret i64 %b
+}
+
+define i64 @vsetvlmax_e32mf2_and8bits() {
+; CHECK-LABEL: vsetvlmax_e32mf2_and8bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e32, mf8, ta, ma
+; CHECK-NEXT:    andi a0, a0, 255
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvlimax(i64 2, i64 5)
+  %b = and i64 %a, 255
+  ret i64 %b
+}
+
+define i64 @vsetvlmax_e32mf4_and10bits() {
+; CHECK-LABEL: vsetvlmax_e32mf4_and10bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e32, mf4, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvlimax(i64 2, i64 6)
+  %b = and i64 %a, 1023
+  ret i64 %b
+}
+
+define i64 @vsetvlmax_e32mf4_and9bits() {
+; CHECK-LABEL: vsetvlmax_e32mf4_and9bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e32, mf4, ta, ma
+; CHECK-NEXT:    andi a0, a0, 511
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvlimax(i64 2, i64 6)
+  %b = and i64 %a, 511
+  ret i64 %b
+}
+
+define i64 @vsetvlmax_e32mf8_and11bits() {
+; CHECK-LABEL: vsetvlmax_e32mf8_and11bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvlimax(i64 2, i64 7)
+  %b = and i64 %a, 2047
+  ret i64 %b
+}
+
+define i64 @vsetvlmax_e32mf8_and10bits() {
+; CHECK-LABEL: vsetvlmax_e32mf8_and10bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
+; CHECK-NEXT:    andi a0, a0, 1023
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvlimax(i64 2, i64 7)
+  %b = and i64 %a, 1023
+  ret i64 %b
+}
+
+define i64 @vsetvlmax_e64m1_and11bits() {
+; CHECK-LABEL: vsetvlmax_e64m1_and11bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvlimax(i64 3, i64 0)
+  %b = and i64 %a, 2047
+  ret i64 %b
+}
+
+define i64 @vsetvlmax_e64m1_and10bits() {
+; CHECK-LABEL: vsetvlmax_e64m1_and10bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; CHECK-NEXT:    andi a0, a0, 1023
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvlimax(i64 3, i64 0)
+  %b = and i64 %a, 1023
+  ret i64 %b
+}
+
+define i64 @vsetvlmax_e64m2_and12bits() {
+; CHECK-LABEL: vsetvlmax_e64m2_and12bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvlimax(i64 3, i64 1)
+  %b = and i64 %a, 4095
+  ret i64 %b
+}
+
+define i64 @vsetvlmax_e64m2_and11bits() {
+; CHECK-LABEL: vsetvlmax_e64m2_and11bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; CHECK-NEXT:    andi a0, a0, 2047
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvlimax(i64 3, i64 1)
+  %b = and i64 %a, 2047
+  ret i64 %b
+}
+
+define i64 @vsetvlmax_e64m4_and13bits() {
+; CHECK-LABEL: vsetvlmax_e64m4_and13bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvlimax(i64 3, i64 2)
+  %b = and i64 %a, 8191
+  ret i64 %b
+}
+
+define i64 @vsetvlmax_e64m4_and12bits() {
+; CHECK-LABEL: vsetvlmax_e64m4_and12bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
+; CHECK-NEXT:    slli a0, a0, 52
+; CHECK-NEXT:    srli a0, a0, 52
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvlimax(i64 3, i64 2)
+  %b = and i64 %a, 4095
+  ret i64 %b
+}
+
+define i64 @vsetvlmax_e64m8_and14bits() {
+; CHECK-LABEL: vsetvlmax_e64m8_and14bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvlimax(i64 3, i64 3)
+  %b = and i64 %a, 16383
+  ret i64 %b
+}
+
+define i64 @vsetvlmax_e64m8_and13bits() {
+; CHECK-LABEL: vsetvlmax_e64m8_and13bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; CHECK-NEXT:    slli a0, a0, 51
+; CHECK-NEXT:    srli a0, a0, 51
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvlimax(i64 3, i64 3)
+  %b = and i64 %a, 8191
+  ret i64 %b
+}
+
+define i64 @vsetvlmax_e64mf2_and8bits() {
+; CHECK-LABEL: vsetvlmax_e64mf2_and8bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e64, mf8, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvlimax(i64 3, i64 5)
+  %b = and i64 %a, 255
+  ret i64 %b
+}
+
+define i64 @vsetvlmax_e64mf2_and7bits() {
+; CHECK-LABEL: vsetvlmax_e64mf2_and7bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e64, mf8, ta, ma
+; CHECK-NEXT:    andi a0, a0, 127
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvlimax(i64 3, i64 5)
+  %b = and i64 %a, 127
+  ret i64 %b
+}
+
+define i64 @vsetvlmax_e64mf4_and9bits() {
+; CHECK-LABEL: vsetvlmax_e64mf4_and9bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e64, mf4, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvlimax(i64 3, i64 6)
+  %b = and i64 %a, 511
+  ret i64 %b
+}
+
+define i64 @vsetvlmax_e64mf4_and8bits() {
+; CHECK-LABEL: vsetvlmax_e64mf4_and8bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e64, mf4, ta, ma
+; CHECK-NEXT:    andi a0, a0, 255
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvlimax(i64 3, i64 6)
+  %b = and i64 %a, 255
+  ret i64 %b
+}
+
+define i64 @vsetvlmax_e64mf8_and10bits() {
+; CHECK-LABEL: vsetvlmax_e64mf8_and10bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e64, mf2, ta, ma
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvlimax(i64 3, i64 7)
+  %b = and i64 %a, 1023
+  ret i64 %b
+}
+
+define i64 @vsetvlmax_e64mf8_and9bits() {
+; CHECK-LABEL: vsetvlmax_e64mf8_and9bits:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e64, mf2, ta, ma
+; CHECK-NEXT:    andi a0, a0, 511
+; CHECK-NEXT:    ret
+  %a = call i64 @llvm.riscv.vsetvlimax(i64 3, i64 7)
+  %b = and i64 %a, 511
+  ret i64 %b
+}

From 966d564e43e650b9c34f9c67829d3947f52add91 Mon Sep 17 00:00:00 2001
From: Dan McGregor <dan.mcgregor@usask.ca>
Date: Sun, 24 Dec 2023 22:37:35 -0600
Subject: [PATCH 245/342] asan_static x86-64: Support 64-bit
 ASAN_SHADOW_OFFSET_CONST redux (#76185)

Similar to b9935bb02a50, but also apply a similar change to
ACCESS_CHECK_ADD.

If ASAN_SHADOW_OFFSET_CONST cannot be encoded as a displacement, switch
to `movabsq` and the register offset variant of cmp.
---
 compiler-rt/lib/asan/asan_rtl_x86_64.S | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/compiler-rt/lib/asan/asan_rtl_x86_64.S b/compiler-rt/lib/asan/asan_rtl_x86_64.S
index e44587ac4331c..9c5289856d8ae 100644
--- a/compiler-rt/lib/asan/asan_rtl_x86_64.S
+++ b/compiler-rt/lib/asan/asan_rtl_x86_64.S
@@ -89,7 +89,12 @@ ENDF
 #define ASAN_MEMORY_ACCESS_CHECK_ADD(reg, op, s, c) \
         mov    %##reg,%r10 ;\
         shr    $0x3,%r10 ;\
+        .if ASAN_SHADOW_OFFSET_CONST < 0x80000000  ;\
         ##c    $0x0,ASAN_SHADOW_OFFSET_CONST(%r10) ;\
+        .else                                      ;\
+        movabsq $ASAN_SHADOW_OFFSET_CONST,%r11     ;\
+        ##c    $0x0,(%r10,%r11)                    ;\
+        .endif                                     ;\
         jne    FLABEL(reg, op, s, add) ;\
         retq  ;\
 

From 41cb686d0f05d2ed35e52f8c5e498ca3dd0809c4 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Sun, 24 Dec 2023 22:45:50 -0800
Subject: [PATCH 246/342] [CodeGen] Use range-based for loops (NFC)

---
 llvm/lib/CodeGen/LiveRangeEdit.cpp                   |  3 +--
 llvm/lib/CodeGen/MachineInstrBundle.cpp              |  6 ++----
 llvm/lib/CodeGen/RegisterClassInfo.cpp               |  3 +--
 llvm/lib/CodeGen/RegisterCoalescer.cpp               |  6 ++----
 llvm/lib/CodeGen/SelectionDAG/FastISel.cpp           |  3 +--
 .../CodeGen/SelectionDAG/FunctionLoweringInfo.cpp    |  3 +--
 llvm/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp    | 12 ++++--------
 .../lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp |  3 +--
 llvm/lib/CodeGen/TwoAddressInstructionPass.cpp       |  3 +--
 9 files changed, 14 insertions(+), 28 deletions(-)

diff --git a/llvm/lib/CodeGen/LiveRangeEdit.cpp b/llvm/lib/CodeGen/LiveRangeEdit.cpp
index 0203034b5a014..643370f0573d1 100644
--- a/llvm/lib/CodeGen/LiveRangeEdit.cpp
+++ b/llvm/lib/CodeGen/LiveRangeEdit.cpp
@@ -426,8 +426,7 @@ void LiveRangeEdit::eliminateDeadDef(MachineInstr *MI, ToShrinkSet &ToShrink) {
 
   // Erase any virtregs that are now empty and unused. There may be <undef>
   // uses around. Keep the empty live range in that case.
-  for (unsigned i = 0, e = RegsToErase.size(); i != e; ++i) {
-    Register Reg = RegsToErase[i];
+  for (Register Reg : RegsToErase) {
     if (LIS.hasInterval(Reg) && MRI.reg_nodbg_empty(Reg)) {
       ToShrink.remove(&LIS.getInterval(Reg));
       eraseVirtReg(Reg);
diff --git a/llvm/lib/CodeGen/MachineInstrBundle.cpp b/llvm/lib/CodeGen/MachineInstrBundle.cpp
index b9db34f7be954..6eeed8b5c3f7d 100644
--- a/llvm/lib/CodeGen/MachineInstrBundle.cpp
+++ b/llvm/lib/CodeGen/MachineInstrBundle.cpp
@@ -208,8 +208,7 @@ void llvm::finalizeBundle(MachineBasicBlock &MBB,
   }
 
   SmallSet<Register, 32> Added;
-  for (unsigned i = 0, e = LocalDefs.size(); i != e; ++i) {
-    Register Reg = LocalDefs[i];
+  for (Register Reg : LocalDefs) {
     if (Added.insert(Reg).second) {
       // If it's not live beyond end of the bundle, mark it dead.
       bool isDead = DeadDefSet.count(Reg) || KilledDefSet.count(Reg);
@@ -218,8 +217,7 @@ void llvm::finalizeBundle(MachineBasicBlock &MBB,
     }
   }
 
-  for (unsigned i = 0, e = ExternUses.size(); i != e; ++i) {
-    Register Reg = ExternUses[i];
+  for (Register Reg : ExternUses) {
     bool isKill = KilledUseSet.count(Reg);
     bool isUndef = UndefUseSet.count(Reg);
     MIB.addReg(Reg, getKillRegState(isKill) | getUndefRegState(isUndef) |
diff --git a/llvm/lib/CodeGen/RegisterClassInfo.cpp b/llvm/lib/CodeGen/RegisterClassInfo.cpp
index fba8c35ecec26..17a9f55cccc0c 100644
--- a/llvm/lib/CodeGen/RegisterClassInfo.cpp
+++ b/llvm/lib/CodeGen/RegisterClassInfo.cpp
@@ -165,8 +165,7 @@ void RegisterClassInfo::compute(const TargetRegisterClass *RC) const {
   assert(RCI.NumRegs <= NumRegs && "Allocation order larger than regclass");
 
   // CSR aliases go after the volatile registers, preserve the target's order.
-  for (unsigned i = 0, e = CSRAlias.size(); i != e; ++i) {
-    unsigned PhysReg = CSRAlias[i];
+  for (unsigned PhysReg : CSRAlias) {
     uint8_t Cost = RegCosts[PhysReg];
     if (Cost != LastCost)
       LastCostChange = N;
diff --git a/llvm/lib/CodeGen/RegisterCoalescer.cpp b/llvm/lib/CodeGen/RegisterCoalescer.cpp
index 397fff5263426..3fbb93795075d 100644
--- a/llvm/lib/CodeGen/RegisterCoalescer.cpp
+++ b/llvm/lib/CodeGen/RegisterCoalescer.cpp
@@ -1621,8 +1621,7 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP,
     NewMI.addOperand(MO);
 
   SlotIndex NewMIIdx = LIS->getInstructionIndex(NewMI);
-  for (unsigned i = 0, e = NewMIImplDefs.size(); i != e; ++i) {
-    MCRegister Reg = NewMIImplDefs[i];
+  for (MCRegister Reg : NewMIImplDefs) {
     for (MCRegUnit Unit : TRI->regunits(Reg))
       if (LiveRange *LR = LIS->getCachedRegUnit(Unit))
         LR->createDeadDef(NewMIIdx.getRegSlot(), LIS->getVNInfoAllocator());
@@ -4269,8 +4268,7 @@ bool RegisterCoalescer::runOnMachineFunction(MachineFunction &fn) {
                     InflateRegs.end());
   LLVM_DEBUG(dbgs() << "Trying to inflate " << InflateRegs.size()
                     << " regs.\n");
-  for (unsigned i = 0, e = InflateRegs.size(); i != e; ++i) {
-    Register Reg = InflateRegs[i];
+  for (Register Reg : InflateRegs) {
     if (MRI->reg_nodbg_empty(Reg))
       continue;
     if (MRI->recomputeRegClass(Reg)) {
diff --git a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
index a831295863399..f3d8edb8926b6 100644
--- a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
@@ -1000,8 +1000,7 @@ bool FastISel::lowerCallTo(CallLoweringInfo &CLI) {
   if (!CanLowerReturn)
     return false;
 
-  for (unsigned I = 0, E = RetTys.size(); I != E; ++I) {
-    EVT VT = RetTys[I];
+  for (EVT VT : RetTys) {
     MVT RegisterVT = TLI.getRegisterType(CLI.RetTy->getContext(), VT);
     unsigned NumRegs = TLI.getNumRegisters(CLI.RetTy->getContext(), VT);
     for (unsigned i = 0; i != NumRegs; ++i) {
diff --git a/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp b/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
index 03cba892a167b..5926a60581112 100644
--- a/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
@@ -377,8 +377,7 @@ Register FunctionLoweringInfo::CreateRegs(Type *Ty, bool isDivergent) {
   ComputeValueVTs(*TLI, MF->getDataLayout(), Ty, ValueVTs);
 
   Register FirstReg;
-  for (unsigned Value = 0, e = ValueVTs.size(); Value != e; ++Value) {
-    EVT ValueVT = ValueVTs[Value];
+  for (EVT ValueVT : ValueVTs) {
     MVT RegisterVT = TLI->getRegisterType(Ty->getContext(), ValueVT);
 
     unsigned NumRegs = TLI->getNumRegisters(Ty->getContext(), ValueVT);
diff --git a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp
index ab4c33c9e976b..f73ddfee2b90f 100644
--- a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp
@@ -296,28 +296,24 @@ SUnit *ScheduleDAGFast::CopyAndMoveSuccessors(SUnit *SU) {
       if (isNewLoad)
         AddPred(LoadSU, ChainPred);
     }
-    for (unsigned i = 0, e = LoadPreds.size(); i != e; ++i) {
-      const SDep &Pred = LoadPreds[i];
+    for (const SDep &Pred : LoadPreds) {
       RemovePred(SU, Pred);
       if (isNewLoad) {
         AddPred(LoadSU, Pred);
       }
     }
-    for (unsigned i = 0, e = NodePreds.size(); i != e; ++i) {
-      const SDep &Pred = NodePreds[i];
+    for (const SDep &Pred : NodePreds) {
       RemovePred(SU, Pred);
       AddPred(NewSU, Pred);
     }
-    for (unsigned i = 0, e = NodeSuccs.size(); i != e; ++i) {
-      SDep D = NodeSuccs[i];
+    for (SDep D : NodeSuccs) {
       SUnit *SuccDep = D.getSUnit();
       D.setSUnit(SU);
       RemovePred(SuccDep, D);
       D.setSUnit(NewSU);
       AddPred(SuccDep, D);
     }
-    for (unsigned i = 0, e = ChainSuccs.size(); i != e; ++i) {
-      SDep D = ChainSuccs[i];
+    for (SDep D : ChainSuccs) {
       SUnit *SuccDep = D.getSUnit();
       D.setSUnit(SU);
       RemovePred(SuccDep, D);
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 12ed4a82ee91a..3c4b285cb0674 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -10627,8 +10627,7 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
     else if (CLI.RetZExt)
       AssertOp = ISD::AssertZext;
     unsigned CurReg = 0;
-    for (unsigned I = 0, E = RetTys.size(); I != E; ++I) {
-      EVT VT = RetTys[I];
+    for (EVT VT : RetTys) {
       MVT RegisterVT = getRegisterTypeForCallingConv(CLI.RetTy->getContext(),
                                                      CLI.CallConv, VT);
       unsigned NumRegs = getNumRegistersForCallingConv(CLI.RetTy->getContext(),
diff --git a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
index bf689dbd308f7..526cb847e8a0b 100644
--- a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -1124,8 +1124,7 @@ bool TwoAddressInstructionPass::rescheduleKillAboveMI(
       }
     }
 
-    for (unsigned i = 0, e = OtherDefs.size(); i != e; ++i) {
-      Register MOReg = OtherDefs[i];
+    for (Register MOReg : OtherDefs) {
       if (regOverlapsSet(Uses, MOReg))
         return false;
       if (MOReg.isPhysical() && regOverlapsSet(LiveDefs, MOReg))

From af8d0502860d9ec2ac9682f35a19c83300f8b25a Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Sun, 24 Dec 2023 23:09:55 -0800
Subject: [PATCH 247/342] [Target] Use range-based for loops (NFC)

---
 llvm/lib/Target/ARM/ARMFrameLowering.cpp                 | 3 +--
 llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp            | 6 ++----
 llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp | 3 +--
 llvm/lib/Target/Hexagon/HexagonConstPropagation.cpp      | 3 +--
 llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp       | 6 ++----
 llvm/lib/Target/X86/X86ISelLowering.cpp                  | 3 +--
 6 files changed, 8 insertions(+), 16 deletions(-)

diff --git a/llvm/lib/Target/ARM/ARMFrameLowering.cpp b/llvm/lib/Target/ARM/ARMFrameLowering.cpp
index 10d9c7f275beb..eeb7f64aa5810 100644
--- a/llvm/lib/Target/ARM/ARMFrameLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMFrameLowering.cpp
@@ -2692,8 +2692,7 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
     const Align TargetAlign = getStackAlign();
     if (TargetAlign >= Align(8) && (NumGPRSpills & 1)) {
       if (CS1Spilled && !UnspilledCS1GPRs.empty()) {
-        for (unsigned i = 0, e = UnspilledCS1GPRs.size(); i != e; ++i) {
-          unsigned Reg = UnspilledCS1GPRs[i];
+        for (unsigned Reg : UnspilledCS1GPRs) {
           // Don't spill high register if the function is thumb.  In the case of
           // Windows on ARM, accept R11 (frame pointer)
           if (!AFI->isThumbFunction() ||
diff --git a/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
index a679699a66c75..ed9d30c3c3ab9 100644
--- a/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
@@ -2604,16 +2604,14 @@ ARMPreAllocLoadStoreOpt::RescheduleLoadStoreInstrs(MachineBasicBlock *MBB) {
     }
 
     // Re-schedule loads.
-    for (unsigned i = 0, e = LdBases.size(); i != e; ++i) {
-      unsigned Base = LdBases[i];
+    for (unsigned Base : LdBases) {
       SmallVectorImpl<MachineInstr *> &Lds = Base2LdsMap[Base];
       if (Lds.size() > 1)
         RetVal |= RescheduleOps(MBB, Lds, Base, true, MI2LocMap, RegisterMap);
     }
 
     // Re-schedule stores.
-    for (unsigned i = 0, e = StBases.size(); i != e; ++i) {
-      unsigned Base = StBases[i];
+    for (unsigned Base : StBases) {
       SmallVectorImpl<MachineInstr *> &Sts = Base2StsMap[Base];
       if (Sts.size() > 1)
         RetVal |= RescheduleOps(MBB, Sts, Base, false, MI2LocMap, RegisterMap);
diff --git a/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp b/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp
index e2d0aeee092e1..ebb269c6e6e06 100644
--- a/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp
+++ b/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp
@@ -937,8 +937,7 @@ void DXILBitcodeWriter::writeAttributeTable() {
   Stream.EnterSubblock(bitc::PARAMATTR_BLOCK_ID, 3);
 
   SmallVector<uint64_t, 64> Record;
-  for (unsigned i = 0, e = Attrs.size(); i != e; ++i) {
-    AttributeList AL = Attrs[i];
+  for (AttributeList AL : Attrs) {
     for (unsigned i : AL.indexes()) {
       AttributeSet AS = AL.getAttributes(i);
       if (AS.hasAttributes())
diff --git a/llvm/lib/Target/Hexagon/HexagonConstPropagation.cpp b/llvm/lib/Target/Hexagon/HexagonConstPropagation.cpp
index 47fbf0a69518b..dae316ccb5e90 100644
--- a/llvm/lib/Target/Hexagon/HexagonConstPropagation.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonConstPropagation.cpp
@@ -2860,8 +2860,7 @@ bool HexagonConstEvaluator::rewriteHexConstDefs(MachineInstr &MI,
   // For each defined register, if it is a constant, create an instruction
   //   NewR = const
   // and replace all uses of the defined register with NewR.
-  for (unsigned i = 0, n = DefRegs.size(); i < n; ++i) {
-    unsigned R = DefRegs[i];
+  for (unsigned R : DefRegs) {
     const LatticeCell &L = Inputs.get(R);
     if (L.isBottom())
       continue;
diff --git a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp
index efb0d405fef2c..e08566718d7cd 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp
@@ -1337,8 +1337,7 @@ OpRef HvxSelector::packs(ShuffleMask SM, OpRef Va, OpRef Vb,
   // segments that are used in the output.
 
   unsigned Seg0 = ~0u, Seg1 = ~0u;
-  for (int I = 0, E = SegMap.size(); I != E; ++I) {
-    unsigned X = SegMap[I];
+  for (unsigned X : SegMap) {
     if (X == ~0u)
       continue;
     if (Seg0 == ~0u)
@@ -2037,8 +2036,7 @@ HvxSelector::completeToPerfect(ArrayRef<uint32_t> Completions, unsigned Width) {
 #ifndef NDEBUG
   // Check that we have generated a valid completion.
   uint32_t OrAll = 0;
-  for (unsigned I = 0, E = Comps.size(); I != E; ++I) {
-    uint32_t C = Comps[I];
+  for (uint32_t C : Comps) {
     assert(isPowerOf2_32(C));
     OrAll |= C;
   }
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 49112862a3142..63bdf24d6b4f5 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -7562,8 +7562,7 @@ static SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG,
   } else
     DstVec = DAG.getUNDEF(VT);
 
-  for (unsigned i = 0, e = NonConstIdx.size(); i != e; ++i) {
-    unsigned InsertIdx = NonConstIdx[i];
+  for (unsigned InsertIdx : NonConstIdx) {
     DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
                          Op.getOperand(InsertIdx),
                          DAG.getIntPtrConstant(InsertIdx, dl));

From fe21b3941df24420b72e789dcf67de2dc17c4417 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Sun, 24 Dec 2023 23:38:25 -0800
Subject: [PATCH 248/342] [Basic] Use range-based for loops (NFC)

---
 clang/include/clang/Basic/PlistSupport.h | 3 +--
 clang/lib/Basic/Warnings.cpp             | 3 +--
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/clang/include/clang/Basic/PlistSupport.h b/clang/include/clang/Basic/PlistSupport.h
index 557462a5b90d0..d52d196019cf8 100644
--- a/clang/include/clang/Basic/PlistSupport.h
+++ b/clang/include/clang/Basic/PlistSupport.h
@@ -77,8 +77,7 @@ inline raw_ostream &EmitInteger(raw_ostream &o, int64_t value) {
 
 inline raw_ostream &EmitString(raw_ostream &o, StringRef s) {
   o << "<string>";
-  for (StringRef::const_iterator I = s.begin(), E = s.end(); I != E; ++I) {
-    char c = *I;
+  for (char c : s) {
     switch (c) {
     default:
       o << c;
diff --git a/clang/lib/Basic/Warnings.cpp b/clang/lib/Basic/Warnings.cpp
index cb23d844ef8f6..bab1af4f03b67 100644
--- a/clang/lib/Basic/Warnings.cpp
+++ b/clang/lib/Basic/Warnings.cpp
@@ -198,8 +198,7 @@ void clang::ProcessWarningOptions(DiagnosticsEngine &Diags,
       }
     }
 
-    for (unsigned i = 0, e = Opts.Remarks.size(); i != e; ++i) {
-      StringRef Opt = Opts.Remarks[i];
+    for (StringRef Opt : Opts.Remarks) {
       const auto Flavor = diag::Flavor::Remark;
 
       // Check to see if this warning starts with "no-", if so, this is a

From 03b774762aacbaaf9eefd4ac006f3be36442bcb7 Mon Sep 17 00:00:00 2001
From: "A. Jiang" <de34@live.cn>
Date: Mon, 25 Dec 2023 17:18:28 +0800
Subject: [PATCH 249/342] [libc++][test] Move `abi.compile.pass.cpp` to
 `libcxx` subdirectory (#76349)

This test file mainly asserts the implementation details of libc++, so
it should be in the `libcxx` subdirectory.
---
 .../containers/sequences/deque/abi.compile.pass.cpp               | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename libcxx/test/{std => libcxx}/containers/sequences/deque/abi.compile.pass.cpp (100%)

diff --git a/libcxx/test/std/containers/sequences/deque/abi.compile.pass.cpp b/libcxx/test/libcxx/containers/sequences/deque/abi.compile.pass.cpp
similarity index 100%
rename from libcxx/test/std/containers/sequences/deque/abi.compile.pass.cpp
rename to libcxx/test/libcxx/containers/sequences/deque/abi.compile.pass.cpp

From 88548df0fc08364bd03148c936e36f0bb07dde8a Mon Sep 17 00:00:00 2001
From: Lu Weining <luweining@loongson.cn>
Date: Mon, 25 Dec 2023 17:40:48 +0800
Subject: [PATCH 250/342] [lld][LoongArch] Support the R_LARCH_CALL36
 relocation type (#73346)

R_LARCH_CALL36 was designed for function call on medium code model where
the 2 instructions (pcaddu18i + jirl) must be adjacent. This is expected
to replace current medium code model implementation, i.e.
R_LARCH_PCALA_{HI20,LO12} on pcalau12i + jirl.

See https://github.com/loongson/la-abi-specs/pull/3 for more details.
---
 lld/ELF/Arch/LoongArch.cpp      | 20 ++++++++++
 lld/test/ELF/loongarch-call36.s | 69 +++++++++++++++++++++++++++++++++
 2 files changed, 89 insertions(+)
 create mode 100644 lld/test/ELF/loongarch-call36.s

diff --git a/lld/ELF/Arch/LoongArch.cpp b/lld/ELF/Arch/LoongArch.cpp
index 1c3e015efc164..996f9957a63ce 100644
--- a/lld/ELF/Arch/LoongArch.cpp
+++ b/lld/ELF/Arch/LoongArch.cpp
@@ -463,6 +463,7 @@ RelExpr LoongArch::getRelExpr(const RelType type, const Symbol &s,
   case R_LARCH_B16:
   case R_LARCH_B21:
   case R_LARCH_B26:
+  case R_LARCH_CALL36:
     return R_PLT_PC;
   case R_LARCH_GOT_PC_HI20:
   case R_LARCH_GOT64_PC_LO20:
@@ -590,6 +591,25 @@ void LoongArch::relocate(uint8_t *loc, const Relocation &rel,
     write32le(loc, setD10k16(read32le(loc), val >> 2));
     return;
 
+  case R_LARCH_CALL36: {
+    // This relocation is designed for adjancent pcaddu18i+jirl pairs that
+    // are patched in one time. Because of sign extension of these insns'
+    // immediate fields, the relocation range is [-128G - 0x20000, +128G -
+    // 0x20000) (of course must be 4-byte aligned).
+    if (((int64_t)val + 0x20000) != llvm::SignExtend64(val + 0x20000, 38))
+      reportRangeError(loc, rel, Twine(val), llvm::minIntN(38) - 0x20000,
+                       llvm::maxIntN(38) - 0x20000);
+    checkAlignment(loc, val, 4, rel);
+    // Since jirl performs sign extension on the offset immediate, adds (1<<17)
+    // to original val to get the correct hi20.
+    uint32_t hi20 = extractBits(val + (1 << 17), 37, 18);
+    // Despite the name, the lower part is actually 18 bits with 4-byte aligned.
+    uint32_t lo16 = extractBits(val, 17, 2);
+    write32le(loc, setJ20(read32le(loc), hi20));
+    write32le(loc + 4, setK16(read32le(loc + 4), lo16));
+    return;
+  }
+
   // Relocs intended for `addi`, `ld` or `st`.
   case R_LARCH_PCALA_LO12:
     // We have to again inspect the insn word to handle the R_LARCH_PCALA_LO12
diff --git a/lld/test/ELF/loongarch-call36.s b/lld/test/ELF/loongarch-call36.s
new file mode 100644
index 0000000000000..2d25a2ac64ed7
--- /dev/null
+++ b/lld/test/ELF/loongarch-call36.s
@@ -0,0 +1,69 @@
+# REQUIRES: loongarch
+
+# RUN: rm -rf %t && split-file %s %t
+# RUN: llvm-mc --filetype=obj --triple=loongarch64-unknown-elf %t/a.s -o %t/a.o
+
+# RUN: ld.lld %t/a.o --section-start=.text=0x20010 --section-start=.sec.foo=0x60020 -o %t/exe1
+# RUN: llvm-objdump --no-show-raw-insn -d %t/exe1 | FileCheck --match-full-lines %s --check-prefix=EXE1
+## hi20 = target - pc + (1 << 17) >> 18 = 0x60020 - 0x20010 + 0x20000 >> 18 = 1
+## lo18 = target - pc & (1 << 18) - 1 = 0x60020 - 0x20010 & 0x3ffff = 16
+# EXE1:      20010: pcaddu18i $t0, 1
+# EXE1-NEXT: 20014: jirl $zero, $t0, 16
+
+# RUN: ld.lld %t/a.o --section-start=.text=0x20010 --section-start=.sec.foo=0x40020 -o %t/exe2
+# RUN: llvm-objdump --no-show-raw-insn -d %t/exe2 | FileCheck --match-full-lines %s --check-prefix=EXE2
+## hi20 = target - pc + (1 << 17) >> 18 = 0x40020 - 0x20010 + 0x20000 >> 18 = 1
+## lo18 = target - pc & (1 << 18) - 1 = 0x40020 - 0x20010 & 0x3ffff = -131056
+# EXE2:      20010: pcaddu18i $t0, 1
+# EXE2-NEXT: 20014: jirl $zero, $t0, -131056
+
+# RUN: ld.lld %t/a.o -shared -T %t/a.t -o %t/a.so
+# RUN: llvm-readelf -x .got.plt %t/a.so | FileCheck --check-prefix=GOTPLT %s
+# RUN: llvm-objdump -d --no-show-raw-insn %t/a.so | FileCheck --check-prefix=SO %s
+## PLT should be present in this case.
+# SO:    Disassembly of section .plt:
+# SO:    <.plt>:
+##       foo@plt:
+# SO:    1234520:  pcaddu12i $t3, 64{{$}}
+# SO-NEXT:         ld.d $t3, $t3, 544{{$}}
+# SO-NEXT:         jirl $t1, $t3, 0
+# SO-NEXT:         nop
+
+# SO:   Disassembly of section .text:
+# SO:   <_start>:
+## hi20 = foo@plt - pc + (1 << 17) >> 18 = 0x1234520 - 0x1274670 + 0x20000 >> 18 = -1
+## lo18 = foo@plt - pc & (1 << 18) - 1 = 0x1234520 - 0x1274670 & 0x3ffff = -336
+# SO-NEXT: pcaddu18i $t0, -1{{$}}
+# SO-NEXT: jirl $zero, $t0, -336{{$}}
+
+# GOTPLT:      section '.got.plt':
+# GOTPLT-NEXT: 0x01274730 00000000 00000000 00000000 00000000
+# GOTPLT-NEXT: 0x01274740 00452301 00000000
+
+# RUN: not ld.lld %t/a.o --section-start=.text=0x20000 --section-start=.sec.foo=0x2000020000 -o /dev/null 2>&1 | \
+# RUN:   FileCheck -DFILE=%t/a.o --check-prefix=ERROR-RANGE %s
+# ERROR-RANGE: error: [[FILE]]:(.text+0x0): relocation R_LARCH_CALL36 out of range: 137438953472 is not in [-137439084544, 137438822399]; references 'foo'
+
+## Impossible case in reality becasue all LoongArch instructions are fixed 4-bytes long.
+# RUN: not ld.lld %t/a.o --section-start=.text=0x20000 --section-start=.sec.foo=0x40001 -o /dev/null 2>&1 | \
+# RUN:   FileCheck -DFILE=%t/a.o --check-prefix=ERROR-ALIGN %s
+# ERROR-ALIGN: error: [[FILE]]:(.text+0x0): improper alignment for relocation R_LARCH_CALL36: 0x20001 is not aligned to 4 bytes
+
+#--- a.t
+SECTIONS {
+ .plt   0x1234500: { *(.plt) }
+ .text  0x1274670: { *(.text) }
+}
+
+#--- a.s
+.text
+.global _start
+_start:
+  .reloc ., R_LARCH_CALL36, foo
+  pcaddu18i $t0, 0
+  jirl      $zero, $t0, 0
+
+.section .sec.foo,"ax"
+.global foo
+foo:
+  ret

From b951239932c735deec633bb53c0efa71912155f1 Mon Sep 17 00:00:00 2001
From: vient <lozko.roma@gmail.com>
Date: Wed, 6 Sep 2023 03:14:32 +0300
Subject: [PATCH 251/342] [polly] [CMake] Link polly-isl-test with LLVMSupport
 (#65424)

Otherwise link may fail if user provided additional library to link with via CMAKE_EXE_LINKER_FLAGS. Concrete example is using custom allocator, LLVMSupport provides needed -lpthread in that case.

Closes: https://github.com/llvm/llvm-project/pull/65424
---
 polly/lib/External/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/polly/lib/External/CMakeLists.txt b/polly/lib/External/CMakeLists.txt
index 458925f9b6e4d..1869410c8baa5 100644
--- a/polly/lib/External/CMakeLists.txt
+++ b/polly/lib/External/CMakeLists.txt
@@ -306,6 +306,7 @@ if (POLLY_BUNDLED_ISL)
 
   target_link_libraries(polly-isl-test PRIVATE
     PollyISL
+    LLVMSupport
     )
 
   # ISL requires at least C99 to compile. gcc < 5.0 use -std=gnu89 as default.

From 58689e4318be7c42b54a77adf5439f6065caa58a Mon Sep 17 00:00:00 2001
From: Violet Purcell <vimproved@inventati.org>
Date: Sun, 17 Sep 2023 11:10:53 -0400
Subject: [PATCH 252/342] [polly] [CMake] Create component and install target
 in add_polly_library (#66598)

Currently there's no component for LLVMPolly and PollyISL, however
they are added to exports whether or not they are installed. This commit
calls add_llvm_install_targets in the add_polly_library function to
allow installation of LLVMPolly and PollyISL via distribution
components, so they can be installed without also installing libPolly.a.

Closes: https://github.com/llvm/llvm-project/pull/66598
---
 polly/cmake/polly_macros.cmake | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/polly/cmake/polly_macros.cmake b/polly/cmake/polly_macros.cmake
index 518a09b45a420..df541eeccc4cb 100644
--- a/polly/cmake/polly_macros.cmake
+++ b/polly/cmake/polly_macros.cmake
@@ -43,9 +43,12 @@ macro(add_polly_library name)
   endif( LLVM_LINK_COMPONENTS )
   if (NOT LLVM_INSTALL_TOOLCHAIN_ONLY OR ${name} STREQUAL "LLVMPolly")
     install(TARGETS ${name}
+      COMPONENT ${name}
       EXPORT LLVMExports
       LIBRARY DESTINATION lib${LLVM_LIBDIR_SUFFIX}
       ARCHIVE DESTINATION lib${LLVM_LIBDIR_SUFFIX})
+    add_llvm_install_targets(install-${name}
+      COMPONENT ${name})
   endif()
   set_property(GLOBAL APPEND PROPERTY LLVM_EXPORTS ${name})
 endmacro(add_polly_library)

From 0fbc728dba97149e530cfb7f2ada0283c398a7ce Mon Sep 17 00:00:00 2001
From: Weining Lu <luweining@loongson.cn>
Date: Mon, 25 Dec 2023 18:28:19 +0800
Subject: [PATCH 253/342] [lld][test][LoongArch] Remove the test for
 R_LARCH_CALL36 range checking

Several buildbots report:
ld.lld: error: failed to open /dev/null: Cannot allocate memory

For example:
- https://lab.llvm.org/buildbot/#/builders/184/builds/8842
- https://lab.llvm.org/buildbot/#/builders/247/builds/12559
---
 lld/test/ELF/loongarch-call36.s | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/lld/test/ELF/loongarch-call36.s b/lld/test/ELF/loongarch-call36.s
index 2d25a2ac64ed7..0a00adacbd6a5 100644
--- a/lld/test/ELF/loongarch-call36.s
+++ b/lld/test/ELF/loongarch-call36.s
@@ -40,10 +40,6 @@
 # GOTPLT-NEXT: 0x01274730 00000000 00000000 00000000 00000000
 # GOTPLT-NEXT: 0x01274740 00452301 00000000
 
-# RUN: not ld.lld %t/a.o --section-start=.text=0x20000 --section-start=.sec.foo=0x2000020000 -o /dev/null 2>&1 | \
-# RUN:   FileCheck -DFILE=%t/a.o --check-prefix=ERROR-RANGE %s
-# ERROR-RANGE: error: [[FILE]]:(.text+0x0): relocation R_LARCH_CALL36 out of range: 137438953472 is not in [-137439084544, 137438822399]; references 'foo'
-
 ## Impossible case in reality becasue all LoongArch instructions are fixed 4-bytes long.
 # RUN: not ld.lld %t/a.o --section-start=.text=0x20000 --section-start=.sec.foo=0x40001 -o /dev/null 2>&1 | \
 # RUN:   FileCheck -DFILE=%t/a.o --check-prefix=ERROR-ALIGN %s

From 48f36c6e742e743e33f931536c653bf4e23568fb Mon Sep 17 00:00:00 2001
From: Acim Maravic <Acim.Maravic@Syrmia.com>
Date: Mon, 25 Dec 2023 11:55:20 +0100
Subject: [PATCH 254/342] [LLVM] Make use of s_flbit_i32_b64 and s_ff1_i32_b64
 (#75158)

Update DAG ISel to support 64bit versions S_FF1_I32_B64 and
S_FLBIT_I32_B664

---------

Co-authored-by: Acim Maravic <Acim.Maravic@amd.com>
---
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp |  16 +-
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp        |  64 ++
 llvm/lib/Target/AMDGPU/SIInstrInfo.h          |   3 +
 .../AMDGPU/atomic_optimizations_buffer.ll     | 105 +--
 .../atomic_optimizations_global_pointer.ll    |  94 +-
 .../atomic_optimizations_local_pointer.ll     | 291 ++-----
 .../AMDGPU/atomic_optimizations_raw_buffer.ll |  70 +-
 .../atomic_optimizations_struct_buffer.ll     |  70 +-
 llvm/test/CodeGen/AMDGPU/ctlz.ll              |  76 +-
 llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll   | 803 +++++++++++++++++-
 llvm/test/CodeGen/AMDGPU/cttz.ll              |  52 +-
 llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll   |  53 +-
 .../AMDGPU/global_atomics_scan_fadd.ll        | 105 +--
 .../AMDGPU/global_atomics_scan_fmax.ll        |  51 +-
 .../AMDGPU/global_atomics_scan_fmin.ll        |  51 +-
 .../AMDGPU/global_atomics_scan_fsub.ll        | 105 +--
 llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll  |  20 +-
 .../CodeGen/AMDGPU/move-to-valu-ctlz-cttz.ll  | 332 ++++++++
 llvm/test/CodeGen/AMDGPU/sdiv64.ll            |  25 +-
 llvm/test/CodeGen/AMDGPU/srem64.ll            |  45 +-
 llvm/test/CodeGen/AMDGPU/udiv64.ll            |  40 +-
 llvm/test/CodeGen/AMDGPU/urem64.ll            |  30 +-
 22 files changed, 1567 insertions(+), 934 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/move-to-valu-ctlz-cttz.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 4bf4707553e5f..541a5b62450dd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -3070,18 +3070,26 @@ SDValue AMDGPUTargetLowering::LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) cons
 
   bool ZeroUndef = Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF ||
                    Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF;
+  bool Is64BitScalar = !Src->isDivergent() && Src.getValueType() == MVT::i64;
 
-  if (Src.getValueType() == MVT::i32) {
+  if (Src.getValueType() == MVT::i32 || Is64BitScalar) {
     // (ctlz hi:lo) -> (umin (ffbh src), 32)
     // (cttz hi:lo) -> (umin (ffbl src), 32)
     // (ctlz_zero_undef src) -> (ffbh src)
     // (cttz_zero_undef src) -> (ffbl src)
+
+    //  64-bit scalar version produce 32-bit result
+    // (ctlz hi:lo) -> (umin (S_FLBIT_I32_B64 src), 64)
+    // (cttz hi:lo) -> (umin (S_FF1_I32_B64 src), 64)
+    // (ctlz_zero_undef src) -> (S_FLBIT_I32_B64 src)
+    // (cttz_zero_undef src) -> (S_FF1_I32_B64 src)
     SDValue NewOpr = DAG.getNode(NewOpc, SL, MVT::i32, Src);
     if (!ZeroUndef) {
-      const SDValue Const32 = DAG.getConstant(32, SL, MVT::i32);
-      NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, Const32);
+      const SDValue ConstVal = DAG.getConstant(
+          Op.getValueType().getScalarSizeInBits(), SL, MVT::i32);
+      NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, ConstVal);
     }
-    return NewOpr;
+    return DAG.getNode(ISD::ZERO_EXTEND, SL, Src.getValueType(), NewOpr);
   }
 
   SDValue Lo, Hi;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 29ac08b6895e5..ebe23a5eac57b 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -6912,6 +6912,15 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
     Inst.eraseFromParent();
     return;
 
+  case AMDGPU::S_FLBIT_I32_B64:
+    splitScalar64BitCountOp(Worklist, Inst, AMDGPU::V_FFBH_U32_e32);
+    Inst.eraseFromParent();
+    return;
+  case AMDGPU::S_FF1_I32_B64:
+    splitScalar64BitCountOp(Worklist, Inst, AMDGPU::V_FFBL_B32_e32);
+    Inst.eraseFromParent();
+    return;
+
   case AMDGPU::S_LSHL_B32:
     if (ST.hasOnlyRevVALUShifts()) {
       NewOpcode = AMDGPU::V_LSHLREV_B32_e64;
@@ -7845,6 +7854,61 @@ void SIInstrInfo::splitScalar64BitBFE(SIInstrWorklist &Worklist,
   addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
 }
 
+void SIInstrInfo::splitScalar64BitCountOp(SIInstrWorklist &Worklist,
+                                          MachineInstr &Inst, unsigned Opcode,
+                                          MachineDominatorTree *MDT) const {
+  //  (S_FLBIT_I32_B64 hi:lo) ->
+  // -> (umin (V_FFBH_U32_e32 hi), (uaddsat (V_FFBH_U32_e32 lo), 32))
+  //  (S_FF1_I32_B64 hi:lo) ->
+  // ->(umin (uaddsat (V_FFBL_B32_e32 hi), 32) (V_FFBL_B32_e32 lo))
+
+  MachineBasicBlock &MBB = *Inst.getParent();
+  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+  MachineBasicBlock::iterator MII = Inst;
+  const DebugLoc &DL = Inst.getDebugLoc();
+
+  MachineOperand &Dest = Inst.getOperand(0);
+  MachineOperand &Src = Inst.getOperand(1);
+
+  const MCInstrDesc &InstDesc = get(Opcode);
+
+  bool IsCtlz = Opcode == AMDGPU::V_FFBH_U32_e32;
+  unsigned OpcodeAdd =
+      ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
+
+  const TargetRegisterClass *SrcRC =
+      Src.isReg() ? MRI.getRegClass(Src.getReg()) : &AMDGPU::SGPR_32RegClass;
+  const TargetRegisterClass *SrcSubRC =
+      RI.getSubRegisterClass(SrcRC, AMDGPU::sub0);
+
+  MachineOperand SrcRegSub0 =
+      buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, AMDGPU::sub0, SrcSubRC);
+  MachineOperand SrcRegSub1 =
+      buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, AMDGPU::sub1, SrcSubRC);
+
+  Register MidReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+  Register MidReg2 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+  Register MidReg3 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+  Register MidReg4 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+
+  BuildMI(MBB, MII, DL, InstDesc, MidReg1).add(SrcRegSub0);
+
+  BuildMI(MBB, MII, DL, InstDesc, MidReg2).add(SrcRegSub1);
+
+  BuildMI(MBB, MII, DL, get(OpcodeAdd), MidReg3)
+      .addReg(IsCtlz ? MidReg1 : MidReg2)
+      .addImm(32)
+      .addImm(1); // enable clamp
+
+  BuildMI(MBB, MII, DL, get(AMDGPU::V_MIN_U32_e64), MidReg4)
+      .addReg(MidReg3)
+      .addReg(IsCtlz ? MidReg2 : MidReg1);
+
+  MRI.replaceRegWith(Dest.getReg(), MidReg4);
+
+  addUsersToMoveToVALUWorklist(MidReg4, MRI, Worklist);
+}
+
 void SIInstrInfo::addUsersToMoveToVALUWorklist(
     Register DstReg, MachineRegisterInfo &MRI,
     SIInstrWorklist &Worklist) const {
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index affe520467520..46eee6fae0a52 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -144,6 +144,9 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
   void splitScalar64BitBCNT(SIInstrWorklist &Worklist,
                             MachineInstr &Inst) const;
   void splitScalar64BitBFE(SIInstrWorklist &Worklist, MachineInstr &Inst) const;
+  void splitScalar64BitCountOp(SIInstrWorklist &Worklist, MachineInstr &Inst,
+                               unsigned Opcode,
+                               MachineDominatorTree *MDT = nullptr) const;
   void movePackToVALU(SIInstrWorklist &Worklist, MachineRegisterInfo &MRI,
                       MachineInstr &Inst) const;
 
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
index db33ed8fa5566..e3d2ecefbda30 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
@@ -589,13 +589,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX8-NEXT:    ; implicit-def: $vgpr1
 ; GFX8-NEXT:  .LBB2_1: ; %ComputeLoop
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    s_ff1_i32_b32 s5, s3
-; GFX8-NEXT:    s_ff1_i32_b32 s6, s2
-; GFX8-NEXT:    s_add_i32 s5, s5, 32
-; GFX8-NEXT:    s_min_u32 s5, s6, s5
+; GFX8-NEXT:    s_ff1_i32_b64 s5, s[2:3]
+; GFX8-NEXT:    s_mov_b32 m0, s5
 ; GFX8-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX8-NEXT:    s_lshl_b64 s[6:7], 1, s5
-; GFX8-NEXT:    s_mov_b32 m0, s5
 ; GFX8-NEXT:    v_writelane_b32 v1, s4, m0
 ; GFX8-NEXT:    s_add_i32 s4, s4, s8
 ; GFX8-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
@@ -633,13 +630,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX9-NEXT:    ; implicit-def: $vgpr1
 ; GFX9-NEXT:  .LBB2_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    s_ff1_i32_b32 s5, s3
-; GFX9-NEXT:    s_ff1_i32_b32 s6, s2
-; GFX9-NEXT:    s_add_i32 s5, s5, 32
-; GFX9-NEXT:    s_min_u32 s5, s6, s5
+; GFX9-NEXT:    s_ff1_i32_b64 s5, s[2:3]
+; GFX9-NEXT:    s_mov_b32 m0, s5
 ; GFX9-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX9-NEXT:    s_lshl_b64 s[6:7], 1, s5
-; GFX9-NEXT:    s_mov_b32 m0, s5
 ; GFX9-NEXT:    v_writelane_b32 v1, s4, m0
 ; GFX9-NEXT:    s_add_i32 s4, s4, s8
 ; GFX9-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
@@ -676,10 +670,7 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX10W64-NEXT:    ; implicit-def: $vgpr1
 ; GFX10W64-NEXT:  .LBB2_1: ; %ComputeLoop
 ; GFX10W64-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10W64-NEXT:    s_ff1_i32_b32 s5, s3
-; GFX10W64-NEXT:    s_ff1_i32_b32 s6, s2
-; GFX10W64-NEXT:    s_add_i32 s5, s5, 32
-; GFX10W64-NEXT:    s_min_u32 s5, s6, s5
+; GFX10W64-NEXT:    s_ff1_i32_b64 s5, s[2:3]
 ; GFX10W64-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX10W64-NEXT:    s_lshl_b64 s[6:7], 1, s5
 ; GFX10W64-NEXT:    v_writelane_b32 v1, s4, s5
@@ -758,16 +749,12 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX11W64-NEXT:    ; implicit-def: $vgpr1
 ; GFX11W64-NEXT:  .LBB2_1: ; %ComputeLoop
 ; GFX11W64-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11W64-NEXT:    s_ctz_i32_b32 s5, s3
-; GFX11W64-NEXT:    s_ctz_i32_b32 s6, s2
-; GFX11W64-NEXT:    s_add_i32 s5, s5, 32
-; GFX11W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11W64-NEXT:    s_min_u32 s5, s6, s5
+; GFX11W64-NEXT:    s_ctz_i32_b64 s5, s[2:3]
+; GFX11W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX11W64-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX11W64-NEXT:    s_lshl_b64 s[6:7], 1, s5
 ; GFX11W64-NEXT:    v_writelane_b32 v1, s4, s5
 ; GFX11W64-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[6:7]
-; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11W64-NEXT:    s_add_i32 s4, s4, s8
 ; GFX11W64-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX11W64-NEXT:    s_cbranch_scc1 .LBB2_1
@@ -849,16 +836,12 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX12W64-NEXT:    ; implicit-def: $vgpr1
 ; GFX12W64-NEXT:  .LBB2_1: ; %ComputeLoop
 ; GFX12W64-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12W64-NEXT:    s_ctz_i32_b32 s5, s3
-; GFX12W64-NEXT:    s_ctz_i32_b32 s6, s2
-; GFX12W64-NEXT:    s_add_co_i32 s5, s5, 32
-; GFX12W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX12W64-NEXT:    s_min_u32 s5, s6, s5
+; GFX12W64-NEXT:    s_ctz_i32_b64 s5, s[2:3]
+; GFX12W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX12W64-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX12W64-NEXT:    s_lshl_b64 s[6:7], 1, s5
 ; GFX12W64-NEXT:    v_writelane_b32 v1, s4, s5
 ; GFX12W64-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[6:7]
-; GFX12W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12W64-NEXT:    s_add_co_i32 s4, s4, s8
 ; GFX12W64-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX12W64-NEXT:    s_cbranch_scc1 .LBB2_1
@@ -961,13 +944,10 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
 ; GFX8-NEXT:    ; implicit-def: $vgpr1
 ; GFX8-NEXT:  .LBB3_1: ; %ComputeLoop
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    s_ff1_i32_b32 s5, s3
-; GFX8-NEXT:    s_ff1_i32_b32 s6, s2
-; GFX8-NEXT:    s_add_i32 s5, s5, 32
-; GFX8-NEXT:    s_min_u32 s5, s6, s5
+; GFX8-NEXT:    s_ff1_i32_b64 s5, s[2:3]
+; GFX8-NEXT:    s_mov_b32 m0, s5
 ; GFX8-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX8-NEXT:    s_lshl_b64 s[6:7], 1, s5
-; GFX8-NEXT:    s_mov_b32 m0, s5
 ; GFX8-NEXT:    v_writelane_b32 v1, s4, m0
 ; GFX8-NEXT:    s_add_i32 s4, s4, s8
 ; GFX8-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
@@ -1007,13 +987,10 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
 ; GFX9-NEXT:    ; implicit-def: $vgpr1
 ; GFX9-NEXT:  .LBB3_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    s_ff1_i32_b32 s5, s3
-; GFX9-NEXT:    s_ff1_i32_b32 s6, s2
-; GFX9-NEXT:    s_add_i32 s5, s5, 32
-; GFX9-NEXT:    s_min_u32 s5, s6, s5
+; GFX9-NEXT:    s_ff1_i32_b64 s5, s[2:3]
+; GFX9-NEXT:    s_mov_b32 m0, s5
 ; GFX9-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX9-NEXT:    s_lshl_b64 s[6:7], 1, s5
-; GFX9-NEXT:    s_mov_b32 m0, s5
 ; GFX9-NEXT:    v_writelane_b32 v1, s4, m0
 ; GFX9-NEXT:    s_add_i32 s4, s4, s8
 ; GFX9-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
@@ -1052,10 +1029,7 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
 ; GFX10W64-NEXT:    ; implicit-def: $vgpr1
 ; GFX10W64-NEXT:  .LBB3_1: ; %ComputeLoop
 ; GFX10W64-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10W64-NEXT:    s_ff1_i32_b32 s5, s3
-; GFX10W64-NEXT:    s_ff1_i32_b32 s6, s2
-; GFX10W64-NEXT:    s_add_i32 s5, s5, 32
-; GFX10W64-NEXT:    s_min_u32 s5, s6, s5
+; GFX10W64-NEXT:    s_ff1_i32_b64 s5, s[2:3]
 ; GFX10W64-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX10W64-NEXT:    s_lshl_b64 s[6:7], 1, s5
 ; GFX10W64-NEXT:    v_writelane_b32 v1, s4, s5
@@ -1140,16 +1114,12 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
 ; GFX11W64-NEXT:    ; implicit-def: $vgpr1
 ; GFX11W64-NEXT:  .LBB3_1: ; %ComputeLoop
 ; GFX11W64-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11W64-NEXT:    s_ctz_i32_b32 s5, s3
-; GFX11W64-NEXT:    s_ctz_i32_b32 s6, s2
-; GFX11W64-NEXT:    s_add_i32 s5, s5, 32
-; GFX11W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11W64-NEXT:    s_min_u32 s5, s6, s5
+; GFX11W64-NEXT:    s_ctz_i32_b64 s5, s[2:3]
+; GFX11W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX11W64-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX11W64-NEXT:    s_lshl_b64 s[6:7], 1, s5
 ; GFX11W64-NEXT:    v_writelane_b32 v1, s4, s5
 ; GFX11W64-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[6:7]
-; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11W64-NEXT:    s_add_i32 s4, s4, s8
 ; GFX11W64-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX11W64-NEXT:    s_cbranch_scc1 .LBB3_1
@@ -1237,16 +1207,12 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
 ; GFX12W64-NEXT:    ; implicit-def: $vgpr1
 ; GFX12W64-NEXT:  .LBB3_1: ; %ComputeLoop
 ; GFX12W64-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12W64-NEXT:    s_ctz_i32_b32 s5, s3
-; GFX12W64-NEXT:    s_ctz_i32_b32 s6, s2
-; GFX12W64-NEXT:    s_add_co_i32 s5, s5, 32
-; GFX12W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX12W64-NEXT:    s_min_u32 s5, s6, s5
+; GFX12W64-NEXT:    s_ctz_i32_b64 s5, s[2:3]
+; GFX12W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX12W64-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX12W64-NEXT:    s_lshl_b64 s[6:7], 1, s5
 ; GFX12W64-NEXT:    v_writelane_b32 v1, s4, s5
 ; GFX12W64-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[6:7]
-; GFX12W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12W64-NEXT:    s_add_co_i32 s4, s4, s8
 ; GFX12W64-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX12W64-NEXT:    s_cbranch_scc1 .LBB3_1
@@ -2005,13 +1971,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX8-NEXT:    ; implicit-def: $vgpr1
 ; GFX8-NEXT:  .LBB7_1: ; %ComputeLoop
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    s_ff1_i32_b32 s5, s3
-; GFX8-NEXT:    s_ff1_i32_b32 s6, s2
-; GFX8-NEXT:    s_add_i32 s5, s5, 32
-; GFX8-NEXT:    s_min_u32 s5, s6, s5
+; GFX8-NEXT:    s_ff1_i32_b64 s5, s[2:3]
+; GFX8-NEXT:    s_mov_b32 m0, s5
 ; GFX8-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX8-NEXT:    s_lshl_b64 s[6:7], 1, s5
-; GFX8-NEXT:    s_mov_b32 m0, s5
 ; GFX8-NEXT:    v_writelane_b32 v1, s4, m0
 ; GFX8-NEXT:    s_add_i32 s4, s4, s8
 ; GFX8-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
@@ -2049,13 +2012,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX9-NEXT:    ; implicit-def: $vgpr1
 ; GFX9-NEXT:  .LBB7_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    s_ff1_i32_b32 s5, s3
-; GFX9-NEXT:    s_ff1_i32_b32 s6, s2
-; GFX9-NEXT:    s_add_i32 s5, s5, 32
-; GFX9-NEXT:    s_min_u32 s5, s6, s5
+; GFX9-NEXT:    s_ff1_i32_b64 s5, s[2:3]
+; GFX9-NEXT:    s_mov_b32 m0, s5
 ; GFX9-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX9-NEXT:    s_lshl_b64 s[6:7], 1, s5
-; GFX9-NEXT:    s_mov_b32 m0, s5
 ; GFX9-NEXT:    v_writelane_b32 v1, s4, m0
 ; GFX9-NEXT:    s_add_i32 s4, s4, s8
 ; GFX9-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
@@ -2092,10 +2052,7 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX10W64-NEXT:    ; implicit-def: $vgpr1
 ; GFX10W64-NEXT:  .LBB7_1: ; %ComputeLoop
 ; GFX10W64-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10W64-NEXT:    s_ff1_i32_b32 s5, s3
-; GFX10W64-NEXT:    s_ff1_i32_b32 s6, s2
-; GFX10W64-NEXT:    s_add_i32 s5, s5, 32
-; GFX10W64-NEXT:    s_min_u32 s5, s6, s5
+; GFX10W64-NEXT:    s_ff1_i32_b64 s5, s[2:3]
 ; GFX10W64-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX10W64-NEXT:    s_lshl_b64 s[6:7], 1, s5
 ; GFX10W64-NEXT:    v_writelane_b32 v1, s4, s5
@@ -2174,16 +2131,12 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX11W64-NEXT:    ; implicit-def: $vgpr1
 ; GFX11W64-NEXT:  .LBB7_1: ; %ComputeLoop
 ; GFX11W64-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11W64-NEXT:    s_ctz_i32_b32 s5, s3
-; GFX11W64-NEXT:    s_ctz_i32_b32 s6, s2
-; GFX11W64-NEXT:    s_add_i32 s5, s5, 32
-; GFX11W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11W64-NEXT:    s_min_u32 s5, s6, s5
+; GFX11W64-NEXT:    s_ctz_i32_b64 s5, s[2:3]
+; GFX11W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX11W64-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX11W64-NEXT:    s_lshl_b64 s[6:7], 1, s5
 ; GFX11W64-NEXT:    v_writelane_b32 v1, s4, s5
 ; GFX11W64-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[6:7]
-; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11W64-NEXT:    s_add_i32 s4, s4, s8
 ; GFX11W64-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX11W64-NEXT:    s_cbranch_scc1 .LBB7_1
@@ -2266,16 +2219,12 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX12W64-NEXT:    ; implicit-def: $vgpr1
 ; GFX12W64-NEXT:  .LBB7_1: ; %ComputeLoop
 ; GFX12W64-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12W64-NEXT:    s_ctz_i32_b32 s5, s3
-; GFX12W64-NEXT:    s_ctz_i32_b32 s6, s2
-; GFX12W64-NEXT:    s_add_co_i32 s5, s5, 32
-; GFX12W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX12W64-NEXT:    s_min_u32 s5, s6, s5
+; GFX12W64-NEXT:    s_ctz_i32_b64 s5, s[2:3]
+; GFX12W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX12W64-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX12W64-NEXT:    s_lshl_b64 s[6:7], 1, s5
 ; GFX12W64-NEXT:    v_writelane_b32 v1, s4, s5
 ; GFX12W64-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[6:7]
-; GFX12W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12W64-NEXT:    s_add_co_i32 s4, s4, s8
 ; GFX12W64-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX12W64-NEXT:    s_cbranch_scc1 .LBB7_1
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
index 6a664f26d470b..9f97f1f4bace5 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
@@ -657,15 +657,12 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX8-NEXT:    ; implicit-def: $vgpr1
 ; GFX8-NEXT:  .LBB2_1: ; %ComputeLoop
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    s_ff1_i32_b32 s4, s3
-; GFX8-NEXT:    s_ff1_i32_b32 s5, s2
-; GFX8-NEXT:    s_add_i32 s4, s4, 32
-; GFX8-NEXT:    s_min_u32 s7, s5, s4
-; GFX8-NEXT:    v_readlane_b32 s8, v0, s7
-; GFX8-NEXT:    s_lshl_b64 s[4:5], 1, s7
-; GFX8-NEXT:    s_mov_b32 m0, s7
+; GFX8-NEXT:    s_ff1_i32_b64 s4, s[2:3]
+; GFX8-NEXT:    s_mov_b32 m0, s4
+; GFX8-NEXT:    v_readlane_b32 s7, v0, s4
+; GFX8-NEXT:    s_lshl_b64 s[4:5], 1, s4
 ; GFX8-NEXT:    v_writelane_b32 v1, s6, m0
-; GFX8-NEXT:    s_add_i32 s6, s6, s8
+; GFX8-NEXT:    s_add_i32 s6, s6, s7
 ; GFX8-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[4:5]
 ; GFX8-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX8-NEXT:    s_cbranch_scc1 .LBB2_1
@@ -705,15 +702,12 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX9-NEXT:    ; implicit-def: $vgpr1
 ; GFX9-NEXT:  .LBB2_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    s_ff1_i32_b32 s4, s3
-; GFX9-NEXT:    s_ff1_i32_b32 s5, s2
-; GFX9-NEXT:    s_add_i32 s4, s4, 32
-; GFX9-NEXT:    s_min_u32 s7, s5, s4
-; GFX9-NEXT:    v_readlane_b32 s8, v0, s7
-; GFX9-NEXT:    s_lshl_b64 s[4:5], 1, s7
-; GFX9-NEXT:    s_mov_b32 m0, s7
+; GFX9-NEXT:    s_ff1_i32_b64 s4, s[2:3]
+; GFX9-NEXT:    s_mov_b32 m0, s4
+; GFX9-NEXT:    v_readlane_b32 s7, v0, s4
+; GFX9-NEXT:    s_lshl_b64 s[4:5], 1, s4
 ; GFX9-NEXT:    v_writelane_b32 v1, s6, m0
-; GFX9-NEXT:    s_add_i32 s6, s6, s8
+; GFX9-NEXT:    s_add_i32 s6, s6, s7
 ; GFX9-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[4:5]
 ; GFX9-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB2_1
@@ -753,10 +747,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1064-NEXT:    ; implicit-def: $vgpr1
 ; GFX1064-NEXT:  .LBB2_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT:    s_ff1_i32_b32 s4, s3
-; GFX1064-NEXT:    s_ff1_i32_b32 s5, s2
-; GFX1064-NEXT:    s_add_i32 s4, s4, 32
-; GFX1064-NEXT:    s_min_u32 s7, s5, s4
+; GFX1064-NEXT:    s_ff1_i32_b64 s7, s[2:3]
 ; GFX1064-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX1064-NEXT:    s_lshl_b64 s[4:5], 1, s7
 ; GFX1064-NEXT:    v_writelane_b32 v1, s6, s7
@@ -847,16 +838,12 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1164-NEXT:    ; implicit-def: $vgpr1
 ; GFX1164-NEXT:  .LBB2_1: ; %ComputeLoop
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT:    s_ctz_i32_b32 s4, s3
-; GFX1164-NEXT:    s_ctz_i32_b32 s5, s2
-; GFX1164-NEXT:    s_add_i32 s4, s4, 32
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_min_u32 s7, s5, s4
+; GFX1164-NEXT:    s_ctz_i32_b64 s7, s[2:3]
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX1164-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX1164-NEXT:    s_lshl_b64 s[4:5], 1, s7
 ; GFX1164-NEXT:    v_writelane_b32 v1, s6, s7
 ; GFX1164-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[4:5]
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1164-NEXT:    s_add_i32 s6, s6, s8
 ; GFX1164-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB2_1
@@ -951,16 +938,12 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1264-NEXT:    ; implicit-def: $vgpr1
 ; GFX1264-NEXT:  .LBB2_1: ; %ComputeLoop
 ; GFX1264-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1264-NEXT:    s_ctz_i32_b32 s4, s3
-; GFX1264-NEXT:    s_ctz_i32_b32 s5, s2
-; GFX1264-NEXT:    s_add_co_i32 s4, s4, 32
-; GFX1264-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1264-NEXT:    s_min_u32 s7, s5, s4
+; GFX1264-NEXT:    s_ctz_i32_b64 s7, s[2:3]
+; GFX1264-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX1264-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX1264-NEXT:    s_lshl_b64 s[4:5], 1, s7
 ; GFX1264-NEXT:    v_writelane_b32 v1, s6, s7
 ; GFX1264-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[4:5]
-; GFX1264-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1264-NEXT:    s_add_co_i32 s6, s6, s8
 ; GFX1264-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX1264-NEXT:    s_cbranch_scc1 .LBB2_1
@@ -2557,15 +2540,12 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX8-NEXT:    ; implicit-def: $vgpr1
 ; GFX8-NEXT:  .LBB8_1: ; %ComputeLoop
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    s_ff1_i32_b32 s4, s3
-; GFX8-NEXT:    s_ff1_i32_b32 s5, s2
-; GFX8-NEXT:    s_add_i32 s4, s4, 32
-; GFX8-NEXT:    s_min_u32 s7, s5, s4
-; GFX8-NEXT:    v_readlane_b32 s8, v0, s7
-; GFX8-NEXT:    s_lshl_b64 s[4:5], 1, s7
-; GFX8-NEXT:    s_mov_b32 m0, s7
+; GFX8-NEXT:    s_ff1_i32_b64 s4, s[2:3]
+; GFX8-NEXT:    s_mov_b32 m0, s4
+; GFX8-NEXT:    v_readlane_b32 s7, v0, s4
+; GFX8-NEXT:    s_lshl_b64 s[4:5], 1, s4
 ; GFX8-NEXT:    v_writelane_b32 v1, s6, m0
-; GFX8-NEXT:    s_add_i32 s6, s6, s8
+; GFX8-NEXT:    s_add_i32 s6, s6, s7
 ; GFX8-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[4:5]
 ; GFX8-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX8-NEXT:    s_cbranch_scc1 .LBB8_1
@@ -2605,15 +2585,12 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX9-NEXT:    ; implicit-def: $vgpr1
 ; GFX9-NEXT:  .LBB8_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    s_ff1_i32_b32 s4, s3
-; GFX9-NEXT:    s_ff1_i32_b32 s5, s2
-; GFX9-NEXT:    s_add_i32 s4, s4, 32
-; GFX9-NEXT:    s_min_u32 s7, s5, s4
-; GFX9-NEXT:    v_readlane_b32 s8, v0, s7
-; GFX9-NEXT:    s_lshl_b64 s[4:5], 1, s7
-; GFX9-NEXT:    s_mov_b32 m0, s7
+; GFX9-NEXT:    s_ff1_i32_b64 s4, s[2:3]
+; GFX9-NEXT:    s_mov_b32 m0, s4
+; GFX9-NEXT:    v_readlane_b32 s7, v0, s4
+; GFX9-NEXT:    s_lshl_b64 s[4:5], 1, s4
 ; GFX9-NEXT:    v_writelane_b32 v1, s6, m0
-; GFX9-NEXT:    s_add_i32 s6, s6, s8
+; GFX9-NEXT:    s_add_i32 s6, s6, s7
 ; GFX9-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[4:5]
 ; GFX9-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB8_1
@@ -2653,10 +2630,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1064-NEXT:    ; implicit-def: $vgpr1
 ; GFX1064-NEXT:  .LBB8_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT:    s_ff1_i32_b32 s4, s3
-; GFX1064-NEXT:    s_ff1_i32_b32 s5, s2
-; GFX1064-NEXT:    s_add_i32 s4, s4, 32
-; GFX1064-NEXT:    s_min_u32 s7, s5, s4
+; GFX1064-NEXT:    s_ff1_i32_b64 s7, s[2:3]
 ; GFX1064-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX1064-NEXT:    s_lshl_b64 s[4:5], 1, s7
 ; GFX1064-NEXT:    v_writelane_b32 v1, s6, s7
@@ -2747,16 +2721,12 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1164-NEXT:    ; implicit-def: $vgpr1
 ; GFX1164-NEXT:  .LBB8_1: ; %ComputeLoop
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT:    s_ctz_i32_b32 s4, s3
-; GFX1164-NEXT:    s_ctz_i32_b32 s5, s2
-; GFX1164-NEXT:    s_add_i32 s4, s4, 32
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_min_u32 s7, s5, s4
+; GFX1164-NEXT:    s_ctz_i32_b64 s7, s[2:3]
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX1164-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX1164-NEXT:    s_lshl_b64 s[4:5], 1, s7
 ; GFX1164-NEXT:    v_writelane_b32 v1, s6, s7
 ; GFX1164-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[4:5]
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1164-NEXT:    s_add_i32 s6, s6, s8
 ; GFX1164-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB8_1
@@ -2851,16 +2821,12 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1264-NEXT:    ; implicit-def: $vgpr1
 ; GFX1264-NEXT:  .LBB8_1: ; %ComputeLoop
 ; GFX1264-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1264-NEXT:    s_ctz_i32_b32 s4, s3
-; GFX1264-NEXT:    s_ctz_i32_b32 s5, s2
-; GFX1264-NEXT:    s_add_co_i32 s4, s4, 32
-; GFX1264-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1264-NEXT:    s_min_u32 s7, s5, s4
+; GFX1264-NEXT:    s_ctz_i32_b64 s7, s[2:3]
+; GFX1264-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX1264-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX1264-NEXT:    s_lshl_b64 s[4:5], 1, s7
 ; GFX1264-NEXT:    v_writelane_b32 v1, s6, s7
 ; GFX1264-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[4:5]
-; GFX1264-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1264-NEXT:    s_add_co_i32 s6, s6, s8
 ; GFX1264-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX1264-NEXT:    s_cbranch_scc1 .LBB8_1
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
index cf2afeb7b01bc..34499043ce6bb 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
@@ -478,13 +478,10 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
 ; GFX8-NEXT:    ; implicit-def: $vgpr1
 ; GFX8-NEXT:  .LBB2_1: ; %ComputeLoop
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    s_ff1_i32_b32 s5, s3
-; GFX8-NEXT:    s_ff1_i32_b32 s6, s2
-; GFX8-NEXT:    s_add_i32 s5, s5, 32
-; GFX8-NEXT:    s_min_u32 s5, s6, s5
+; GFX8-NEXT:    s_ff1_i32_b64 s5, s[2:3]
+; GFX8-NEXT:    s_mov_b32 m0, s5
 ; GFX8-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX8-NEXT:    s_lshl_b64 s[6:7], 1, s5
-; GFX8-NEXT:    s_mov_b32 m0, s5
 ; GFX8-NEXT:    v_writelane_b32 v1, s4, m0
 ; GFX8-NEXT:    s_add_i32 s4, s4, s8
 ; GFX8-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
@@ -522,13 +519,10 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
 ; GFX9-NEXT:    ; implicit-def: $vgpr1
 ; GFX9-NEXT:  .LBB2_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    s_ff1_i32_b32 s5, s3
-; GFX9-NEXT:    s_ff1_i32_b32 s6, s2
-; GFX9-NEXT:    s_add_i32 s5, s5, 32
-; GFX9-NEXT:    s_min_u32 s5, s6, s5
+; GFX9-NEXT:    s_ff1_i32_b64 s5, s[2:3]
+; GFX9-NEXT:    s_mov_b32 m0, s5
 ; GFX9-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX9-NEXT:    s_lshl_b64 s[6:7], 1, s5
-; GFX9-NEXT:    s_mov_b32 m0, s5
 ; GFX9-NEXT:    v_writelane_b32 v1, s4, m0
 ; GFX9-NEXT:    s_add_i32 s4, s4, s8
 ; GFX9-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
@@ -565,10 +559,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
 ; GFX1064-NEXT:    ; implicit-def: $vgpr1
 ; GFX1064-NEXT:  .LBB2_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT:    s_ff1_i32_b32 s5, s3
-; GFX1064-NEXT:    s_ff1_i32_b32 s6, s2
-; GFX1064-NEXT:    s_add_i32 s5, s5, 32
-; GFX1064-NEXT:    s_min_u32 s5, s6, s5
+; GFX1064-NEXT:    s_ff1_i32_b64 s5, s[2:3]
 ; GFX1064-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX1064-NEXT:    s_lshl_b64 s[6:7], 1, s5
 ; GFX1064-NEXT:    v_writelane_b32 v1, s4, s5
@@ -649,16 +640,12 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
 ; GFX1164-NEXT:    ; implicit-def: $vgpr1
 ; GFX1164-NEXT:  .LBB2_1: ; %ComputeLoop
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT:    s_ctz_i32_b32 s5, s3
-; GFX1164-NEXT:    s_ctz_i32_b32 s6, s2
-; GFX1164-NEXT:    s_add_i32 s5, s5, 32
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_min_u32 s5, s6, s5
+; GFX1164-NEXT:    s_ctz_i32_b64 s5, s[2:3]
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX1164-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX1164-NEXT:    s_lshl_b64 s[6:7], 1, s5
 ; GFX1164-NEXT:    v_writelane_b32 v1, s4, s5
 ; GFX1164-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[6:7]
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1164-NEXT:    s_add_i32 s4, s4, s8
 ; GFX1164-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB2_1
@@ -757,10 +744,7 @@ define amdgpu_kernel void @add_i32_varying_nouse() {
 ; GFX8-NEXT:    s_mov_b32 s2, 0
 ; GFX8-NEXT:  .LBB3_1: ; %ComputeLoop
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    s_ff1_i32_b32 s3, s1
-; GFX8-NEXT:    s_ff1_i32_b32 s4, s0
-; GFX8-NEXT:    s_add_i32 s3, s3, 32
-; GFX8-NEXT:    s_min_u32 s3, s4, s3
+; GFX8-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX8-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX8-NEXT:    s_lshl_b64 s[4:5], 1, s3
 ; GFX8-NEXT:    s_add_i32 s2, s2, s6
@@ -789,10 +773,7 @@ define amdgpu_kernel void @add_i32_varying_nouse() {
 ; GFX9-NEXT:    s_mov_b32 s2, 0
 ; GFX9-NEXT:  .LBB3_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    s_ff1_i32_b32 s3, s1
-; GFX9-NEXT:    s_ff1_i32_b32 s4, s0
-; GFX9-NEXT:    s_add_i32 s3, s3, 32
-; GFX9-NEXT:    s_min_u32 s3, s4, s3
+; GFX9-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX9-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX9-NEXT:    s_lshl_b64 s[4:5], 1, s3
 ; GFX9-NEXT:    s_add_i32 s2, s2, s6
@@ -820,10 +801,7 @@ define amdgpu_kernel void @add_i32_varying_nouse() {
 ; GFX1064-NEXT:    s_mov_b32 s2, 0
 ; GFX1064-NEXT:  .LBB3_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT:    s_ff1_i32_b32 s3, s1
-; GFX1064-NEXT:    s_ff1_i32_b32 s4, s0
-; GFX1064-NEXT:    s_add_i32 s3, s3, 32
-; GFX1064-NEXT:    s_min_u32 s3, s4, s3
+; GFX1064-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX1064-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX1064-NEXT:    s_lshl_b64 s[4:5], 1, s3
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
@@ -880,15 +858,12 @@ define amdgpu_kernel void @add_i32_varying_nouse() {
 ; GFX1164-NEXT:    s_mov_b32 s2, 0
 ; GFX1164-NEXT:  .LBB3_1: ; %ComputeLoop
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT:    s_ctz_i32_b32 s3, s1
-; GFX1164-NEXT:    s_ctz_i32_b32 s4, s0
-; GFX1164-NEXT:    s_add_i32 s3, s3, 32
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_min_u32 s3, s4, s3
+; GFX1164-NEXT:    s_ctz_i32_b64 s3, s[0:1]
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX1164-NEXT:    s_lshl_b64 s[4:5], 1, s3
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[4:5]
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164-NEXT:    s_add_i32 s2, s2, s6
 ; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB3_1
@@ -2005,13 +1980,10 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
 ; GFX8-NEXT:    ; implicit-def: $vgpr1
 ; GFX8-NEXT:  .LBB9_1: ; %ComputeLoop
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    s_ff1_i32_b32 s5, s3
-; GFX8-NEXT:    s_ff1_i32_b32 s6, s2
-; GFX8-NEXT:    s_add_i32 s5, s5, 32
-; GFX8-NEXT:    s_min_u32 s5, s6, s5
+; GFX8-NEXT:    s_ff1_i32_b64 s5, s[2:3]
+; GFX8-NEXT:    s_mov_b32 m0, s5
 ; GFX8-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX8-NEXT:    s_lshl_b64 s[6:7], 1, s5
-; GFX8-NEXT:    s_mov_b32 m0, s5
 ; GFX8-NEXT:    v_writelane_b32 v1, s4, m0
 ; GFX8-NEXT:    s_add_i32 s4, s4, s8
 ; GFX8-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
@@ -2049,13 +2021,10 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
 ; GFX9-NEXT:    ; implicit-def: $vgpr1
 ; GFX9-NEXT:  .LBB9_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    s_ff1_i32_b32 s5, s3
-; GFX9-NEXT:    s_ff1_i32_b32 s6, s2
-; GFX9-NEXT:    s_add_i32 s5, s5, 32
-; GFX9-NEXT:    s_min_u32 s5, s6, s5
+; GFX9-NEXT:    s_ff1_i32_b64 s5, s[2:3]
+; GFX9-NEXT:    s_mov_b32 m0, s5
 ; GFX9-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX9-NEXT:    s_lshl_b64 s[6:7], 1, s5
-; GFX9-NEXT:    s_mov_b32 m0, s5
 ; GFX9-NEXT:    v_writelane_b32 v1, s4, m0
 ; GFX9-NEXT:    s_add_i32 s4, s4, s8
 ; GFX9-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
@@ -2092,10 +2061,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
 ; GFX1064-NEXT:    ; implicit-def: $vgpr1
 ; GFX1064-NEXT:  .LBB9_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT:    s_ff1_i32_b32 s5, s3
-; GFX1064-NEXT:    s_ff1_i32_b32 s6, s2
-; GFX1064-NEXT:    s_add_i32 s5, s5, 32
-; GFX1064-NEXT:    s_min_u32 s5, s6, s5
+; GFX1064-NEXT:    s_ff1_i32_b64 s5, s[2:3]
 ; GFX1064-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX1064-NEXT:    s_lshl_b64 s[6:7], 1, s5
 ; GFX1064-NEXT:    v_writelane_b32 v1, s4, s5
@@ -2176,16 +2142,12 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
 ; GFX1164-NEXT:    ; implicit-def: $vgpr1
 ; GFX1164-NEXT:  .LBB9_1: ; %ComputeLoop
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT:    s_ctz_i32_b32 s5, s3
-; GFX1164-NEXT:    s_ctz_i32_b32 s6, s2
-; GFX1164-NEXT:    s_add_i32 s5, s5, 32
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_min_u32 s5, s6, s5
+; GFX1164-NEXT:    s_ctz_i32_b64 s5, s[2:3]
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX1164-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX1164-NEXT:    s_lshl_b64 s[6:7], 1, s5
 ; GFX1164-NEXT:    v_writelane_b32 v1, s4, s5
 ; GFX1164-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[6:7]
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1164-NEXT:    s_add_i32 s4, s4, s8
 ; GFX1164-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB9_1
@@ -2284,10 +2246,7 @@ define amdgpu_kernel void @sub_i32_varying_nouse() {
 ; GFX8-NEXT:    s_mov_b32 s2, 0
 ; GFX8-NEXT:  .LBB10_1: ; %ComputeLoop
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    s_ff1_i32_b32 s3, s1
-; GFX8-NEXT:    s_ff1_i32_b32 s4, s0
-; GFX8-NEXT:    s_add_i32 s3, s3, 32
-; GFX8-NEXT:    s_min_u32 s3, s4, s3
+; GFX8-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX8-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX8-NEXT:    s_lshl_b64 s[4:5], 1, s3
 ; GFX8-NEXT:    s_add_i32 s2, s2, s6
@@ -2316,10 +2275,7 @@ define amdgpu_kernel void @sub_i32_varying_nouse() {
 ; GFX9-NEXT:    s_mov_b32 s2, 0
 ; GFX9-NEXT:  .LBB10_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    s_ff1_i32_b32 s3, s1
-; GFX9-NEXT:    s_ff1_i32_b32 s4, s0
-; GFX9-NEXT:    s_add_i32 s3, s3, 32
-; GFX9-NEXT:    s_min_u32 s3, s4, s3
+; GFX9-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX9-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX9-NEXT:    s_lshl_b64 s[4:5], 1, s3
 ; GFX9-NEXT:    s_add_i32 s2, s2, s6
@@ -2347,10 +2303,7 @@ define amdgpu_kernel void @sub_i32_varying_nouse() {
 ; GFX1064-NEXT:    s_mov_b32 s2, 0
 ; GFX1064-NEXT:  .LBB10_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT:    s_ff1_i32_b32 s3, s1
-; GFX1064-NEXT:    s_ff1_i32_b32 s4, s0
-; GFX1064-NEXT:    s_add_i32 s3, s3, 32
-; GFX1064-NEXT:    s_min_u32 s3, s4, s3
+; GFX1064-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX1064-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX1064-NEXT:    s_lshl_b64 s[4:5], 1, s3
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
@@ -2407,15 +2360,12 @@ define amdgpu_kernel void @sub_i32_varying_nouse() {
 ; GFX1164-NEXT:    s_mov_b32 s2, 0
 ; GFX1164-NEXT:  .LBB10_1: ; %ComputeLoop
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT:    s_ctz_i32_b32 s3, s1
-; GFX1164-NEXT:    s_ctz_i32_b32 s4, s0
-; GFX1164-NEXT:    s_add_i32 s3, s3, 32
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_min_u32 s3, s4, s3
+; GFX1164-NEXT:    s_ctz_i32_b64 s3, s[0:1]
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX1164-NEXT:    s_lshl_b64 s[4:5], 1, s3
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[4:5]
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164-NEXT:    s_add_i32 s2, s2, s6
 ; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB10_1
@@ -3105,13 +3055,10 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
 ; GFX8-NEXT:    ; implicit-def: $vgpr1
 ; GFX8-NEXT:  .LBB14_1: ; %ComputeLoop
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    s_ff1_i32_b32 s5, s3
-; GFX8-NEXT:    s_ff1_i32_b32 s6, s2
-; GFX8-NEXT:    s_add_i32 s5, s5, 32
-; GFX8-NEXT:    s_min_u32 s5, s6, s5
+; GFX8-NEXT:    s_ff1_i32_b64 s5, s[2:3]
+; GFX8-NEXT:    s_mov_b32 m0, s5
 ; GFX8-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX8-NEXT:    s_lshl_b64 s[6:7], 1, s5
-; GFX8-NEXT:    s_mov_b32 m0, s5
 ; GFX8-NEXT:    v_writelane_b32 v1, s4, m0
 ; GFX8-NEXT:    s_and_b32 s4, s4, s8
 ; GFX8-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
@@ -3149,13 +3096,10 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
 ; GFX9-NEXT:    ; implicit-def: $vgpr1
 ; GFX9-NEXT:  .LBB14_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    s_ff1_i32_b32 s5, s3
-; GFX9-NEXT:    s_ff1_i32_b32 s6, s2
-; GFX9-NEXT:    s_add_i32 s5, s5, 32
-; GFX9-NEXT:    s_min_u32 s5, s6, s5
+; GFX9-NEXT:    s_ff1_i32_b64 s5, s[2:3]
+; GFX9-NEXT:    s_mov_b32 m0, s5
 ; GFX9-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX9-NEXT:    s_lshl_b64 s[6:7], 1, s5
-; GFX9-NEXT:    s_mov_b32 m0, s5
 ; GFX9-NEXT:    v_writelane_b32 v1, s4, m0
 ; GFX9-NEXT:    s_and_b32 s4, s4, s8
 ; GFX9-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
@@ -3192,10 +3136,7 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
 ; GFX1064-NEXT:    ; implicit-def: $vgpr1
 ; GFX1064-NEXT:  .LBB14_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT:    s_ff1_i32_b32 s5, s3
-; GFX1064-NEXT:    s_ff1_i32_b32 s6, s2
-; GFX1064-NEXT:    s_add_i32 s5, s5, 32
-; GFX1064-NEXT:    s_min_u32 s5, s6, s5
+; GFX1064-NEXT:    s_ff1_i32_b64 s5, s[2:3]
 ; GFX1064-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX1064-NEXT:    s_lshl_b64 s[6:7], 1, s5
 ; GFX1064-NEXT:    v_writelane_b32 v1, s4, s5
@@ -3276,16 +3217,12 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
 ; GFX1164-NEXT:    ; implicit-def: $vgpr1
 ; GFX1164-NEXT:  .LBB14_1: ; %ComputeLoop
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT:    s_ctz_i32_b32 s5, s3
-; GFX1164-NEXT:    s_ctz_i32_b32 s6, s2
-; GFX1164-NEXT:    s_add_i32 s5, s5, 32
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_min_u32 s5, s6, s5
+; GFX1164-NEXT:    s_ctz_i32_b64 s5, s[2:3]
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX1164-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX1164-NEXT:    s_lshl_b64 s[6:7], 1, s5
 ; GFX1164-NEXT:    v_writelane_b32 v1, s4, s5
 ; GFX1164-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[6:7]
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1164-NEXT:    s_and_b32 s4, s4, s8
 ; GFX1164-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB14_1
@@ -3392,13 +3329,10 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
 ; GFX8-NEXT:    ; implicit-def: $vgpr1
 ; GFX8-NEXT:  .LBB15_1: ; %ComputeLoop
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    s_ff1_i32_b32 s5, s3
-; GFX8-NEXT:    s_ff1_i32_b32 s6, s2
-; GFX8-NEXT:    s_add_i32 s5, s5, 32
-; GFX8-NEXT:    s_min_u32 s5, s6, s5
+; GFX8-NEXT:    s_ff1_i32_b64 s5, s[2:3]
+; GFX8-NEXT:    s_mov_b32 m0, s5
 ; GFX8-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX8-NEXT:    s_lshl_b64 s[6:7], 1, s5
-; GFX8-NEXT:    s_mov_b32 m0, s5
 ; GFX8-NEXT:    v_writelane_b32 v1, s4, m0
 ; GFX8-NEXT:    s_or_b32 s4, s4, s8
 ; GFX8-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
@@ -3436,13 +3370,10 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
 ; GFX9-NEXT:    ; implicit-def: $vgpr1
 ; GFX9-NEXT:  .LBB15_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    s_ff1_i32_b32 s5, s3
-; GFX9-NEXT:    s_ff1_i32_b32 s6, s2
-; GFX9-NEXT:    s_add_i32 s5, s5, 32
-; GFX9-NEXT:    s_min_u32 s5, s6, s5
+; GFX9-NEXT:    s_ff1_i32_b64 s5, s[2:3]
+; GFX9-NEXT:    s_mov_b32 m0, s5
 ; GFX9-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX9-NEXT:    s_lshl_b64 s[6:7], 1, s5
-; GFX9-NEXT:    s_mov_b32 m0, s5
 ; GFX9-NEXT:    v_writelane_b32 v1, s4, m0
 ; GFX9-NEXT:    s_or_b32 s4, s4, s8
 ; GFX9-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
@@ -3479,10 +3410,7 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
 ; GFX1064-NEXT:    ; implicit-def: $vgpr1
 ; GFX1064-NEXT:  .LBB15_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT:    s_ff1_i32_b32 s5, s3
-; GFX1064-NEXT:    s_ff1_i32_b32 s6, s2
-; GFX1064-NEXT:    s_add_i32 s5, s5, 32
-; GFX1064-NEXT:    s_min_u32 s5, s6, s5
+; GFX1064-NEXT:    s_ff1_i32_b64 s5, s[2:3]
 ; GFX1064-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX1064-NEXT:    s_lshl_b64 s[6:7], 1, s5
 ; GFX1064-NEXT:    v_writelane_b32 v1, s4, s5
@@ -3563,16 +3491,12 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
 ; GFX1164-NEXT:    ; implicit-def: $vgpr1
 ; GFX1164-NEXT:  .LBB15_1: ; %ComputeLoop
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT:    s_ctz_i32_b32 s5, s3
-; GFX1164-NEXT:    s_ctz_i32_b32 s6, s2
-; GFX1164-NEXT:    s_add_i32 s5, s5, 32
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_min_u32 s5, s6, s5
+; GFX1164-NEXT:    s_ctz_i32_b64 s5, s[2:3]
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX1164-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX1164-NEXT:    s_lshl_b64 s[6:7], 1, s5
 ; GFX1164-NEXT:    v_writelane_b32 v1, s4, s5
 ; GFX1164-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[6:7]
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1164-NEXT:    s_or_b32 s4, s4, s8
 ; GFX1164-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB15_1
@@ -3679,13 +3603,10 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
 ; GFX8-NEXT:    ; implicit-def: $vgpr1
 ; GFX8-NEXT:  .LBB16_1: ; %ComputeLoop
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    s_ff1_i32_b32 s5, s3
-; GFX8-NEXT:    s_ff1_i32_b32 s6, s2
-; GFX8-NEXT:    s_add_i32 s5, s5, 32
-; GFX8-NEXT:    s_min_u32 s5, s6, s5
+; GFX8-NEXT:    s_ff1_i32_b64 s5, s[2:3]
+; GFX8-NEXT:    s_mov_b32 m0, s5
 ; GFX8-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX8-NEXT:    s_lshl_b64 s[6:7], 1, s5
-; GFX8-NEXT:    s_mov_b32 m0, s5
 ; GFX8-NEXT:    v_writelane_b32 v1, s4, m0
 ; GFX8-NEXT:    s_xor_b32 s4, s4, s8
 ; GFX8-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
@@ -3723,13 +3644,10 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
 ; GFX9-NEXT:    ; implicit-def: $vgpr1
 ; GFX9-NEXT:  .LBB16_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    s_ff1_i32_b32 s5, s3
-; GFX9-NEXT:    s_ff1_i32_b32 s6, s2
-; GFX9-NEXT:    s_add_i32 s5, s5, 32
-; GFX9-NEXT:    s_min_u32 s5, s6, s5
+; GFX9-NEXT:    s_ff1_i32_b64 s5, s[2:3]
+; GFX9-NEXT:    s_mov_b32 m0, s5
 ; GFX9-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX9-NEXT:    s_lshl_b64 s[6:7], 1, s5
-; GFX9-NEXT:    s_mov_b32 m0, s5
 ; GFX9-NEXT:    v_writelane_b32 v1, s4, m0
 ; GFX9-NEXT:    s_xor_b32 s4, s4, s8
 ; GFX9-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
@@ -3766,10 +3684,7 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
 ; GFX1064-NEXT:    ; implicit-def: $vgpr1
 ; GFX1064-NEXT:  .LBB16_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT:    s_ff1_i32_b32 s5, s3
-; GFX1064-NEXT:    s_ff1_i32_b32 s6, s2
-; GFX1064-NEXT:    s_add_i32 s5, s5, 32
-; GFX1064-NEXT:    s_min_u32 s5, s6, s5
+; GFX1064-NEXT:    s_ff1_i32_b64 s5, s[2:3]
 ; GFX1064-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX1064-NEXT:    s_lshl_b64 s[6:7], 1, s5
 ; GFX1064-NEXT:    v_writelane_b32 v1, s4, s5
@@ -3850,16 +3765,12 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
 ; GFX1164-NEXT:    ; implicit-def: $vgpr1
 ; GFX1164-NEXT:  .LBB16_1: ; %ComputeLoop
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT:    s_ctz_i32_b32 s5, s3
-; GFX1164-NEXT:    s_ctz_i32_b32 s6, s2
-; GFX1164-NEXT:    s_add_i32 s5, s5, 32
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_min_u32 s5, s6, s5
+; GFX1164-NEXT:    s_ctz_i32_b64 s5, s[2:3]
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX1164-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX1164-NEXT:    s_lshl_b64 s[6:7], 1, s5
 ; GFX1164-NEXT:    v_writelane_b32 v1, s4, s5
 ; GFX1164-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[6:7]
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1164-NEXT:    s_xor_b32 s4, s4, s8
 ; GFX1164-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB16_1
@@ -3966,13 +3877,10 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
 ; GFX8-NEXT:    ; implicit-def: $vgpr1
 ; GFX8-NEXT:  .LBB17_1: ; %ComputeLoop
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    s_ff1_i32_b32 s5, s3
-; GFX8-NEXT:    s_ff1_i32_b32 s6, s2
-; GFX8-NEXT:    s_add_i32 s5, s5, 32
-; GFX8-NEXT:    s_min_u32 s5, s6, s5
+; GFX8-NEXT:    s_ff1_i32_b64 s5, s[2:3]
+; GFX8-NEXT:    s_mov_b32 m0, s5
 ; GFX8-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX8-NEXT:    s_lshl_b64 s[6:7], 1, s5
-; GFX8-NEXT:    s_mov_b32 m0, s5
 ; GFX8-NEXT:    v_writelane_b32 v1, s4, m0
 ; GFX8-NEXT:    s_max_i32 s4, s4, s8
 ; GFX8-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
@@ -4010,13 +3918,10 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
 ; GFX9-NEXT:    ; implicit-def: $vgpr1
 ; GFX9-NEXT:  .LBB17_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    s_ff1_i32_b32 s5, s3
-; GFX9-NEXT:    s_ff1_i32_b32 s6, s2
-; GFX9-NEXT:    s_add_i32 s5, s5, 32
-; GFX9-NEXT:    s_min_u32 s5, s6, s5
+; GFX9-NEXT:    s_ff1_i32_b64 s5, s[2:3]
+; GFX9-NEXT:    s_mov_b32 m0, s5
 ; GFX9-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX9-NEXT:    s_lshl_b64 s[6:7], 1, s5
-; GFX9-NEXT:    s_mov_b32 m0, s5
 ; GFX9-NEXT:    v_writelane_b32 v1, s4, m0
 ; GFX9-NEXT:    s_max_i32 s4, s4, s8
 ; GFX9-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
@@ -4053,10 +3958,7 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
 ; GFX1064-NEXT:    ; implicit-def: $vgpr1
 ; GFX1064-NEXT:  .LBB17_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT:    s_ff1_i32_b32 s5, s3
-; GFX1064-NEXT:    s_ff1_i32_b32 s6, s2
-; GFX1064-NEXT:    s_add_i32 s5, s5, 32
-; GFX1064-NEXT:    s_min_u32 s5, s6, s5
+; GFX1064-NEXT:    s_ff1_i32_b64 s5, s[2:3]
 ; GFX1064-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX1064-NEXT:    s_lshl_b64 s[6:7], 1, s5
 ; GFX1064-NEXT:    v_writelane_b32 v1, s4, s5
@@ -4137,16 +4039,12 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
 ; GFX1164-NEXT:    ; implicit-def: $vgpr1
 ; GFX1164-NEXT:  .LBB17_1: ; %ComputeLoop
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT:    s_ctz_i32_b32 s5, s3
-; GFX1164-NEXT:    s_ctz_i32_b32 s6, s2
-; GFX1164-NEXT:    s_add_i32 s5, s5, 32
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_min_u32 s5, s6, s5
+; GFX1164-NEXT:    s_ctz_i32_b64 s5, s[2:3]
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX1164-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX1164-NEXT:    s_lshl_b64 s[6:7], 1, s5
 ; GFX1164-NEXT:    v_writelane_b32 v1, s4, s5
 ; GFX1164-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[6:7]
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1164-NEXT:    s_max_i32 s4, s4, s8
 ; GFX1164-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB17_1
@@ -4495,13 +4393,10 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
 ; GFX8-NEXT:    ; implicit-def: $vgpr1
 ; GFX8-NEXT:  .LBB19_1: ; %ComputeLoop
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    s_ff1_i32_b32 s5, s3
-; GFX8-NEXT:    s_ff1_i32_b32 s6, s2
-; GFX8-NEXT:    s_add_i32 s5, s5, 32
-; GFX8-NEXT:    s_min_u32 s5, s6, s5
+; GFX8-NEXT:    s_ff1_i32_b64 s5, s[2:3]
+; GFX8-NEXT:    s_mov_b32 m0, s5
 ; GFX8-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX8-NEXT:    s_lshl_b64 s[6:7], 1, s5
-; GFX8-NEXT:    s_mov_b32 m0, s5
 ; GFX8-NEXT:    v_writelane_b32 v1, s4, m0
 ; GFX8-NEXT:    s_min_i32 s4, s4, s8
 ; GFX8-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
@@ -4539,13 +4434,10 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
 ; GFX9-NEXT:    ; implicit-def: $vgpr1
 ; GFX9-NEXT:  .LBB19_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    s_ff1_i32_b32 s5, s3
-; GFX9-NEXT:    s_ff1_i32_b32 s6, s2
-; GFX9-NEXT:    s_add_i32 s5, s5, 32
-; GFX9-NEXT:    s_min_u32 s5, s6, s5
+; GFX9-NEXT:    s_ff1_i32_b64 s5, s[2:3]
+; GFX9-NEXT:    s_mov_b32 m0, s5
 ; GFX9-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX9-NEXT:    s_lshl_b64 s[6:7], 1, s5
-; GFX9-NEXT:    s_mov_b32 m0, s5
 ; GFX9-NEXT:    v_writelane_b32 v1, s4, m0
 ; GFX9-NEXT:    s_min_i32 s4, s4, s8
 ; GFX9-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
@@ -4582,10 +4474,7 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
 ; GFX1064-NEXT:    ; implicit-def: $vgpr1
 ; GFX1064-NEXT:  .LBB19_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT:    s_ff1_i32_b32 s5, s3
-; GFX1064-NEXT:    s_ff1_i32_b32 s6, s2
-; GFX1064-NEXT:    s_add_i32 s5, s5, 32
-; GFX1064-NEXT:    s_min_u32 s5, s6, s5
+; GFX1064-NEXT:    s_ff1_i32_b64 s5, s[2:3]
 ; GFX1064-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX1064-NEXT:    s_lshl_b64 s[6:7], 1, s5
 ; GFX1064-NEXT:    v_writelane_b32 v1, s4, s5
@@ -4666,16 +4555,12 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
 ; GFX1164-NEXT:    ; implicit-def: $vgpr1
 ; GFX1164-NEXT:  .LBB19_1: ; %ComputeLoop
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT:    s_ctz_i32_b32 s5, s3
-; GFX1164-NEXT:    s_ctz_i32_b32 s6, s2
-; GFX1164-NEXT:    s_add_i32 s5, s5, 32
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_min_u32 s5, s6, s5
+; GFX1164-NEXT:    s_ctz_i32_b64 s5, s[2:3]
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX1164-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX1164-NEXT:    s_lshl_b64 s[6:7], 1, s5
 ; GFX1164-NEXT:    v_writelane_b32 v1, s4, s5
 ; GFX1164-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[6:7]
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1164-NEXT:    s_min_i32 s4, s4, s8
 ; GFX1164-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB19_1
@@ -5024,13 +4909,10 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
 ; GFX8-NEXT:    ; implicit-def: $vgpr1
 ; GFX8-NEXT:  .LBB21_1: ; %ComputeLoop
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    s_ff1_i32_b32 s5, s3
-; GFX8-NEXT:    s_ff1_i32_b32 s6, s2
-; GFX8-NEXT:    s_add_i32 s5, s5, 32
-; GFX8-NEXT:    s_min_u32 s5, s6, s5
+; GFX8-NEXT:    s_ff1_i32_b64 s5, s[2:3]
+; GFX8-NEXT:    s_mov_b32 m0, s5
 ; GFX8-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX8-NEXT:    s_lshl_b64 s[6:7], 1, s5
-; GFX8-NEXT:    s_mov_b32 m0, s5
 ; GFX8-NEXT:    v_writelane_b32 v1, s4, m0
 ; GFX8-NEXT:    s_max_u32 s4, s4, s8
 ; GFX8-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
@@ -5068,13 +4950,10 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
 ; GFX9-NEXT:    ; implicit-def: $vgpr1
 ; GFX9-NEXT:  .LBB21_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    s_ff1_i32_b32 s5, s3
-; GFX9-NEXT:    s_ff1_i32_b32 s6, s2
-; GFX9-NEXT:    s_add_i32 s5, s5, 32
-; GFX9-NEXT:    s_min_u32 s5, s6, s5
+; GFX9-NEXT:    s_ff1_i32_b64 s5, s[2:3]
+; GFX9-NEXT:    s_mov_b32 m0, s5
 ; GFX9-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX9-NEXT:    s_lshl_b64 s[6:7], 1, s5
-; GFX9-NEXT:    s_mov_b32 m0, s5
 ; GFX9-NEXT:    v_writelane_b32 v1, s4, m0
 ; GFX9-NEXT:    s_max_u32 s4, s4, s8
 ; GFX9-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
@@ -5111,10 +4990,7 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
 ; GFX1064-NEXT:    ; implicit-def: $vgpr1
 ; GFX1064-NEXT:  .LBB21_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT:    s_ff1_i32_b32 s5, s3
-; GFX1064-NEXT:    s_ff1_i32_b32 s6, s2
-; GFX1064-NEXT:    s_add_i32 s5, s5, 32
-; GFX1064-NEXT:    s_min_u32 s5, s6, s5
+; GFX1064-NEXT:    s_ff1_i32_b64 s5, s[2:3]
 ; GFX1064-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX1064-NEXT:    s_lshl_b64 s[6:7], 1, s5
 ; GFX1064-NEXT:    v_writelane_b32 v1, s4, s5
@@ -5195,16 +5071,12 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
 ; GFX1164-NEXT:    ; implicit-def: $vgpr1
 ; GFX1164-NEXT:  .LBB21_1: ; %ComputeLoop
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT:    s_ctz_i32_b32 s5, s3
-; GFX1164-NEXT:    s_ctz_i32_b32 s6, s2
-; GFX1164-NEXT:    s_add_i32 s5, s5, 32
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_min_u32 s5, s6, s5
+; GFX1164-NEXT:    s_ctz_i32_b64 s5, s[2:3]
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX1164-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX1164-NEXT:    s_lshl_b64 s[6:7], 1, s5
 ; GFX1164-NEXT:    v_writelane_b32 v1, s4, s5
 ; GFX1164-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[6:7]
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1164-NEXT:    s_max_u32 s4, s4, s8
 ; GFX1164-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB21_1
@@ -5548,13 +5420,10 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
 ; GFX8-NEXT:    ; implicit-def: $vgpr1
 ; GFX8-NEXT:  .LBB23_1: ; %ComputeLoop
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    s_ff1_i32_b32 s5, s3
-; GFX8-NEXT:    s_ff1_i32_b32 s6, s2
-; GFX8-NEXT:    s_add_i32 s5, s5, 32
-; GFX8-NEXT:    s_min_u32 s5, s6, s5
+; GFX8-NEXT:    s_ff1_i32_b64 s5, s[2:3]
+; GFX8-NEXT:    s_mov_b32 m0, s5
 ; GFX8-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX8-NEXT:    s_lshl_b64 s[6:7], 1, s5
-; GFX8-NEXT:    s_mov_b32 m0, s5
 ; GFX8-NEXT:    v_writelane_b32 v1, s4, m0
 ; GFX8-NEXT:    s_min_u32 s4, s4, s8
 ; GFX8-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
@@ -5592,13 +5461,10 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
 ; GFX9-NEXT:    ; implicit-def: $vgpr1
 ; GFX9-NEXT:  .LBB23_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    s_ff1_i32_b32 s5, s3
-; GFX9-NEXT:    s_ff1_i32_b32 s6, s2
-; GFX9-NEXT:    s_add_i32 s5, s5, 32
-; GFX9-NEXT:    s_min_u32 s5, s6, s5
+; GFX9-NEXT:    s_ff1_i32_b64 s5, s[2:3]
+; GFX9-NEXT:    s_mov_b32 m0, s5
 ; GFX9-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX9-NEXT:    s_lshl_b64 s[6:7], 1, s5
-; GFX9-NEXT:    s_mov_b32 m0, s5
 ; GFX9-NEXT:    v_writelane_b32 v1, s4, m0
 ; GFX9-NEXT:    s_min_u32 s4, s4, s8
 ; GFX9-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
@@ -5635,10 +5501,7 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
 ; GFX1064-NEXT:    ; implicit-def: $vgpr1
 ; GFX1064-NEXT:  .LBB23_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT:    s_ff1_i32_b32 s5, s3
-; GFX1064-NEXT:    s_ff1_i32_b32 s6, s2
-; GFX1064-NEXT:    s_add_i32 s5, s5, 32
-; GFX1064-NEXT:    s_min_u32 s5, s6, s5
+; GFX1064-NEXT:    s_ff1_i32_b64 s5, s[2:3]
 ; GFX1064-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX1064-NEXT:    s_lshl_b64 s[6:7], 1, s5
 ; GFX1064-NEXT:    v_writelane_b32 v1, s4, s5
@@ -5719,16 +5582,12 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
 ; GFX1164-NEXT:    ; implicit-def: $vgpr1
 ; GFX1164-NEXT:  .LBB23_1: ; %ComputeLoop
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT:    s_ctz_i32_b32 s5, s3
-; GFX1164-NEXT:    s_ctz_i32_b32 s6, s2
-; GFX1164-NEXT:    s_add_i32 s5, s5, 32
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_min_u32 s5, s6, s5
+; GFX1164-NEXT:    s_ctz_i32_b64 s5, s[2:3]
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX1164-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX1164-NEXT:    s_lshl_b64 s[6:7], 1, s5
 ; GFX1164-NEXT:    v_writelane_b32 v1, s4, s5
 ; GFX1164-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[6:7]
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1164-NEXT:    s_min_u32 s4, s4, s8
 ; GFX1164-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB23_1
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
index 43068a28812ed..79f8b3a1d5d84 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
@@ -588,13 +588,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX8-NEXT:    ; implicit-def: $vgpr1
 ; GFX8-NEXT:  .LBB2_1: ; %ComputeLoop
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    s_ff1_i32_b32 s5, s3
-; GFX8-NEXT:    s_ff1_i32_b32 s6, s2
-; GFX8-NEXT:    s_add_i32 s5, s5, 32
-; GFX8-NEXT:    s_min_u32 s5, s6, s5
+; GFX8-NEXT:    s_ff1_i32_b64 s5, s[2:3]
+; GFX8-NEXT:    s_mov_b32 m0, s5
 ; GFX8-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX8-NEXT:    s_lshl_b64 s[6:7], 1, s5
-; GFX8-NEXT:    s_mov_b32 m0, s5
 ; GFX8-NEXT:    v_writelane_b32 v1, s4, m0
 ; GFX8-NEXT:    s_add_i32 s4, s4, s8
 ; GFX8-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
@@ -632,13 +629,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX9-NEXT:    ; implicit-def: $vgpr1
 ; GFX9-NEXT:  .LBB2_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    s_ff1_i32_b32 s5, s3
-; GFX9-NEXT:    s_ff1_i32_b32 s6, s2
-; GFX9-NEXT:    s_add_i32 s5, s5, 32
-; GFX9-NEXT:    s_min_u32 s5, s6, s5
+; GFX9-NEXT:    s_ff1_i32_b64 s5, s[2:3]
+; GFX9-NEXT:    s_mov_b32 m0, s5
 ; GFX9-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX9-NEXT:    s_lshl_b64 s[6:7], 1, s5
-; GFX9-NEXT:    s_mov_b32 m0, s5
 ; GFX9-NEXT:    v_writelane_b32 v1, s4, m0
 ; GFX9-NEXT:    s_add_i32 s4, s4, s8
 ; GFX9-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
@@ -675,10 +669,7 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX10W64-NEXT:    ; implicit-def: $vgpr1
 ; GFX10W64-NEXT:  .LBB2_1: ; %ComputeLoop
 ; GFX10W64-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10W64-NEXT:    s_ff1_i32_b32 s5, s3
-; GFX10W64-NEXT:    s_ff1_i32_b32 s6, s2
-; GFX10W64-NEXT:    s_add_i32 s5, s5, 32
-; GFX10W64-NEXT:    s_min_u32 s5, s6, s5
+; GFX10W64-NEXT:    s_ff1_i32_b64 s5, s[2:3]
 ; GFX10W64-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX10W64-NEXT:    s_lshl_b64 s[6:7], 1, s5
 ; GFX10W64-NEXT:    v_writelane_b32 v1, s4, s5
@@ -757,16 +748,12 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX11W64-NEXT:    ; implicit-def: $vgpr1
 ; GFX11W64-NEXT:  .LBB2_1: ; %ComputeLoop
 ; GFX11W64-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11W64-NEXT:    s_ctz_i32_b32 s5, s3
-; GFX11W64-NEXT:    s_ctz_i32_b32 s6, s2
-; GFX11W64-NEXT:    s_add_i32 s5, s5, 32
-; GFX11W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11W64-NEXT:    s_min_u32 s5, s6, s5
+; GFX11W64-NEXT:    s_ctz_i32_b64 s5, s[2:3]
+; GFX11W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX11W64-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX11W64-NEXT:    s_lshl_b64 s[6:7], 1, s5
 ; GFX11W64-NEXT:    v_writelane_b32 v1, s4, s5
 ; GFX11W64-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[6:7]
-; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11W64-NEXT:    s_add_i32 s4, s4, s8
 ; GFX11W64-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX11W64-NEXT:    s_cbranch_scc1 .LBB2_1
@@ -848,16 +835,12 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX12W64-NEXT:    ; implicit-def: $vgpr1
 ; GFX12W64-NEXT:  .LBB2_1: ; %ComputeLoop
 ; GFX12W64-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12W64-NEXT:    s_ctz_i32_b32 s5, s3
-; GFX12W64-NEXT:    s_ctz_i32_b32 s6, s2
-; GFX12W64-NEXT:    s_add_co_i32 s5, s5, 32
-; GFX12W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX12W64-NEXT:    s_min_u32 s5, s6, s5
+; GFX12W64-NEXT:    s_ctz_i32_b64 s5, s[2:3]
+; GFX12W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX12W64-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX12W64-NEXT:    s_lshl_b64 s[6:7], 1, s5
 ; GFX12W64-NEXT:    v_writelane_b32 v1, s4, s5
 ; GFX12W64-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[6:7]
-; GFX12W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12W64-NEXT:    s_add_co_i32 s4, s4, s8
 ; GFX12W64-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX12W64-NEXT:    s_cbranch_scc1 .LBB2_1
@@ -1610,13 +1593,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX8-NEXT:    ; implicit-def: $vgpr1
 ; GFX8-NEXT:  .LBB6_1: ; %ComputeLoop
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    s_ff1_i32_b32 s5, s3
-; GFX8-NEXT:    s_ff1_i32_b32 s6, s2
-; GFX8-NEXT:    s_add_i32 s5, s5, 32
-; GFX8-NEXT:    s_min_u32 s5, s6, s5
+; GFX8-NEXT:    s_ff1_i32_b64 s5, s[2:3]
+; GFX8-NEXT:    s_mov_b32 m0, s5
 ; GFX8-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX8-NEXT:    s_lshl_b64 s[6:7], 1, s5
-; GFX8-NEXT:    s_mov_b32 m0, s5
 ; GFX8-NEXT:    v_writelane_b32 v1, s4, m0
 ; GFX8-NEXT:    s_add_i32 s4, s4, s8
 ; GFX8-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
@@ -1654,13 +1634,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX9-NEXT:    ; implicit-def: $vgpr1
 ; GFX9-NEXT:  .LBB6_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    s_ff1_i32_b32 s5, s3
-; GFX9-NEXT:    s_ff1_i32_b32 s6, s2
-; GFX9-NEXT:    s_add_i32 s5, s5, 32
-; GFX9-NEXT:    s_min_u32 s5, s6, s5
+; GFX9-NEXT:    s_ff1_i32_b64 s5, s[2:3]
+; GFX9-NEXT:    s_mov_b32 m0, s5
 ; GFX9-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX9-NEXT:    s_lshl_b64 s[6:7], 1, s5
-; GFX9-NEXT:    s_mov_b32 m0, s5
 ; GFX9-NEXT:    v_writelane_b32 v1, s4, m0
 ; GFX9-NEXT:    s_add_i32 s4, s4, s8
 ; GFX9-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
@@ -1697,10 +1674,7 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX10W64-NEXT:    ; implicit-def: $vgpr1
 ; GFX10W64-NEXT:  .LBB6_1: ; %ComputeLoop
 ; GFX10W64-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10W64-NEXT:    s_ff1_i32_b32 s5, s3
-; GFX10W64-NEXT:    s_ff1_i32_b32 s6, s2
-; GFX10W64-NEXT:    s_add_i32 s5, s5, 32
-; GFX10W64-NEXT:    s_min_u32 s5, s6, s5
+; GFX10W64-NEXT:    s_ff1_i32_b64 s5, s[2:3]
 ; GFX10W64-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX10W64-NEXT:    s_lshl_b64 s[6:7], 1, s5
 ; GFX10W64-NEXT:    v_writelane_b32 v1, s4, s5
@@ -1779,16 +1753,12 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX11W64-NEXT:    ; implicit-def: $vgpr1
 ; GFX11W64-NEXT:  .LBB6_1: ; %ComputeLoop
 ; GFX11W64-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11W64-NEXT:    s_ctz_i32_b32 s5, s3
-; GFX11W64-NEXT:    s_ctz_i32_b32 s6, s2
-; GFX11W64-NEXT:    s_add_i32 s5, s5, 32
-; GFX11W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11W64-NEXT:    s_min_u32 s5, s6, s5
+; GFX11W64-NEXT:    s_ctz_i32_b64 s5, s[2:3]
+; GFX11W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX11W64-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX11W64-NEXT:    s_lshl_b64 s[6:7], 1, s5
 ; GFX11W64-NEXT:    v_writelane_b32 v1, s4, s5
 ; GFX11W64-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[6:7]
-; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11W64-NEXT:    s_add_i32 s4, s4, s8
 ; GFX11W64-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX11W64-NEXT:    s_cbranch_scc1 .LBB6_1
@@ -1871,16 +1841,12 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX12W64-NEXT:    ; implicit-def: $vgpr1
 ; GFX12W64-NEXT:  .LBB6_1: ; %ComputeLoop
 ; GFX12W64-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12W64-NEXT:    s_ctz_i32_b32 s5, s3
-; GFX12W64-NEXT:    s_ctz_i32_b32 s6, s2
-; GFX12W64-NEXT:    s_add_co_i32 s5, s5, 32
-; GFX12W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX12W64-NEXT:    s_min_u32 s5, s6, s5
+; GFX12W64-NEXT:    s_ctz_i32_b64 s5, s[2:3]
+; GFX12W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX12W64-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX12W64-NEXT:    s_lshl_b64 s[6:7], 1, s5
 ; GFX12W64-NEXT:    v_writelane_b32 v1, s4, s5
 ; GFX12W64-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[6:7]
-; GFX12W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12W64-NEXT:    s_add_co_i32 s4, s4, s8
 ; GFX12W64-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX12W64-NEXT:    s_cbranch_scc1 .LBB6_1
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
index 5ac8ed8df456d..edf6fbadf1a60 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
@@ -605,13 +605,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX8-NEXT:    ; implicit-def: $vgpr1
 ; GFX8-NEXT:  .LBB2_1: ; %ComputeLoop
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    s_ff1_i32_b32 s5, s3
-; GFX8-NEXT:    s_ff1_i32_b32 s6, s2
-; GFX8-NEXT:    s_add_i32 s5, s5, 32
-; GFX8-NEXT:    s_min_u32 s5, s6, s5
+; GFX8-NEXT:    s_ff1_i32_b64 s5, s[2:3]
+; GFX8-NEXT:    s_mov_b32 m0, s5
 ; GFX8-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX8-NEXT:    s_lshl_b64 s[6:7], 1, s5
-; GFX8-NEXT:    s_mov_b32 m0, s5
 ; GFX8-NEXT:    v_writelane_b32 v1, s4, m0
 ; GFX8-NEXT:    s_add_i32 s4, s4, s8
 ; GFX8-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
@@ -650,13 +647,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX9-NEXT:    ; implicit-def: $vgpr1
 ; GFX9-NEXT:  .LBB2_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    s_ff1_i32_b32 s5, s3
-; GFX9-NEXT:    s_ff1_i32_b32 s6, s2
-; GFX9-NEXT:    s_add_i32 s5, s5, 32
-; GFX9-NEXT:    s_min_u32 s5, s6, s5
+; GFX9-NEXT:    s_ff1_i32_b64 s5, s[2:3]
+; GFX9-NEXT:    s_mov_b32 m0, s5
 ; GFX9-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX9-NEXT:    s_lshl_b64 s[6:7], 1, s5
-; GFX9-NEXT:    s_mov_b32 m0, s5
 ; GFX9-NEXT:    v_writelane_b32 v1, s4, m0
 ; GFX9-NEXT:    s_add_i32 s4, s4, s8
 ; GFX9-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
@@ -694,10 +688,7 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX10W64-NEXT:    ; implicit-def: $vgpr1
 ; GFX10W64-NEXT:  .LBB2_1: ; %ComputeLoop
 ; GFX10W64-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10W64-NEXT:    s_ff1_i32_b32 s5, s3
-; GFX10W64-NEXT:    s_ff1_i32_b32 s6, s2
-; GFX10W64-NEXT:    s_add_i32 s5, s5, 32
-; GFX10W64-NEXT:    s_min_u32 s5, s6, s5
+; GFX10W64-NEXT:    s_ff1_i32_b64 s5, s[2:3]
 ; GFX10W64-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX10W64-NEXT:    s_lshl_b64 s[6:7], 1, s5
 ; GFX10W64-NEXT:    v_writelane_b32 v1, s4, s5
@@ -778,16 +769,12 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX11W64-NEXT:    ; implicit-def: $vgpr1
 ; GFX11W64-NEXT:  .LBB2_1: ; %ComputeLoop
 ; GFX11W64-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11W64-NEXT:    s_ctz_i32_b32 s5, s3
-; GFX11W64-NEXT:    s_ctz_i32_b32 s6, s2
-; GFX11W64-NEXT:    s_add_i32 s5, s5, 32
-; GFX11W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11W64-NEXT:    s_min_u32 s5, s6, s5
+; GFX11W64-NEXT:    s_ctz_i32_b64 s5, s[2:3]
+; GFX11W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX11W64-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX11W64-NEXT:    s_lshl_b64 s[6:7], 1, s5
 ; GFX11W64-NEXT:    v_writelane_b32 v1, s4, s5
 ; GFX11W64-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[6:7]
-; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11W64-NEXT:    s_add_i32 s4, s4, s8
 ; GFX11W64-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX11W64-NEXT:    s_cbranch_scc1 .LBB2_1
@@ -871,16 +858,12 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX12W64-NEXT:    ; implicit-def: $vgpr1
 ; GFX12W64-NEXT:  .LBB2_1: ; %ComputeLoop
 ; GFX12W64-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12W64-NEXT:    s_ctz_i32_b32 s5, s3
-; GFX12W64-NEXT:    s_ctz_i32_b32 s6, s2
-; GFX12W64-NEXT:    s_add_co_i32 s5, s5, 32
-; GFX12W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX12W64-NEXT:    s_min_u32 s5, s6, s5
+; GFX12W64-NEXT:    s_ctz_i32_b64 s5, s[2:3]
+; GFX12W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX12W64-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX12W64-NEXT:    s_lshl_b64 s[6:7], 1, s5
 ; GFX12W64-NEXT:    v_writelane_b32 v1, s4, s5
 ; GFX12W64-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[6:7]
-; GFX12W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12W64-NEXT:    s_add_co_i32 s4, s4, s8
 ; GFX12W64-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX12W64-NEXT:    s_cbranch_scc1 .LBB2_1
@@ -1785,13 +1768,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX8-NEXT:    ; implicit-def: $vgpr1
 ; GFX8-NEXT:  .LBB7_1: ; %ComputeLoop
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    s_ff1_i32_b32 s5, s3
-; GFX8-NEXT:    s_ff1_i32_b32 s6, s2
-; GFX8-NEXT:    s_add_i32 s5, s5, 32
-; GFX8-NEXT:    s_min_u32 s5, s6, s5
+; GFX8-NEXT:    s_ff1_i32_b64 s5, s[2:3]
+; GFX8-NEXT:    s_mov_b32 m0, s5
 ; GFX8-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX8-NEXT:    s_lshl_b64 s[6:7], 1, s5
-; GFX8-NEXT:    s_mov_b32 m0, s5
 ; GFX8-NEXT:    v_writelane_b32 v1, s4, m0
 ; GFX8-NEXT:    s_add_i32 s4, s4, s8
 ; GFX8-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
@@ -1830,13 +1810,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX9-NEXT:    ; implicit-def: $vgpr1
 ; GFX9-NEXT:  .LBB7_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    s_ff1_i32_b32 s5, s3
-; GFX9-NEXT:    s_ff1_i32_b32 s6, s2
-; GFX9-NEXT:    s_add_i32 s5, s5, 32
-; GFX9-NEXT:    s_min_u32 s5, s6, s5
+; GFX9-NEXT:    s_ff1_i32_b64 s5, s[2:3]
+; GFX9-NEXT:    s_mov_b32 m0, s5
 ; GFX9-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX9-NEXT:    s_lshl_b64 s[6:7], 1, s5
-; GFX9-NEXT:    s_mov_b32 m0, s5
 ; GFX9-NEXT:    v_writelane_b32 v1, s4, m0
 ; GFX9-NEXT:    s_add_i32 s4, s4, s8
 ; GFX9-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
@@ -1874,10 +1851,7 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX10W64-NEXT:    ; implicit-def: $vgpr1
 ; GFX10W64-NEXT:  .LBB7_1: ; %ComputeLoop
 ; GFX10W64-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10W64-NEXT:    s_ff1_i32_b32 s5, s3
-; GFX10W64-NEXT:    s_ff1_i32_b32 s6, s2
-; GFX10W64-NEXT:    s_add_i32 s5, s5, 32
-; GFX10W64-NEXT:    s_min_u32 s5, s6, s5
+; GFX10W64-NEXT:    s_ff1_i32_b64 s5, s[2:3]
 ; GFX10W64-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX10W64-NEXT:    s_lshl_b64 s[6:7], 1, s5
 ; GFX10W64-NEXT:    v_writelane_b32 v1, s4, s5
@@ -1958,16 +1932,12 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX11W64-NEXT:    ; implicit-def: $vgpr1
 ; GFX11W64-NEXT:  .LBB7_1: ; %ComputeLoop
 ; GFX11W64-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11W64-NEXT:    s_ctz_i32_b32 s5, s3
-; GFX11W64-NEXT:    s_ctz_i32_b32 s6, s2
-; GFX11W64-NEXT:    s_add_i32 s5, s5, 32
-; GFX11W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11W64-NEXT:    s_min_u32 s5, s6, s5
+; GFX11W64-NEXT:    s_ctz_i32_b64 s5, s[2:3]
+; GFX11W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX11W64-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX11W64-NEXT:    s_lshl_b64 s[6:7], 1, s5
 ; GFX11W64-NEXT:    v_writelane_b32 v1, s4, s5
 ; GFX11W64-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[6:7]
-; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11W64-NEXT:    s_add_i32 s4, s4, s8
 ; GFX11W64-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX11W64-NEXT:    s_cbranch_scc1 .LBB7_1
@@ -2052,16 +2022,12 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX12W64-NEXT:    ; implicit-def: $vgpr1
 ; GFX12W64-NEXT:  .LBB7_1: ; %ComputeLoop
 ; GFX12W64-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12W64-NEXT:    s_ctz_i32_b32 s5, s3
-; GFX12W64-NEXT:    s_ctz_i32_b32 s6, s2
-; GFX12W64-NEXT:    s_add_co_i32 s5, s5, 32
-; GFX12W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX12W64-NEXT:    s_min_u32 s5, s6, s5
+; GFX12W64-NEXT:    s_ctz_i32_b64 s5, s[2:3]
+; GFX12W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
 ; GFX12W64-NEXT:    v_readlane_b32 s8, v0, s5
 ; GFX12W64-NEXT:    s_lshl_b64 s[6:7], 1, s5
 ; GFX12W64-NEXT:    v_writelane_b32 v1, s4, s5
 ; GFX12W64-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[6:7]
-; GFX12W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12W64-NEXT:    s_add_co_i32 s4, s4, s8
 ; GFX12W64-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX12W64-NEXT:    s_cbranch_scc1 .LBB7_1
diff --git a/llvm/test/CodeGen/AMDGPU/ctlz.ll b/llvm/test/CodeGen/AMDGPU/ctlz.ll
index 213b6e6e620d3..3d69655111da6 100644
--- a/llvm/test/CodeGen/AMDGPU/ctlz.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctlz.ll
@@ -607,13 +607,10 @@ define amdgpu_kernel void @s_ctlz_i64(ptr addrspace(1) noalias %out, [8 x i32],
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_flbit_i32_b32 s4, s4
-; SI-NEXT:    s_flbit_i32_b32 s5, s5
-; SI-NEXT:    s_min_u32 s4, s4, 0xffffffdf
-; SI-NEXT:    v_mov_b32_e32 v0, s5
-; SI-NEXT:    s_add_i32 s4, s4, 32
-; SI-NEXT:    v_min3_u32 v0, s4, v0, 64
+; SI-NEXT:    s_flbit_i32_b64 s4, s[4:5]
+; SI-NEXT:    s_min_u32 s4, s4, 64
 ; SI-NEXT:    v_mov_b32_e32 v1, 0
+; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
@@ -625,10 +622,9 @@ define amdgpu_kernel void @s_ctlz_i64(ptr addrspace(1) noalias %out, [8 x i32],
 ; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_flbit_i32_b32 s4, s4
-; VI-NEXT:    v_add_u32_e64 v0, s[6:7], s4, 32 clamp
-; VI-NEXT:    s_flbit_i32_b32 s4, s5
-; VI-NEXT:    v_min3_u32 v0, v0, s4, 64
+; VI-NEXT:    s_flbit_i32_b64 s4, s[4:5]
+; VI-NEXT:    s_min_u32 s4, s4, 64
+; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
 ;
@@ -657,10 +653,9 @@ define amdgpu_kernel void @s_ctlz_i64(ptr addrspace(1) noalias %out, [8 x i32],
 ; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_flbit_i32_b32 s0, s2
-; GFX10-NEXT:    v_add_nc_u32_e64 v0, s0, 32 clamp
-; GFX10-NEXT:    s_flbit_i32_b32 s0, s3
-; GFX10-NEXT:    v_min3_u32 v0, v0, s0, 64
+; GFX10-NEXT:    s_flbit_i32_b64 s0, s[2:3]
+; GFX10-NEXT:    s_min_u32 s0, s0, 64
+; GFX10-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-NEXT:    global_store_dwordx2 v1, v[0:1], s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
@@ -684,14 +679,11 @@ define amdgpu_kernel void @s_ctlz_i64(ptr addrspace(1) noalias %out, [8 x i32],
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    s_load_b64 s[2:3], s[0:1], 0x4c
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_clz_i32_u32 s2, s2
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    v_add_nc_u32_e64 v0, s2, 32 clamp
-; GFX11-NEXT:    s_clz_i32_u32 s2, s3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    v_min3_u32 v0, v0, s2, 64
+; GFX11-NEXT:    s_clz_i32_u64 s2, s[2:3]
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_min_u32 s2, s2, 64
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
 ; GFX11-NEXT:    global_store_b64 v1, v[0:1], s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -706,16 +698,13 @@ define amdgpu_kernel void @s_ctlz_i64_trunc(ptr addrspace(1) noalias %out, i64 %
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_flbit_i32_b64 s2, s[2:3]
+; SI-NEXT:    s_min_u32 s2, s2, 64
+; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_mov_b32 s4, s0
 ; SI-NEXT:    s_mov_b32 s5, s1
-; SI-NEXT:    s_flbit_i32_b32 s0, s2
-; SI-NEXT:    s_min_u32 s0, s0, 0xffffffdf
-; SI-NEXT:    s_flbit_i32_b32 s1, s3
-; SI-NEXT:    s_add_i32 s0, s0, 32
-; SI-NEXT:    v_mov_b32_e32 v0, s1
-; SI-NEXT:    v_min3_u32 v0, s0, v0, 64
+; SI-NEXT:    v_mov_b32_e32 v0, s2
 ; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
@@ -726,11 +715,10 @@ define amdgpu_kernel void @s_ctlz_i64_trunc(ptr addrspace(1) noalias %out, i64 %
 ; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_mov_b32 s4, s0
-; VI-NEXT:    s_flbit_i32_b32 s0, s2
+; VI-NEXT:    s_flbit_i32_b64 s0, s[2:3]
+; VI-NEXT:    s_min_u32 s0, s0, 64
 ; VI-NEXT:    s_mov_b32 s5, s1
-; VI-NEXT:    v_add_u32_e64 v0, s[0:1], s0, 32 clamp
-; VI-NEXT:    s_flbit_i32_b32 s0, s3
-; VI-NEXT:    v_min3_u32 v0, v0, s0, 64
+; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
@@ -754,13 +742,12 @@ define amdgpu_kernel void @s_ctlz_i64_trunc(ptr addrspace(1) noalias %out, i64 %
 ; GFX10-LABEL: s_ctlz_i64_trunc:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_flbit_i32_b32 s2, s2
-; GFX10-NEXT:    v_add_nc_u32_e64 v0, s2, 32 clamp
-; GFX10-NEXT:    s_flbit_i32_b32 s2, s3
-; GFX10-NEXT:    v_min3_u32 v0, v0, s2, 64
-; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT:    s_flbit_i32_b64 s2, s[2:3]
+; GFX10-NEXT:    s_min_u32 s2, s2, 64
+; GFX10-NEXT:    v_mov_b32_e32 v1, s2
+; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: s_ctlz_i64_trunc:
@@ -777,15 +764,12 @@ define amdgpu_kernel void @s_ctlz_i64_trunc(ptr addrspace(1) noalias %out, i64 %
 ; GFX11-LABEL: s_ctlz_i64_trunc:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_clz_i32_u32 s2, s2
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    v_add_nc_u32_e64 v0, s2, 32 clamp
-; GFX11-NEXT:    s_clz_i32_u32 s2, s3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    v_min3_u32 v0, v0, s2, 64
-; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT:    s_clz_i32_u64 s2, s[2:3]
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_min_u32 s2, s2, 64
+; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
index 354f5b954659a..03f3d04cf8a68 100644
--- a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
@@ -6,6 +6,8 @@
 
 declare i8 @llvm.ctlz.i8(i8, i1) nounwind readnone
 
+declare i16 @llvm.ctlz.i16(i16, i1) nounwind readnone
+
 declare i32 @llvm.ctlz.i32(i32, i1) nounwind readnone
 declare <2 x i32> @llvm.ctlz.v2i32(<2 x i32>, i1) nounwind readnone
 declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1) nounwind readnone
@@ -305,6 +307,787 @@ define amdgpu_kernel void @v_ctlz_zero_undef_v4i32(ptr addrspace(1) noalias %out
   ret void
 }
 
+define amdgpu_kernel void @s_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noalias %out, i8 %val) nounwind {
+; SI-LABEL: s_ctlz_zero_undef_i8_with_select:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dword s2, s[0:1], 0xb
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_and_b32 s2, s2, 0xff
+; SI-NEXT:    s_flbit_i32_b32 s2, s2
+; SI-NEXT:    s_sub_i32 s4, s2, 24
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: s_ctlz_zero_undef_i8_with_select:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_and_b32 s2, s2, 0xff
+; VI-NEXT:    s_flbit_i32_b32 s2, s2
+; VI-NEXT:    s_add_i32 s2, s2, -16
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_add_u16_e64 v2, s2, -8
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    flat_store_byte v[0:1], v2
+; VI-NEXT:    s_endpgm
+;
+; EG-LABEL: s_ctlz_zero_undef_i8_with_select:
+; EG:       ; %bb.0:
+; EG-NEXT:    ALU 0, @8, KC0[], KC1[]
+; EG-NEXT:    TEX 0 @6
+; EG-NEXT:    ALU 14, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
+; EG-NEXT:    CF_END
+; EG-NEXT:    PAD
+; EG-NEXT:    Fetch clause starting at 6:
+; EG-NEXT:     VTX_READ_8 T0.X, T0.X, 40, #3
+; EG-NEXT:    ALU clause starting at 8:
+; EG-NEXT:     MOV * T0.X, 0.0,
+; EG-NEXT:    ALU clause starting at 9:
+; EG-NEXT:     FFBH_UINT T0.W, T0.X,
+; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.x,
+; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT * T0.W, PV.W, literal.x,
+; EG-NEXT:    -24(nan), 0(0.000000e+00)
+; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
+; EG-NEXT:     LSHL * T1.W, T1.W, literal.y,
+; EG-NEXT:    255(3.573311e-43), 3(4.203895e-45)
+; EG-NEXT:     LSHL T0.X, PV.W, PS,
+; EG-NEXT:     LSHL * T0.W, literal.x, PS,
+; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
+; EG-NEXT:     MOV T0.Y, 0.0,
+; EG-NEXT:     MOV * T0.Z, 0.0,
+; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
+; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+;
+; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i8_with_select:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_load_dword s4, s[0:1], 0x2c
+; GFX9-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT:    s_and_b32 s0, s4, 0xff
+; GFX9-GISEL-NEXT:    s_flbit_i32_b32 s0, s0
+; GFX9-GISEL-NEXT:    s_sub_i32 s0, s0, 24
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-GISEL-NEXT:    global_store_byte v1, v0, s[2:3]
+; GFX9-GISEL-NEXT:    s_endpgm
+  %ctlz = tail call i8 @llvm.ctlz.i8(i8 %val, i1 true) nounwind readnone
+  %ctlz_ret = icmp ne i8 %val, 0
+  %ret = select i1 %ctlz_ret, i8 %ctlz, i8 32
+  store i8 %ctlz, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @s_ctlz_zero_undef_i16_with_select(ptr addrspace(1) noalias %out, i16 %val) nounwind {
+; SI-LABEL: s_ctlz_zero_undef_i16_with_select:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dword s2, s[0:1], 0xb
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_and_b32 s2, s2, 0xffff
+; SI-NEXT:    s_flbit_i32_b32 s2, s2
+; SI-NEXT:    s_add_i32 s4, s2, -16
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: s_ctlz_zero_undef_i16_with_select:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_and_b32 s2, s2, 0xffff
+; VI-NEXT:    s_flbit_i32_b32 s2, s2
+; VI-NEXT:    s_add_i32 s2, s2, -16
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    flat_store_short v[0:1], v2
+; VI-NEXT:    s_endpgm
+;
+; EG-LABEL: s_ctlz_zero_undef_i16_with_select:
+; EG:       ; %bb.0:
+; EG-NEXT:    ALU 0, @8, KC0[], KC1[]
+; EG-NEXT:    TEX 0 @6
+; EG-NEXT:    ALU 14, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
+; EG-NEXT:    CF_END
+; EG-NEXT:    PAD
+; EG-NEXT:    Fetch clause starting at 6:
+; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 40, #3
+; EG-NEXT:    ALU clause starting at 8:
+; EG-NEXT:     MOV * T0.X, 0.0,
+; EG-NEXT:    ALU clause starting at 9:
+; EG-NEXT:     FFBH_UINT T0.W, T0.X,
+; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.x,
+; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT * T0.W, PV.W, literal.x,
+; EG-NEXT:    -16(nan), 0(0.000000e+00)
+; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
+; EG-NEXT:     LSHL * T1.W, T1.W, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 3(4.203895e-45)
+; EG-NEXT:     LSHL T0.X, PV.W, PS,
+; EG-NEXT:     LSHL * T0.W, literal.x, PS,
+; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT:     MOV T0.Y, 0.0,
+; EG-NEXT:     MOV * T0.Z, 0.0,
+; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
+; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+;
+; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i16_with_select:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_load_dword s4, s[0:1], 0x2c
+; GFX9-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT:    s_and_b32 s0, s4, 0xffff
+; GFX9-GISEL-NEXT:    s_flbit_i32_b32 s0, s0
+; GFX9-GISEL-NEXT:    s_sub_i32 s0, s0, 16
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-GISEL-NEXT:    global_store_short v1, v0, s[2:3]
+; GFX9-GISEL-NEXT:    s_endpgm
+  %ctlz = tail call i16 @llvm.ctlz.i16(i16 %val, i1 true) nounwind readnone
+  %ctlz_ret = icmp ne i16 %val, 0
+  %ret = select i1 %ctlz_ret, i16 %ctlz, i16 32
+  store i16 %ctlz, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @s_ctlz_zero_undef_i32_with_select(ptr addrspace(1) noalias %out, i32 %val) nounwind {
+; SI-LABEL: s_ctlz_zero_undef_i32_with_select:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dword s2, s[0:1], 0xb
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_flbit_i32_b32 s4, s2
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: s_ctlz_zero_undef_i32_with_select:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_flbit_i32_b32 s2, s2
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    flat_store_dword v[0:1], v2
+; VI-NEXT:    s_endpgm
+;
+; EG-LABEL: s_ctlz_zero_undef_i32_with_select:
+; EG:       ; %bb.0:
+; EG-NEXT:    ALU 2, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
+; EG-NEXT:    CF_END
+; EG-NEXT:    PAD
+; EG-NEXT:    ALU clause starting at 4:
+; EG-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
+; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     FFBH_UINT * T1.X, KC0[2].Z,
+;
+; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i32_with_select:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_load_dword s4, s[0:1], 0x2c
+; GFX9-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT:    s_flbit_i32_b32 s0, s4
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[2:3]
+; GFX9-GISEL-NEXT:    s_endpgm
+  %ctlz = tail call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
+  %ctlz_ret = icmp ne i32 %val, 0
+  %ret = select i1 %ctlz_ret, i32 %ctlz, i32 32
+  store i32 %ctlz, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @s_ctlz_zero_undef_i64_with_select(ptr addrspace(1) noalias %out, i64 %val) nounwind {
+; SI-LABEL: s_ctlz_zero_undef_i64_with_select:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_flbit_i32_b64 s2, s[2:3]
+; SI-NEXT:    v_mov_b32_e32 v1, 0
+; SI-NEXT:    s_mov_b32 s4, s0
+; SI-NEXT:    s_mov_b32 s5, s1
+; SI-NEXT:    v_mov_b32_e32 v0, s2
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: s_ctlz_zero_undef_i64_with_select:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    v_mov_b32_e32 v1, 0
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_flbit_i32_b64 s2, s[2:3]
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    v_mov_b32_e32 v2, s0
+; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT:    s_endpgm
+;
+; EG-LABEL: s_ctlz_zero_undef_i64_with_select:
+; EG:       ; %bb.0:
+; EG-NEXT:    ALU 7, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
+; EG-NEXT:    CF_END
+; EG-NEXT:    PAD
+; EG-NEXT:    ALU clause starting at 4:
+; EG-NEXT:     FFBH_UINT * T0.W, KC0[2].W,
+; EG-NEXT:     FFBH_UINT T1.W, KC0[3].X,
+; EG-NEXT:     ADD_INT * T0.W, PV.W, literal.x,
+; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
+; EG-NEXT:     CNDE_INT T0.X, KC0[3].X, PS, PV.W,
+; EG-NEXT:     MOV T0.Y, 0.0,
+; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
+; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+;
+; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i64_with_select:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    s_mov_b32 s5, 0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT:    s_flbit_i32_b64 s4, s[2:3]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-GISEL-NEXT:    s_endpgm
+  %ctlz = tail call i64 @llvm.ctlz.i64(i64 %val, i1 true) nounwind readnone
+  %ctlz_ret = icmp ne i64 %val, 0
+  %ret = select i1 %ctlz_ret, i64 %ctlz, i64 32
+  store i64 %ctlz, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind {
+; SI-LABEL: v_ctlz_zero_undef_i8_with_select:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    s_mov_b32 s10, s6
+; SI-NEXT:    s_mov_b32 s11, s7
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b32 s8, s2
+; SI-NEXT:    s_mov_b32 s9, s3
+; SI-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0
+; SI-NEXT:    s_mov_b32 s4, s0
+; SI-NEXT:    s_mov_b32 s5, s1
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_ffbh_u32_e32 v1, v0
+; SI-NEXT:    v_subrev_i32_e32 v1, vcc, 24, v1
+; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; SI-NEXT:    v_cndmask_b32_e32 v0, 32, v1, vcc
+; SI-NEXT:    buffer_store_byte v0, off, s[4:7], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: v_ctlz_zero_undef_i8_with_select:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    flat_load_ubyte v0, v[0:1]
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_ffbh_u32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; VI-NEXT:    v_add_u32_e32 v1, vcc, -16, v1
+; VI-NEXT:    v_add_u16_e32 v1, -8, v1
+; VI-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v0
+; VI-NEXT:    v_cndmask_b32_e32 v2, 32, v1, vcc
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    flat_store_byte v[0:1], v2
+; VI-NEXT:    s_endpgm
+;
+; EG-LABEL: v_ctlz_zero_undef_i8_with_select:
+; EG:       ; %bb.0:
+; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    TEX 0 @6
+; EG-NEXT:    ALU 15, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
+; EG-NEXT:    CF_END
+; EG-NEXT:    PAD
+; EG-NEXT:    Fetch clause starting at 6:
+; EG-NEXT:     VTX_READ_8 T0.X, T0.X, 0, #1
+; EG-NEXT:    ALU clause starting at 8:
+; EG-NEXT:     MOV * T0.X, KC0[2].Z,
+; EG-NEXT:    ALU clause starting at 9:
+; EG-NEXT:     FFBH_UINT * T0.W, T0.X,
+; EG-NEXT:     ADD_INT T0.W, PV.W, literal.x,
+; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    -24(nan), 3(4.203895e-45)
+; EG-NEXT:     CNDE_INT * T0.W, T0.X, literal.x, PV.W,
+; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
+; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
+; EG-NEXT:     LSHL * T1.W, T1.W, literal.y,
+; EG-NEXT:    255(3.573311e-43), 3(4.203895e-45)
+; EG-NEXT:     LSHL T0.X, PV.W, PS,
+; EG-NEXT:     LSHL * T0.W, literal.x, PS,
+; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
+; EG-NEXT:     MOV T0.Y, 0.0,
+; EG-NEXT:     MOV * T0.Z, 0.0,
+; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
+; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+;
+; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i8_with_select:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[2:3]
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v2, v1
+; GFX9-GISEL-NEXT:    v_subrev_u32_e32 v2, 24, v2
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX9-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, 32, v2, vcc
+; GFX9-GISEL-NEXT:    global_store_byte v0, v1, s[0:1]
+; GFX9-GISEL-NEXT:    s_endpgm
+  %val = load i8, ptr addrspace(1) %arrayidx, align 1
+  %ctlz = tail call i8 @llvm.ctlz.i8(i8 %val, i1 true) nounwind readnone
+  %ctlz_ret = icmp ne i8 %val, 0
+  %ret = select i1 %ctlz_ret, i8 %ctlz, i8 32
+  store i8 %ret, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind {
+; SI-LABEL: v_ctlz_zero_undef_i16_with_select:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    s_mov_b32 s10, s6
+; SI-NEXT:    s_mov_b32 s11, s7
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b32 s8, s2
+; SI-NEXT:    s_mov_b32 s9, s3
+; SI-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0 offset:1
+; SI-NEXT:    buffer_load_ubyte v1, off, s[8:11], 0
+; SI-NEXT:    s_mov_b32 s4, s0
+; SI-NEXT:    s_mov_b32 s5, s1
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_or_b32_e32 v0, v0, v1
+; SI-NEXT:    v_ffbh_u32_e32 v1, v0
+; SI-NEXT:    v_add_i32_e32 v1, vcc, -16, v1
+; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; SI-NEXT:    v_cndmask_b32_e32 v0, 32, v1, vcc
+; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: v_ctlz_zero_undef_i16_with_select:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_add_u32 s4, s2, 1
+; VI-NEXT:    s_addc_u32 s5, s3, 0
+; VI-NEXT:    v_mov_b32_e32 v2, s4
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    v_mov_b32_e32 v3, s5
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    flat_load_ubyte v2, v[2:3]
+; VI-NEXT:    flat_load_ubyte v0, v[0:1]
+; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v2
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_or_b32_e32 v0, v1, v0
+; VI-NEXT:    v_ffbh_u32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; VI-NEXT:    v_add_u32_e32 v1, vcc, -16, v1
+; VI-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v0
+; VI-NEXT:    v_cndmask_b32_e32 v2, 32, v1, vcc
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    flat_store_short v[0:1], v2
+; VI-NEXT:    s_endpgm
+;
+; EG-LABEL: v_ctlz_zero_undef_i16_with_select:
+; EG:       ; %bb.0:
+; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    TEX 0 @6
+; EG-NEXT:    ALU 15, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
+; EG-NEXT:    CF_END
+; EG-NEXT:    PAD
+; EG-NEXT:    Fetch clause starting at 6:
+; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 0, #1
+; EG-NEXT:    ALU clause starting at 8:
+; EG-NEXT:     MOV * T0.X, KC0[2].Z,
+; EG-NEXT:    ALU clause starting at 9:
+; EG-NEXT:     FFBH_UINT * T0.W, T0.X,
+; EG-NEXT:     ADD_INT T0.W, PV.W, literal.x,
+; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.y,
+; EG-NEXT:    -16(nan), 3(4.203895e-45)
+; EG-NEXT:     CNDE_INT * T0.W, T0.X, literal.x, PV.W,
+; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
+; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
+; EG-NEXT:     LSHL * T1.W, T1.W, literal.y,
+; EG-NEXT:    65535(9.183409e-41), 3(4.203895e-45)
+; EG-NEXT:     LSHL T0.X, PV.W, PS,
+; EG-NEXT:     LSHL * T0.W, literal.x, PS,
+; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT:     MOV T0.Y, 0.0,
+; EG-NEXT:     MOV * T0.Z, 0.0,
+; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
+; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+;
+; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i16_with_select:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[2:3]
+; GFX9-GISEL-NEXT:    global_load_ubyte v2, v0, s[2:3] offset:1
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
+; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v2, v1
+; GFX9-GISEL-NEXT:    v_subrev_u32_e32 v2, 16, v2
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX9-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, 32, v2, vcc
+; GFX9-GISEL-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX9-GISEL-NEXT:    s_endpgm
+  %val = load i16, ptr addrspace(1) %arrayidx, align 1
+  %ctlz = tail call i16 @llvm.ctlz.i16(i16 %val, i1 true) nounwind readnone
+  %ctlz_ret = icmp ne i16 %val, 0
+  %ret = select i1 %ctlz_ret, i16 %ctlz, i16 32
+  store i16 %ret, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @v_ctlz_zero_undef_i32_with_select(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind {
+; SI-LABEL: v_ctlz_zero_undef_i32_with_select:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    s_mov_b32 s10, s6
+; SI-NEXT:    s_mov_b32 s11, s7
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b32 s8, s2
+; SI-NEXT:    s_mov_b32 s9, s3
+; SI-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0 offset:1
+; SI-NEXT:    buffer_load_ubyte v1, off, s[8:11], 0 offset:3
+; SI-NEXT:    buffer_load_ubyte v2, off, s[8:11], 0
+; SI-NEXT:    buffer_load_ubyte v3, off, s[8:11], 0 offset:2
+; SI-NEXT:    s_mov_b32 s4, s0
+; SI-NEXT:    s_mov_b32 s5, s1
+; SI-NEXT:    s_waitcnt vmcnt(3)
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_or_b32_e32 v0, v0, v2
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_or_b32_e32 v1, v1, v3
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_or_b32_e32 v0, v1, v0
+; SI-NEXT:    v_ffbh_u32_e32 v0, v0
+; SI-NEXT:    v_min_u32_e32 v0, 32, v0
+; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: v_ctlz_zero_undef_i32_with_select:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_add_u32 s4, s2, 3
+; VI-NEXT:    s_addc_u32 s5, s3, 0
+; VI-NEXT:    v_mov_b32_e32 v2, s4
+; VI-NEXT:    v_mov_b32_e32 v3, s5
+; VI-NEXT:    s_add_u32 s4, s2, 2
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    s_addc_u32 s5, s3, 0
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    s_add_u32 s2, s2, 1
+; VI-NEXT:    s_addc_u32 s3, s3, 0
+; VI-NEXT:    v_mov_b32_e32 v4, s4
+; VI-NEXT:    v_mov_b32_e32 v7, s3
+; VI-NEXT:    v_mov_b32_e32 v5, s5
+; VI-NEXT:    v_mov_b32_e32 v6, s2
+; VI-NEXT:    flat_load_ubyte v2, v[2:3]
+; VI-NEXT:    flat_load_ubyte v3, v[4:5]
+; VI-NEXT:    flat_load_ubyte v4, v[6:7]
+; VI-NEXT:    flat_load_ubyte v0, v[0:1]
+; VI-NEXT:    s_waitcnt vmcnt(3)
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v2
+; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    v_or_b32_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v4
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_or_b32_e32 v0, v2, v0
+; VI-NEXT:    v_or_b32_e32 v0, v1, v0
+; VI-NEXT:    v_ffbh_u32_e32 v0, v0
+; VI-NEXT:    v_min_u32_e32 v2, 32, v0
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    flat_store_dword v[0:1], v2
+; VI-NEXT:    s_endpgm
+;
+; EG-LABEL: v_ctlz_zero_undef_i32_with_select:
+; EG:       ; %bb.0:
+; EG-NEXT:    ALU 0, @10, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    TEX 1 @6
+; EG-NEXT:    ALU 6, @11, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
+; EG-NEXT:    CF_END
+; EG-NEXT:    PAD
+; EG-NEXT:    Fetch clause starting at 6:
+; EG-NEXT:     VTX_READ_16 T1.X, T0.X, 2, #1
+; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 0, #1
+; EG-NEXT:    ALU clause starting at 10:
+; EG-NEXT:     MOV * T0.X, KC0[2].Z,
+; EG-NEXT:    ALU clause starting at 11:
+; EG-NEXT:     LSHL * T0.W, T1.X, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     OR_INT * T0.W, PV.W, T0.X,
+; EG-NEXT:     FFBH_UINT * T1.W, PV.W,
+; EG-NEXT:     CNDE_INT T0.X, T0.W, literal.x, PV.W,
+; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
+; EG-NEXT:    32(4.484155e-44), 2(2.802597e-45)
+;
+; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_with_select:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[2:3]
+; GFX9-GISEL-NEXT:    global_load_ubyte v2, v0, s[2:3] offset:1
+; GFX9-GISEL-NEXT:    global_load_ubyte v3, v0, s[2:3] offset:3
+; GFX9-GISEL-NEXT:    global_load_ubyte v4, v0, s[2:3] offset:2
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 24, v3
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
+; GFX9-GISEL-NEXT:    v_or3_b32 v1, v2, v3, v1
+; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v2, v1
+; GFX9-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, 32, v2, vcc
+; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-GISEL-NEXT:    s_endpgm
+  %val = load i32, ptr addrspace(1) %arrayidx, align 1
+  %ctlz = tail call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
+  %ctlz_ret = icmp ne i32 %val, 0
+  %ret = select i1 %ctlz_ret, i32 %ctlz, i32 32
+  store i32 %ret, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @v_ctlz_zero_undef_i64_with_select(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind {
+; SI-LABEL: v_ctlz_zero_undef_i64_with_select:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_mov_b32 s10, s2
+; SI-NEXT:    s_mov_b32 s11, s3
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b32 s8, s6
+; SI-NEXT:    s_mov_b32 s9, s7
+; SI-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0 offset:5
+; SI-NEXT:    buffer_load_ubyte v1, off, s[8:11], 0 offset:7
+; SI-NEXT:    buffer_load_ubyte v2, off, s[8:11], 0
+; SI-NEXT:    buffer_load_ubyte v3, off, s[8:11], 0 offset:1
+; SI-NEXT:    buffer_load_ubyte v4, off, s[8:11], 0 offset:2
+; SI-NEXT:    buffer_load_ubyte v5, off, s[8:11], 0 offset:3
+; SI-NEXT:    buffer_load_ubyte v6, off, s[8:11], 0 offset:4
+; SI-NEXT:    buffer_load_ubyte v7, off, s[8:11], 0 offset:6
+; SI-NEXT:    s_mov_b32 s0, s4
+; SI-NEXT:    s_mov_b32 s1, s5
+; SI-NEXT:    s_waitcnt vmcnt(7)
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; SI-NEXT:    s_waitcnt vmcnt(6)
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
+; SI-NEXT:    s_waitcnt vmcnt(4)
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
+; SI-NEXT:    s_waitcnt vmcnt(2)
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_or_b32_e32 v0, v0, v6
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_or_b32_e32 v1, v1, v7
+; SI-NEXT:    v_or_b32_e32 v2, v3, v2
+; SI-NEXT:    v_or_b32_e32 v3, v5, v4
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_or_b32_e32 v0, v1, v0
+; SI-NEXT:    v_or_b32_e32 v1, v3, v2
+; SI-NEXT:    v_ffbh_u32_e32 v1, v1
+; SI-NEXT:    v_ffbh_u32_e32 v0, v0
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 32, v1
+; SI-NEXT:    v_min_u32_e32 v0, v1, v0
+; SI-NEXT:    v_min_u32_e32 v0, 64, v0
+; SI-NEXT:    v_mov_b32_e32 v1, 0
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: v_ctlz_zero_undef_i64_with_select:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_add_u32 s4, s2, 5
+; VI-NEXT:    s_addc_u32 s5, s3, 0
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    s_add_u32 s4, s2, 4
+; VI-NEXT:    s_addc_u32 s5, s3, 0
+; VI-NEXT:    v_mov_b32_e32 v2, s4
+; VI-NEXT:    v_mov_b32_e32 v3, s5
+; VI-NEXT:    s_add_u32 s4, s2, 7
+; VI-NEXT:    s_addc_u32 s5, s3, 0
+; VI-NEXT:    v_mov_b32_e32 v4, s4
+; VI-NEXT:    v_mov_b32_e32 v5, s5
+; VI-NEXT:    s_add_u32 s4, s2, 6
+; VI-NEXT:    s_addc_u32 s5, s3, 0
+; VI-NEXT:    v_mov_b32_e32 v7, s5
+; VI-NEXT:    v_mov_b32_e32 v6, s4
+; VI-NEXT:    s_add_u32 s4, s2, 3
+; VI-NEXT:    s_addc_u32 s5, s3, 0
+; VI-NEXT:    v_mov_b32_e32 v9, s5
+; VI-NEXT:    v_mov_b32_e32 v8, s4
+; VI-NEXT:    s_add_u32 s4, s2, 2
+; VI-NEXT:    s_addc_u32 s5, s3, 0
+; VI-NEXT:    v_mov_b32_e32 v11, s5
+; VI-NEXT:    v_mov_b32_e32 v10, s4
+; VI-NEXT:    s_add_u32 s4, s2, 1
+; VI-NEXT:    flat_load_ubyte v12, v[0:1]
+; VI-NEXT:    flat_load_ubyte v13, v[2:3]
+; VI-NEXT:    flat_load_ubyte v4, v[4:5]
+; VI-NEXT:    flat_load_ubyte v5, v[6:7]
+; VI-NEXT:    s_addc_u32 s5, s3, 0
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    flat_load_ubyte v6, v[8:9]
+; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    v_mov_b32_e32 v3, s3
+; VI-NEXT:    flat_load_ubyte v7, v[10:11]
+; VI-NEXT:    flat_load_ubyte v0, v[0:1]
+; VI-NEXT:    flat_load_ubyte v2, v[2:3]
+; VI-NEXT:    v_mov_b32_e32 v1, 0
+; VI-NEXT:    s_waitcnt vmcnt(7)
+; VI-NEXT:    v_lshlrev_b32_e32 v3, 8, v12
+; VI-NEXT:    s_waitcnt vmcnt(6)
+; VI-NEXT:    v_or_b32_e32 v3, v3, v13
+; VI-NEXT:    s_waitcnt vmcnt(5)
+; VI-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; VI-NEXT:    s_waitcnt vmcnt(4)
+; VI-NEXT:    v_or_b32_sdwa v4, v4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_or_b32_e32 v3, v4, v3
+; VI-NEXT:    v_ffbh_u32_e32 v3, v3
+; VI-NEXT:    s_waitcnt vmcnt(3)
+; VI-NEXT:    v_lshlrev_b32_e32 v4, 8, v6
+; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    v_or_b32_sdwa v4, v4, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_or_b32_e32 v0, v0, v2
+; VI-NEXT:    v_or_b32_e32 v0, v4, v0
+; VI-NEXT:    v_ffbh_u32_e32 v0, v0
+; VI-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
+; VI-NEXT:    v_min_u32_e32 v0, v0, v3
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_min_u32_e32 v0, 64, v0
+; VI-NEXT:    v_mov_b32_e32 v2, s0
+; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT:    s_endpgm
+;
+; EG-LABEL: v_ctlz_zero_undef_i64_with_select:
+; EG:       ; %bb.0:
+; EG-NEXT:    ALU 0, @14, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    TEX 3 @6
+; EG-NEXT:    ALU 15, @15, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
+; EG-NEXT:    CF_END
+; EG-NEXT:    PAD
+; EG-NEXT:    Fetch clause starting at 6:
+; EG-NEXT:     VTX_READ_16 T1.X, T0.X, 2, #1
+; EG-NEXT:     VTX_READ_16 T2.X, T0.X, 4, #1
+; EG-NEXT:     VTX_READ_16 T3.X, T0.X, 6, #1
+; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 0, #1
+; EG-NEXT:    ALU clause starting at 14:
+; EG-NEXT:     MOV * T0.X, KC0[2].Z,
+; EG-NEXT:    ALU clause starting at 15:
+; EG-NEXT:     LSHL * T0.W, T1.X, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     OR_INT * T0.W, PV.W, T0.X,
+; EG-NEXT:     FFBH_UINT T1.W, PV.W,
+; EG-NEXT:     LSHL * T2.W, T3.X, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     CNDE_INT T0.W, T0.W, literal.x, PV.W,
+; EG-NEXT:     OR_INT * T1.W, PS, T2.X,
+; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
+; EG-NEXT:     FFBH_UINT T2.W, PS,
+; EG-NEXT:     ADD_INT * T0.W, PV.W, literal.x,
+; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
+; EG-NEXT:     CNDE_INT T0.X, T1.W, PS, PV.W,
+; EG-NEXT:     MOV T0.Y, 0.0,
+; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
+; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+;
+; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i64_with_select:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT:    global_load_ubyte v0, v1, s[2:3]
+; GFX9-GISEL-NEXT:    global_load_ubyte v2, v1, s[2:3] offset:1
+; GFX9-GISEL-NEXT:    global_load_ubyte v3, v1, s[2:3] offset:2
+; GFX9-GISEL-NEXT:    global_load_ubyte v4, v1, s[2:3] offset:3
+; GFX9-GISEL-NEXT:    global_load_ubyte v5, v1, s[2:3] offset:4
+; GFX9-GISEL-NEXT:    global_load_ubyte v6, v1, s[2:3] offset:5
+; GFX9-GISEL-NEXT:    global_load_ubyte v7, v1, s[2:3] offset:6
+; GFX9-GISEL-NEXT:    global_load_ubyte v8, v1, s[2:3] offset:7
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(6)
+; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v2, 8, v0
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(5)
+; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 24, v4
+; GFX9-GISEL-NEXT:    v_or3_b32 v2, v2, v3, v0
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-GISEL-NEXT:    v_lshl_or_b32 v4, v6, 8, v5
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v7
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v8, 24, v5
+; GFX9-GISEL-NEXT:    v_or3_b32 v3, v0, v4, 0
+; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v0, v2
+; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v4, v3
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v0, 32, v0
+; GFX9-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; GFX9-GISEL-NEXT:    v_min_u32_e32 v0, v4, v0
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, 64, v0, vcc
+; GFX9-GISEL-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
+; GFX9-GISEL-NEXT:    s_endpgm
+  %val = load i64, ptr addrspace(1) %arrayidx, align 1
+  %ctlz = tail call i64 @llvm.ctlz.i64(i64 %val, i1 true) nounwind readnone
+  %ctlz_ret = icmp ne i64 %val, 0
+  %ret = select i1 %ctlz_ret, i64 %ctlz, i64 64
+  store i64 %ret, ptr addrspace(1) %out, align 4
+  ret void
+}
+
 define amdgpu_kernel void @v_ctlz_zero_undef_i8(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind {
 ; SI-LABEL: v_ctlz_zero_undef_i8:
 ; SI:       ; %bb.0:
@@ -403,10 +1186,7 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64(ptr addrspace(1) noalias %out,
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_flbit_i32_b32 s4, s4
-; SI-NEXT:    s_flbit_i32_b32 s5, s5
-; SI-NEXT:    s_add_i32 s4, s4, 32
-; SI-NEXT:    s_min_u32 s4, s4, s5
+; SI-NEXT:    s_flbit_i32_b64 s4, s[4:5]
 ; SI-NEXT:    v_mov_b32_e32 v1, 0
 ; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -418,10 +1198,7 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64(ptr addrspace(1) noalias %out,
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_flbit_i32_b32 s2, s2
-; VI-NEXT:    s_flbit_i32_b32 s3, s3
-; VI-NEXT:    s_add_i32 s2, s2, 32
-; VI-NEXT:    s_min_u32 s2, s2, s3
+; VI-NEXT:    s_flbit_i32_b64 s2, s[2:3]
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
@@ -467,10 +1244,7 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64_trunc(ptr addrspace(1) noalias
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_flbit_i32_b32 s2, s2
-; SI-NEXT:    s_flbit_i32_b32 s3, s3
-; SI-NEXT:    s_add_i32 s2, s2, 32
-; SI-NEXT:    s_min_u32 s2, s2, s3
+; SI-NEXT:    s_flbit_i32_b64 s2, s[2:3]
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_mov_b32 s4, s0
 ; SI-NEXT:    s_mov_b32 s5, s1
@@ -482,10 +1256,7 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64_trunc(ptr addrspace(1) noalias
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_flbit_i32_b32 s2, s2
-; VI-NEXT:    s_flbit_i32_b32 s3, s3
-; VI-NEXT:    s_add_i32 s2, s2, 32
-; VI-NEXT:    s_min_u32 s2, s2, s3
+; VI-NEXT:    s_flbit_i32_b64 s2, s[2:3]
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
diff --git a/llvm/test/CodeGen/AMDGPU/cttz.ll b/llvm/test/CodeGen/AMDGPU/cttz.ll
index e871b80cbe29e..db91554b2ff39 100644
--- a/llvm/test/CodeGen/AMDGPU/cttz.ll
+++ b/llvm/test/CodeGen/AMDGPU/cttz.ll
@@ -510,13 +510,10 @@ define amdgpu_kernel void @s_cttz_i64(ptr addrspace(1) noalias %out, [8 x i32],
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_ff1_i32_b32 s5, s5
-; SI-NEXT:    s_min_u32 s5, s5, 0xffffffdf
-; SI-NEXT:    s_add_i32 s5, s5, 32
-; SI-NEXT:    s_ff1_i32_b32 s4, s4
-; SI-NEXT:    v_mov_b32_e32 v0, s5
-; SI-NEXT:    v_min3_u32 v0, s4, v0, 64
+; SI-NEXT:    s_ff1_i32_b64 s4, s[4:5]
+; SI-NEXT:    s_min_u32 s4, s4, 64
 ; SI-NEXT:    v_mov_b32_e32 v1, 0
+; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
@@ -528,10 +525,9 @@ define amdgpu_kernel void @s_cttz_i64(ptr addrspace(1) noalias %out, [8 x i32],
 ; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_ff1_i32_b32 s5, s5
-; VI-NEXT:    v_add_u32_e64 v0, s[6:7], s5, 32 clamp
-; VI-NEXT:    s_ff1_i32_b32 s4, s4
-; VI-NEXT:    v_min3_u32 v0, s4, v0, 64
+; VI-NEXT:    s_ff1_i32_b64 s4, s[4:5]
+; VI-NEXT:    s_min_u32 s4, s4, 64
+; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
 ;
@@ -560,10 +556,9 @@ define amdgpu_kernel void @s_cttz_i64(ptr addrspace(1) noalias %out, [8 x i32],
 ; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_ff1_i32_b32 s0, s3
-; GFX10-NEXT:    v_add_nc_u32_e64 v0, s0, 32 clamp
-; GFX10-NEXT:    s_ff1_i32_b32 s0, s2
-; GFX10-NEXT:    v_min3_u32 v0, s0, v0, 64
+; GFX10-NEXT:    s_ff1_i32_b64 s0, s[2:3]
+; GFX10-NEXT:    s_min_u32 s0, s0, 64
+; GFX10-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-NEXT:    global_store_dwordx2 v1, v[0:1], s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
@@ -591,16 +586,13 @@ define amdgpu_kernel void @s_cttz_i64_trunc(ptr addrspace(1) noalias %out, i64 %
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_ff1_i32_b64 s2, s[2:3]
+; SI-NEXT:    s_min_u32 s2, s2, 64
+; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_mov_b32 s4, s0
 ; SI-NEXT:    s_mov_b32 s5, s1
-; SI-NEXT:    s_ff1_i32_b32 s0, s3
-; SI-NEXT:    s_min_u32 s0, s0, 0xffffffdf
-; SI-NEXT:    s_add_i32 s0, s0, 32
-; SI-NEXT:    s_ff1_i32_b32 s1, s2
-; SI-NEXT:    v_mov_b32_e32 v0, s0
-; SI-NEXT:    v_min3_u32 v0, s1, v0, 64
+; SI-NEXT:    v_mov_b32_e32 v0, s2
 ; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
@@ -611,11 +603,10 @@ define amdgpu_kernel void @s_cttz_i64_trunc(ptr addrspace(1) noalias %out, i64 %
 ; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_mov_b32 s4, s0
-; VI-NEXT:    s_ff1_i32_b32 s0, s3
+; VI-NEXT:    s_ff1_i32_b64 s0, s[2:3]
+; VI-NEXT:    s_min_u32 s0, s0, 64
 ; VI-NEXT:    s_mov_b32 s5, s1
-; VI-NEXT:    v_add_u32_e64 v0, s[0:1], s0, 32 clamp
-; VI-NEXT:    s_ff1_i32_b32 s0, s2
-; VI-NEXT:    v_min3_u32 v0, s0, v0, 64
+; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
@@ -639,13 +630,12 @@ define amdgpu_kernel void @s_cttz_i64_trunc(ptr addrspace(1) noalias %out, i64 %
 ; GFX10-LABEL: s_cttz_i64_trunc:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_ff1_i32_b32 s3, s3
-; GFX10-NEXT:    s_ff1_i32_b32 s2, s2
-; GFX10-NEXT:    v_add_nc_u32_e64 v0, s3, 32 clamp
-; GFX10-NEXT:    v_min3_u32 v0, s2, v0, 64
-; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT:    s_ff1_i32_b64 s2, s[2:3]
+; GFX10-NEXT:    s_min_u32 s2, s2, 64
+; GFX10-NEXT:    v_mov_b32_e32 v1, s2
+; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX10-GISEL-LABEL: s_cttz_i64_trunc:
diff --git a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
index 7e8c28fa44750..5985a235680c0 100644
--- a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
@@ -500,10 +500,7 @@ define amdgpu_kernel void @s_cttz_zero_undef_i64_with_select(ptr addrspace(1) no
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_ff1_i32_b32 s3, s3
-; SI-NEXT:    s_ff1_i32_b32 s2, s2
-; SI-NEXT:    s_add_i32 s3, s3, 32
-; SI-NEXT:    s_min_u32 s2, s2, s3
+; SI-NEXT:    s_ff1_i32_b64 s2, s[2:3]
 ; SI-NEXT:    v_mov_b32_e32 v1, 0
 ; SI-NEXT:    s_mov_b32 s4, s0
 ; SI-NEXT:    s_mov_b32 s5, s1
@@ -516,10 +513,7 @@ define amdgpu_kernel void @s_cttz_zero_undef_i64_with_select(ptr addrspace(1) no
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_ff1_i32_b32 s3, s3
-; VI-NEXT:    s_ff1_i32_b32 s2, s2
-; VI-NEXT:    s_add_i32 s3, s3, 32
-; VI-NEXT:    s_min_u32 s2, s2, s3
+; VI-NEXT:    s_ff1_i32_b64 s2, s[2:3]
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
@@ -878,39 +872,41 @@ define amdgpu_kernel void @v_cttz_zero_undef_i64_with_select(ptr addrspace(1) no
 ; SI-NEXT:    s_mov_b32 s10, s2
 ; SI-NEXT:    s_mov_b32 s11, s3
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b32 s0, s4
-; SI-NEXT:    s_mov_b32 s1, s5
 ; SI-NEXT:    s_mov_b32 s8, s6
 ; SI-NEXT:    s_mov_b32 s9, s7
-; SI-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0
-; SI-NEXT:    buffer_load_ubyte v1, off, s[8:11], 0 offset:1
-; SI-NEXT:    buffer_load_ubyte v2, off, s[8:11], 0 offset:2
-; SI-NEXT:    buffer_load_ubyte v3, off, s[8:11], 0 offset:3
-; SI-NEXT:    buffer_load_ubyte v4, off, s[8:11], 0 offset:4
-; SI-NEXT:    buffer_load_ubyte v5, off, s[8:11], 0 offset:5
-; SI-NEXT:    buffer_load_ubyte v6, off, s[8:11], 0 offset:6
-; SI-NEXT:    buffer_load_ubyte v7, off, s[8:11], 0 offset:7
+; SI-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0 offset:5
+; SI-NEXT:    buffer_load_ubyte v1, off, s[8:11], 0 offset:7
+; SI-NEXT:    buffer_load_ubyte v2, off, s[8:11], 0
+; SI-NEXT:    buffer_load_ubyte v3, off, s[8:11], 0 offset:1
+; SI-NEXT:    buffer_load_ubyte v4, off, s[8:11], 0 offset:2
+; SI-NEXT:    buffer_load_ubyte v5, off, s[8:11], 0 offset:3
+; SI-NEXT:    buffer_load_ubyte v6, off, s[8:11], 0 offset:4
+; SI-NEXT:    buffer_load_ubyte v7, off, s[8:11], 0 offset:6
+; SI-NEXT:    s_mov_b32 s0, s4
+; SI-NEXT:    s_mov_b32 s1, s5
+; SI-NEXT:    s_waitcnt vmcnt(7)
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
 ; SI-NEXT:    s_waitcnt vmcnt(6)
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
 ; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
 ; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_or_b32_e32 v0, v0, v6
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_lshlrev_b32_e32 v7, 8, v7
-; SI-NEXT:    v_or_b32_e32 v0, v1, v0
-; SI-NEXT:    v_or_b32_e32 v1, v3, v2
-; SI-NEXT:    v_or_b32_e32 v2, v5, v4
-; SI-NEXT:    v_or_b32_e32 v3, v7, v6
+; SI-NEXT:    v_or_b32_e32 v1, v1, v7
+; SI-NEXT:    v_or_b32_e32 v2, v3, v2
+; SI-NEXT:    v_or_b32_e32 v3, v5, v4
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; SI-NEXT:    v_or_b32_e32 v0, v1, v0
 ; SI-NEXT:    v_or_b32_e32 v1, v3, v2
 ; SI-NEXT:    v_ffbl_b32_e32 v1, v1
 ; SI-NEXT:    v_ffbl_b32_e32 v0, v0
-; SI-NEXT:    v_min_u32_e32 v1, 0xffffffdf, v1
-; SI-NEXT:    v_add_i32_e32 v1, vcc, 32, v1
-; SI-NEXT:    v_min3_u32 v0, v0, v1, 64
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 32, v0
+; SI-NEXT:    v_min_u32_e32 v0, v0, v1
+; SI-NEXT:    v_min_u32_e32 v0, 64, v0
 ; SI-NEXT:    v_mov_b32_e32 v1, 0
 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
@@ -970,7 +966,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i64_with_select(ptr addrspace(1) no
 ; VI-NEXT:    s_waitcnt vmcnt(3)
 ; VI-NEXT:    v_lshlrev_b32_e32 v4, 8, v6
 ; VI-NEXT:    v_ffbl_b32_e32 v3, v3
-; VI-NEXT:    v_add_u32_e64 v3, s[2:3], v3, 32 clamp
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 32, v3
 ; VI-NEXT:    s_waitcnt vmcnt(2)
 ; VI-NEXT:    v_or_b32_sdwa v4, v4, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI-NEXT:    s_waitcnt vmcnt(1)
@@ -979,8 +975,9 @@ define amdgpu_kernel void @v_cttz_zero_undef_i64_with_select(ptr addrspace(1) no
 ; VI-NEXT:    v_or_b32_e32 v0, v0, v2
 ; VI-NEXT:    v_or_b32_e32 v0, v4, v0
 ; VI-NEXT:    v_ffbl_b32_e32 v0, v0
-; VI-NEXT:    v_min3_u32 v0, v0, v3, 64
+; VI-NEXT:    v_min_u32_e32 v0, v3, v0
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_min_u32_e32 v0, 64, v0
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; VI-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
index 064831f098667..5ebd3eef69f25 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
@@ -415,10 +415,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX9-NEXT:    v_bfrev_b32_e32 v2, 1
 ; GFX9-NEXT:  .LBB1_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    s_ff1_i32_b32 s2, s1
-; GFX9-NEXT:    s_ff1_i32_b32 s3, s0
-; GFX9-NEXT:    s_add_i32 s2, s2, 32
-; GFX9-NEXT:    s_min_u32 s2, s3, s2
+; GFX9-NEXT:    s_ff1_i32_b64 s2, s[0:1]
 ; GFX9-NEXT:    v_readlane_b32 s4, v0, s2
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
@@ -484,10 +481,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1064-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1064-NEXT:  .LBB1_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT:    s_ff1_i32_b32 s2, s1
-; GFX1064-NEXT:    s_ff1_i32_b32 s3, s0
-; GFX1064-NEXT:    s_add_i32 s2, s2, 32
-; GFX1064-NEXT:    s_min_u32 s2, s3, s2
+; GFX1064-NEXT:    s_ff1_i32_b64 s2, s[0:1]
 ; GFX1064-NEXT:    v_readlane_b32 s4, v0, s2
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
@@ -608,17 +602,14 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1164-NEXT:  .LBB1_1: ; %ComputeLoop
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_ctz_i32_b32 s2, s1
-; GFX1164-NEXT:    s_ctz_i32_b32 s3, s0
-; GFX1164-NEXT:    s_add_i32 s2, s2, 32
-; GFX1164-NEXT:    s_min_u32 s2, s3, s2
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_ctz_i32_b64 s2, s[0:1]
 ; GFX1164-NEXT:    v_readlane_b32 s4, v0, s2
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s2
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164-NEXT:    v_add_f32_e32 v1, s4, v1
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB1_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -1610,10 +1601,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
 ; GFX9-NEXT:    v_bfrev_b32_e32 v2, 1
 ; GFX9-NEXT:  .LBB3_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    s_ff1_i32_b32 s2, s1
-; GFX9-NEXT:    s_ff1_i32_b32 s3, s0
-; GFX9-NEXT:    s_add_i32 s2, s2, 32
-; GFX9-NEXT:    s_min_u32 s2, s3, s2
+; GFX9-NEXT:    s_ff1_i32_b64 s2, s[0:1]
 ; GFX9-NEXT:    v_readlane_b32 s4, v0, s2
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
@@ -1679,10 +1667,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
 ; GFX1064-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1064-NEXT:  .LBB3_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT:    s_ff1_i32_b32 s2, s1
-; GFX1064-NEXT:    s_ff1_i32_b32 s3, s0
-; GFX1064-NEXT:    s_add_i32 s2, s2, 32
-; GFX1064-NEXT:    s_min_u32 s2, s3, s2
+; GFX1064-NEXT:    s_ff1_i32_b64 s2, s[0:1]
 ; GFX1064-NEXT:    v_readlane_b32 s4, v0, s2
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
@@ -1803,17 +1788,14 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
 ; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1164-NEXT:  .LBB3_1: ; %ComputeLoop
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_ctz_i32_b32 s2, s1
-; GFX1164-NEXT:    s_ctz_i32_b32 s3, s0
-; GFX1164-NEXT:    s_add_i32 s2, s2, 32
-; GFX1164-NEXT:    s_min_u32 s2, s3, s2
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_ctz_i32_b64 s2, s[0:1]
 ; GFX1164-NEXT:    v_readlane_b32 s4, v0, s2
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s2
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164-NEXT:    v_add_f32_e32 v2, s4, v2
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -2849,10 +2831,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX9-NEXT:    v_bfrev_b32_e32 v2, 1
 ; GFX9-NEXT:  .LBB5_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    s_ff1_i32_b32 s2, s1
-; GFX9-NEXT:    s_ff1_i32_b32 s3, s0
-; GFX9-NEXT:    s_add_i32 s2, s2, 32
-; GFX9-NEXT:    s_min_u32 s2, s3, s2
+; GFX9-NEXT:    s_ff1_i32_b64 s2, s[0:1]
 ; GFX9-NEXT:    v_readlane_b32 s4, v0, s2
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
@@ -2918,10 +2897,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1064-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1064-NEXT:  .LBB5_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT:    s_ff1_i32_b32 s2, s1
-; GFX1064-NEXT:    s_ff1_i32_b32 s3, s0
-; GFX1064-NEXT:    s_add_i32 s2, s2, 32
-; GFX1064-NEXT:    s_min_u32 s2, s3, s2
+; GFX1064-NEXT:    s_ff1_i32_b64 s2, s[0:1]
 ; GFX1064-NEXT:    v_readlane_b32 s4, v0, s2
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
@@ -3042,17 +3018,14 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1164-NEXT:  .LBB5_1: ; %ComputeLoop
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_ctz_i32_b32 s2, s1
-; GFX1164-NEXT:    s_ctz_i32_b32 s3, s0
-; GFX1164-NEXT:    s_add_i32 s2, s2, 32
-; GFX1164-NEXT:    s_min_u32 s2, s3, s2
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_ctz_i32_b64 s2, s[0:1]
 ; GFX1164-NEXT:    v_readlane_b32 s4, v0, s2
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s2
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164-NEXT:    v_add_f32_e32 v1, s4, v1
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -3586,10 +3559,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX9-NEXT:    v_bfrev_b32_e32 v2, 1
 ; GFX9-NEXT:  .LBB6_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    s_ff1_i32_b32 s2, s1
-; GFX9-NEXT:    s_ff1_i32_b32 s3, s0
-; GFX9-NEXT:    s_add_i32 s2, s2, 32
-; GFX9-NEXT:    s_min_u32 s2, s3, s2
+; GFX9-NEXT:    s_ff1_i32_b64 s2, s[0:1]
 ; GFX9-NEXT:    v_readlane_b32 s4, v0, s2
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
@@ -3655,10 +3625,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1064-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1064-NEXT:  .LBB6_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT:    s_ff1_i32_b32 s2, s1
-; GFX1064-NEXT:    s_ff1_i32_b32 s3, s0
-; GFX1064-NEXT:    s_add_i32 s2, s2, 32
-; GFX1064-NEXT:    s_min_u32 s2, s3, s2
+; GFX1064-NEXT:    s_ff1_i32_b64 s2, s[0:1]
 ; GFX1064-NEXT:    v_readlane_b32 s4, v0, s2
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
@@ -3779,17 +3746,14 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1164-NEXT:  .LBB6_1: ; %ComputeLoop
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_ctz_i32_b32 s2, s1
-; GFX1164-NEXT:    s_ctz_i32_b32 s3, s0
-; GFX1164-NEXT:    s_add_i32 s2, s2, 32
-; GFX1164-NEXT:    s_min_u32 s2, s3, s2
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_ctz_i32_b64 s2, s[0:1]
 ; GFX1164-NEXT:    v_readlane_b32 s4, v0, s2
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s2
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164-NEXT:    v_add_f32_e32 v1, s4, v1
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -4780,10 +4744,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_defalut_scop
 ; GFX9-NEXT:    v_bfrev_b32_e32 v2, 1
 ; GFX9-NEXT:  .LBB8_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    s_ff1_i32_b32 s2, s1
-; GFX9-NEXT:    s_ff1_i32_b32 s3, s0
-; GFX9-NEXT:    s_add_i32 s2, s2, 32
-; GFX9-NEXT:    s_min_u32 s2, s3, s2
+; GFX9-NEXT:    s_ff1_i32_b64 s2, s[0:1]
 ; GFX9-NEXT:    v_readlane_b32 s4, v0, s2
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
@@ -4849,10 +4810,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_defalut_scop
 ; GFX1064-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1064-NEXT:  .LBB8_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT:    s_ff1_i32_b32 s2, s1
-; GFX1064-NEXT:    s_ff1_i32_b32 s3, s0
-; GFX1064-NEXT:    s_add_i32 s2, s2, 32
-; GFX1064-NEXT:    s_min_u32 s2, s3, s2
+; GFX1064-NEXT:    s_ff1_i32_b64 s2, s[0:1]
 ; GFX1064-NEXT:    v_readlane_b32 s4, v0, s2
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
@@ -4973,17 +4931,14 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_defalut_scop
 ; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1164-NEXT:  .LBB8_1: ; %ComputeLoop
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_ctz_i32_b32 s2, s1
-; GFX1164-NEXT:    s_ctz_i32_b32 s3, s0
-; GFX1164-NEXT:    s_add_i32 s2, s2, 32
-; GFX1164-NEXT:    s_min_u32 s2, s3, s2
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_ctz_i32_b64 s2, s[0:1]
 ; GFX1164-NEXT:    v_readlane_b32 s4, v0, s2
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s2
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164-NEXT:    v_add_f32_e32 v2, s4, v2
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
index 66c89de1789ee..ce1654b38d4b8 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
@@ -434,10 +434,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0xff800000
 ; GFX9-NEXT:  .LBB1_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    s_ff1_i32_b32 s2, s1
-; GFX9-NEXT:    s_ff1_i32_b32 s3, s0
-; GFX9-NEXT:    s_add_i32 s2, s2, 32
-; GFX9-NEXT:    s_min_u32 s2, s3, s2
+; GFX9-NEXT:    s_ff1_i32_b64 s2, s[0:1]
 ; GFX9-NEXT:    v_readlane_b32 s4, v0, s2
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX9-NEXT:    v_max_f32_e32 v1, v2, v2
@@ -507,11 +504,8 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
 ; GFX1064-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1064-NEXT:  .LBB1_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT:    s_ff1_i32_b32 s2, s1
-; GFX1064-NEXT:    s_ff1_i32_b32 s3, s0
-; GFX1064-NEXT:    s_add_i32 s2, s2, 32
+; GFX1064-NEXT:    s_ff1_i32_b64 s2, s[0:1]
 ; GFX1064-NEXT:    v_max_f32_e32 v1, v2, v2
-; GFX1064-NEXT:    s_min_u32 s2, s3, s2
 ; GFX1064-NEXT:    v_readlane_b32 s3, v0, s2
 ; GFX1064-NEXT:    v_max_f32_e64 v2, s3, s3
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s2
@@ -639,12 +633,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
 ; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1164-NEXT:  .LBB1_1: ; %ComputeLoop
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_ctz_i32_b32 s2, s1
-; GFX1164-NEXT:    s_ctz_i32_b32 s3, s0
-; GFX1164-NEXT:    s_add_i32 s2, s2, 32
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT:    s_ctz_i32_b64 s2, s[0:1]
 ; GFX1164-NEXT:    v_max_f32_e32 v1, v2, v2
-; GFX1164-NEXT:    s_min_u32 s2, s3, s2
 ; GFX1164-NEXT:    v_readlane_b32 s3, v0, s2
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    v_max_f32_e64 v2, s3, s3
@@ -1622,10 +1613,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0xff800000
 ; GFX9-NEXT:  .LBB3_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    s_ff1_i32_b32 s2, s1
-; GFX9-NEXT:    s_ff1_i32_b32 s3, s0
-; GFX9-NEXT:    s_add_i32 s2, s2, 32
-; GFX9-NEXT:    s_min_u32 s2, s3, s2
+; GFX9-NEXT:    s_ff1_i32_b64 s2, s[0:1]
 ; GFX9-NEXT:    v_readlane_b32 s4, v0, s2
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX9-NEXT:    v_max_f32_e32 v1, v2, v2
@@ -1695,11 +1683,8 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
 ; GFX1064-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1064-NEXT:  .LBB3_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT:    s_ff1_i32_b32 s2, s1
-; GFX1064-NEXT:    s_ff1_i32_b32 s3, s0
-; GFX1064-NEXT:    s_add_i32 s2, s2, 32
+; GFX1064-NEXT:    s_ff1_i32_b64 s2, s[0:1]
 ; GFX1064-NEXT:    v_max_f32_e32 v1, v2, v2
-; GFX1064-NEXT:    s_min_u32 s2, s3, s2
 ; GFX1064-NEXT:    v_readlane_b32 s3, v0, s2
 ; GFX1064-NEXT:    v_max_f32_e64 v2, s3, s3
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s2
@@ -1827,12 +1812,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
 ; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1164-NEXT:  .LBB3_1: ; %ComputeLoop
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_ctz_i32_b32 s2, s1
-; GFX1164-NEXT:    s_ctz_i32_b32 s3, s0
-; GFX1164-NEXT:    s_add_i32 s2, s2, 32
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT:    s_ctz_i32_b64 s2, s[0:1]
 ; GFX1164-NEXT:    v_max_f32_e32 v1, v2, v2
-; GFX1164-NEXT:    s_min_u32 s2, s3, s2
 ; GFX1164-NEXT:    v_readlane_b32 s3, v0, s2
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    v_max_f32_e64 v2, s3, s3
@@ -2810,10 +2792,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_defalut_scop
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0xff800000
 ; GFX9-NEXT:  .LBB5_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    s_ff1_i32_b32 s2, s1
-; GFX9-NEXT:    s_ff1_i32_b32 s3, s0
-; GFX9-NEXT:    s_add_i32 s2, s2, 32
-; GFX9-NEXT:    s_min_u32 s2, s3, s2
+; GFX9-NEXT:    s_ff1_i32_b64 s2, s[0:1]
 ; GFX9-NEXT:    v_readlane_b32 s4, v0, s2
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX9-NEXT:    v_max_f32_e32 v1, v2, v2
@@ -2883,11 +2862,8 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_defalut_scop
 ; GFX1064-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1064-NEXT:  .LBB5_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT:    s_ff1_i32_b32 s2, s1
-; GFX1064-NEXT:    s_ff1_i32_b32 s3, s0
-; GFX1064-NEXT:    s_add_i32 s2, s2, 32
+; GFX1064-NEXT:    s_ff1_i32_b64 s2, s[0:1]
 ; GFX1064-NEXT:    v_max_f32_e32 v1, v2, v2
-; GFX1064-NEXT:    s_min_u32 s2, s3, s2
 ; GFX1064-NEXT:    v_readlane_b32 s3, v0, s2
 ; GFX1064-NEXT:    v_max_f32_e64 v2, s3, s3
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s2
@@ -3015,12 +2991,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_defalut_scop
 ; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1164-NEXT:  .LBB5_1: ; %ComputeLoop
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_ctz_i32_b32 s2, s1
-; GFX1164-NEXT:    s_ctz_i32_b32 s3, s0
-; GFX1164-NEXT:    s_add_i32 s2, s2, 32
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT:    s_ctz_i32_b64 s2, s[0:1]
 ; GFX1164-NEXT:    v_max_f32_e32 v1, v2, v2
-; GFX1164-NEXT:    s_min_u32 s2, s3, s2
 ; GFX1164-NEXT:    v_readlane_b32 s3, v0, s2
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    v_max_f32_e64 v2, s3, s3
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
index 17533e22ce2af..7379fd5a5422f 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
@@ -434,10 +434,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; GFX9-NEXT:  .LBB1_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    s_ff1_i32_b32 s2, s1
-; GFX9-NEXT:    s_ff1_i32_b32 s3, s0
-; GFX9-NEXT:    s_add_i32 s2, s2, 32
-; GFX9-NEXT:    s_min_u32 s2, s3, s2
+; GFX9-NEXT:    s_ff1_i32_b64 s2, s[0:1]
 ; GFX9-NEXT:    v_readlane_b32 s4, v0, s2
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX9-NEXT:    v_max_f32_e32 v1, v2, v2
@@ -507,11 +504,8 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
 ; GFX1064-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1064-NEXT:  .LBB1_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT:    s_ff1_i32_b32 s2, s1
-; GFX1064-NEXT:    s_ff1_i32_b32 s3, s0
-; GFX1064-NEXT:    s_add_i32 s2, s2, 32
+; GFX1064-NEXT:    s_ff1_i32_b64 s2, s[0:1]
 ; GFX1064-NEXT:    v_max_f32_e32 v1, v2, v2
-; GFX1064-NEXT:    s_min_u32 s2, s3, s2
 ; GFX1064-NEXT:    v_readlane_b32 s3, v0, s2
 ; GFX1064-NEXT:    v_max_f32_e64 v2, s3, s3
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s2
@@ -639,12 +633,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
 ; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1164-NEXT:  .LBB1_1: ; %ComputeLoop
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_ctz_i32_b32 s2, s1
-; GFX1164-NEXT:    s_ctz_i32_b32 s3, s0
-; GFX1164-NEXT:    s_add_i32 s2, s2, 32
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT:    s_ctz_i32_b64 s2, s[0:1]
 ; GFX1164-NEXT:    v_max_f32_e32 v1, v2, v2
-; GFX1164-NEXT:    s_min_u32 s2, s3, s2
 ; GFX1164-NEXT:    v_readlane_b32 s3, v0, s2
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    v_max_f32_e64 v2, s3, s3
@@ -1622,10 +1613,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; GFX9-NEXT:  .LBB3_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    s_ff1_i32_b32 s2, s1
-; GFX9-NEXT:    s_ff1_i32_b32 s3, s0
-; GFX9-NEXT:    s_add_i32 s2, s2, 32
-; GFX9-NEXT:    s_min_u32 s2, s3, s2
+; GFX9-NEXT:    s_ff1_i32_b64 s2, s[0:1]
 ; GFX9-NEXT:    v_readlane_b32 s4, v0, s2
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX9-NEXT:    v_max_f32_e32 v1, v2, v2
@@ -1695,11 +1683,8 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
 ; GFX1064-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1064-NEXT:  .LBB3_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT:    s_ff1_i32_b32 s2, s1
-; GFX1064-NEXT:    s_ff1_i32_b32 s3, s0
-; GFX1064-NEXT:    s_add_i32 s2, s2, 32
+; GFX1064-NEXT:    s_ff1_i32_b64 s2, s[0:1]
 ; GFX1064-NEXT:    v_max_f32_e32 v1, v2, v2
-; GFX1064-NEXT:    s_min_u32 s2, s3, s2
 ; GFX1064-NEXT:    v_readlane_b32 s3, v0, s2
 ; GFX1064-NEXT:    v_max_f32_e64 v2, s3, s3
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s2
@@ -1827,12 +1812,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
 ; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1164-NEXT:  .LBB3_1: ; %ComputeLoop
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_ctz_i32_b32 s2, s1
-; GFX1164-NEXT:    s_ctz_i32_b32 s3, s0
-; GFX1164-NEXT:    s_add_i32 s2, s2, 32
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT:    s_ctz_i32_b64 s2, s[0:1]
 ; GFX1164-NEXT:    v_max_f32_e32 v1, v2, v2
-; GFX1164-NEXT:    s_min_u32 s2, s3, s2
 ; GFX1164-NEXT:    v_readlane_b32 s3, v0, s2
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    v_max_f32_e64 v2, s3, s3
@@ -2810,10 +2792,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_defalut_scop
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; GFX9-NEXT:  .LBB5_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    s_ff1_i32_b32 s2, s1
-; GFX9-NEXT:    s_ff1_i32_b32 s3, s0
-; GFX9-NEXT:    s_add_i32 s2, s2, 32
-; GFX9-NEXT:    s_min_u32 s2, s3, s2
+; GFX9-NEXT:    s_ff1_i32_b64 s2, s[0:1]
 ; GFX9-NEXT:    v_readlane_b32 s4, v0, s2
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX9-NEXT:    v_max_f32_e32 v1, v2, v2
@@ -2883,11 +2862,8 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_defalut_scop
 ; GFX1064-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1064-NEXT:  .LBB5_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT:    s_ff1_i32_b32 s2, s1
-; GFX1064-NEXT:    s_ff1_i32_b32 s3, s0
-; GFX1064-NEXT:    s_add_i32 s2, s2, 32
+; GFX1064-NEXT:    s_ff1_i32_b64 s2, s[0:1]
 ; GFX1064-NEXT:    v_max_f32_e32 v1, v2, v2
-; GFX1064-NEXT:    s_min_u32 s2, s3, s2
 ; GFX1064-NEXT:    v_readlane_b32 s3, v0, s2
 ; GFX1064-NEXT:    v_max_f32_e64 v2, s3, s3
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s2
@@ -3015,12 +2991,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_defalut_scop
 ; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1164-NEXT:  .LBB5_1: ; %ComputeLoop
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_ctz_i32_b32 s2, s1
-; GFX1164-NEXT:    s_ctz_i32_b32 s3, s0
-; GFX1164-NEXT:    s_add_i32 s2, s2, 32
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT:    s_ctz_i32_b64 s2, s[0:1]
 ; GFX1164-NEXT:    v_max_f32_e32 v1, v2, v2
-; GFX1164-NEXT:    s_min_u32 s2, s3, s2
 ; GFX1164-NEXT:    v_readlane_b32 s3, v0, s2
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    v_max_f32_e64 v2, s3, s3
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
index a8b83edfa7438..b2c749c131f60 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
@@ -467,10 +467,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX9-NEXT:    v_bfrev_b32_e32 v2, 1
 ; GFX9-NEXT:  .LBB1_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    s_ff1_i32_b32 s2, s1
-; GFX9-NEXT:    s_ff1_i32_b32 s3, s0
-; GFX9-NEXT:    s_add_i32 s2, s2, 32
-; GFX9-NEXT:    s_min_u32 s2, s3, s2
+; GFX9-NEXT:    s_ff1_i32_b64 s2, s[0:1]
 ; GFX9-NEXT:    v_readlane_b32 s4, v0, s2
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
@@ -536,10 +533,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1064-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1064-NEXT:  .LBB1_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT:    s_ff1_i32_b32 s2, s1
-; GFX1064-NEXT:    s_ff1_i32_b32 s3, s0
-; GFX1064-NEXT:    s_add_i32 s2, s2, 32
-; GFX1064-NEXT:    s_min_u32 s2, s3, s2
+; GFX1064-NEXT:    s_ff1_i32_b64 s2, s[0:1]
 ; GFX1064-NEXT:    v_readlane_b32 s4, v0, s2
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
@@ -660,17 +654,14 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1164-NEXT:  .LBB1_1: ; %ComputeLoop
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_ctz_i32_b32 s2, s1
-; GFX1164-NEXT:    s_ctz_i32_b32 s3, s0
-; GFX1164-NEXT:    s_add_i32 s2, s2, 32
-; GFX1164-NEXT:    s_min_u32 s2, s3, s2
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_ctz_i32_b64 s2, s[0:1]
 ; GFX1164-NEXT:    v_readlane_b32 s4, v0, s2
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s2
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164-NEXT:    v_add_f32_e32 v2, s4, v2
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB1_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -1706,10 +1697,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
 ; GFX9-NEXT:    v_bfrev_b32_e32 v2, 1
 ; GFX9-NEXT:  .LBB3_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    s_ff1_i32_b32 s2, s1
-; GFX9-NEXT:    s_ff1_i32_b32 s3, s0
-; GFX9-NEXT:    s_add_i32 s2, s2, 32
-; GFX9-NEXT:    s_min_u32 s2, s3, s2
+; GFX9-NEXT:    s_ff1_i32_b64 s2, s[0:1]
 ; GFX9-NEXT:    v_readlane_b32 s4, v0, s2
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
@@ -1775,10 +1763,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
 ; GFX1064-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1064-NEXT:  .LBB3_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT:    s_ff1_i32_b32 s2, s1
-; GFX1064-NEXT:    s_ff1_i32_b32 s3, s0
-; GFX1064-NEXT:    s_add_i32 s2, s2, 32
-; GFX1064-NEXT:    s_min_u32 s2, s3, s2
+; GFX1064-NEXT:    s_ff1_i32_b64 s2, s[0:1]
 ; GFX1064-NEXT:    v_readlane_b32 s4, v0, s2
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
@@ -1899,17 +1884,14 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
 ; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1164-NEXT:  .LBB3_1: ; %ComputeLoop
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_ctz_i32_b32 s2, s1
-; GFX1164-NEXT:    s_ctz_i32_b32 s3, s0
-; GFX1164-NEXT:    s_add_i32 s2, s2, 32
-; GFX1164-NEXT:    s_min_u32 s2, s3, s2
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_ctz_i32_b64 s2, s[0:1]
 ; GFX1164-NEXT:    v_readlane_b32 s4, v0, s2
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s2
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164-NEXT:    v_add_f32_e32 v2, s4, v2
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -2945,10 +2927,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX9-NEXT:    v_bfrev_b32_e32 v2, 1
 ; GFX9-NEXT:  .LBB5_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    s_ff1_i32_b32 s2, s1
-; GFX9-NEXT:    s_ff1_i32_b32 s3, s0
-; GFX9-NEXT:    s_add_i32 s2, s2, 32
-; GFX9-NEXT:    s_min_u32 s2, s3, s2
+; GFX9-NEXT:    s_ff1_i32_b64 s2, s[0:1]
 ; GFX9-NEXT:    v_readlane_b32 s4, v0, s2
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
@@ -3014,10 +2993,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1064-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1064-NEXT:  .LBB5_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT:    s_ff1_i32_b32 s2, s1
-; GFX1064-NEXT:    s_ff1_i32_b32 s3, s0
-; GFX1064-NEXT:    s_add_i32 s2, s2, 32
-; GFX1064-NEXT:    s_min_u32 s2, s3, s2
+; GFX1064-NEXT:    s_ff1_i32_b64 s2, s[0:1]
 ; GFX1064-NEXT:    v_readlane_b32 s4, v0, s2
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
@@ -3138,17 +3114,14 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1164-NEXT:  .LBB5_1: ; %ComputeLoop
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_ctz_i32_b32 s2, s1
-; GFX1164-NEXT:    s_ctz_i32_b32 s3, s0
-; GFX1164-NEXT:    s_add_i32 s2, s2, 32
-; GFX1164-NEXT:    s_min_u32 s2, s3, s2
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_ctz_i32_b64 s2, s[0:1]
 ; GFX1164-NEXT:    v_readlane_b32 s4, v0, s2
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s2
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164-NEXT:    v_add_f32_e32 v2, s4, v2
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -3726,10 +3699,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX9-NEXT:    v_bfrev_b32_e32 v2, 1
 ; GFX9-NEXT:  .LBB6_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    s_ff1_i32_b32 s2, s1
-; GFX9-NEXT:    s_ff1_i32_b32 s3, s0
-; GFX9-NEXT:    s_add_i32 s2, s2, 32
-; GFX9-NEXT:    s_min_u32 s2, s3, s2
+; GFX9-NEXT:    s_ff1_i32_b64 s2, s[0:1]
 ; GFX9-NEXT:    v_readlane_b32 s4, v0, s2
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
@@ -3795,10 +3765,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1064-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1064-NEXT:  .LBB6_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT:    s_ff1_i32_b32 s2, s1
-; GFX1064-NEXT:    s_ff1_i32_b32 s3, s0
-; GFX1064-NEXT:    s_add_i32 s2, s2, 32
-; GFX1064-NEXT:    s_min_u32 s2, s3, s2
+; GFX1064-NEXT:    s_ff1_i32_b64 s2, s[0:1]
 ; GFX1064-NEXT:    v_readlane_b32 s4, v0, s2
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
@@ -3919,17 +3886,14 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1164-NEXT:  .LBB6_1: ; %ComputeLoop
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_ctz_i32_b32 s2, s1
-; GFX1164-NEXT:    s_ctz_i32_b32 s3, s0
-; GFX1164-NEXT:    s_add_i32 s2, s2, 32
-; GFX1164-NEXT:    s_min_u32 s2, s3, s2
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_ctz_i32_b64 s2, s[0:1]
 ; GFX1164-NEXT:    v_readlane_b32 s4, v0, s2
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s2
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164-NEXT:    v_add_f32_e32 v2, s4, v2
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -4964,10 +4928,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_defalut_scop
 ; GFX9-NEXT:    v_bfrev_b32_e32 v2, 1
 ; GFX9-NEXT:  .LBB8_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    s_ff1_i32_b32 s2, s1
-; GFX9-NEXT:    s_ff1_i32_b32 s3, s0
-; GFX9-NEXT:    s_add_i32 s2, s2, 32
-; GFX9-NEXT:    s_min_u32 s2, s3, s2
+; GFX9-NEXT:    s_ff1_i32_b64 s2, s[0:1]
 ; GFX9-NEXT:    v_readlane_b32 s4, v0, s2
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
@@ -5033,10 +4994,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_defalut_scop
 ; GFX1064-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1064-NEXT:  .LBB8_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT:    s_ff1_i32_b32 s2, s1
-; GFX1064-NEXT:    s_ff1_i32_b32 s3, s0
-; GFX1064-NEXT:    s_add_i32 s2, s2, 32
-; GFX1064-NEXT:    s_min_u32 s2, s3, s2
+; GFX1064-NEXT:    s_ff1_i32_b64 s2, s[0:1]
 ; GFX1064-NEXT:    v_readlane_b32 s4, v0, s2
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
@@ -5157,17 +5115,14 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_defalut_scop
 ; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX1164-NEXT:  .LBB8_1: ; %ComputeLoop
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_ctz_i32_b32 s2, s1
-; GFX1164-NEXT:    s_ctz_i32_b32 s3, s0
-; GFX1164-NEXT:    s_add_i32 s2, s2, 32
-; GFX1164-NEXT:    s_min_u32 s2, s3, s2
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT:    s_ctz_i32_b64 s2, s[0:1]
 ; GFX1164-NEXT:    v_readlane_b32 s4, v0, s2
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s2
+; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164-NEXT:    v_add_f32_e32 v2, s4, v2
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
diff --git a/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll b/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll
index b90c92bf9be52..5296ef1f88678 100644
--- a/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll
@@ -182,10 +182,7 @@ define amdgpu_kernel void @lds_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) %
 ; VI-NEXT:    ; implicit-def: $vgpr0
 ; VI-NEXT:  .LBB2_5: ; %ComputeLoop
 ; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
-; VI-NEXT:    s_ff1_i32_b32 s3, s5
-; VI-NEXT:    s_ff1_i32_b32 s6, s4
-; VI-NEXT:    s_add_i32 s3, s3, 32
-; VI-NEXT:    s_min_u32 s3, s6, s3
+; VI-NEXT:    s_ff1_i32_b64 s3, s[4:5]
 ; VI-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; VI-NEXT:    v_readfirstlane_b32 s8, v1
 ; VI-NEXT:    v_readlane_b32 s9, v2, s3
@@ -268,10 +265,7 @@ define amdgpu_kernel void @lds_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) %
 ; GFX9-NEXT:    ; implicit-def: $vgpr0
 ; GFX9-NEXT:  .LBB2_5: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    s_ff1_i32_b32 s3, s5
-; GFX9-NEXT:    s_ff1_i32_b32 s6, s4
-; GFX9-NEXT:    s_add_i32 s3, s3, 32
-; GFX9-NEXT:    s_min_u32 s3, s6, s3
+; GFX9-NEXT:    s_ff1_i32_b64 s3, s[4:5]
 ; GFX9-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX9-NEXT:    v_readfirstlane_b32 s8, v1
 ; GFX9-NEXT:    v_readlane_b32 s9, v2, s3
@@ -543,10 +537,7 @@ define amdgpu_kernel void @lds_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrspa
 ; VI-NEXT:    ; implicit-def: $vgpr0
 ; VI-NEXT:  .LBB3_5: ; %ComputeLoop
 ; VI-NEXT:    ; =>This Inner Loop Header: Depth=1
-; VI-NEXT:    s_ff1_i32_b32 s3, s5
-; VI-NEXT:    s_ff1_i32_b32 s6, s4
-; VI-NEXT:    s_add_i32 s3, s3, 32
-; VI-NEXT:    s_min_u32 s3, s6, s3
+; VI-NEXT:    s_ff1_i32_b64 s3, s[4:5]
 ; VI-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; VI-NEXT:    v_readfirstlane_b32 s8, v1
 ; VI-NEXT:    v_readlane_b32 s9, v2, s3
@@ -625,10 +616,7 @@ define amdgpu_kernel void @lds_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrspa
 ; GFX9-NEXT:    ; implicit-def: $vgpr0
 ; GFX9-NEXT:  .LBB3_5: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    s_ff1_i32_b32 s3, s5
-; GFX9-NEXT:    s_ff1_i32_b32 s6, s4
-; GFX9-NEXT:    s_add_i32 s3, s3, 32
-; GFX9-NEXT:    s_min_u32 s3, s6, s3
+; GFX9-NEXT:    s_ff1_i32_b64 s3, s[4:5]
 ; GFX9-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX9-NEXT:    v_readfirstlane_b32 s8, v1
 ; GFX9-NEXT:    v_readlane_b32 s9, v2, s3
diff --git a/llvm/test/CodeGen/AMDGPU/move-to-valu-ctlz-cttz.ll b/llvm/test/CodeGen/AMDGPU/move-to-valu-ctlz-cttz.ll
new file mode 100644
index 0000000000000..91ba353390f3c
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/move-to-valu-ctlz-cttz.ll
@@ -0,0 +1,332 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX9 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10 %s
+
+declare i64 @llvm.ctlz.i64(i64, i1) nounwind readnone
+declare i64 @llvm.cttz.i64(i64, i1) nounwind readnone
+
+define amdgpu_kernel void @ctlz_i64_poison(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind {
+; GFX9-LABEL: ctlz_i64_poison:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_load_ubyte v0, v1, s[2:3] offset:5
+; GFX9-NEXT:    global_load_ubyte v2, v1, s[2:3] offset:6
+; GFX9-NEXT:    global_load_ubyte v3, v1, s[2:3] offset:7
+; GFX9-NEXT:    global_load_ubyte v4, v1, s[2:3] offset:1
+; GFX9-NEXT:    global_load_ubyte v5, v1, s[2:3] offset:3
+; GFX9-NEXT:    global_load_ubyte v6, v1, s[2:3] offset:4
+; GFX9-NEXT:    global_load_ubyte v7, v1, s[2:3]
+; GFX9-NEXT:    global_load_ubyte v8, v1, s[2:3] offset:2
+; GFX9-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
+; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    v_or_b32_e32 v0, v0, v6
+; GFX9-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_or_b32_e32 v3, v4, v7
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_or_b32_sdwa v4, v5, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX9-NEXT:    v_or_b32_e32 v2, v4, v3
+; GFX9-NEXT:    v_ffbh_u32_e32 v2, v2
+; GFX9-NEXT:    v_ffbh_u32_e32 v0, v0
+; GFX9-NEXT:    v_add_u32_e64 v2, v2, 32 clamp
+; GFX9-NEXT:    v_min_u32_e32 v0, v2, v0
+; GFX9-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: ctlz_i64_poison:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_clause 0x7
+; GFX10-NEXT:    global_load_ubyte v0, v1, s[2:3] offset:5
+; GFX10-NEXT:    global_load_ubyte v2, v1, s[2:3] offset:6
+; GFX10-NEXT:    global_load_ubyte v3, v1, s[2:3] offset:7
+; GFX10-NEXT:    global_load_ubyte v4, v1, s[2:3] offset:1
+; GFX10-NEXT:    global_load_ubyte v5, v1, s[2:3] offset:3
+; GFX10-NEXT:    global_load_ubyte v6, v1, s[2:3]
+; GFX10-NEXT:    global_load_ubyte v7, v1, s[2:3] offset:2
+; GFX10-NEXT:    global_load_ubyte v8, v1, s[2:3] offset:4
+; GFX10-NEXT:    s_waitcnt vmcnt(7)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(5)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
+; GFX10-NEXT:    s_waitcnt vmcnt(4)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; GFX10-NEXT:    s_waitcnt vmcnt(3)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_or_b32_e32 v0, v0, v8
+; GFX10-NEXT:    v_or_b32_e32 v4, v4, v6
+; GFX10-NEXT:    v_or_b32_sdwa v5, v5, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT:    v_or_b32_e32 v3, v5, v4
+; GFX10-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX10-NEXT:    v_ffbh_u32_e32 v2, v3
+; GFX10-NEXT:    v_ffbh_u32_e32 v0, v0
+; GFX10-NEXT:    v_add_nc_u32_e64 v2, v2, 32 clamp
+; GFX10-NEXT:    v_min_u32_e32 v0, v2, v0
+; GFX10-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
+; GFX10-NEXT:    s_endpgm
+  %val = load i64, ptr addrspace(1) %arrayidx, align 1
+  %ctlz = tail call i64 @llvm.ctlz.i64(i64 %val, i1 true) nounwind readnone
+  store i64 %ctlz, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind {
+; GFX9-LABEL: ctlz_i64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_load_ubyte v0, v1, s[2:3] offset:5
+; GFX9-NEXT:    global_load_ubyte v2, v1, s[2:3] offset:6
+; GFX9-NEXT:    global_load_ubyte v3, v1, s[2:3] offset:7
+; GFX9-NEXT:    global_load_ubyte v4, v1, s[2:3] offset:1
+; GFX9-NEXT:    global_load_ubyte v5, v1, s[2:3] offset:3
+; GFX9-NEXT:    global_load_ubyte v6, v1, s[2:3] offset:4
+; GFX9-NEXT:    global_load_ubyte v7, v1, s[2:3]
+; GFX9-NEXT:    global_load_ubyte v8, v1, s[2:3] offset:2
+; GFX9-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
+; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    v_or_b32_e32 v0, v0, v6
+; GFX9-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_or_b32_e32 v3, v4, v7
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_or_b32_sdwa v4, v5, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX9-NEXT:    v_or_b32_e32 v2, v4, v3
+; GFX9-NEXT:    v_ffbh_u32_e32 v2, v2
+; GFX9-NEXT:    v_ffbh_u32_e32 v0, v0
+; GFX9-NEXT:    v_add_u32_e64 v2, v2, 32 clamp
+; GFX9-NEXT:    v_min_u32_e32 v0, v2, v0
+; GFX9-NEXT:    v_min_u32_e32 v0, 64, v0
+; GFX9-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: ctlz_i64:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_clause 0x7
+; GFX10-NEXT:    global_load_ubyte v0, v1, s[2:3] offset:5
+; GFX10-NEXT:    global_load_ubyte v2, v1, s[2:3] offset:6
+; GFX10-NEXT:    global_load_ubyte v3, v1, s[2:3] offset:7
+; GFX10-NEXT:    global_load_ubyte v4, v1, s[2:3] offset:1
+; GFX10-NEXT:    global_load_ubyte v5, v1, s[2:3] offset:3
+; GFX10-NEXT:    global_load_ubyte v6, v1, s[2:3]
+; GFX10-NEXT:    global_load_ubyte v7, v1, s[2:3] offset:2
+; GFX10-NEXT:    global_load_ubyte v8, v1, s[2:3] offset:4
+; GFX10-NEXT:    s_waitcnt vmcnt(7)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(5)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
+; GFX10-NEXT:    s_waitcnt vmcnt(4)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; GFX10-NEXT:    s_waitcnt vmcnt(3)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_or_b32_e32 v0, v0, v8
+; GFX10-NEXT:    v_or_b32_e32 v4, v4, v6
+; GFX10-NEXT:    v_or_b32_sdwa v5, v5, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT:    v_or_b32_e32 v3, v5, v4
+; GFX10-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX10-NEXT:    v_ffbh_u32_e32 v2, v3
+; GFX10-NEXT:    v_ffbh_u32_e32 v0, v0
+; GFX10-NEXT:    v_add_nc_u32_e64 v2, v2, 32 clamp
+; GFX10-NEXT:    v_min_u32_e32 v0, v2, v0
+; GFX10-NEXT:    v_min_u32_e32 v0, 64, v0
+; GFX10-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
+; GFX10-NEXT:    s_endpgm
+  %val = load i64, ptr addrspace(1) %arrayidx, align 1
+  %ctlz = tail call i64 @llvm.ctlz.i64(i64 %val, i1 false) nounwind readnone
+  store i64 %ctlz, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @cttz_i64_poison(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind {
+; GFX9-LABEL: cttz_i64_poison:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_load_ubyte v0, v1, s[2:3] offset:5
+; GFX9-NEXT:    global_load_ubyte v2, v1, s[2:3] offset:6
+; GFX9-NEXT:    global_load_ubyte v3, v1, s[2:3] offset:7
+; GFX9-NEXT:    global_load_ubyte v4, v1, s[2:3] offset:1
+; GFX9-NEXT:    global_load_ubyte v5, v1, s[2:3] offset:3
+; GFX9-NEXT:    global_load_ubyte v6, v1, s[2:3] offset:4
+; GFX9-NEXT:    global_load_ubyte v7, v1, s[2:3]
+; GFX9-NEXT:    global_load_ubyte v8, v1, s[2:3] offset:2
+; GFX9-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
+; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    v_or_b32_e32 v0, v0, v6
+; GFX9-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_or_b32_e32 v3, v4, v7
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_or_b32_sdwa v4, v5, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX9-NEXT:    v_or_b32_e32 v2, v4, v3
+; GFX9-NEXT:    v_ffbl_b32_e32 v0, v0
+; GFX9-NEXT:    v_ffbl_b32_e32 v2, v2
+; GFX9-NEXT:    v_add_u32_e64 v0, v0, 32 clamp
+; GFX9-NEXT:    v_min_u32_e32 v0, v0, v2
+; GFX9-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: cttz_i64_poison:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_clause 0x7
+; GFX10-NEXT:    global_load_ubyte v0, v1, s[2:3] offset:5
+; GFX10-NEXT:    global_load_ubyte v2, v1, s[2:3] offset:7
+; GFX10-NEXT:    global_load_ubyte v3, v1, s[2:3] offset:6
+; GFX10-NEXT:    global_load_ubyte v4, v1, s[2:3] offset:1
+; GFX10-NEXT:    global_load_ubyte v5, v1, s[2:3] offset:3
+; GFX10-NEXT:    global_load_ubyte v6, v1, s[2:3] offset:4
+; GFX10-NEXT:    global_load_ubyte v7, v1, s[2:3]
+; GFX10-NEXT:    global_load_ubyte v8, v1, s[2:3] offset:2
+; GFX10-NEXT:    s_waitcnt vmcnt(7)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(6)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; GFX10-NEXT:    s_waitcnt vmcnt(4)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; GFX10-NEXT:    s_waitcnt vmcnt(3)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
+; GFX10-NEXT:    s_waitcnt vmcnt(2)
+; GFX10-NEXT:    v_or_b32_e32 v0, v0, v6
+; GFX10-NEXT:    v_or_b32_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    v_or_b32_e32 v3, v4, v7
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_or_b32_sdwa v4, v5, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX10-NEXT:    v_or_b32_e32 v2, v4, v3
+; GFX10-NEXT:    v_ffbl_b32_e32 v0, v0
+; GFX10-NEXT:    v_ffbl_b32_e32 v2, v2
+; GFX10-NEXT:    v_add_nc_u32_e64 v0, v0, 32 clamp
+; GFX10-NEXT:    v_min_u32_e32 v0, v0, v2
+; GFX10-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
+; GFX10-NEXT:    s_endpgm
+  %val = load i64, ptr addrspace(1) %arrayidx, align 1
+  %cttz = tail call i64 @llvm.cttz.i64(i64 %val, i1 true) nounwind readnone
+  store i64 %cttz, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @cttz_i64(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind {
+; GFX9-LABEL: cttz_i64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_load_ubyte v0, v1, s[2:3] offset:5
+; GFX9-NEXT:    global_load_ubyte v2, v1, s[2:3] offset:6
+; GFX9-NEXT:    global_load_ubyte v3, v1, s[2:3] offset:7
+; GFX9-NEXT:    global_load_ubyte v4, v1, s[2:3] offset:1
+; GFX9-NEXT:    global_load_ubyte v5, v1, s[2:3] offset:3
+; GFX9-NEXT:    global_load_ubyte v6, v1, s[2:3] offset:4
+; GFX9-NEXT:    global_load_ubyte v7, v1, s[2:3]
+; GFX9-NEXT:    global_load_ubyte v8, v1, s[2:3] offset:2
+; GFX9-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
+; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    v_or_b32_e32 v0, v0, v6
+; GFX9-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_or_b32_e32 v3, v4, v7
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_or_b32_sdwa v4, v5, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX9-NEXT:    v_or_b32_e32 v2, v4, v3
+; GFX9-NEXT:    v_ffbl_b32_e32 v0, v0
+; GFX9-NEXT:    v_ffbl_b32_e32 v2, v2
+; GFX9-NEXT:    v_add_u32_e64 v0, v0, 32 clamp
+; GFX9-NEXT:    v_min_u32_e32 v0, v0, v2
+; GFX9-NEXT:    v_min_u32_e32 v0, 64, v0
+; GFX9-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: cttz_i64:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_clause 0x7
+; GFX10-NEXT:    global_load_ubyte v0, v1, s[2:3] offset:5
+; GFX10-NEXT:    global_load_ubyte v2, v1, s[2:3] offset:7
+; GFX10-NEXT:    global_load_ubyte v3, v1, s[2:3] offset:6
+; GFX10-NEXT:    global_load_ubyte v4, v1, s[2:3] offset:1
+; GFX10-NEXT:    global_load_ubyte v5, v1, s[2:3] offset:3
+; GFX10-NEXT:    global_load_ubyte v6, v1, s[2:3] offset:4
+; GFX10-NEXT:    global_load_ubyte v7, v1, s[2:3]
+; GFX10-NEXT:    global_load_ubyte v8, v1, s[2:3] offset:2
+; GFX10-NEXT:    s_waitcnt vmcnt(7)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; GFX10-NEXT:    s_waitcnt vmcnt(6)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; GFX10-NEXT:    s_waitcnt vmcnt(4)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
+; GFX10-NEXT:    s_waitcnt vmcnt(3)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
+; GFX10-NEXT:    s_waitcnt vmcnt(2)
+; GFX10-NEXT:    v_or_b32_e32 v0, v0, v6
+; GFX10-NEXT:    v_or_b32_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    v_or_b32_e32 v3, v4, v7
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_or_b32_sdwa v4, v5, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX10-NEXT:    v_or_b32_e32 v2, v4, v3
+; GFX10-NEXT:    v_ffbl_b32_e32 v0, v0
+; GFX10-NEXT:    v_ffbl_b32_e32 v2, v2
+; GFX10-NEXT:    v_add_nc_u32_e64 v0, v0, 32 clamp
+; GFX10-NEXT:    v_min_u32_e32 v0, v0, v2
+; GFX10-NEXT:    v_min_u32_e32 v0, 64, v0
+; GFX10-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
+; GFX10-NEXT:    s_endpgm
+  %val = load i64, ptr addrspace(1) %arrayidx, align 1
+  %cttz = tail call i64 @llvm.cttz.i64(i64 %val, i1 false) nounwind readnone
+  store i64 %cttz, ptr addrspace(1) %out, align 8
+  ret void
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GCN: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
index c48370a9c6c75..7f84d21fbbc44 100644
--- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
@@ -156,15 +156,9 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    s_subb_u32 s7, s7, s2
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[8:9], s[12:13], 0
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[6:7], 0
+; GCN-IR-NEXT:    s_flbit_i32_b64 s14, s[6:7]
 ; GCN-IR-NEXT:    s_or_b64 s[10:11], s[10:11], s[8:9]
-; GCN-IR-NEXT:    s_flbit_i32_b32 s8, s6
-; GCN-IR-NEXT:    s_add_i32 s8, s8, 32
-; GCN-IR-NEXT:    s_flbit_i32_b32 s9, s7
-; GCN-IR-NEXT:    s_min_u32 s14, s8, s9
-; GCN-IR-NEXT:    s_flbit_i32_b32 s8, s12
-; GCN-IR-NEXT:    s_add_i32 s8, s8, 32
-; GCN-IR-NEXT:    s_flbit_i32_b32 s9, s13
-; GCN-IR-NEXT:    s_min_u32 s20, s8, s9
+; GCN-IR-NEXT:    s_flbit_i32_b64 s20, s[12:13]
 ; GCN-IR-NEXT:    s_sub_u32 s16, s14, s20
 ; GCN-IR-NEXT:    s_subb_u32 s17, 0, 0
 ; GCN-IR-NEXT:    v_cmp_gt_u64_e64 s[18:19], s[16:17], 63
@@ -993,15 +987,9 @@ define amdgpu_kernel void @s_test_sdiv24_48(ptr addrspace(1) %out, i48 %x, i48 %
 ; GCN-IR-NEXT:    s_subb_u32 s7, s7, s4
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[8:9], s[6:7], 0
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[12:13], 0
+; GCN-IR-NEXT:    s_flbit_i32_b64 s14, s[6:7]
 ; GCN-IR-NEXT:    s_or_b64 s[10:11], s[8:9], s[10:11]
-; GCN-IR-NEXT:    s_flbit_i32_b32 s8, s6
-; GCN-IR-NEXT:    s_add_i32 s8, s8, 32
-; GCN-IR-NEXT:    s_flbit_i32_b32 s9, s7
-; GCN-IR-NEXT:    s_min_u32 s14, s8, s9
-; GCN-IR-NEXT:    s_flbit_i32_b32 s8, s12
-; GCN-IR-NEXT:    s_add_i32 s8, s8, 32
-; GCN-IR-NEXT:    s_flbit_i32_b32 s9, s13
-; GCN-IR-NEXT:    s_min_u32 s20, s8, s9
+; GCN-IR-NEXT:    s_flbit_i32_b64 s20, s[12:13]
 ; GCN-IR-NEXT:    s_sub_u32 s16, s14, s20
 ; GCN-IR-NEXT:    s_subb_u32 s17, 0, 0
 ; GCN-IR-NEXT:    v_cmp_gt_u64_e64 s[18:19], s[16:17], 63
@@ -1203,10 +1191,7 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-IR-NEXT:    s_xor_b64 s[2:3], s[4:5], s[2:3]
 ; GCN-IR-NEXT:    s_sub_u32 s2, s2, s4
 ; GCN-IR-NEXT:    s_subb_u32 s3, s3, s4
-; GCN-IR-NEXT:    s_flbit_i32_b32 s10, s2
-; GCN-IR-NEXT:    s_add_i32 s10, s10, 32
-; GCN-IR-NEXT:    s_flbit_i32_b32 s11, s3
-; GCN-IR-NEXT:    s_min_u32 s14, s10, s11
+; GCN-IR-NEXT:    s_flbit_i32_b64 s14, s[2:3]
 ; GCN-IR-NEXT:    s_add_u32 s10, s14, 0xffffffc5
 ; GCN-IR-NEXT:    s_addc_u32 s11, 0, -1
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[8:9], s[2:3], 0
diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll
index ac212d22e9cfa..70e75116e180a 100644
--- a/llvm/test/CodeGen/AMDGPU/srem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/srem64.ll
@@ -124,18 +124,14 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
 ; GCN-IR-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
 ; GCN-IR-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN-IR-NEXT:    s_mov_b64 s[6:7], 0
+; GCN-IR-NEXT:    s_mov_b32 s11, 0
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[6:7], s[4:5], 0
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[8:9], s[2:3], 0
-; GCN-IR-NEXT:    s_flbit_i32_b32 s10, s4
-; GCN-IR-NEXT:    s_or_b64 s[8:9], s[6:7], s[8:9]
-; GCN-IR-NEXT:    s_flbit_i32_b32 s6, s2
-; GCN-IR-NEXT:    s_flbit_i32_b32 s11, s5
-; GCN-IR-NEXT:    s_add_i32 s10, s10, 32
-; GCN-IR-NEXT:    s_add_i32 s6, s6, 32
-; GCN-IR-NEXT:    s_flbit_i32_b32 s7, s3
-; GCN-IR-NEXT:    s_min_u32 s10, s10, s11
-; GCN-IR-NEXT:    s_min_u32 s18, s6, s7
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[8:9], s[4:5], 0
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[12:13], s[2:3], 0
+; GCN-IR-NEXT:    s_flbit_i32_b64 s10, s[4:5]
+; GCN-IR-NEXT:    s_flbit_i32_b64 s18, s[2:3]
+; GCN-IR-NEXT:    s_or_b64 s[8:9], s[8:9], s[12:13]
 ; GCN-IR-NEXT:    s_sub_u32 s12, s10, s18
 ; GCN-IR-NEXT:    s_subb_u32 s13, 0, 0
 ; GCN-IR-NEXT:    v_cmp_gt_u64_e64 s[14:15], s[12:13], 63
@@ -145,9 +141,7 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    s_cselect_b32 s9, 0, s3
 ; GCN-IR-NEXT:    s_cselect_b32 s8, 0, s2
 ; GCN-IR-NEXT:    s_or_b64 s[14:15], s[14:15], s[16:17]
-; GCN-IR-NEXT:    s_mov_b64 s[6:7], 0
 ; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[14:15]
-; GCN-IR-NEXT:    s_mov_b32 s11, 0
 ; GCN-IR-NEXT:    s_cbranch_vccz .LBB0_5
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
 ; GCN-IR-NEXT:    s_add_u32 s14, s12, 1
@@ -1029,15 +1023,9 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %
 ; GCN-IR-NEXT:    s_subb_u32 s9, s7, s10
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[6:7], s[8:9], 0
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[2:3], 0
+; GCN-IR-NEXT:    s_flbit_i32_b64 s12, s[8:9]
 ; GCN-IR-NEXT:    s_or_b64 s[10:11], s[6:7], s[10:11]
-; GCN-IR-NEXT:    s_flbit_i32_b32 s6, s8
-; GCN-IR-NEXT:    s_add_i32 s6, s6, 32
-; GCN-IR-NEXT:    s_flbit_i32_b32 s7, s9
-; GCN-IR-NEXT:    s_min_u32 s12, s6, s7
-; GCN-IR-NEXT:    s_flbit_i32_b32 s6, s2
-; GCN-IR-NEXT:    s_add_i32 s6, s6, 32
-; GCN-IR-NEXT:    s_flbit_i32_b32 s7, s3
-; GCN-IR-NEXT:    s_min_u32 s20, s6, s7
+; GCN-IR-NEXT:    s_flbit_i32_b64 s20, s[2:3]
 ; GCN-IR-NEXT:    s_sub_u32 s14, s12, s20
 ; GCN-IR-NEXT:    s_subb_u32 s15, 0, 0
 ; GCN-IR-NEXT:    v_cmp_gt_u64_e64 s[16:17], s[14:15], 63
@@ -1180,15 +1168,9 @@ define amdgpu_kernel void @s_test_srem24_48(ptr addrspace(1) %out, i48 %x, i48 %
 ; GCN-IR-NEXT:    s_subb_u32 s7, s7, s10
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[8:9], s[6:7], 0
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[4:5], 0
+; GCN-IR-NEXT:    s_flbit_i32_b64 s12, s[6:7]
 ; GCN-IR-NEXT:    s_or_b64 s[10:11], s[8:9], s[10:11]
-; GCN-IR-NEXT:    s_flbit_i32_b32 s8, s6
-; GCN-IR-NEXT:    s_add_i32 s8, s8, 32
-; GCN-IR-NEXT:    s_flbit_i32_b32 s9, s7
-; GCN-IR-NEXT:    s_min_u32 s12, s8, s9
-; GCN-IR-NEXT:    s_flbit_i32_b32 s8, s4
-; GCN-IR-NEXT:    s_add_i32 s8, s8, 32
-; GCN-IR-NEXT:    s_flbit_i32_b32 s9, s5
-; GCN-IR-NEXT:    s_min_u32 s20, s8, s9
+; GCN-IR-NEXT:    s_flbit_i32_b64 s20, s[4:5]
 ; GCN-IR-NEXT:    s_sub_u32 s14, s12, s20
 ; GCN-IR-NEXT:    s_subb_u32 s15, 0, 0
 ; GCN-IR-NEXT:    v_cmp_gt_u64_e64 s[16:17], s[14:15], 63
@@ -1393,10 +1375,7 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-IR-NEXT:    s_xor_b64 s[2:3], s[2:3], s[8:9]
 ; GCN-IR-NEXT:    s_sub_u32 s4, s2, s8
 ; GCN-IR-NEXT:    s_subb_u32 s5, s3, s8
-; GCN-IR-NEXT:    s_flbit_i32_b32 s2, s4
-; GCN-IR-NEXT:    s_add_i32 s2, s2, 32
-; GCN-IR-NEXT:    s_flbit_i32_b32 s3, s5
-; GCN-IR-NEXT:    s_min_u32 s12, s2, s3
+; GCN-IR-NEXT:    s_flbit_i32_b64 s12, s[4:5]
 ; GCN-IR-NEXT:    s_add_u32 s2, s12, 0xffffffc5
 ; GCN-IR-NEXT:    s_addc_u32 s3, 0, -1
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[8:9], s[4:5], 0
diff --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll
index 9301170c034d8..7a8d19200a72e 100644
--- a/llvm/test/CodeGen/AMDGPU/udiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll
@@ -125,18 +125,14 @@ define amdgpu_kernel void @s_test_udiv_i64(ptr addrspace(1) %out, i64 %x, i64 %y
 ; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
 ; GCN-IR-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
 ; GCN-IR-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN-IR-NEXT:    s_mov_b64 s[6:7], 0
+; GCN-IR-NEXT:    s_mov_b32 s11, 0
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[6:7], s[4:5], 0
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[8:9], s[2:3], 0
-; GCN-IR-NEXT:    s_flbit_i32_b32 s10, s4
-; GCN-IR-NEXT:    s_or_b64 s[8:9], s[6:7], s[8:9]
-; GCN-IR-NEXT:    s_flbit_i32_b32 s6, s2
-; GCN-IR-NEXT:    s_flbit_i32_b32 s11, s5
-; GCN-IR-NEXT:    s_add_i32 s10, s10, 32
-; GCN-IR-NEXT:    s_add_i32 s6, s6, 32
-; GCN-IR-NEXT:    s_flbit_i32_b32 s7, s3
-; GCN-IR-NEXT:    s_min_u32 s10, s10, s11
-; GCN-IR-NEXT:    s_min_u32 s16, s6, s7
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[8:9], s[4:5], 0
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[12:13], s[2:3], 0
+; GCN-IR-NEXT:    s_flbit_i32_b64 s10, s[4:5]
+; GCN-IR-NEXT:    s_flbit_i32_b64 s16, s[2:3]
+; GCN-IR-NEXT:    s_or_b64 s[8:9], s[8:9], s[12:13]
 ; GCN-IR-NEXT:    s_sub_u32 s12, s10, s16
 ; GCN-IR-NEXT:    s_subb_u32 s13, 0, 0
 ; GCN-IR-NEXT:    v_cmp_gt_u64_e64 s[14:15], s[12:13], 63
@@ -146,9 +142,7 @@ define amdgpu_kernel void @s_test_udiv_i64(ptr addrspace(1) %out, i64 %x, i64 %y
 ; GCN-IR-NEXT:    s_cselect_b32 s9, 0, s3
 ; GCN-IR-NEXT:    s_cselect_b32 s8, 0, s2
 ; GCN-IR-NEXT:    s_or_b64 s[14:15], s[14:15], s[18:19]
-; GCN-IR-NEXT:    s_mov_b64 s[6:7], 0
 ; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[14:15]
-; GCN-IR-NEXT:    s_mov_b32 s11, 0
 ; GCN-IR-NEXT:    s_cbranch_vccz .LBB0_5
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
 ; GCN-IR-NEXT:    s_add_u32 s14, s12, 1
@@ -796,15 +790,9 @@ define amdgpu_kernel void @s_test_udiv24_i48(ptr addrspace(1) %out, i48 %x, i48
 ; GCN-IR-NEXT:    s_and_b32 s3, s3, 0xffff
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[2:3], 0
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[6:7], s[8:9], 0
+; GCN-IR-NEXT:    s_flbit_i32_b64 s10, s[2:3]
 ; GCN-IR-NEXT:    s_or_b64 s[6:7], s[4:5], s[6:7]
-; GCN-IR-NEXT:    s_flbit_i32_b32 s4, s2
-; GCN-IR-NEXT:    s_add_i32 s4, s4, 32
-; GCN-IR-NEXT:    s_flbit_i32_b32 s5, s3
-; GCN-IR-NEXT:    s_min_u32 s10, s4, s5
-; GCN-IR-NEXT:    s_flbit_i32_b32 s4, s8
-; GCN-IR-NEXT:    s_add_i32 s4, s4, 32
-; GCN-IR-NEXT:    s_flbit_i32_b32 s5, s9
-; GCN-IR-NEXT:    s_min_u32 s16, s4, s5
+; GCN-IR-NEXT:    s_flbit_i32_b64 s16, s[8:9]
 ; GCN-IR-NEXT:    s_sub_u32 s12, s10, s16
 ; GCN-IR-NEXT:    s_subb_u32 s13, 0, 0
 ; GCN-IR-NEXT:    v_cmp_gt_u64_e64 s[14:15], s[12:13], 63
@@ -986,10 +974,7 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-IR-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
 ; GCN-IR-NEXT:    s_mov_b64 s[4:5], 0
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-IR-NEXT:    s_flbit_i32_b32 s8, s2
-; GCN-IR-NEXT:    s_flbit_i32_b32 s9, s3
-; GCN-IR-NEXT:    s_add_i32 s8, s8, 32
-; GCN-IR-NEXT:    s_min_u32 s12, s8, s9
+; GCN-IR-NEXT:    s_flbit_i32_b64 s12, s[2:3]
 ; GCN-IR-NEXT:    s_add_u32 s8, s12, 0xffffffc5
 ; GCN-IR-NEXT:    s_addc_u32 s9, 0, -1
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[6:7], s[2:3], 0
@@ -1406,10 +1391,7 @@ define amdgpu_kernel void @s_test_udiv_k_den_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
 ; GCN-IR-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-IR-NEXT:    s_flbit_i32_b32 s6, s2
-; GCN-IR-NEXT:    s_flbit_i32_b32 s7, s3
-; GCN-IR-NEXT:    s_add_i32 s6, s6, 32
-; GCN-IR-NEXT:    s_min_u32 s12, s6, s7
+; GCN-IR-NEXT:    s_flbit_i32_b64 s12, s[2:3]
 ; GCN-IR-NEXT:    s_sub_u32 s8, 59, s12
 ; GCN-IR-NEXT:    s_subb_u32 s9, 0, 0
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[2:3], 0
diff --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll
index 784993ccd3bd1..3ec51b01c7a3c 100644
--- a/llvm/test/CodeGen/AMDGPU/urem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/urem64.ll
@@ -124,18 +124,14 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y
 ; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
 ; GCN-IR-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
 ; GCN-IR-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN-IR-NEXT:    s_mov_b64 s[6:7], 0
+; GCN-IR-NEXT:    s_mov_b32 s11, 0
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[6:7], s[4:5], 0
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[8:9], s[2:3], 0
-; GCN-IR-NEXT:    s_flbit_i32_b32 s10, s4
-; GCN-IR-NEXT:    s_or_b64 s[8:9], s[6:7], s[8:9]
-; GCN-IR-NEXT:    s_flbit_i32_b32 s6, s2
-; GCN-IR-NEXT:    s_flbit_i32_b32 s11, s5
-; GCN-IR-NEXT:    s_add_i32 s10, s10, 32
-; GCN-IR-NEXT:    s_add_i32 s6, s6, 32
-; GCN-IR-NEXT:    s_flbit_i32_b32 s7, s3
-; GCN-IR-NEXT:    s_min_u32 s10, s10, s11
-; GCN-IR-NEXT:    s_min_u32 s18, s6, s7
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[8:9], s[4:5], 0
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[12:13], s[2:3], 0
+; GCN-IR-NEXT:    s_flbit_i32_b64 s10, s[4:5]
+; GCN-IR-NEXT:    s_flbit_i32_b64 s18, s[2:3]
+; GCN-IR-NEXT:    s_or_b64 s[8:9], s[8:9], s[12:13]
 ; GCN-IR-NEXT:    s_sub_u32 s12, s10, s18
 ; GCN-IR-NEXT:    s_subb_u32 s13, 0, 0
 ; GCN-IR-NEXT:    v_cmp_gt_u64_e64 s[14:15], s[12:13], 63
@@ -145,9 +141,7 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y
 ; GCN-IR-NEXT:    s_cselect_b32 s9, 0, s3
 ; GCN-IR-NEXT:    s_cselect_b32 s8, 0, s2
 ; GCN-IR-NEXT:    s_or_b64 s[14:15], s[14:15], s[16:17]
-; GCN-IR-NEXT:    s_mov_b64 s[6:7], 0
 ; GCN-IR-NEXT:    s_andn2_b64 vcc, exec, s[14:15]
-; GCN-IR-NEXT:    s_mov_b32 s11, 0
 ; GCN-IR-NEXT:    s_cbranch_vccz .LBB0_5
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
 ; GCN-IR-NEXT:    s_add_u32 s14, s12, 1
@@ -814,10 +808,7 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-IR-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
 ; GCN-IR-NEXT:    s_mov_b64 s[4:5], 0
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-IR-NEXT:    s_flbit_i32_b32 s8, s2
-; GCN-IR-NEXT:    s_flbit_i32_b32 s9, s3
-; GCN-IR-NEXT:    s_add_i32 s8, s8, 32
-; GCN-IR-NEXT:    s_min_u32 s12, s8, s9
+; GCN-IR-NEXT:    s_flbit_i32_b64 s12, s[2:3]
 ; GCN-IR-NEXT:    s_add_u32 s8, s12, 0xffffffc5
 ; GCN-IR-NEXT:    s_addc_u32 s9, 0, -1
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[6:7], s[2:3], 0
@@ -973,10 +964,7 @@ define amdgpu_kernel void @s_test_urem_k_den_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
 ; GCN-IR-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-IR-NEXT:    s_flbit_i32_b32 s6, s2
-; GCN-IR-NEXT:    s_flbit_i32_b32 s7, s3
-; GCN-IR-NEXT:    s_add_i32 s6, s6, 32
-; GCN-IR-NEXT:    s_min_u32 s12, s6, s7
+; GCN-IR-NEXT:    s_flbit_i32_b64 s12, s[2:3]
 ; GCN-IR-NEXT:    s_sub_u32 s8, 59, s12
 ; GCN-IR-NEXT:    s_subb_u32 s9, 0, 0
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[4:5], s[2:3], 0

From 952d344f3e25a352bc5f2f1f5f611e96bd7acb91 Mon Sep 17 00:00:00 2001
From: Tyler Rockwood <rockwotj@gmail.com>
Date: Mon, 25 Dec 2023 05:19:53 -0600
Subject: [PATCH 255/342] [clang-tidy] introduce a unused local non trival
 variable check (#76101)

Introduce a new (off by default) clang tidy check to ensure that
variables of a specific type are always used even if -Wunused-variables
wouldn't generate a warning.

This check has already caught a couple of different bugs on the codebase
I work on, where not handling a future means that lifetimes may not be
kept alive properly as an async chunk of code may run after a class has
been destroyed, etc.

I would like to upstream it because I believe there could be other
applications of this check that would be useful in different contexts.

---------

Signed-off-by: Tyler Rockwood <rockwood@redpanda.com>
---
 .../bugprone/BugproneTidyModule.cpp           |   3 +
 .../clang-tidy/bugprone/CMakeLists.txt        |   1 +
 .../UnusedLocalNonTrivialVariableCheck.cpp    |  91 ++++++++++++++
 .../UnusedLocalNonTrivialVariableCheck.h      |  44 +++++++
 clang-tools-extra/docs/ReleaseNotes.rst       |   5 +
 .../unused-local-non-trivial-variable.rst     |  56 +++++++++
 .../docs/clang-tidy/checks/list.rst           |   1 +
 .../unused-local-non-trivial-variable.cpp     | 118 ++++++++++++++++++
 8 files changed, 319 insertions(+)
 create mode 100644 clang-tools-extra/clang-tidy/bugprone/UnusedLocalNonTrivialVariableCheck.cpp
 create mode 100644 clang-tools-extra/clang-tidy/bugprone/UnusedLocalNonTrivialVariableCheck.h
 create mode 100644 clang-tools-extra/docs/clang-tidy/checks/bugprone/unused-local-non-trivial-variable.rst
 create mode 100644 clang-tools-extra/test/clang-tidy/checkers/bugprone/unused-local-non-trivial-variable.cpp

diff --git a/clang-tools-extra/clang-tidy/bugprone/BugproneTidyModule.cpp b/clang-tools-extra/clang-tidy/bugprone/BugproneTidyModule.cpp
index 7a910037368c8..435cb1e3fbcff 100644
--- a/clang-tools-extra/clang-tidy/bugprone/BugproneTidyModule.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/BugproneTidyModule.cpp
@@ -83,6 +83,7 @@
 #include "UnhandledSelfAssignmentCheck.h"
 #include "UniquePtrArrayMismatchCheck.h"
 #include "UnsafeFunctionsCheck.h"
+#include "UnusedLocalNonTrivialVariableCheck.h"
 #include "UnusedRaiiCheck.h"
 #include "UnusedReturnValueCheck.h"
 #include "UseAfterMoveCheck.h"
@@ -235,6 +236,8 @@ class BugproneModule : public ClangTidyModule {
         "bugprone-unique-ptr-array-mismatch");
     CheckFactories.registerCheck<UnsafeFunctionsCheck>(
         "bugprone-unsafe-functions");
+    CheckFactories.registerCheck<UnusedLocalNonTrivialVariableCheck>(
+        "bugprone-unused-local-non-trivial-variable");
     CheckFactories.registerCheck<UnusedRaiiCheck>("bugprone-unused-raii");
     CheckFactories.registerCheck<UnusedReturnValueCheck>(
         "bugprone-unused-return-value");
diff --git a/clang-tools-extra/clang-tidy/bugprone/CMakeLists.txt b/clang-tools-extra/clang-tidy/bugprone/CMakeLists.txt
index d443fd8d1452f..70e7fbc7ec0c1 100644
--- a/clang-tools-extra/clang-tidy/bugprone/CMakeLists.txt
+++ b/clang-tools-extra/clang-tidy/bugprone/CMakeLists.txt
@@ -79,6 +79,7 @@ add_clang_library(clangTidyBugproneModule
   UnhandledSelfAssignmentCheck.cpp
   UniquePtrArrayMismatchCheck.cpp
   UnsafeFunctionsCheck.cpp
+  UnusedLocalNonTrivialVariableCheck.cpp
   UnusedRaiiCheck.cpp
   UnusedReturnValueCheck.cpp
   UseAfterMoveCheck.cpp
diff --git a/clang-tools-extra/clang-tidy/bugprone/UnusedLocalNonTrivialVariableCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/UnusedLocalNonTrivialVariableCheck.cpp
new file mode 100644
index 0000000000000..ee7f365320ff9
--- /dev/null
+++ b/clang-tools-extra/clang-tidy/bugprone/UnusedLocalNonTrivialVariableCheck.cpp
@@ -0,0 +1,91 @@
+//===--- UnusedLocalNonTrivialVariableCheck.cpp - clang-tidy --------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "UnusedLocalNonTrivialVariableCheck.h"
+#include "../utils/Matchers.h"
+#include "../utils/OptionsUtils.h"
+#include "clang/AST/ASTContext.h"
+#include "clang/AST/ASTTypeTraits.h"
+#include "clang/AST/Type.h"
+#include "clang/ASTMatchers/ASTMatchFinder.h"
+#include "clang/ASTMatchers/ASTMatchers.h"
+#include "clang/ASTMatchers/ASTMatchersMacros.h"
+
+using namespace clang::ast_matchers;
+using namespace clang::tidy::matchers;
+
+namespace clang::tidy::bugprone {
+
+namespace {
+static constexpr StringRef DefaultIncludeTypeRegex =
+    "::std::.*mutex;::std::future;::std::basic_string;::std::basic_regex;"
+    "::std::base_istringstream;::std::base_stringstream;::std::bitset;"
+    "::std::filesystem::path";
+
+AST_MATCHER(VarDecl, isLocalVarDecl) { return Node.isLocalVarDecl(); }
+AST_MATCHER(VarDecl, isReferenced) { return Node.isReferenced(); }
+AST_MATCHER(Type, isReferenceType) { return Node.isReferenceType(); }
+AST_MATCHER(QualType, isTrivial) {
+  return Node.isTrivialType(Finder->getASTContext()) ||
+         Node.isTriviallyCopyableType(Finder->getASTContext());
+}
+} // namespace
+
+UnusedLocalNonTrivialVariableCheck::UnusedLocalNonTrivialVariableCheck(
+    StringRef Name, ClangTidyContext *Context)
+    : ClangTidyCheck(Name, Context),
+      IncludeTypes(utils::options::parseStringList(
+          Options.get("IncludeTypes", DefaultIncludeTypeRegex))),
+      ExcludeTypes(
+          utils::options::parseStringList(Options.get("ExcludeTypes", ""))) {}
+
+void UnusedLocalNonTrivialVariableCheck::storeOptions(
+    ClangTidyOptions::OptionMap &Opts) {
+  Options.store(Opts, "IncludeTypes",
+                utils::options::serializeStringList(IncludeTypes));
+  Options.store(Opts, "ExcludeTypes",
+                utils::options::serializeStringList(ExcludeTypes));
+}
+
+void UnusedLocalNonTrivialVariableCheck::registerMatchers(MatchFinder *Finder) {
+  if (IncludeTypes.empty())
+    return;
+
+  Finder->addMatcher(
+      varDecl(isLocalVarDecl(), unless(isReferenced()),
+              unless(isExceptionVariable()), hasLocalStorage(), isDefinition(),
+              unless(hasType(isReferenceType())), unless(hasType(isTrivial())),
+              hasType(hasUnqualifiedDesugaredType(
+                  anyOf(recordType(hasDeclaration(namedDecl(
+                            matchesAnyListedName(IncludeTypes),
+                            unless(matchesAnyListedName(ExcludeTypes))))),
+                        templateSpecializationType(hasDeclaration(namedDecl(
+                            matchesAnyListedName(IncludeTypes),
+                            unless(matchesAnyListedName(ExcludeTypes)))))))))
+          .bind("var"),
+      this);
+}
+
+void UnusedLocalNonTrivialVariableCheck::check(
+    const MatchFinder::MatchResult &Result) {
+  const auto *MatchedDecl = Result.Nodes.getNodeAs<VarDecl>("var");
+  diag(MatchedDecl->getLocation(), "unused local variable %0 of type %1")
+      << MatchedDecl << MatchedDecl->getType();
+}
+
+bool UnusedLocalNonTrivialVariableCheck::isLanguageVersionSupported(
+    const LangOptions &LangOpts) const {
+  return LangOpts.CPlusPlus;
+}
+
+std::optional<TraversalKind>
+UnusedLocalNonTrivialVariableCheck::getCheckTraversalKind() const {
+  return TK_IgnoreUnlessSpelledInSource;
+}
+
+} // namespace clang::tidy::bugprone
diff --git a/clang-tools-extra/clang-tidy/bugprone/UnusedLocalNonTrivialVariableCheck.h b/clang-tools-extra/clang-tidy/bugprone/UnusedLocalNonTrivialVariableCheck.h
new file mode 100644
index 0000000000000..e79b803a2158b
--- /dev/null
+++ b/clang-tools-extra/clang-tidy/bugprone/UnusedLocalNonTrivialVariableCheck.h
@@ -0,0 +1,44 @@
+//===--- UnusedLocalNonTrivialVariableCheck.h - clang-tidy ------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_UNUSEDLOCALNONTRIVIALVARIABLECHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_UNUSEDLOCALNONTRIVIALVARIABLECHECK_H
+
+#include "../ClangTidyCheck.h"
+
+namespace clang::tidy::bugprone {
+
+/// Warns when a local non trivial variable is unused within a function. By
+/// default std::.*mutex and std::future are included.
+///
+/// The check supports these options:
+///   - 'IncludeTypes': a semicolon-separated list of regular expressions
+///                     matching types to ensure must be used.
+///   - 'ExcludeTypes': a semicolon-separated list of regular expressions
+///                     matching types that are excluded from the
+///                     'IncludeTypes' matches.
+///
+/// For the user-facing documentation see:
+/// http://clang.llvm.org/extra/clang-tidy/checks/bugprone/unused-local-non-trivial-variable.html
+class UnusedLocalNonTrivialVariableCheck : public ClangTidyCheck {
+public:
+  UnusedLocalNonTrivialVariableCheck(StringRef Name, ClangTidyContext *Context);
+  void registerMatchers(ast_matchers::MatchFinder *Finder) override;
+  void check(const ast_matchers::MatchFinder::MatchResult &Result) override;
+  void storeOptions(ClangTidyOptions::OptionMap &Opts) override;
+  bool isLanguageVersionSupported(const LangOptions &LangOpts) const override;
+  std::optional<TraversalKind> getCheckTraversalKind() const override;
+
+private:
+  const std::vector<StringRef> IncludeTypes;
+  const std::vector<StringRef> ExcludeTypes;
+};
+
+} // namespace clang::tidy::bugprone
+
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_UNUSEDLOCALNONTRIVIALVARIABLECHECK_H
diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index 6d91748e4cef1..6e7554e0433c2 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -168,6 +168,11 @@ New checks
   extracted from an optional-like type and then used to create a new instance
   of the same optional-like type.
 
+- New :doc:`bugprone-unused-local-non-trivial-variable
+  <clang-tidy/checks/bugprone/unused-local-non-trivial-variable>` check.
+
+  Warns when a local non trivial variable is unused within a function.
+
 - New :doc:`cppcoreguidelines-no-suspend-with-lock
   <clang-tidy/checks/cppcoreguidelines/no-suspend-with-lock>` check.
 
diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/unused-local-non-trivial-variable.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/unused-local-non-trivial-variable.rst
new file mode 100644
index 0000000000000..7a72a08d8f3aa
--- /dev/null
+++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/unused-local-non-trivial-variable.rst
@@ -0,0 +1,56 @@
+.. title:: clang-tidy - bugprone-unused-local-non-trivial-variable
+
+bugprone-unused-local-non-trivial-variable
+==========================================
+
+Warns when a local non trivial variable is unused within a function.
+The following types of variables are excluded from this check:
+
+* trivial and trivially copyable
+* references and pointers
+* exception variables in catch clauses
+* static or thread local
+* structured bindings
+
+This check can be configured to warn on all non-trivial variables by setting
+`IncludeTypes` to `.*`, and excluding specific types using `ExcludeTypes`.
+
+In the this example, `my_lock` would generate a warning that it is unused.
+
+.. code-block:: c++
+
+   std::mutex my_lock;
+   // my_lock local variable is never used
+
+In the next example, `future2` would generate a warning that it is unused.
+
+.. code-block:: c++
+
+   std::future<MyObject> future1;
+   std::future<MyObject> future2;
+   // ...
+   MyObject foo = future1.get();
+   // future2 is not used.
+
+Options
+-------
+
+.. option:: IncludeTypes
+
+   Semicolon-separated list of regular expressions matching types of variables
+   to check. 
+   By default the following types are checked: 
+
+   * `::std::.*mutex`
+   * `::std::future`
+   * `::std::string`
+   * `::std::basic_regex`
+   * `::std::basic_istringstream`
+   * `::std::basic_stringstream`
+   * `::std::bitset`
+   * `::std::path`
+
+.. option:: ExcludeTypes
+
+   A semicolon-separated list of regular expressions matching types that are 
+   excluded from the `IncludeTypes` matches. By default it is an empty list.
diff --git a/clang-tools-extra/docs/clang-tidy/checks/list.rst b/clang-tools-extra/docs/clang-tidy/checks/list.rst
index 31f0e090db1d7..39d8b490d927c 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/list.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/list.rst
@@ -149,6 +149,7 @@ Clang-Tidy Checks
    :doc:`bugprone-unhandled-self-assignment <bugprone/unhandled-self-assignment>`,
    :doc:`bugprone-unique-ptr-array-mismatch <bugprone/unique-ptr-array-mismatch>`, "Yes"
    :doc:`bugprone-unsafe-functions <bugprone/unsafe-functions>`,
+   :doc:`bugprone-unused-local-non-trivial-variable <bugprone/unused-local-non-trivial-variable>`,
    :doc:`bugprone-unused-raii <bugprone/unused-raii>`, "Yes"
    :doc:`bugprone-unused-return-value <bugprone/unused-return-value>`,
    :doc:`bugprone-use-after-move <bugprone/use-after-move>`,
diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/unused-local-non-trivial-variable.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/unused-local-non-trivial-variable.cpp
new file mode 100644
index 0000000000000..9bbf3d116885f
--- /dev/null
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/unused-local-non-trivial-variable.cpp
@@ -0,0 +1,118 @@
+// RUN: %check_clang_tidy -std=c++17-or-later %s bugprone-unused-local-non-trivial-variable %t -- \
+// RUN:       -config="{CheckOptions: {bugprone-unused-local-non-trivial-variable.IncludeTypes: '::async::Future;::async::Foo.*', bugprone-unused-local-non-trivial-variable.ExcludeTypes: '::async::FooBar'}}"
+
+
+namespace async {
+template <typename T>
+class Ptr {
+  public:
+  explicit Ptr(T Arg) : Underlying(new T(Arg)) {}
+  T& operator->() {
+    return Underlying;
+  }
+  ~Ptr() {
+    delete Underlying;
+  }
+  private:
+    T* Underlying;
+};
+
+template<typename T>
+class Future {
+public:    
+    T get() {
+        return Pending;
+    }
+    ~Future();
+private:
+    T Pending;
+};
+
+class FooBar {
+  public:
+    ~FooBar();
+  private:
+    Future<int> Fut;
+};
+
+class FooQux {
+  public:
+    ~FooQux();
+  private:
+    Future<int> Fut;
+};
+
+class FizzFoo {
+  public:
+    ~FizzFoo();
+  private:
+    Future<int> Fut;
+};
+
+} // namespace async
+
+// Warning is still emitted if there are type aliases.
+namespace a {
+template<typename T>
+using Future = async::Future<T>;
+} // namespace a
+
+void releaseUnits();
+struct Units {
+  ~Units() {
+    releaseUnits();
+  }
+};
+a::Future<Units> acquireUnits();
+
+template<typename T>
+T qux(T Generic) {
+    async::Future<Units> PendingA = acquireUnits();
+    auto PendingB = acquireUnits();
+    // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: unused local variable 'PendingB' of type 'a::Future<Units>' (aka 'Future<Units>') [bugprone-unused-local-non-trivial-variable]
+    async::Future<Units> MustBeUsed;
+    // CHECK-MESSAGES: :[[@LINE-1]]:26: warning: unused local variable 'MustBeUsed' of type 'async::Future<Units>' [bugprone-unused-local-non-trivial-variable]
+    PendingA.get();
+    async::Future<T> TemplateType;
+    // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: unused local variable 'TemplateType' of type 'async::Future<T>' [bugprone-unused-local-non-trivial-variable]
+    a::Future<T> AliasTemplateType;
+    // CHECK-MESSAGES: :[[@LINE-1]]:18: warning: unused local variable 'AliasTemplateType' of type 'a::Future<T>' (aka 'Future<type-parameter-0-0>') [bugprone-unused-local-non-trivial-variable]
+    return Generic;
+}
+
+async::Future<int> Global;
+
+int bar(int Num) {
+    a::Future<Units> PendingA = acquireUnits();
+    a::Future<Units> PendingB = acquireUnits(); // not used at all, unused variable not fired because of destructor side effect
+    // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: unused local variable 'PendingB' of type 'a::Future<Units>' (aka 'Future<Units>') [bugprone-unused-local-non-trivial-variable]
+    auto Num2 = PendingA.get();
+    auto Num3 = qux(Num);
+    async::Ptr<a::Future<Units>> Shared = async::Ptr<a::Future<Units>>(acquireUnits());
+    static auto UnusedStatic = async::Future<Units>();
+    thread_local async::Future<Units> UnusedThreadLocal;
+    auto Captured = acquireUnits();
+    Num3 += [Captured]() {
+      return 1;
+    }();
+    a::Future<Units> Referenced = acquireUnits();
+    a::Future<Units>* Pointer = &Referenced;
+    a::Future<Units>& Reference = Referenced;
+    const a::Future<Units>& ConstReference = Referenced;
+    try {
+    } catch (a::Future<Units> Fut) {
+    }
+    struct Holder {
+      a::Future<Units> Fut;
+    };
+    Holder H;
+    auto [fut] = H;
+    return Num * Num3;
+}
+
+void exclusion() {
+  async::FizzFoo A;
+  async::FooBar B;
+  async::FooQux C;
+  // CHECK-MESSAGES: :[[@LINE-1]]:17: warning: unused local variable 'C' of type 'async::FooQux' [bugprone-unused-local-non-trivial-variable]
+}

From 3f85fb9a021b70cb947d32efd9bdbddb6fb3a1d7 Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Mon, 25 Dec 2023 11:20:28 +0000
Subject: [PATCH 256/342] [gn build] Port 952d344f3e25

---
 .../gn/secondary/clang-tools-extra/clang-tidy/bugprone/BUILD.gn  | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/bugprone/BUILD.gn b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/bugprone/BUILD.gn
index dcc2520006984..7f9302e06f8b7 100644
--- a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/bugprone/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/bugprone/BUILD.gn
@@ -91,6 +91,7 @@ static_library("bugprone") {
     "UnhandledSelfAssignmentCheck.cpp",
     "UniquePtrArrayMismatchCheck.cpp",
     "UnsafeFunctionsCheck.cpp",
+    "UnusedLocalNonTrivialVariableCheck.cpp",
     "UnusedRaiiCheck.cpp",
     "UnusedReturnValueCheck.cpp",
     "UseAfterMoveCheck.cpp",

From 007ed0dccd6a3d19f331eb7cd91438d792754439 Mon Sep 17 00:00:00 2001
From: Piotr Zegar <me@piotrzegar.pl>
Date: Mon, 25 Dec 2023 11:29:24 +0000
Subject: [PATCH 257/342] [clang-tidy][NFC] Enable exceptions in test for
 bugprone-unused-local-non-trivial-variable

Added -fexceptions switch to test.
It were missing in #76101.
---
 .../checkers/bugprone/unused-local-non-trivial-variable.cpp   | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/unused-local-non-trivial-variable.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/unused-local-non-trivial-variable.cpp
index 9bbf3d116885f..4900020371641 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/bugprone/unused-local-non-trivial-variable.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/unused-local-non-trivial-variable.cpp
@@ -1,6 +1,6 @@
 // RUN: %check_clang_tidy -std=c++17-or-later %s bugprone-unused-local-non-trivial-variable %t -- \
 // RUN:       -config="{CheckOptions: {bugprone-unused-local-non-trivial-variable.IncludeTypes: '::async::Future;::async::Foo.*', bugprone-unused-local-non-trivial-variable.ExcludeTypes: '::async::FooBar'}}"
-
+// RUN:       -- -fexceptions
 
 namespace async {
 template <typename T>
@@ -19,7 +19,7 @@ class Ptr {
 
 template<typename T>
 class Future {
-public:    
+public:
     T get() {
         return Pending;
     }

From 37fc9c6a4227b1736cc643eb95636d9f7ec30190 Mon Sep 17 00:00:00 2001
From: Piotr Zegar <me@piotrzegar.pl>
Date: Mon, 25 Dec 2023 11:38:06 +0000
Subject: [PATCH 258/342] [clang-tidy][NFC] Enable exceptions in test for
 bugprone-unused-local-non-trivial-variable

Added -fexceptions switch to test.
Added missing Fixes for #76101.
---
 .../checkers/bugprone/unused-local-non-trivial-variable.cpp     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/unused-local-non-trivial-variable.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/unused-local-non-trivial-variable.cpp
index 4900020371641..19f2344de4a65 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/bugprone/unused-local-non-trivial-variable.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/unused-local-non-trivial-variable.cpp
@@ -1,5 +1,5 @@
 // RUN: %check_clang_tidy -std=c++17-or-later %s bugprone-unused-local-non-trivial-variable %t -- \
-// RUN:       -config="{CheckOptions: {bugprone-unused-local-non-trivial-variable.IncludeTypes: '::async::Future;::async::Foo.*', bugprone-unused-local-non-trivial-variable.ExcludeTypes: '::async::FooBar'}}"
+// RUN:       -config="{CheckOptions: {bugprone-unused-local-non-trivial-variable.IncludeTypes: '::async::Future;::async::Foo.*', bugprone-unused-local-non-trivial-variable.ExcludeTypes: '::async::FooBar'}}" \
 // RUN:       -- -fexceptions
 
 namespace async {

From 9fba1d5f3a52af0ae62f386d0c494bd9510fa845 Mon Sep 17 00:00:00 2001
From: Piotr Zegar <me@piotrzegar.pl>
Date: Mon, 25 Dec 2023 13:09:12 +0000
Subject: [PATCH 259/342] [clang-tidy] Fixes for
 bugprone-unused-local-non-trivial-variable

Fixed spelling of some classes in code and in documentation.
Fixes for #76101
---
 .../bugprone/UnusedLocalNonTrivialVariableCheck.cpp      | 2 +-
 .../bugprone/unused-local-non-trivial-variable.rst       | 9 ++++-----
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/clang-tools-extra/clang-tidy/bugprone/UnusedLocalNonTrivialVariableCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/UnusedLocalNonTrivialVariableCheck.cpp
index ee7f365320ff9..1b763d291082b 100644
--- a/clang-tools-extra/clang-tidy/bugprone/UnusedLocalNonTrivialVariableCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/UnusedLocalNonTrivialVariableCheck.cpp
@@ -24,7 +24,7 @@ namespace clang::tidy::bugprone {
 namespace {
 static constexpr StringRef DefaultIncludeTypeRegex =
     "::std::.*mutex;::std::future;::std::basic_string;::std::basic_regex;"
-    "::std::base_istringstream;::std::base_stringstream;::std::bitset;"
+    "::std::basic_istringstream;::std::basic_stringstream;::std::bitset;"
     "::std::filesystem::path";
 
 AST_MATCHER(VarDecl, isLocalVarDecl) { return Node.isLocalVarDecl(); }
diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/unused-local-non-trivial-variable.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/unused-local-non-trivial-variable.rst
index 7a72a08d8f3aa..7531f19f3ebc1 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/unused-local-non-trivial-variable.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/unused-local-non-trivial-variable.rst
@@ -38,19 +38,18 @@ Options
 .. option:: IncludeTypes
 
    Semicolon-separated list of regular expressions matching types of variables
-   to check. 
-   By default the following types are checked: 
+   to check. By default the following types are checked:
 
    * `::std::.*mutex`
    * `::std::future`
-   * `::std::string`
+   * `::std::basic_string`
    * `::std::basic_regex`
    * `::std::basic_istringstream`
    * `::std::basic_stringstream`
    * `::std::bitset`
-   * `::std::path`
+   * `::std::filesystem::path`
 
 .. option:: ExcludeTypes
 
-   A semicolon-separated list of regular expressions matching types that are 
+   A semicolon-separated list of regular expressions matching types that are
    excluded from the `IncludeTypes` matches. By default it is an empty list.

From 6452395561eaae59e38f1df84f5413dffdb9169f Mon Sep 17 00:00:00 2001
From: Weining Lu <luweining@loongson.cn>
Date: Mon, 25 Dec 2023 22:41:09 +0800
Subject: [PATCH 260/342] Revert "[lld][test][LoongArch] Remove the test for
 R_LARCH_CALL36 range checking"

This reverts commit 0fbc728dba97149e530cfb7f2ada0283c398a7ce.

In 88548df0fc08, both the .sec.foo and .tex sections used the same
section flags, hence sharing one segment, pushing the output file
size too large. This breaks on many buildbots.

Now assign section .sec.foo different flags ("awx") from .text ("ax")
so that both sections get their own segment.
---
 lld/test/ELF/loongarch-call36.s | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/lld/test/ELF/loongarch-call36.s b/lld/test/ELF/loongarch-call36.s
index 0a00adacbd6a5..b593fdf1f6045 100644
--- a/lld/test/ELF/loongarch-call36.s
+++ b/lld/test/ELF/loongarch-call36.s
@@ -40,6 +40,10 @@
 # GOTPLT-NEXT: 0x01274730 00000000 00000000 00000000 00000000
 # GOTPLT-NEXT: 0x01274740 00452301 00000000
 
+# RUN: not ld.lld %t/a.o --section-start=.text=0x20000 --section-start=.sec.foo=0x2000020000 -o /dev/null 2>&1 | \
+# RUN:   FileCheck -DFILE=%t/a.o --check-prefix=ERROR-RANGE %s
+# ERROR-RANGE: error: [[FILE]]:(.text+0x0): relocation R_LARCH_CALL36 out of range: 137438953472 is not in [-137439084544, 137438822399]; references 'foo'
+
 ## Impossible case in reality becasue all LoongArch instructions are fixed 4-bytes long.
 # RUN: not ld.lld %t/a.o --section-start=.text=0x20000 --section-start=.sec.foo=0x40001 -o /dev/null 2>&1 | \
 # RUN:   FileCheck -DFILE=%t/a.o --check-prefix=ERROR-ALIGN %s
@@ -59,7 +63,7 @@ _start:
   pcaddu18i $t0, 0
   jirl      $zero, $t0, 0
 
-.section .sec.foo,"ax"
+.section .sec.foo,"awx"
 .global foo
 foo:
   ret

From ff76627aeb8d431d5451201d656bb38318908f0a Mon Sep 17 00:00:00 2001
From: Yingwei Zheng <dtcxzyw2333@gmail.com>
Date: Tue, 26 Dec 2023 00:04:06 +0800
Subject: [PATCH 261/342] [InstCombine] Fix type mismatch between cond and
 value in `foldSelectToCopysign` (#76343)

This patch fixes the miscompilation when we try to bitcast a floating point vector into an integer scalar.
---
 .../Transforms/InstCombine/InstCombineSelect.cpp  |  3 +++
 llvm/test/Transforms/InstCombine/select.ll        | 15 +++++++++++++++
 2 files changed, 18 insertions(+)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
index 20bf00344b144..3c6ce450c5bcf 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -2363,6 +2363,9 @@ static Instruction *foldSelectToCopysign(SelectInst &Sel,
   Value *FVal = Sel.getFalseValue();
   Type *SelType = Sel.getType();
 
+  if (ICmpInst::makeCmpResultType(TVal->getType()) != Cond->getType())
+    return nullptr;
+
   // Match select ?, TC, FC where the constants are equal but negated.
   // TODO: Generalize to handle a negated variable operand?
   const APFloat *TC, *FC;
diff --git a/llvm/test/Transforms/InstCombine/select.ll b/llvm/test/Transforms/InstCombine/select.ll
index 7583a75385a76..94aa012f86801 100644
--- a/llvm/test/Transforms/InstCombine/select.ll
+++ b/llvm/test/Transforms/InstCombine/select.ll
@@ -1735,6 +1735,21 @@ define float @copysign_type_mismatch(double %x) {
 
 ; Negative test
 
+define <2 x float> @copysign_type_mismatch2(<2 x float> %x) {
+; CHECK-LABEL: @copysign_type_mismatch2(
+; CHECK-NEXT:    [[I:%.*]] = bitcast <2 x float> [[X:%.*]] to i64
+; CHECK-NEXT:    [[ISPOS:%.*]] = icmp sgt i64 [[I]], -1
+; CHECK-NEXT:    [[R:%.*]] = select i1 [[ISPOS]], <2 x float> <float 1.000000e+00, float 1.000000e+00>, <2 x float> <float -1.000000e+00, float -1.000000e+00>
+; CHECK-NEXT:    ret <2 x float> [[R]]
+;
+  %i = bitcast <2 x float> %x to i64
+  %ispos = icmp sgt i64 %i, -1
+  %r = select i1 %ispos, <2 x float> <float 1.0, float 1.0>, <2 x float> <float -1.0, float -1.0>
+  ret <2 x float> %r
+}
+
+; Negative test
+
 define float @copysign_wrong_cmp(float %x) {
 ; CHECK-LABEL: @copysign_wrong_cmp(
 ; CHECK-NEXT:    [[I:%.*]] = bitcast float [[X:%.*]] to i32

From 5cfc7b3342ce4de0bbe182b38baa8a71fc83f8f8 Mon Sep 17 00:00:00 2001
From: Kai Luo <lkail@cn.ibm.com>
Date: Tue, 26 Dec 2023 00:21:56 +0800
Subject: [PATCH 262/342] [PowerPC] Add test after #75271 on PPC. NFC. (#75616)

Demonstrate `IMPLICIT_DEF implicit-def ...` can be generated after
coalescing on PPC.

The case is reduced from failure in #75570. The failure is triggered
after #75271 .
---
 ...-remat-with-undef-implicit-def-operand.mir | 28 +++++++++++++++++++
 1 file changed, 28 insertions(+)
 create mode 100644 llvm/test/CodeGen/PowerPC/coalescer-remat-with-undef-implicit-def-operand.mir

diff --git a/llvm/test/CodeGen/PowerPC/coalescer-remat-with-undef-implicit-def-operand.mir b/llvm/test/CodeGen/PowerPC/coalescer-remat-with-undef-implicit-def-operand.mir
new file mode 100644
index 0000000000000..8e4e3be55600f
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/coalescer-remat-with-undef-implicit-def-operand.mir
@@ -0,0 +1,28 @@
+# RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -verify-coalescing -run-pass=register-coalescer \
+# RUN:   -o - %s | FileCheck %s
+---
+name:            _Z13testTransposeIfLj31ELj17EEvv
+alignment:       16
+tracksRegLiveness: true
+frameInfo:
+  maxAlignment:    128
+machineFunctionInfo: {}
+body:             |
+  ; CHECK-LABEL: name:            _Z13testTransposeIfLj31ELj17EEvv
+  ; CHECK: undef %[[REG:[0-9]+]].sub_64:vsrc = IMPLICIT_DEF implicit-def %[[REG]]
+  bb.0:
+    liveins: $x2
+    %2:vssrc = IMPLICIT_DEF
+    B %bb.2
+
+  bb.1:
+    %0:vsrc = SUBREG_TO_REG 1, killed %2, %subreg.sub_64
+    %1:vsrc = XXPERMDI killed undef %0, killed %0, 0
+    BLR8 implicit $lr8, implicit $rm
+
+  bb.2:
+    successors: %bb.2(0x7c000000), %bb.1(0x04000000)
+    BDNZ8 %bb.2, implicit-def $ctr8, implicit $ctr8
+    B %bb.1
+
+...

From 95bdbc8a28c5f5e91ea25bf246788b49aa6ec160 Mon Sep 17 00:00:00 2001
From: Lang Hames <lhames@gmail.com>
Date: Mon, 25 Dec 2023 09:54:14 -0800
Subject: [PATCH 263/342] [JITLink] Add -phony-externals to silence test errors
 on some buildbots.

The recently added MachO_subtractor_single_block tests reference C++ itanium
ABI symbols, but these break on some Windows systems (see e.g.
https://lab.llvm.org/buildbot/#/builders/117/builds/17021). Adding
-phony-externals should suppress the errors.
---
 .../JITLink/AArch64/MachO_subtractor_single_block.yaml          | 2 +-
 .../JITLink/x86-64/MachO_subtractor_single_block.yaml           | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/test/ExecutionEngine/JITLink/AArch64/MachO_subtractor_single_block.yaml b/llvm/test/ExecutionEngine/JITLink/AArch64/MachO_subtractor_single_block.yaml
index dec9f274072cb..bf72750eece05 100644
--- a/llvm/test/ExecutionEngine/JITLink/AArch64/MachO_subtractor_single_block.yaml
+++ b/llvm/test/ExecutionEngine/JITLink/AArch64/MachO_subtractor_single_block.yaml
@@ -1,5 +1,5 @@
 # RUN: yaml2obj %s -o %t
-# RUN: llvm-jitlink -noexec %t
+# RUN: llvm-jitlink -noexec -phony-externals %t
 #
 # Check that MachO::ARM64_RELOC_SUBTRACTOR relocations work when the fixup
 # location and target are in the same block (in this case in the __eh_frame
diff --git a/llvm/test/ExecutionEngine/JITLink/x86-64/MachO_subtractor_single_block.yaml b/llvm/test/ExecutionEngine/JITLink/x86-64/MachO_subtractor_single_block.yaml
index 704c611cf9f79..12542cf7c3142 100644
--- a/llvm/test/ExecutionEngine/JITLink/x86-64/MachO_subtractor_single_block.yaml
+++ b/llvm/test/ExecutionEngine/JITLink/x86-64/MachO_subtractor_single_block.yaml
@@ -1,5 +1,5 @@
 # RUN: yaml2obj %s -o %t
-# RUN: llvm-jitlink -noexec %t
+# RUN: llvm-jitlink -noexec -phony-externals %t
 #
 # Check that MachO::X86_64_RELOC_SUBTRACTOR relocations work when the fixup
 # location and target are in the same block (in this case in the __eh_frame

From 9d6837d595719904720e5ff68ec1f1a2665bdc2f Mon Sep 17 00:00:00 2001
From: Michael Klemm <michael.klemm@amd.com>
Date: Mon, 25 Dec 2023 19:15:00 +0100
Subject: [PATCH 264/342] [flang][driver] Remove Fortain_main static library
 from linking stages (#75816)

At present, when building static or shared libraries, Flang adds
`-lFortran_main.a` (or `/WHOLEARCHIVE:Fortran.*.lib` pon Windows) to the
link line. This leads to the problem that `_QQmain` and
`_QQEnvironmentDefaults` (as of the time of this PR) are symbols marked
as used, while `main` is being defined. This should not happen and this
PR fixes this by detecting if `-shared` or `-static` is used on the
Flang command line and removing the static `Fortran_main` library.

---------

Co-authored-by: kkwli <kkwli@users.noreply.github.com>
---
 clang/lib/Driver/ToolChains/CommonArgs.cpp | 20 ++++++++
 flang/docs/FlangDriver.md                  | 57 ++++++++++++++++++++++
 flang/test/Driver/dynamic-linker.f90       |  8 ++-
 3 files changed, 83 insertions(+), 2 deletions(-)

diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp
index 6eb0ed8f3fed9..3b29e1bc75850 100644
--- a/clang/lib/Driver/ToolChains/CommonArgs.cpp
+++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp
@@ -1133,6 +1133,16 @@ static bool isWholeArchivePresent(const ArgList &Args) {
   return WholeArchiveActive;
 }
 
+/// Determine if driver is invoked to create a shared object library (-static)
+static bool isSharedLinkage(const ArgList &Args) {
+  return Args.hasArg(options::OPT_shared);
+}
+
+/// Determine if driver is invoked to create a static object library (-shared)
+static bool isStaticLinkage(const ArgList &Args) {
+  return Args.hasArg(options::OPT_static);
+}
+
 /// Add Fortran runtime libs for MSVC
 static void addFortranRuntimeLibsMSVC(const ArgList &Args,
                                       llvm::opt::ArgStringList &CmdArgs) {
@@ -1164,6 +1174,16 @@ static void addFortranRuntimeLibsMSVC(const ArgList &Args,
 // Add FortranMain runtime lib
 static void addFortranMain(const ToolChain &TC, const ArgList &Args,
                            llvm::opt::ArgStringList &CmdArgs) {
+  // 0. Shared-library linkage
+  // If we are attempting to link a library, we should not add
+  // -lFortran_main.a to the link line, as the `main` symbol is not
+  // required for a library and should also be provided by one of
+  // the translation units of the code that this shared library
+  // will be linked against eventually.
+  if (isSharedLinkage(Args) || isStaticLinkage(Args)) {
+    return;
+  }
+
   // 1. MSVC
   if (TC.getTriple().isKnownWindowsMSVCEnvironment()) {
     addFortranRuntimeLibsMSVC(Args, CmdArgs);
diff --git a/flang/docs/FlangDriver.md b/flang/docs/FlangDriver.md
index 5231e78335f6a..fa39889927e0e 100644
--- a/flang/docs/FlangDriver.md
+++ b/flang/docs/FlangDriver.md
@@ -163,6 +163,63 @@ forward compiler options to the frontend driver, `flang-new -fc1`.
 You can read more on the design of `clangDriver` in Clang's [Driver Design &
 Internals](https://clang.llvm.org/docs/DriverInternals.html).
 
+## Linker Driver
+When used as a linker, Flang's frontend driver assembles the command line for an
+external linker command (e.g., LLVM's `lld`) and invokes it to create the final
+executable by linking static and shared libraries together with all the
+translation units supplied as object files.
+
+By default, the Flang linker driver adds several libraries to the linker
+invocation to make sure that all entrypoints for program start
+(Fortran's program unit) and runtime routines can be resolved by the linker.
+
+An abridged example (only showing the Fortran specific linker flags, omission
+indicated by `[...]`) for such a linker invocation on a Linux system would look
+like this:
+
+```
+$ flang -v -o example example.o
+"/usr/bin/ld" [...] example.o [...] "--whole-archive" "-lFortran_main"
+"--no-whole-archive" "-lFortranRuntime" "-lFortranDecimal" [...]
+```
+
+The automatically added libraries are:
+
+* `Fortran_main`: Provides the main entry point `main` that then invokes
+  `_QQmain` with the Fortran program unit.  This library has a dependency to
+  the `FortranRuntime` library.
+* `FortranRuntime`: Provides most of the Flang runtime library.
+* `FortranDecimal`: Provides operations for decimal numbers.
+
+The default is that, when using Flang as the linker, one of the Fortran
+translation units provides the program unit and therefore it is assumed that
+Fortran is the main code part (calling into C/C++ routines via `BIND (C)`
+interfaces).  When composing the linker commandline, Flang uses
+`--whole-archive` and `--no-whole-archive` (Windows: `/WHOLEARCHIVE:`,
+Darwin & AIX: *not implemented yet*) to make sure that all for `Fortran_main`
+is processed by the linker.  This is done to issue a proper error message when
+multiple definitions of `main` occur.  This happens, for instance, when linking
+a code that has a Fortran program unit with a C/C++ code that also defines a
+`main` function.  A user may be required to explicitly provide the C++ runtime
+libraries at link time (e.g., via `-lstdc++` for STL)
+
+If the code is C/C++ based and invokes Fortran routines, one can either use Clang
+or Flang as the linker driver.  If Clang is used, it will automatically all
+required runtime libraries needed by C++ (e.g., for STL) to the linker invocation.
+In this case, one has to explicitly provide the Fortran runtime libraries
+`FortranRuntime` and/or `FortranDecimal`.  An alternative is to use Flang to link
+and use the `-fno-fortran-main` flag.  This flag removes
+`Fortran_main` from the linker stage and hence requires one of the C/C++
+translation units to provide a definition of the `main` function. In this case,
+it may be required to explicitly supply C++ runtime libraries as mentioned above.
+
+When creating shared or static libraries using Flang with `-shared` or `-static`
+flag, Fortran_main is automatically removed from the linker stage (i.e.,
+`-fno-fortran-main` is on by default).  It is assumed that when creating a
+static or shared library, the generated library does not need a `main`
+function, as a final link stage will occur that will provide the `Fortran_main`
+library when creating the final executable.
+
 ## Frontend Driver
 Flang's frontend driver is the main interface between compiler developers and
 the Flang frontend. The high-level design is similar to Clang's frontend
diff --git a/flang/test/Driver/dynamic-linker.f90 b/flang/test/Driver/dynamic-linker.f90
index 1cbd407d21ce0..7c3f1b5a53fe4 100644
--- a/flang/test/Driver/dynamic-linker.f90
+++ b/flang/test/Driver/dynamic-linker.f90
@@ -3,10 +3,12 @@
 
 ! RUN: %flang -### --target=x86_64-linux-gnu -rpath /path/to/dir -shared \
 ! RUN:     -static %s 2>&1 | FileCheck \
-! RUN:     --check-prefixes=GNU-LINKER-OPTIONS %s
+! RUN:     --check-prefixes=GNU-LINKER-OPTIONS \
+! RUN:     --implicit-check-not=GNU-LINKER-OPTIONS-NOT %s
 ! RUN: %flang -### --target=x86_64-windows-msvc -rpath /path/to/dir -shared \
 ! RUN:     -static %s 2>&1 | FileCheck \
-! RUN:     --check-prefixes=MSVC-LINKER-OPTIONS %s
+! RUN:     --check-prefixes=MSVC-LINKER-OPTIONS \
+! RUN:     --implicit-check-not=MSVC-LINKER-OPTIONS-NOT %s
 ! RUN: %flang -### --target=aarch64-linux-none -rdynamic %s 2>&1 | FileCheck --check-prefixes=RDYNAMIC-LINKER-OPTION %s
 
 ! TODO: Could the linker have an extension or a suffix?
@@ -14,6 +16,7 @@
 ! GNU-LINKER-OPTIONS-SAME: "-shared"
 ! GNU-LINKER-OPTIONS-SAME: "-static"
 ! GNU-LINKER-OPTIONS-SAME: "-rpath" "/path/to/dir"
+! GNU-LINKER-OPTIONS-NOT: "-lFortran_main.a"
 
 ! RDYNAMIC-LINKER-OPTION: "{{.*}}ld"
 ! RDYNAMIC-LINKER-OPTION-SAME: "-export-dynamic"
@@ -22,3 +25,4 @@
 ! MSVC-LINKER-OPTIONS: "{{.*}}link{{(.exe)?}}"
 ! MSVC-LINKER-OPTIONS-SAME: "-dll"
 ! MSVC-LINKER-OPTIONS-SAME: "-rpath" "/path/to/dir"
+! MSVC-LINKER-OPTIONS-NOT: "/WHOLEARCHIVE:Fortran_main"

From c67e2d97ad7fa25e997d378a3fcd1142ad38bf80 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Mon, 25 Dec 2023 10:58:07 -0800
Subject: [PATCH 265/342] [ADT] Add [[nodiscard]] to SmallString (NFC)

---
 llvm/include/llvm/ADT/SmallString.h | 65 ++++++++++++++---------------
 1 file changed, 32 insertions(+), 33 deletions(-)

diff --git a/llvm/include/llvm/ADT/SmallString.h b/llvm/include/llvm/ADT/SmallString.h
index 02fa28fc856d7..a5b9eec50c825 100644
--- a/llvm/include/llvm/ADT/SmallString.h
+++ b/llvm/include/llvm/ADT/SmallString.h
@@ -89,30 +89,26 @@ class SmallString : public SmallVector<char, InternalLen> {
 
   /// Check for string equality.  This is more efficient than compare() when
   /// the relative ordering of inequal strings isn't needed.
-  bool equals(StringRef RHS) const {
-    return str().equals(RHS);
-  }
+  [[nodiscard]] bool equals(StringRef RHS) const { return str().equals(RHS); }
 
   /// Check for string equality, ignoring case.
-  bool equals_insensitive(StringRef RHS) const {
+  [[nodiscard]] bool equals_insensitive(StringRef RHS) const {
     return str().equals_insensitive(RHS);
   }
 
   /// compare - Compare two strings; the result is negative, zero, or positive
   /// if this string is lexicographically less than, equal to, or greater than
   /// the \p RHS.
-  int compare(StringRef RHS) const {
-    return str().compare(RHS);
-  }
+  [[nodiscard]] int compare(StringRef RHS) const { return str().compare(RHS); }
 
   /// compare_insensitive - Compare two strings, ignoring case.
-  int compare_insensitive(StringRef RHS) const {
+  [[nodiscard]] int compare_insensitive(StringRef RHS) const {
     return str().compare_insensitive(RHS);
   }
 
   /// compare_numeric - Compare two strings, treating sequences of digits as
   /// numbers.
-  int compare_numeric(StringRef RHS) const {
+  [[nodiscard]] int compare_numeric(StringRef RHS) const {
     return str().compare_numeric(RHS);
   }
 
@@ -121,10 +117,14 @@ class SmallString : public SmallVector<char, InternalLen> {
   /// @{
 
   /// starts_with - Check if this string starts with the given \p Prefix.
-  bool starts_with(StringRef Prefix) const { return str().starts_with(Prefix); }
+  [[nodiscard]] bool starts_with(StringRef Prefix) const {
+    return str().starts_with(Prefix);
+  }
 
   /// ends_with - Check if this string ends with the given \p Suffix.
-  bool ends_with(StringRef Suffix) const { return str().ends_with(Suffix); }
+  [[nodiscard]] bool ends_with(StringRef Suffix) const {
+    return str().ends_with(Suffix);
+  }
 
   /// @}
   /// @name String Searching
@@ -134,7 +134,7 @@ class SmallString : public SmallVector<char, InternalLen> {
   ///
   /// \return - The index of the first occurrence of \p C, or npos if not
   /// found.
-  size_t find(char C, size_t From = 0) const {
+  [[nodiscard]] size_t find(char C, size_t From = 0) const {
     return str().find(C, From);
   }
 
@@ -142,7 +142,7 @@ class SmallString : public SmallVector<char, InternalLen> {
   ///
   /// \returns The index of the first occurrence of \p Str, or npos if not
   /// found.
-  size_t find(StringRef Str, size_t From = 0) const {
+  [[nodiscard]] size_t find(StringRef Str, size_t From = 0) const {
     return str().find(Str, From);
   }
 
@@ -150,7 +150,7 @@ class SmallString : public SmallVector<char, InternalLen> {
   ///
   /// \returns The index of the last occurrence of \p C, or npos if not
   /// found.
-  size_t rfind(char C, size_t From = StringRef::npos) const {
+  [[nodiscard]] size_t rfind(char C, size_t From = StringRef::npos) const {
     return str().rfind(C, From);
   }
 
@@ -158,13 +158,11 @@ class SmallString : public SmallVector<char, InternalLen> {
   ///
   /// \returns The index of the last occurrence of \p Str, or npos if not
   /// found.
-  size_t rfind(StringRef Str) const {
-    return str().rfind(Str);
-  }
+  [[nodiscard]] size_t rfind(StringRef Str) const { return str().rfind(Str); }
 
   /// Find the first character in the string that is \p C, or npos if not
   /// found. Same as find.
-  size_t find_first_of(char C, size_t From = 0) const {
+  [[nodiscard]] size_t find_first_of(char C, size_t From = 0) const {
     return str().find_first_of(C, From);
   }
 
@@ -172,13 +170,13 @@ class SmallString : public SmallVector<char, InternalLen> {
   /// not found.
   ///
   /// Complexity: O(size() + Chars.size())
-  size_t find_first_of(StringRef Chars, size_t From = 0) const {
+  [[nodiscard]] size_t find_first_of(StringRef Chars, size_t From = 0) const {
     return str().find_first_of(Chars, From);
   }
 
   /// Find the first character in the string that is not \p C or npos if not
   /// found.
-  size_t find_first_not_of(char C, size_t From = 0) const {
+  [[nodiscard]] size_t find_first_not_of(char C, size_t From = 0) const {
     return str().find_first_not_of(C, From);
   }
 
@@ -186,13 +184,15 @@ class SmallString : public SmallVector<char, InternalLen> {
   /// \p Chars, or npos if not found.
   ///
   /// Complexity: O(size() + Chars.size())
-  size_t find_first_not_of(StringRef Chars, size_t From = 0) const {
+  [[nodiscard]] size_t find_first_not_of(StringRef Chars,
+                                         size_t From = 0) const {
     return str().find_first_not_of(Chars, From);
   }
 
   /// Find the last character in the string that is \p C, or npos if not
   /// found.
-  size_t find_last_of(char C, size_t From = StringRef::npos) const {
+  [[nodiscard]] size_t find_last_of(char C,
+                                    size_t From = StringRef::npos) const {
     return str().find_last_of(C, From);
   }
 
@@ -200,8 +200,8 @@ class SmallString : public SmallVector<char, InternalLen> {
   /// found.
   ///
   /// Complexity: O(size() + Chars.size())
-  size_t find_last_of(
-      StringRef Chars, size_t From = StringRef::npos) const {
+  [[nodiscard]] size_t find_last_of(StringRef Chars,
+                                    size_t From = StringRef::npos) const {
     return str().find_last_of(Chars, From);
   }
 
@@ -210,15 +210,11 @@ class SmallString : public SmallVector<char, InternalLen> {
   /// @{
 
   /// Return the number of occurrences of \p C in the string.
-  size_t count(char C) const {
-    return str().count(C);
-  }
+  [[nodiscard]] size_t count(char C) const { return str().count(C); }
 
   /// Return the number of non-overlapped occurrences of \p Str in the
   /// string.
-  size_t count(StringRef Str) const {
-    return str().count(Str);
-  }
+  [[nodiscard]] size_t count(StringRef Str) const { return str().count(Str); }
 
   /// @}
   /// @name Substring Operations
@@ -233,7 +229,8 @@ class SmallString : public SmallVector<char, InternalLen> {
   /// \param N The number of characters to included in the substring. If \p N
   /// exceeds the number of characters remaining in the string, the string
   /// suffix (starting with \p Start) will be returned.
-  StringRef substr(size_t Start, size_t N = StringRef::npos) const {
+  [[nodiscard]] StringRef substr(size_t Start,
+                                 size_t N = StringRef::npos) const {
     return str().substr(Start, N);
   }
 
@@ -247,14 +244,16 @@ class SmallString : public SmallVector<char, InternalLen> {
   /// substring. If this is npos, or less than \p Start, or exceeds the
   /// number of characters remaining in the string, the string suffix
   /// (starting with \p Start) will be returned.
-  StringRef slice(size_t Start, size_t End) const {
+  [[nodiscard]] StringRef slice(size_t Start, size_t End) const {
     return str().slice(Start, End);
   }
 
   // Extra methods.
 
   /// Explicit conversion to StringRef.
-  StringRef str() const { return StringRef(this->data(), this->size()); }
+  [[nodiscard]] StringRef str() const {
+    return StringRef(this->data(), this->size());
+  }
 
   // TODO: Make this const, if it's safe...
   const char* c_str() {

From 8c24422cd4a5ec458950e135f62d9b14a96e75cc Mon Sep 17 00:00:00 2001
From: Da-Viper <57949090+Da-Viper@users.noreply.github.com>
Date: Mon, 25 Dec 2023 18:59:08 +0000
Subject: [PATCH 266/342] [clang-tidy] add std::span to the default types.
 (#76116)

Change default configuration of readability-simplify-subscript-expr to include std::span.
Fixes #75687
---
 .../clang-tidy/readability/SimplifySubscriptExprCheck.cpp      | 3 ++-
 .../clang-tidy/checks/readability/simplify-subscript-expr.rst  | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/clang-tools-extra/clang-tidy/readability/SimplifySubscriptExprCheck.cpp b/clang-tools-extra/clang-tidy/readability/SimplifySubscriptExprCheck.cpp
index d274abcbfabe8..7d4698d27ed16 100644
--- a/clang-tools-extra/clang-tidy/readability/SimplifySubscriptExprCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/SimplifySubscriptExprCheck.cpp
@@ -16,7 +16,8 @@ using namespace clang::ast_matchers;
 namespace clang::tidy::readability {
 
 static const char KDefaultTypes[] =
-    "::std::basic_string;::std::basic_string_view;::std::vector;::std::array";
+    "::std::basic_string;::std::basic_string_view;::std::vector;::std::array;::"
+    "std::span";
 
 SimplifySubscriptExprCheck::SimplifySubscriptExprCheck(
     StringRef Name, ClangTidyContext *Context)
diff --git a/clang-tools-extra/docs/clang-tidy/checks/readability/simplify-subscript-expr.rst b/clang-tools-extra/docs/clang-tidy/checks/readability/simplify-subscript-expr.rst
index f3f44bedcf74c..4b7d7f2ddcf41 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/readability/simplify-subscript-expr.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/readability/simplify-subscript-expr.rst
@@ -20,4 +20,4 @@ Options
 .. option:: Types
 
    The list of type(s) that triggers this check. Default is
-   `::std::basic_string;::std::basic_string_view;::std::vector;::std::array`
+   `::std::basic_string;::std::basic_string_view;::std::vector;::std::array;::std::span`

From 76243adc4699314c7569ba3c0610ecd187228291 Mon Sep 17 00:00:00 2001
From: Jacek Caban <jacek@codeweavers.com>
Date: Mon, 25 Dec 2023 20:31:08 +0100
Subject: [PATCH 267/342] [LLD][COFF] Merge .wowthk section to .text. (#76254)

.wowthk section is used by the compiler for ARM64EC entry thunks.
---
 lld/COFF/Driver.cpp          |  3 +++
 lld/test/COFF/merge-wowthk.s | 44 ++++++++++++++++++++++++++++++++++++
 2 files changed, 47 insertions(+)
 create mode 100644 lld/test/COFF/merge-wowthk.s

diff --git a/lld/COFF/Driver.cpp b/lld/COFF/Driver.cpp
index 4f11affb35ed3..cd2985b035bc0 100644
--- a/lld/COFF/Driver.cpp
+++ b/lld/COFF/Driver.cpp
@@ -1951,6 +1951,9 @@ void LinkerDriver::linkerMain(ArrayRef<const char *> argsArr) {
   parseMerge(".00cfg=.rdata");
   parseMerge(".bss=.data");
 
+  if (isArm64EC(config->machine))
+    parseMerge(".wowthk=.text");
+
   if (config->mingw) {
     parseMerge(".ctors=.rdata");
     parseMerge(".dtors=.rdata");
diff --git a/lld/test/COFF/merge-wowthk.s b/lld/test/COFF/merge-wowthk.s
new file mode 100644
index 0000000000000..a1358fc30e4c5
--- /dev/null
+++ b/lld/test/COFF/merge-wowthk.s
@@ -0,0 +1,44 @@
+// REQUIRES: aarch64
+
+// RUN: llvm-mc -filetype=obj -triple=arm64ec-windows %s -o %t-arm64ec.obj
+// RUN: llvm-mc -filetype=obj -triple=aarch64-windows %s -o %t-arm64.obj
+// RUN: llvm-mc -filetype=obj -triple=arm64ec-windows %S/Inputs/loadconfig-arm64ec.s -o %t-loadcfg.obj
+
+// Check that .wowthk section is merged into .text on ARM64EC target.
+
+// RUN: lld-link -out:%t.dll -machine:arm64ec %t-arm64ec.obj %t-loadcfg.obj -dll -noentry
+// RUN: llvm-objdump -d %t.dll | FileCheck  -check-prefix=DISASM %s
+// DISASM:      0000000180001000 <.text>:
+// DISASM-NEXT: 180001000: 52800040     mov     w0, #0x2                // =2
+// DISASM-NEXT: 180001004: d65f03c0     ret
+// DISASM-NEXT: 180001008: 52800060     mov     w0, #0x3                // =3
+// DISASM-NEXT: 18000100c: d65f03c0     ret
+
+// Check that .wowthk section is not merged on aarch64 target.
+
+// RUN: lld-link -out:%t.dll -machine:arm64 %t-arm64.obj -dll -noentry
+// RUN: llvm-objdump -d %t.dll | FileCheck -check-prefix=DISASM2 %s
+// DISASM2:      0000000180001000 <.text>:
+// DISASM2-NEXT: 180001000: 52800040     mov     w0, #0x2                // =2
+// DISASM2-NEXT: 180001004: d65f03c0     ret
+// DISASM2-EMPTY:
+// DISASM2-NEXT: Disassembly of section .wowthk:
+// DISASM2-EMPTY:
+// DISASM2-NEXT: 0000000180002000 <.wowthk>:
+// DISASM2-NEXT: 180002000: 52800060     mov     w0, #0x3                // =3
+// DISASM2-NEXT: 180002004: d65f03c0     ret
+
+
+        .text
+        .globl arm64ec_func_sym
+        .p2align 2, 0x0
+arm64ec_func_sym:
+        mov w0, #2
+        ret
+
+        .section .wowthk$aa, "x"
+        .globl wowthk_sym
+        .p2align 3, 0x0
+wowthk_sym:
+        mov w0, #3
+        ret

From f5f2c313ae3dfe29638a3794f75a8ad3ccbc5f4e Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Mon, 25 Dec 2023 12:32:59 -0800
Subject: [PATCH 268/342] [llvm] Use StringRef::consume_front (NFC)

---
 llvm/lib/ExecutionEngine/Orc/Debugging/DebugInfoSupport.cpp | 3 +--
 llvm/lib/MC/WasmObjectWriter.cpp                            | 3 +--
 llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp               | 6 ++----
 llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp              | 3 +--
 llvm/lib/Target/X86/X86InsertPrefetch.cpp                   | 3 +--
 llvm/lib/TargetParser/ARMTargetParser.cpp                   | 6 +-----
 llvm/lib/TargetParser/Triple.cpp                            | 3 +--
 llvm/tools/llvm-ar/llvm-ar.cpp                              | 3 +--
 llvm/tools/llvm-diff/llvm-diff.cpp                          | 3 +--
 9 files changed, 10 insertions(+), 23 deletions(-)

diff --git a/llvm/lib/ExecutionEngine/Orc/Debugging/DebugInfoSupport.cpp b/llvm/lib/ExecutionEngine/Orc/Debugging/DebugInfoSupport.cpp
index f65ec27ff8756..5a058bd712a3e 100644
--- a/llvm/lib/ExecutionEngine/Orc/Debugging/DebugInfoSupport.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/Debugging/DebugInfoSupport.cpp
@@ -105,8 +105,7 @@ llvm::orc::createDWARFContext(LinkGraph &G) {
       auto SecData = getSectionData(Sec);
       auto Name = Sec.getName();
       // DWARFContext expects the section name to not start with a dot
-      if (Name.starts_with("."))
-        Name = Name.drop_front();
+      Name.consume_front(".");
       LLVM_DEBUG(dbgs() << "Creating DWARFContext section " << Name
                         << " with size " << SecData.size() << "\n");
       DWARFSectionData[Name] =
diff --git a/llvm/lib/MC/WasmObjectWriter.cpp b/llvm/lib/MC/WasmObjectWriter.cpp
index fd48d5080ff61..e43f111113b40 100644
--- a/llvm/lib/MC/WasmObjectWriter.cpp
+++ b/llvm/lib/MC/WasmObjectWriter.cpp
@@ -1526,8 +1526,7 @@ uint64_t WasmObjectWriter::writeOneObject(MCAssembler &Asm,
       StringRef Name = SectionName;
 
       // For user-defined custom sections, strip the prefix
-      if (Name.starts_with(".custom_section."))
-        Name = Name.substr(strlen(".custom_section."));
+      Name.consume_front(".custom_section.");
 
       MCSymbol *Begin = Sec.getBeginSymbol();
       if (Begin) {
diff --git a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
index 3a34a0bfae46e..6c009b9e8ddef 100644
--- a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
@@ -959,8 +959,7 @@ SPIRVType *SPIRVGlobalRegistry::getOrCreateSPIRVTypeByName(
   // N is the number of elements of the vector.
   Type *Ty;
 
-  if (TypeStr.starts_with("atomic_"))
-    TypeStr = TypeStr.substr(strlen("atomic_"));
+  TypeStr.consume_front("atomic_");
 
   if (TypeStr.starts_with("void")) {
     Ty = Type::getVoidTy(Ctx);
@@ -1007,8 +1006,7 @@ SPIRVType *SPIRVGlobalRegistry::getOrCreateSPIRVTypeByName(
   // Handle "typeN*" or  "type vector[N]*".
   bool IsPtrToVec = TypeStr.consume_back("*");
 
-  if (TypeStr.starts_with(" vector[")) {
-    TypeStr = TypeStr.substr(strlen(" vector["));
+  if (TypeStr.consume_front(" vector[")) {
     TypeStr = TypeStr.substr(0, TypeStr.find(']'));
   }
   TypeStr.getAsInteger(10, VecElts);
diff --git a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
index bc5f562d95893..5a231a046e931 100644
--- a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -2312,8 +2312,7 @@ bool X86AsmParser::ParseIntelDotOperator(IntelExprStateMachine &SM,
 
   // Drop the optional '.'.
   StringRef DotDispStr = Tok.getString();
-  if (DotDispStr.starts_with("."))
-    DotDispStr = DotDispStr.drop_front(1);
+  DotDispStr.consume_front(".");
   StringRef TrailingDot;
 
   // .Imm gets lexed as a real.
diff --git a/llvm/lib/Target/X86/X86InsertPrefetch.cpp b/llvm/lib/Target/X86/X86InsertPrefetch.cpp
index 6c23928228d21..9aa70dff5f932 100644
--- a/llvm/lib/Target/X86/X86InsertPrefetch.cpp
+++ b/llvm/lib/Target/X86/X86InsertPrefetch.cpp
@@ -135,8 +135,7 @@ bool X86InsertPrefetch::findPrefetchInfo(const FunctionSamples *TopSamples,
       int64_t D = static_cast<int64_t>(S_V.second);
       unsigned IID = 0;
       for (const auto &HintType : HintTypes) {
-        if (Name.starts_with(HintType.first)) {
-          Name = Name.drop_front(HintType.first.size());
+        if (Name.consume_front(HintType.first)) {
           IID = HintType.second;
           break;
         }
diff --git a/llvm/lib/TargetParser/ARMTargetParser.cpp b/llvm/lib/TargetParser/ARMTargetParser.cpp
index 27d168020ce60..ce640f5b8d45c 100644
--- a/llvm/lib/TargetParser/ARMTargetParser.cpp
+++ b/llvm/lib/TargetParser/ARMTargetParser.cpp
@@ -348,11 +348,7 @@ StringRef ARM::getArchExtName(uint64_t ArchExtKind) {
 }
 
 static bool stripNegationPrefix(StringRef &Name) {
-  if (Name.starts_with("no")) {
-    Name = Name.substr(2);
-    return true;
-  }
-  return false;
+  return Name.consume_front("no");
 }
 
 StringRef ARM::getArchExtFeature(StringRef ArchExt) {
diff --git a/llvm/lib/TargetParser/Triple.cpp b/llvm/lib/TargetParser/Triple.cpp
index d475650c2d18c..e93502187b549 100644
--- a/llvm/lib/TargetParser/Triple.cpp
+++ b/llvm/lib/TargetParser/Triple.cpp
@@ -1208,8 +1208,7 @@ static VersionTuple parseVersionFromName(StringRef Name) {
 VersionTuple Triple::getEnvironmentVersion() const {
   StringRef EnvironmentName = getEnvironmentName();
   StringRef EnvironmentTypeName = getEnvironmentTypeName(getEnvironment());
-  if (EnvironmentName.starts_with(EnvironmentTypeName))
-    EnvironmentName = EnvironmentName.substr(EnvironmentTypeName.size());
+  EnvironmentName.consume_front(EnvironmentTypeName);
 
   return parseVersionFromName(EnvironmentName);
 }
diff --git a/llvm/tools/llvm-ar/llvm-ar.cpp b/llvm/tools/llvm-ar/llvm-ar.cpp
index fcb6392a1d955..299b7856ec0ba 100644
--- a/llvm/tools/llvm-ar/llvm-ar.cpp
+++ b/llvm/tools/llvm-ar/llvm-ar.cpp
@@ -1287,8 +1287,7 @@ static const char *matchFlagWithArg(StringRef Expected,
                                     ArrayRef<const char *> Args) {
   StringRef Arg = *ArgIt;
 
-  if (Arg.starts_with("--"))
-    Arg = Arg.substr(2);
+  Arg.consume_front("--");
 
   size_t len = Expected.size();
   if (Arg == Expected) {
diff --git a/llvm/tools/llvm-diff/llvm-diff.cpp b/llvm/tools/llvm-diff/llvm-diff.cpp
index 6fe18a51c9f55..3e77b1ed89b04 100644
--- a/llvm/tools/llvm-diff/llvm-diff.cpp
+++ b/llvm/tools/llvm-diff/llvm-diff.cpp
@@ -42,8 +42,7 @@ static std::unique_ptr<Module> readModule(LLVMContext &Context,
 static void diffGlobal(DifferenceEngine &Engine, Module &L, Module &R,
                        StringRef Name) {
   // Drop leading sigils from the global name.
-  if (Name.starts_with("@"))
-    Name = Name.substr(1);
+  Name.consume_front("@");
 
   Function *LFn = L.getFunction(Name);
   Function *RFn = R.getFunction(Name);

From 9e98f8d7ac11c63768b1ed69c11ea75c8b794063 Mon Sep 17 00:00:00 2001
From: Piotr Zegar <me@piotrzegar.pl>
Date: Mon, 25 Dec 2023 18:57:25 +0000
Subject: [PATCH 269/342] [clng-tidy][NFC] Update documentation for
 readability-simplify-subscript-expr

Add release notes and mention ::std::span in documentation.
Change is related to #76116.
---
 clang-tools-extra/docs/ReleaseNotes.rst | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index 6e7554e0433c2..c843efac754ce 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -458,6 +458,10 @@ Changes in existing checks
   <clang-tidy/checks/readability/non-const-parameter>` check to ignore
   false-positives in initializer list of record.
 
+- Improved :doc:`readability-simplify-subscript-expr
+  <clang-tidy/checks/readability/simplify-subscript-expr>` check by extending
+  the default value of the `Types` option to include ``std::span``.
+
 - Improved :doc:`readability-static-accessed-through-instance
   <clang-tidy/checks/readability/static-accessed-through-instance>` check to
   identify calls to static member functions with out-of-class inline definitions.

From 68f832f56da1af0e5fc77003f640648ec7d901ad Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Mon, 25 Dec 2023 12:54:35 -0800
Subject: [PATCH 270/342] [clang] Use StringRef::consume_front (NFC)

---
 clang/lib/CodeGen/CodeGenAction.cpp                         | 3 +--
 clang/lib/Driver/ToolChains/CommonArgs.cpp                  | 3 +--
 clang/lib/Frontend/DependencyGraph.cpp                      | 3 +--
 clang/lib/Frontend/VerifyDiagnosticConsumer.cpp             | 3 +--
 clang/lib/Sema/SemaChecking.cpp                             | 4 ++--
 clang/lib/Sema/SemaDeclAttr.cpp                             | 3 +--
 .../lib/StaticAnalyzer/Checkers/CheckSecuritySyntaxOnly.cpp | 6 ++----
 clang/lib/Tooling/Refactoring/Lookup.cpp                    | 4 ++--
 clang/lib/Tooling/Tooling.cpp                               | 4 +---
 9 files changed, 12 insertions(+), 21 deletions(-)

diff --git a/clang/lib/CodeGen/CodeGenAction.cpp b/clang/lib/CodeGen/CodeGenAction.cpp
index 753a8fd74fa69..f8038497d90a7 100644
--- a/clang/lib/CodeGen/CodeGenAction.cpp
+++ b/clang/lib/CodeGen/CodeGenAction.cpp
@@ -1139,8 +1139,7 @@ CodeGenAction::loadModule(MemoryBufferRef MBRef) {
 
   // Strip off a leading diagnostic code if there is one.
   StringRef Msg = Err.getMessage();
-  if (Msg.starts_with("error: "))
-    Msg = Msg.substr(7);
+  Msg.consume_front("error: ");
 
   unsigned DiagID =
       CI.getDiagnostics().getCustomDiagID(DiagnosticsEngine::Error, "%0");
diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp
index 3b29e1bc75850..2340191ca97d9 100644
--- a/clang/lib/Driver/ToolChains/CommonArgs.cpp
+++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp
@@ -2388,8 +2388,7 @@ static void GetSDLFromOffloadArchive(
       FoundAOB = true;
     }
   } else {
-    if (Lib.starts_with("-l"))
-      Lib = Lib.drop_front(2);
+    Lib.consume_front("-l");
     for (auto LPath : LibraryPaths) {
       ArchiveOfBundles.clear();
       auto LibFile = (Lib.starts_with(":") ? Lib.drop_front()
diff --git a/clang/lib/Frontend/DependencyGraph.cpp b/clang/lib/Frontend/DependencyGraph.cpp
index e96669f856bb1..b471471f3528a 100644
--- a/clang/lib/Frontend/DependencyGraph.cpp
+++ b/clang/lib/Frontend/DependencyGraph.cpp
@@ -110,8 +110,7 @@ void DependencyGraphCallback::OutputGraphFile() {
     writeNodeReference(OS, AllFiles[I]);
     OS << " [ shape=\"box\", label=\"";
     StringRef FileName = AllFiles[I].getName();
-    if (FileName.starts_with(SysRoot))
-      FileName = FileName.substr(SysRoot.size());
+    FileName.consume_front(SysRoot);
 
     OS << DOT::EscapeString(std::string(FileName)) << "\"];\n";
   }
diff --git a/clang/lib/Frontend/VerifyDiagnosticConsumer.cpp b/clang/lib/Frontend/VerifyDiagnosticConsumer.cpp
index 09c1460d54e1d..8a3d2286cd168 100644
--- a/clang/lib/Frontend/VerifyDiagnosticConsumer.cpp
+++ b/clang/lib/Frontend/VerifyDiagnosticConsumer.cpp
@@ -1144,8 +1144,7 @@ std::unique_ptr<Directive> Directive::create(bool RegexKind,
   std::string RegexStr;
   StringRef S = Text;
   while (!S.empty()) {
-    if (S.starts_with("{{")) {
-      S = S.drop_front(2);
+    if (S.consume_front("{{")) {
       size_t RegexMatchLength = S.find("}}");
       assert(RegexMatchLength != StringRef::npos);
       // Append the regex, enclosed in parentheses.
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index 2a69325f02951..66dac99b8d992 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -1219,8 +1219,8 @@ void Sema::checkFortifiedBuiltinMemoryFunction(FunctionDecl *FD,
     if (IsChkVariant) {
       FunctionName = FunctionName.drop_front(std::strlen("__builtin___"));
       FunctionName = FunctionName.drop_back(std::strlen("_chk"));
-    } else if (FunctionName.starts_with("__builtin_")) {
-      FunctionName = FunctionName.drop_front(std::strlen("__builtin_"));
+    } else {
+      FunctionName.consume_front("__builtin_");
     }
     return FunctionName;
   };
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index af8b90ecfed97..4a385a396fa62 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -5825,8 +5825,7 @@ struct IntrinToName {
 static bool ArmBuiltinAliasValid(unsigned BuiltinID, StringRef AliasName,
                                  ArrayRef<IntrinToName> Map,
                                  const char *IntrinNames) {
-  if (AliasName.starts_with("__arm_"))
-    AliasName = AliasName.substr(6);
+  AliasName.consume_front("__arm_");
   const IntrinToName *It =
       llvm::lower_bound(Map, BuiltinID, [](const IntrinToName &L, unsigned Id) {
         return L.Id < Id;
diff --git a/clang/lib/StaticAnalyzer/Checkers/CheckSecuritySyntaxOnly.cpp b/clang/lib/StaticAnalyzer/Checkers/CheckSecuritySyntaxOnly.cpp
index afc5e6b48008d..ce05d2d3c9058 100644
--- a/clang/lib/StaticAnalyzer/Checkers/CheckSecuritySyntaxOnly.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/CheckSecuritySyntaxOnly.cpp
@@ -140,8 +140,7 @@ void WalkAST::VisitCallExpr(CallExpr *CE) {
   if (!II)   // if no identifier, not a simple C function
     return;
   StringRef Name = II->getName();
-  if (Name.starts_with("__builtin_"))
-    Name = Name.substr(10);
+  Name.consume_front("__builtin_");
 
   // Set the evaluation function by switching on the callee name.
   FnCheck evalFunction =
@@ -763,8 +762,7 @@ void WalkAST::checkDeprecatedOrUnsafeBufferHandling(const CallExpr *CE,
   enum { DEPR_ONLY = -1, UNKNOWN_CALL = -2 };
 
   StringRef Name = FD->getIdentifier()->getName();
-  if (Name.starts_with("__builtin_"))
-    Name = Name.substr(10);
+  Name.consume_front("__builtin_");
 
   int ArgIndex =
       llvm::StringSwitch<int>(Name)
diff --git a/clang/lib/Tooling/Refactoring/Lookup.cpp b/clang/lib/Tooling/Refactoring/Lookup.cpp
index 52799f16fab2a..757fba0404e62 100644
--- a/clang/lib/Tooling/Refactoring/Lookup.cpp
+++ b/clang/lib/Tooling/Refactoring/Lookup.cpp
@@ -98,8 +98,8 @@ static StringRef getBestNamespaceSubstr(const DeclContext *DeclA,
     // from NewName if it has an identical prefix.
     std::string NS =
         "::" + cast<NamespaceDecl>(DeclA)->getQualifiedNameAsString() + "::";
-    if (NewName.starts_with(NS))
-      return NewName.substr(NS.size());
+    if (NewName.consume_front(NS))
+      return NewName;
 
     // No match yet. Strip of a namespace from the end of the chain and try
     // again. This allows to get optimal qualifications even if the old and new
diff --git a/clang/lib/Tooling/Tooling.cpp b/clang/lib/Tooling/Tooling.cpp
index 33bfa8d3d81f1..d192c7f429396 100644
--- a/clang/lib/Tooling/Tooling.cpp
+++ b/clang/lib/Tooling/Tooling.cpp
@@ -255,9 +255,7 @@ llvm::Expected<std::string> getAbsolutePath(llvm::vfs::FileSystem &FS,
                                             StringRef File) {
   StringRef RelativePath(File);
   // FIXME: Should '.\\' be accepted on Win32?
-  if (RelativePath.starts_with("./")) {
-    RelativePath = RelativePath.substr(strlen("./"));
-  }
+  RelativePath.consume_front("./");
 
   SmallString<1024> AbsolutePath = RelativePath;
   if (auto EC = FS.makeAbsolute(AbsolutePath))

From 5c39b8d1a86cc0c92acd438d4799d19e67ae70db Mon Sep 17 00:00:00 2001
From: XinWang10 <108658776+XinWang10@users.noreply.github.com>
Date: Tue, 26 Dec 2023 10:41:33 +0800
Subject: [PATCH 271/342] [X86][MC] Support Enc/Dec for EGPR for promoted
 AMX-TILE instruction (#76210)

R16-R31 was added into GPRs in
https://github.com/llvm/llvm-project/pull/70958,
This patch supports the encoding/decoding for promoted AMX-TILE
instruction in EVEX space.

RFC:
https://discourse.llvm.org/t/rfc-design-for-apx-feature-egpr-and-ndd-support/73031/4
---
 llvm/lib/Target/X86/X86InstrAMX.td            | 68 +++++++++++--------
 .../test/MC/Disassembler/X86/apx/amx-tile.txt | 22 ++++++
 llvm/test/MC/X86/apx/amx-tile-att.s           | 24 +++++++
 llvm/test/MC/X86/apx/amx-tile-intel.s         | 21 ++++++
 4 files changed, 106 insertions(+), 29 deletions(-)
 create mode 100644 llvm/test/MC/Disassembler/X86/apx/amx-tile.txt
 create mode 100644 llvm/test/MC/X86/apx/amx-tile-att.s
 create mode 100644 llvm/test/MC/X86/apx/amx-tile-intel.s

diff --git a/llvm/lib/Target/X86/X86InstrAMX.td b/llvm/lib/Target/X86/X86InstrAMX.td
index 7f3e193d9a1b9..c47bee070e04f 100644
--- a/llvm/lib/Target/X86/X86InstrAMX.td
+++ b/llvm/lib/Target/X86/X86InstrAMX.td
@@ -14,35 +14,45 @@
 //===----------------------------------------------------------------------===//
 // AMX instructions
 
-let Predicates = [HasAMXTILE, In64BitMode] in {
-  let SchedRW = [WriteSystem] in {
-    let hasSideEffects = 1,
-        Defs = [TMM0,TMM1,TMM2,TMM3,TMM4,TMM5,TMM6,TMM7] in
-    def LDTILECFG : I <0x49, MRM0m, (outs), (ins opaquemem:$src),
-                       "ldtilecfg\t$src",
-                       [(int_x86_ldtilecfg addr:$src)]>, VEX, T8;
-    let hasSideEffects = 1 in
-    def STTILECFG : I <0x49, MRM0m, (outs), (ins opaquemem:$src),
-                       "sttilecfg\t$src",
-                       [(int_x86_sttilecfg addr:$src)]>, VEX, T8, PD;
-    let mayLoad = 1 in
-    def TILELOADD : I<0x4b, MRMSrcMemFSIB, (outs TILE:$dst),
-                      (ins sibmem:$src),
-                      "tileloadd\t{$src, $dst|$dst, $src}", []>,
-                      VEX, T8, XD;
-    let mayLoad = 1 in
-    def TILELOADDT1 : I<0x4b, MRMSrcMemFSIB, (outs TILE:$dst),
-                        (ins sibmem:$src),
-                        "tileloaddt1\t{$src, $dst|$dst, $src}", []>,
-                        VEX, T8, PD;
+multiclass AMX_TILE_COMMON<string Suffix, Predicate HasEGPR> {
+let Predicates = [HasAMXTILE, HasEGPR, In64BitMode] in {
+  let hasSideEffects = 1,
+      Defs = [TMM0,TMM1,TMM2,TMM3,TMM4,TMM5,TMM6,TMM7] in
+  def LDTILECFG#Suffix : I<0x49, MRM0m, (outs), (ins opaquemem:$src),
+                           "ldtilecfg\t$src",
+                           [(int_x86_ldtilecfg addr:$src)]>,
+                         T8, PS;
+  let hasSideEffects = 1 in
+  def STTILECFG#Suffix : I<0x49, MRM0m, (outs), (ins opaquemem:$src),
+                           "sttilecfg\t$src",
+                           [(int_x86_sttilecfg addr:$src)]>,
+                         T8, PD;
+  let mayLoad = 1 in
+  def TILELOADD#Suffix : I<0x4b, MRMSrcMemFSIB, (outs TILE:$dst),
+                           (ins sibmem:$src),
+                           "tileloadd\t{$src, $dst|$dst, $src}", []>,
+                         T8, XD;
+  let mayLoad = 1 in
+  def TILELOADDT1#Suffix : I<0x4b, MRMSrcMemFSIB, (outs TILE:$dst),
+                             (ins sibmem:$src),
+                             "tileloaddt1\t{$src, $dst|$dst, $src}", []>,
+                           T8, PD;
+  let mayStore = 1 in
+  def TILESTORED#Suffix : I<0x4b, MRMDestMemFSIB, (outs),
+                            (ins sibmem:$dst, TILE:$src),
+                            "tilestored\t{$src, $dst|$dst, $src}", []>,
+                          T8, XS;
+}
+}
+
+let SchedRW = [WriteSystem] in {
+  defm "" : AMX_TILE_COMMON<"", NoEGPR>, VEX;
+  defm "" : AMX_TILE_COMMON<"_EVEX", HasEGPR>, EVEX, NoCD8;
+
+  let Predicates = [HasAMXTILE, In64BitMode] in {
     let Defs = [TMM0,TMM1,TMM2,TMM3,TMM4,TMM5,TMM6,TMM7] in
     def TILERELEASE : I<0x49, MRM_C0, (outs), (ins),
-                        "tilerelease", [(int_x86_tilerelease)]>, VEX, T8;
-    let mayStore = 1 in
-    def TILESTORED : I<0x4b, MRMDestMemFSIB, (outs),
-                       (ins sibmem:$dst, TILE:$src),
-                       "tilestored\t{$src, $dst|$dst, $src}", []>,
-                       VEX, T8, XS;
+                        "tilerelease", [(int_x86_tilerelease)]>, VEX, T8, PS;
     def TILEZERO : I<0x49, MRMr0, (outs TILE:$dst), (ins),
                      "tilezero\t$dst", []>,
                      VEX, T8, XD;
@@ -82,8 +92,8 @@ let Predicates = [HasAMXTILE, In64BitMode] in {
       def PTILEZERO : PseudoI<(outs), (ins u8imm:$src),
                               [(int_x86_tilezero timm:$src)]>;
     }
-  } // SchedRW
-} // HasAMXTILE
+  } // Predicates
+} // SchedRW
 
 let Predicates = [HasAMXINT8, In64BitMode] in {
   let SchedRW = [WriteSystem] in {
diff --git a/llvm/test/MC/Disassembler/X86/apx/amx-tile.txt b/llvm/test/MC/Disassembler/X86/apx/amx-tile.txt
new file mode 100644
index 0000000000000..960c40cfc4b15
--- /dev/null
+++ b/llvm/test/MC/Disassembler/X86/apx/amx-tile.txt
@@ -0,0 +1,22 @@
+# RUN: llvm-mc --disassemble %s -triple=x86_64 | FileCheck %s --check-prefixes=ATT
+# RUN: llvm-mc --disassemble %s -triple=x86_64 -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck %s --check-prefixes=INTEL
+
+# ATT:   ldtilecfg	291(%r28,%r29,4)
+# INTEL: ldtilecfg	[r28 + 4*r29 + 291]
+0x62,0x9a,0x78,0x08,0x49,0x84,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   sttilecfg	291(%r28,%r29,4)
+# INTEL: sttilecfg	[r28 + 4*r29 + 291]
+0x62,0x9a,0x79,0x08,0x49,0x84,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   tileloadd	291(%r28,%r29,4), %tmm6
+# INTEL: tileloadd	tmm6, [r28 + 4*r29 + 291]
+0x62,0x9a,0x7b,0x08,0x4b,0xb4,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   tileloaddt1	291(%r28,%r29,4), %tmm6
+# INTEL: tileloaddt1	tmm6, [r28 + 4*r29 + 291]
+0x62,0x9a,0x79,0x08,0x4b,0xb4,0xac,0x23,0x01,0x00,0x00
+
+# ATT:   tilestored	%tmm6, 291(%r28,%r29,4)
+# INTEL: tilestored	[r28 + 4*r29 + 291], tmm6
+0x62,0x9a,0x7a,0x08,0x4b,0xb4,0xac,0x23,0x01,0x00,0x00
diff --git a/llvm/test/MC/X86/apx/amx-tile-att.s b/llvm/test/MC/X86/apx/amx-tile-att.s
new file mode 100644
index 0000000000000..f4a47c16d1939
--- /dev/null
+++ b/llvm/test/MC/X86/apx/amx-tile-att.s
@@ -0,0 +1,24 @@
+# RUN: llvm-mc -triple x86_64 --show-encoding %s | FileCheck %s
+# RUN: not llvm-mc -triple i386 -show-encoding %s 2>&1 | FileCheck %s --check-prefix=ERROR
+
+# ERROR-COUNT-5: error:
+# ERROR-NOT: error:
+# CHECK: ldtilecfg	291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x9a,0x78,0x08,0x49,0x84,0xac,0x23,0x01,0x00,0x00]
+         ldtilecfg	291(%r28,%r29,4)
+
+# CHECK: sttilecfg	291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x9a,0x79,0x08,0x49,0x84,0xac,0x23,0x01,0x00,0x00]
+         sttilecfg	291(%r28,%r29,4)
+
+# CHECK: tileloadd	291(%r28,%r29,4), %tmm6
+# CHECK: encoding: [0x62,0x9a,0x7b,0x08,0x4b,0xb4,0xac,0x23,0x01,0x00,0x00]
+         tileloadd	291(%r28,%r29,4), %tmm6
+
+# CHECK: tileloaddt1	291(%r28,%r29,4), %tmm6
+# CHECK: encoding: [0x62,0x9a,0x79,0x08,0x4b,0xb4,0xac,0x23,0x01,0x00,0x00]
+         tileloaddt1	291(%r28,%r29,4), %tmm6
+
+# CHECK: tilestored	%tmm6, 291(%r28,%r29,4)
+# CHECK: encoding: [0x62,0x9a,0x7a,0x08,0x4b,0xb4,0xac,0x23,0x01,0x00,0x00]
+         tilestored	%tmm6, 291(%r28,%r29,4)
diff --git a/llvm/test/MC/X86/apx/amx-tile-intel.s b/llvm/test/MC/X86/apx/amx-tile-intel.s
new file mode 100644
index 0000000000000..dd7b87b1806c2
--- /dev/null
+++ b/llvm/test/MC/X86/apx/amx-tile-intel.s
@@ -0,0 +1,21 @@
+# RUN: llvm-mc -triple x86_64 -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s
+
+# CHECK: ldtilecfg	[r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x9a,0x78,0x08,0x49,0x84,0xac,0x23,0x01,0x00,0x00]
+         ldtilecfg	[r28 + 4*r29 + 291]
+
+# CHECK: sttilecfg	[r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x9a,0x79,0x08,0x49,0x84,0xac,0x23,0x01,0x00,0x00]
+         sttilecfg	[r28 + 4*r29 + 291]
+
+# CHECK: tileloadd	tmm6, [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x9a,0x7b,0x08,0x4b,0xb4,0xac,0x23,0x01,0x00,0x00]
+         tileloadd	tmm6, [r28 + 4*r29 + 291]
+
+# CHECK: tileloaddt1	tmm6, [r28 + 4*r29 + 291]
+# CHECK: encoding: [0x62,0x9a,0x79,0x08,0x4b,0xb4,0xac,0x23,0x01,0x00,0x00]
+         tileloaddt1	tmm6, [r28 + 4*r29 + 291]
+
+# CHECK: tilestored	[r28 + 4*r29 + 291], tmm6
+# CHECK: encoding: [0x62,0x9a,0x7a,0x08,0x4b,0xb4,0xac,0x23,0x01,0x00,0x00]
+         tilestored	[r28 + 4*r29 + 291], tmm6

From f9e23991a7bb99eca338c4c3aae842386ae5190f Mon Sep 17 00:00:00 2001
From: Shengchen Kan <shengchen.kan@intel.com>
Date: Tue, 26 Dec 2023 12:13:10 +0800
Subject: [PATCH 272/342] [X86][NFC] Remove class ADCOXOp(RR|RM) and redundant
 let statements

1. Remove these two classes b/c opcode is changed from 0xF6 to 0x66
   after promotion, then the classes become useless.
2. Remove `OpSize = OpSizeFixed` b/c the default value is OpSizeFixed.
3. Remove `let isCommutable = 1` b/c ADCX/ADOX is not VEX-encoding,
   we can not apply VEX3ToVEX2 optimization for it and the compiler
   never emits it.
4. Remove predicate `HasADX` due to no pattern

This patch is to extract the NFC in #76319 into a separate commit.
---
 llvm/lib/Target/X86/X86InstrArithmetic.td | 34 +++++++++--------------
 llvm/lib/Target/X86/X86InstrPredicates.td |  1 -
 2 files changed, 13 insertions(+), 22 deletions(-)

diff --git a/llvm/lib/Target/X86/X86InstrArithmetic.td b/llvm/lib/Target/X86/X86InstrArithmetic.td
index 936db48bb9df4..71abd03044c82 100644
--- a/llvm/lib/Target/X86/X86InstrArithmetic.td
+++ b/llvm/lib/Target/X86/X86InstrArithmetic.td
@@ -1188,25 +1188,17 @@ let Uses = [RDX] in
 //
 // We don't have patterns for these as there is no advantage over ADC for
 // most code.
-class ADCOXOpRR <string m, X86TypeInfo t>
-  : BinOpRRF_RF<0xF6, m, t, null_frag> {
-  let Form = MRMSrcReg;
-  let isCommutable = 1;
-}
-
-class ADCOXOpRM <string m, X86TypeInfo t>
-  : BinOpRMF_RF<0xF6, m, t, null_frag> {
-  let Form = MRMSrcMem;
-}
-
-let OpSize = OpSizeFixed, Constraints = "$src1 = $dst",
-    Predicates = [HasADX] in {
-def ADCX32rr : ADCOXOpRR<"adcx", Xi32>, T8, PD;
-def ADCX64rr : ADCOXOpRR<"adcx", Xi64>, T8, PD;
-def ADOX32rr : ADCOXOpRR<"adox", Xi32>, T8, XS;
-def ADOX64rr : ADCOXOpRR<"adox", Xi64>, T8, XS;
-def ADCX32rm : ADCOXOpRM<"adcx", Xi32>, T8, PD;
-def ADCX64rm : ADCOXOpRM<"adcx", Xi64>, T8, PD;
-def ADOX32rm : ADCOXOpRM<"adox", Xi32>, T8, XS;
-def ADOX64rm : ADCOXOpRM<"adox", Xi64>, T8, XS;
+let Constraints = "$src1 = $dst" in {
+  let Form = MRMSrcReg in {
+  def ADCX32rr : BinOpRRF_RF<0xF6, "adcx", Xi32, null_frag>, T8, PD;
+  def ADCX64rr : BinOpRRF_RF<0xF6, "adcx", Xi64, null_frag>, T8, PD;
+  def ADOX32rr : BinOpRRF_RF<0xF6, "adox", Xi32, null_frag>, T8, XS;
+  def ADOX64rr : BinOpRRF_RF<0xF6, "adox", Xi64, null_frag>, T8, XS;
+  }
+  let Form = MRMSrcMem in {
+  def ADCX32rm : BinOpRMF_RF<0xF6, "adcx", Xi32, null_frag>, T8, PD;
+  def ADCX64rm : BinOpRMF_RF<0xF6, "adcx", Xi64, null_frag>, T8, PD;
+  def ADOX32rm : BinOpRMF_RF<0xF6, "adox", Xi32, null_frag>, T8, XS;
+  def ADOX64rm : BinOpRMF_RF<0xF6, "adox", Xi64, null_frag>, T8, XS;
+  }
 }
diff --git a/llvm/lib/Target/X86/X86InstrPredicates.td b/llvm/lib/Target/X86/X86InstrPredicates.td
index 8653f15d86028..df4bc38aa0b56 100644
--- a/llvm/lib/Target/X86/X86InstrPredicates.td
+++ b/llvm/lib/Target/X86/X86InstrPredicates.td
@@ -100,7 +100,6 @@ def HasIFMA      : Predicate<"Subtarget->hasIFMA()">;
 def HasAVXIFMA   : Predicate<"Subtarget->hasAVXIFMA()">;
 def NoVLX_Or_NoIFMA : Predicate<"!Subtarget->hasVLX() || !Subtarget->hasIFMA()">;
 def HasRTM       : Predicate<"Subtarget->hasRTM()">;
-def HasADX       : Predicate<"Subtarget->hasADX()">;
 def HasSHA       : Predicate<"Subtarget->hasSHA()">;
 def HasSHA512    : Predicate<"Subtarget->hasSHA512()">;
 def HasSGX       : Predicate<"Subtarget->hasSGX()">;

From 64e63888dd8beae7272c0526e4770e31857dd0e7 Mon Sep 17 00:00:00 2001
From: Brandon Wu <brandon.wu@sifive.com>
Date: Tue, 26 Dec 2023 12:59:00 +0800
Subject: [PATCH 273/342] Recommit [RISCV] Update the interface of sifive
 vqmaccqoq (#74284) (#75768)

The

spec(https://sifive.cdn.prismic.io/sifive/60d5a660-3af0-49a3-a904-d2bbb1a21517_int8-matmul-spec.pdf)
is updated.
---
 .../clang/Basic/riscv_sifive_vector.td        | 26 ++++---
 .../non-overloaded/sf_vqmacc_4x8x4.c          | 24 +++----
 .../non-overloaded/sf_vqmaccsu_4x8x4.c        | 24 +++----
 .../non-overloaded/sf_vqmaccu_4x8x4.c         | 24 +++----
 .../non-overloaded/sf_vqmaccus_4x8x4.c        | 24 +++----
 .../non-policy/overloaded/sf_vqmacc_4x8x4.c   | 24 +++----
 .../non-policy/overloaded/sf_vqmaccsu_4x8x4.c | 24 +++----
 .../non-policy/overloaded/sf_vqmaccu_4x8x4.c  | 24 +++----
 .../non-policy/overloaded/sf_vqmaccus_4x8x4.c | 24 +++----
 .../policy/non-overloaded/sf_vqmacc_4x8x4.c   | 24 +++----
 .../policy/non-overloaded/sf_vqmaccsu_4x8x4.c | 24 +++----
 .../policy/non-overloaded/sf_vqmaccu_4x8x4.c  | 24 +++----
 .../policy/non-overloaded/sf_vqmaccus_4x8x4.c | 24 +++----
 .../policy/overloaded/sf_vqmacc_4x8x4.c       | 24 +++----
 .../policy/overloaded/sf_vqmaccsu_4x8x4.c     | 24 +++----
 .../policy/overloaded/sf_vqmaccu_4x8x4.c      | 24 +++----
 .../policy/overloaded/sf_vqmaccus_4x8x4.c     | 24 +++----
 clang/test/Sema/rvv-required-features.c       | 18 +----
 llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td    | 58 +++++++++------
 .../test/CodeGen/RISCV/rvv/sf_vqmacc_4x8x4.ll | 70 +++++++++----------
 .../CodeGen/RISCV/rvv/sf_vqmaccsu_4x8x4.ll    | 70 +++++++++----------
 .../CodeGen/RISCV/rvv/sf_vqmaccu_4x8x4.ll     | 70 +++++++++----------
 .../CodeGen/RISCV/rvv/sf_vqmaccus_4x8x4.ll    | 70 +++++++++----------
 23 files changed, 384 insertions(+), 382 deletions(-)

diff --git a/clang/include/clang/Basic/riscv_sifive_vector.td b/clang/include/clang/Basic/riscv_sifive_vector.td
index bb54e26641861..e19a34f7632fd 100644
--- a/clang/include/clang/Basic/riscv_sifive_vector.td
+++ b/clang/include/clang/Basic/riscv_sifive_vector.td
@@ -112,7 +112,7 @@ multiclass RVVVFWMACCBuiltinSet<list<list<string>> suffixes_prototypes> {
     defm NAME : RVVOutOp1Op2BuiltinSet<NAME, "b", suffixes_prototypes>;
 }
 
-multiclass RVVVQMACCBuiltinSet<list<list<string>> suffixes_prototypes> {
+multiclass RVVVQMACCDODBuiltinSet<list<list<string>> suffixes_prototypes> {
   let OverloadedName = NAME,
       Name = NAME,
       HasMasked = false,
@@ -120,6 +120,14 @@ multiclass RVVVQMACCBuiltinSet<list<list<string>> suffixes_prototypes> {
     defm NAME : RVVOutOp1Op2BuiltinSet<NAME, "i", suffixes_prototypes>;
 }
 
+multiclass RVVVQMACCQOQBuiltinSet<list<list<string>> suffixes_prototypes> {
+   let OverloadedName = NAME,
+       Name = NAME,
+       HasMasked = false,
+       Log2LMUL = [-1, 0, 1, 2] in
+     defm NAME : RVVOutOp1Op2BuiltinSet<NAME, "s", suffixes_prototypes>;
+}
+
 multiclass RVVVFNRCLIPBuiltinSet<string suffix, string prototype, string type_range> {
   let Log2LMUL = [-3, -2, -1, 0, 1, 2],
       Name = NAME,
@@ -130,18 +138,18 @@ multiclass RVVVFNRCLIPBuiltinSet<string suffix, string prototype, string type_ra
 
 let UnMaskedPolicyScheme = HasPolicyOperand in
   let RequiredFeatures = ["Xsfvqmaccdod"] in {
-    defm sf_vqmaccu_2x8x2 : RVVVQMACCBuiltinSet<[["", "v", "vv(FixedSEW:8)SUv(FixedSEW:8)Uv"]]>;
-    defm sf_vqmacc_2x8x2 : RVVVQMACCBuiltinSet<[["", "v", "vv(FixedSEW:8)Sv(FixedSEW:8)v"]]>;
-    defm sf_vqmaccus_2x8x2 : RVVVQMACCBuiltinSet<[["", "v", "vv(FixedSEW:8)SUv(FixedSEW:8)v"]]>;
-    defm sf_vqmaccsu_2x8x2 : RVVVQMACCBuiltinSet<[["", "v", "vv(FixedSEW:8)Sv(FixedSEW:8)Uv"]]>;
+    defm sf_vqmaccu_2x8x2 : RVVVQMACCDODBuiltinSet<[["", "v", "vv(FixedSEW:8)SUv(FixedSEW:8)Uv"]]>;
+    defm sf_vqmacc_2x8x2 : RVVVQMACCDODBuiltinSet<[["", "v", "vv(FixedSEW:8)Sv(FixedSEW:8)v"]]>;
+    defm sf_vqmaccus_2x8x2 : RVVVQMACCDODBuiltinSet<[["", "v", "vv(FixedSEW:8)SUv(FixedSEW:8)v"]]>;
+    defm sf_vqmaccsu_2x8x2 : RVVVQMACCDODBuiltinSet<[["", "v", "vv(FixedSEW:8)Sv(FixedSEW:8)Uv"]]>;
   }
 
 let UnMaskedPolicyScheme = HasPolicyOperand in
   let RequiredFeatures = ["Xsfvqmaccqoq"] in {
-    defm sf_vqmaccu_4x8x4 : RVVVQMACCBuiltinSet<[["", "v", "vv(FixedSEW:8)SUv(FixedSEW:8)Uv"]]>;
-    defm sf_vqmacc_4x8x4 : RVVVQMACCBuiltinSet<[["", "v", "vv(FixedSEW:8)Sv(FixedSEW:8)v"]]>;
-    defm sf_vqmaccus_4x8x4 : RVVVQMACCBuiltinSet<[["", "v", "vv(FixedSEW:8)SUv(FixedSEW:8)v"]]>;
-    defm sf_vqmaccsu_4x8x4 : RVVVQMACCBuiltinSet<[["", "v", "vv(FixedSEW:8)Sv(FixedSEW:8)Uv"]]>;
+    defm sf_vqmaccu_4x8x4 : RVVVQMACCQOQBuiltinSet<[["", "w", "ww(FixedSEW:8)SUv(FixedSEW:8)Uv"]]>;
+    defm sf_vqmacc_4x8x4 : RVVVQMACCQOQBuiltinSet<[["", "w", "ww(FixedSEW:8)Sv(FixedSEW:8)v"]]>;
+    defm sf_vqmaccus_4x8x4 : RVVVQMACCQOQBuiltinSet<[["", "w", "ww(FixedSEW:8)SUv(FixedSEW:8)v"]]>;
+    defm sf_vqmaccsu_4x8x4 : RVVVQMACCQOQBuiltinSet<[["", "w", "ww(FixedSEW:8)Sv(FixedSEW:8)Uv"]]>;
   }
 
 let UnMaskedPolicyScheme = HasPolicyOperand in
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/sf_vqmacc_4x8x4.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/sf_vqmacc_4x8x4.c
index 935cb2e007d35..80e1c443eb54b 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/sf_vqmacc_4x8x4.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/sf_vqmacc_4x8x4.c
@@ -7,41 +7,41 @@
 #include <sifive_vector.h>
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 2 x i32> @test_sf_vqmacc_4x8x4_i32m1
-// CHECK-RV64-SAME: (<vscale x 2 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 8 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-SAME: (<vscale x 2 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 4 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.sf.vqmacc.4x8x4.nxv2i32.nxv8i8.nxv8i8.i64(<vscale x 2 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 8 x i8> [[VS2]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.sf.vqmacc.4x8x4.nxv2i32.nxv8i8.nxv4i8.i64(<vscale x 2 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 4 x i8> [[VS2]], i64 [[VL]], i64 3)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
-vint32m1_t test_sf_vqmacc_4x8x4_i32m1(vint32m1_t vd, vint8m1_t vs1, vint8m1_t vs2, size_t vl) {
+vint32m1_t test_sf_vqmacc_4x8x4_i32m1(vint32m1_t vd, vint8m1_t vs1, vint8mf2_t vs2, size_t vl) {
   return __riscv_sf_vqmacc_4x8x4_i32m1(vd, vs1, vs2, vl);
 }
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 4 x i32> @test_sf_vqmacc_4x8x4_i32m2
-// CHECK-RV64-SAME: (<vscale x 4 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 16 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-SAME: (<vscale x 4 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 8 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.sf.vqmacc.4x8x4.nxv4i32.nxv8i8.nxv16i8.i64(<vscale x 4 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 16 x i8> [[VS2]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.sf.vqmacc.4x8x4.nxv4i32.nxv8i8.nxv8i8.i64(<vscale x 4 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 8 x i8> [[VS2]], i64 [[VL]], i64 3)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
-vint32m2_t test_sf_vqmacc_4x8x4_i32m2(vint32m2_t vd, vint8m1_t vs1, vint8m2_t vs2, size_t vl) {
+vint32m2_t test_sf_vqmacc_4x8x4_i32m2(vint32m2_t vd, vint8m1_t vs1, vint8m1_t vs2, size_t vl) {
   return __riscv_sf_vqmacc_4x8x4_i32m2(vd, vs1, vs2, vl);
 }
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 8 x i32> @test_sf_vqmacc_4x8x4_i32m4
-// CHECK-RV64-SAME: (<vscale x 8 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 32 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-SAME: (<vscale x 8 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 16 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.sf.vqmacc.4x8x4.nxv8i32.nxv8i8.nxv32i8.i64(<vscale x 8 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 32 x i8> [[VS2]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.sf.vqmacc.4x8x4.nxv8i32.nxv8i8.nxv16i8.i64(<vscale x 8 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 16 x i8> [[VS2]], i64 [[VL]], i64 3)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
-vint32m4_t test_sf_vqmacc_4x8x4_i32m4(vint32m4_t vd, vint8m1_t vs1, vint8m4_t vs2, size_t vl) {
+vint32m4_t test_sf_vqmacc_4x8x4_i32m4(vint32m4_t vd, vint8m1_t vs1, vint8m2_t vs2, size_t vl) {
   return __riscv_sf_vqmacc_4x8x4_i32m4(vd, vs1, vs2, vl);
 }
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 16 x i32> @test_sf_vqmacc_4x8x4_i32m8
-// CHECK-RV64-SAME: (<vscale x 16 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 64 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-SAME: (<vscale x 16 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 32 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.sf.vqmacc.4x8x4.nxv16i32.nxv8i8.nxv64i8.i64(<vscale x 16 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 64 x i8> [[VS2]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.sf.vqmacc.4x8x4.nxv16i32.nxv8i8.nxv32i8.i64(<vscale x 16 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 32 x i8> [[VS2]], i64 [[VL]], i64 3)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
-vint32m8_t test_sf_vqmacc_4x8x4_i32m8(vint32m8_t vd, vint8m1_t vs1, vint8m8_t vs2, size_t vl) {
+vint32m8_t test_sf_vqmacc_4x8x4_i32m8(vint32m8_t vd, vint8m1_t vs1, vint8m4_t vs2, size_t vl) {
   return __riscv_sf_vqmacc_4x8x4_i32m8(vd, vs1, vs2, vl);
 }
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/sf_vqmaccsu_4x8x4.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/sf_vqmaccsu_4x8x4.c
index f34517b24bcf2..8c0a6218c1d2f 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/sf_vqmaccsu_4x8x4.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/sf_vqmaccsu_4x8x4.c
@@ -7,41 +7,41 @@
 #include <sifive_vector.h>
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 2 x i32> @test_sf_vqmaccsu_4x8x4_i32m1
-// CHECK-RV64-SAME: (<vscale x 2 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 8 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-SAME: (<vscale x 2 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 4 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.sf.vqmaccsu.4x8x4.nxv2i32.nxv8i8.nxv8i8.i64(<vscale x 2 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 8 x i8> [[VS2]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.sf.vqmaccsu.4x8x4.nxv2i32.nxv8i8.nxv4i8.i64(<vscale x 2 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 4 x i8> [[VS2]], i64 [[VL]], i64 3)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
-vint32m1_t test_sf_vqmaccsu_4x8x4_i32m1(vint32m1_t vd, vint8m1_t vs1, vuint8m1_t vs2, size_t vl) {
+vint32m1_t test_sf_vqmaccsu_4x8x4_i32m1(vint32m1_t vd, vint8m1_t vs1, vuint8mf2_t vs2, size_t vl) {
   return __riscv_sf_vqmaccsu_4x8x4_i32m1(vd, vs1, vs2, vl);
 }
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 4 x i32> @test_sf_vqmaccsu_4x8x4_i32m2
-// CHECK-RV64-SAME: (<vscale x 4 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 16 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-SAME: (<vscale x 4 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 8 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.sf.vqmaccsu.4x8x4.nxv4i32.nxv8i8.nxv16i8.i64(<vscale x 4 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 16 x i8> [[VS2]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.sf.vqmaccsu.4x8x4.nxv4i32.nxv8i8.nxv8i8.i64(<vscale x 4 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 8 x i8> [[VS2]], i64 [[VL]], i64 3)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
-vint32m2_t test_sf_vqmaccsu_4x8x4_i32m2(vint32m2_t vd, vint8m1_t vs1, vuint8m2_t vs2, size_t vl) {
+vint32m2_t test_sf_vqmaccsu_4x8x4_i32m2(vint32m2_t vd, vint8m1_t vs1, vuint8m1_t vs2, size_t vl) {
   return __riscv_sf_vqmaccsu_4x8x4_i32m2(vd, vs1, vs2, vl);
 }
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 8 x i32> @test_sf_vqmaccsu_4x8x4_i32m4
-// CHECK-RV64-SAME: (<vscale x 8 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 32 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-SAME: (<vscale x 8 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 16 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.sf.vqmaccsu.4x8x4.nxv8i32.nxv8i8.nxv32i8.i64(<vscale x 8 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 32 x i8> [[VS2]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.sf.vqmaccsu.4x8x4.nxv8i32.nxv8i8.nxv16i8.i64(<vscale x 8 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 16 x i8> [[VS2]], i64 [[VL]], i64 3)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
-vint32m4_t test_sf_vqmaccsu_4x8x4_i32m4(vint32m4_t vd, vint8m1_t vs1, vuint8m4_t vs2, size_t vl) {
+vint32m4_t test_sf_vqmaccsu_4x8x4_i32m4(vint32m4_t vd, vint8m1_t vs1, vuint8m2_t vs2, size_t vl) {
   return __riscv_sf_vqmaccsu_4x8x4_i32m4(vd, vs1, vs2, vl);
 }
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 16 x i32> @test_sf_vqmaccsu_4x8x4_i32m8
-// CHECK-RV64-SAME: (<vscale x 16 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 64 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-SAME: (<vscale x 16 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 32 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.sf.vqmaccsu.4x8x4.nxv16i32.nxv8i8.nxv64i8.i64(<vscale x 16 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 64 x i8> [[VS2]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.sf.vqmaccsu.4x8x4.nxv16i32.nxv8i8.nxv32i8.i64(<vscale x 16 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 32 x i8> [[VS2]], i64 [[VL]], i64 3)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
-vint32m8_t test_sf_vqmaccsu_4x8x4_i32m8(vint32m8_t vd, vint8m1_t vs1, vuint8m8_t vs2, size_t vl) {
+vint32m8_t test_sf_vqmaccsu_4x8x4_i32m8(vint32m8_t vd, vint8m1_t vs1, vuint8m4_t vs2, size_t vl) {
   return __riscv_sf_vqmaccsu_4x8x4_i32m8(vd, vs1, vs2, vl);
 }
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/sf_vqmaccu_4x8x4.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/sf_vqmaccu_4x8x4.c
index ab7f6627ad1fb..b40891f417f2c 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/sf_vqmaccu_4x8x4.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/sf_vqmaccu_4x8x4.c
@@ -7,41 +7,41 @@
 #include <sifive_vector.h>
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 2 x i32> @test_sf_vqmaccu_4x8x4_i32m1
-// CHECK-RV64-SAME: (<vscale x 2 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 8 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-SAME: (<vscale x 2 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 4 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.sf.vqmaccu.4x8x4.nxv2i32.nxv8i8.nxv8i8.i64(<vscale x 2 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 8 x i8> [[VS2]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.sf.vqmaccu.4x8x4.nxv2i32.nxv8i8.nxv4i8.i64(<vscale x 2 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 4 x i8> [[VS2]], i64 [[VL]], i64 3)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
-vint32m1_t test_sf_vqmaccu_4x8x4_i32m1(vint32m1_t vd, vuint8m1_t vs1, vuint8m1_t vs2, size_t vl) {
+vint32m1_t test_sf_vqmaccu_4x8x4_i32m1(vint32m1_t vd, vuint8m1_t vs1, vuint8mf2_t vs2, size_t vl) {
   return __riscv_sf_vqmaccu_4x8x4_i32m1(vd, vs1, vs2, vl);
 }
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 4 x i32> @test_sf_vqmaccu_4x8x4_i32m2
-// CHECK-RV64-SAME: (<vscale x 4 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 16 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-SAME: (<vscale x 4 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 8 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.sf.vqmaccu.4x8x4.nxv4i32.nxv8i8.nxv16i8.i64(<vscale x 4 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 16 x i8> [[VS2]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.sf.vqmaccu.4x8x4.nxv4i32.nxv8i8.nxv8i8.i64(<vscale x 4 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 8 x i8> [[VS2]], i64 [[VL]], i64 3)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
-vint32m2_t test_sf_vqmaccu_4x8x4_i32m2(vint32m2_t vd, vuint8m1_t vs1, vuint8m2_t vs2, size_t vl) {
+vint32m2_t test_sf_vqmaccu_4x8x4_i32m2(vint32m2_t vd, vuint8m1_t vs1, vuint8m1_t vs2, size_t vl) {
   return __riscv_sf_vqmaccu_4x8x4_i32m2(vd, vs1, vs2, vl);
 }
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 8 x i32> @test_sf_vqmaccu_4x8x4_i32m4
-// CHECK-RV64-SAME: (<vscale x 8 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 32 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-SAME: (<vscale x 8 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 16 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.sf.vqmaccu.4x8x4.nxv8i32.nxv8i8.nxv32i8.i64(<vscale x 8 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 32 x i8> [[VS2]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.sf.vqmaccu.4x8x4.nxv8i32.nxv8i8.nxv16i8.i64(<vscale x 8 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 16 x i8> [[VS2]], i64 [[VL]], i64 3)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
-vint32m4_t test_sf_vqmaccu_4x8x4_i32m4(vint32m4_t vd, vuint8m1_t vs1, vuint8m4_t vs2, size_t vl) {
+vint32m4_t test_sf_vqmaccu_4x8x4_i32m4(vint32m4_t vd, vuint8m1_t vs1, vuint8m2_t vs2, size_t vl) {
   return __riscv_sf_vqmaccu_4x8x4_i32m4(vd, vs1, vs2, vl);
 }
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 16 x i32> @test_sf_vqmaccu_4x8x4_i32m8
-// CHECK-RV64-SAME: (<vscale x 16 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 64 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-SAME: (<vscale x 16 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 32 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.sf.vqmaccu.4x8x4.nxv16i32.nxv8i8.nxv64i8.i64(<vscale x 16 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 64 x i8> [[VS2]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.sf.vqmaccu.4x8x4.nxv16i32.nxv8i8.nxv32i8.i64(<vscale x 16 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 32 x i8> [[VS2]], i64 [[VL]], i64 3)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
-vint32m8_t test_sf_vqmaccu_4x8x4_i32m8(vint32m8_t vd, vuint8m1_t vs1, vuint8m8_t vs2, size_t vl) {
+vint32m8_t test_sf_vqmaccu_4x8x4_i32m8(vint32m8_t vd, vuint8m1_t vs1, vuint8m4_t vs2, size_t vl) {
   return __riscv_sf_vqmaccu_4x8x4_i32m8(vd, vs1, vs2, vl);
 }
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/sf_vqmaccus_4x8x4.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/sf_vqmaccus_4x8x4.c
index d0bcdcbf40cc3..d106aab64c514 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/sf_vqmaccus_4x8x4.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/sf_vqmaccus_4x8x4.c
@@ -7,41 +7,41 @@
 #include <sifive_vector.h>
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 2 x i32> @test_sf_vqmaccus_4x8x4_i32m1
-// CHECK-RV64-SAME: (<vscale x 2 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 8 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-SAME: (<vscale x 2 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 4 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.sf.vqmaccus.4x8x4.nxv2i32.nxv8i8.nxv8i8.i64(<vscale x 2 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 8 x i8> [[VS2]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.sf.vqmaccus.4x8x4.nxv2i32.nxv8i8.nxv4i8.i64(<vscale x 2 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 4 x i8> [[VS2]], i64 [[VL]], i64 3)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
-vint32m1_t test_sf_vqmaccus_4x8x4_i32m1(vint32m1_t vd, vuint8m1_t vs1, vint8m1_t vs2, size_t vl) {
+vint32m1_t test_sf_vqmaccus_4x8x4_i32m1(vint32m1_t vd, vuint8m1_t vs1, vint8mf2_t vs2, size_t vl) {
   return __riscv_sf_vqmaccus_4x8x4_i32m1(vd, vs1, vs2, vl);
 }
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 4 x i32> @test_sf_vqmaccus_4x8x4_i32m2
-// CHECK-RV64-SAME: (<vscale x 4 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 16 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-SAME: (<vscale x 4 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 8 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.sf.vqmaccus.4x8x4.nxv4i32.nxv8i8.nxv16i8.i64(<vscale x 4 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 16 x i8> [[VS2]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.sf.vqmaccus.4x8x4.nxv4i32.nxv8i8.nxv8i8.i64(<vscale x 4 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 8 x i8> [[VS2]], i64 [[VL]], i64 3)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
-vint32m2_t test_sf_vqmaccus_4x8x4_i32m2(vint32m2_t vd, vuint8m1_t vs1, vint8m2_t vs2, size_t vl) {
+vint32m2_t test_sf_vqmaccus_4x8x4_i32m2(vint32m2_t vd, vuint8m1_t vs1, vint8m1_t vs2, size_t vl) {
   return __riscv_sf_vqmaccus_4x8x4_i32m2(vd, vs1, vs2, vl);
 }
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 8 x i32> @test_sf_vqmaccus_4x8x4_i32m4
-// CHECK-RV64-SAME: (<vscale x 8 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 32 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-SAME: (<vscale x 8 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 16 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.sf.vqmaccus.4x8x4.nxv8i32.nxv8i8.nxv32i8.i64(<vscale x 8 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 32 x i8> [[VS2]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.sf.vqmaccus.4x8x4.nxv8i32.nxv8i8.nxv16i8.i64(<vscale x 8 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 16 x i8> [[VS2]], i64 [[VL]], i64 3)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
-vint32m4_t test_sf_vqmaccus_4x8x4_i32m4(vint32m4_t vd, vuint8m1_t vs1, vint8m4_t vs2, size_t vl) {
+vint32m4_t test_sf_vqmaccus_4x8x4_i32m4(vint32m4_t vd, vuint8m1_t vs1, vint8m2_t vs2, size_t vl) {
   return __riscv_sf_vqmaccus_4x8x4_i32m4(vd, vs1, vs2, vl);
 }
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 16 x i32> @test_sf_vqmaccus_4x8x4_i32m8
-// CHECK-RV64-SAME: (<vscale x 16 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 64 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-SAME: (<vscale x 16 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 32 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.sf.vqmaccus.4x8x4.nxv16i32.nxv8i8.nxv64i8.i64(<vscale x 16 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 64 x i8> [[VS2]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.sf.vqmaccus.4x8x4.nxv16i32.nxv8i8.nxv32i8.i64(<vscale x 16 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 32 x i8> [[VS2]], i64 [[VL]], i64 3)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
-vint32m8_t test_sf_vqmaccus_4x8x4_i32m8(vint32m8_t vd, vuint8m1_t vs1, vint8m8_t vs2, size_t vl) {
+vint32m8_t test_sf_vqmaccus_4x8x4_i32m8(vint32m8_t vd, vuint8m1_t vs1, vint8m4_t vs2, size_t vl) {
   return __riscv_sf_vqmaccus_4x8x4_i32m8(vd, vs1, vs2, vl);
 }
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/sf_vqmacc_4x8x4.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/sf_vqmacc_4x8x4.c
index 839d09c4f9a98..88fae73069440 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/sf_vqmacc_4x8x4.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/sf_vqmacc_4x8x4.c
@@ -7,41 +7,41 @@
 #include <sifive_vector.h>
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 2 x i32> @test_sf_vqmacc_4x8x4_i32m1
-// CHECK-RV64-SAME: (<vscale x 2 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 8 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-SAME: (<vscale x 2 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 4 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.sf.vqmacc.4x8x4.nxv2i32.nxv8i8.nxv8i8.i64(<vscale x 2 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 8 x i8> [[VS2]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.sf.vqmacc.4x8x4.nxv2i32.nxv8i8.nxv4i8.i64(<vscale x 2 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 4 x i8> [[VS2]], i64 [[VL]], i64 3)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
-vint32m1_t test_sf_vqmacc_4x8x4_i32m1(vint32m1_t vd, vint8m1_t vs1, vint8m1_t vs2, size_t vl) {
+vint32m1_t test_sf_vqmacc_4x8x4_i32m1(vint32m1_t vd, vint8m1_t vs1, vint8mf2_t vs2, size_t vl) {
   return __riscv_sf_vqmacc_4x8x4(vd, vs1, vs2, vl);
 }
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 4 x i32> @test_sf_vqmacc_4x8x4_i32m2
-// CHECK-RV64-SAME: (<vscale x 4 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 16 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-SAME: (<vscale x 4 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 8 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.sf.vqmacc.4x8x4.nxv4i32.nxv8i8.nxv16i8.i64(<vscale x 4 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 16 x i8> [[VS2]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.sf.vqmacc.4x8x4.nxv4i32.nxv8i8.nxv8i8.i64(<vscale x 4 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 8 x i8> [[VS2]], i64 [[VL]], i64 3)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
-vint32m2_t test_sf_vqmacc_4x8x4_i32m2(vint32m2_t vd, vint8m1_t vs1, vint8m2_t vs2, size_t vl) {
+vint32m2_t test_sf_vqmacc_4x8x4_i32m2(vint32m2_t vd, vint8m1_t vs1, vint8m1_t vs2, size_t vl) {
   return __riscv_sf_vqmacc_4x8x4(vd, vs1, vs2, vl);
 }
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 8 x i32> @test_sf_vqmacc_4x8x4_i32m4
-// CHECK-RV64-SAME: (<vscale x 8 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 32 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-SAME: (<vscale x 8 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 16 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.sf.vqmacc.4x8x4.nxv8i32.nxv8i8.nxv32i8.i64(<vscale x 8 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 32 x i8> [[VS2]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.sf.vqmacc.4x8x4.nxv8i32.nxv8i8.nxv16i8.i64(<vscale x 8 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 16 x i8> [[VS2]], i64 [[VL]], i64 3)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
-vint32m4_t test_sf_vqmacc_4x8x4_i32m4(vint32m4_t vd, vint8m1_t vs1, vint8m4_t vs2, size_t vl) {
+vint32m4_t test_sf_vqmacc_4x8x4_i32m4(vint32m4_t vd, vint8m1_t vs1, vint8m2_t vs2, size_t vl) {
   return __riscv_sf_vqmacc_4x8x4(vd, vs1, vs2, vl);
 }
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 16 x i32> @test_sf_vqmacc_4x8x4_i32m8
-// CHECK-RV64-SAME: (<vscale x 16 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 64 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-SAME: (<vscale x 16 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 32 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.sf.vqmacc.4x8x4.nxv16i32.nxv8i8.nxv64i8.i64(<vscale x 16 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 64 x i8> [[VS2]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.sf.vqmacc.4x8x4.nxv16i32.nxv8i8.nxv32i8.i64(<vscale x 16 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 32 x i8> [[VS2]], i64 [[VL]], i64 3)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
-vint32m8_t test_sf_vqmacc_4x8x4_i32m8(vint32m8_t vd, vint8m1_t vs1, vint8m8_t vs2, size_t vl) {
+vint32m8_t test_sf_vqmacc_4x8x4_i32m8(vint32m8_t vd, vint8m1_t vs1, vint8m4_t vs2, size_t vl) {
   return __riscv_sf_vqmacc_4x8x4(vd, vs1, vs2, vl);
 }
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/sf_vqmaccsu_4x8x4.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/sf_vqmaccsu_4x8x4.c
index b18853043e924..0aec4bfd9fe22 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/sf_vqmaccsu_4x8x4.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/sf_vqmaccsu_4x8x4.c
@@ -7,41 +7,41 @@
 #include <sifive_vector.h>
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 2 x i32> @test_sf_vqmaccsu_4x8x4_i32m1
-// CHECK-RV64-SAME: (<vscale x 2 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 8 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-SAME: (<vscale x 2 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 4 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.sf.vqmaccsu.4x8x4.nxv2i32.nxv8i8.nxv8i8.i64(<vscale x 2 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 8 x i8> [[VS2]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.sf.vqmaccsu.4x8x4.nxv2i32.nxv8i8.nxv4i8.i64(<vscale x 2 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 4 x i8> [[VS2]], i64 [[VL]], i64 3)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
-vint32m1_t test_sf_vqmaccsu_4x8x4_i32m1(vint32m1_t vd, vint8m1_t vs1, vuint8m1_t vs2, size_t vl) {
+vint32m1_t test_sf_vqmaccsu_4x8x4_i32m1(vint32m1_t vd, vint8m1_t vs1, vuint8mf2_t vs2, size_t vl) {
   return __riscv_sf_vqmaccsu_4x8x4(vd, vs1, vs2, vl);
 }
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 4 x i32> @test_sf_vqmaccsu_4x8x4_i32m2
-// CHECK-RV64-SAME: (<vscale x 4 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 16 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-SAME: (<vscale x 4 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 8 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.sf.vqmaccsu.4x8x4.nxv4i32.nxv8i8.nxv16i8.i64(<vscale x 4 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 16 x i8> [[VS2]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.sf.vqmaccsu.4x8x4.nxv4i32.nxv8i8.nxv8i8.i64(<vscale x 4 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 8 x i8> [[VS2]], i64 [[VL]], i64 3)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
-vint32m2_t test_sf_vqmaccsu_4x8x4_i32m2(vint32m2_t vd, vint8m1_t vs1, vuint8m2_t vs2, size_t vl) {
+vint32m2_t test_sf_vqmaccsu_4x8x4_i32m2(vint32m2_t vd, vint8m1_t vs1, vuint8m1_t vs2, size_t vl) {
   return __riscv_sf_vqmaccsu_4x8x4(vd, vs1, vs2, vl);
 }
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 8 x i32> @test_sf_vqmaccsu_4x8x4_i32m4
-// CHECK-RV64-SAME: (<vscale x 8 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 32 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-SAME: (<vscale x 8 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 16 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.sf.vqmaccsu.4x8x4.nxv8i32.nxv8i8.nxv32i8.i64(<vscale x 8 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 32 x i8> [[VS2]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.sf.vqmaccsu.4x8x4.nxv8i32.nxv8i8.nxv16i8.i64(<vscale x 8 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 16 x i8> [[VS2]], i64 [[VL]], i64 3)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
-vint32m4_t test_sf_vqmaccsu_4x8x4_i32m4(vint32m4_t vd, vint8m1_t vs1, vuint8m4_t vs2, size_t vl) {
+vint32m4_t test_sf_vqmaccsu_4x8x4_i32m4(vint32m4_t vd, vint8m1_t vs1, vuint8m2_t vs2, size_t vl) {
   return __riscv_sf_vqmaccsu_4x8x4(vd, vs1, vs2, vl);
 }
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 16 x i32> @test_sf_vqmaccsu_4x8x4_i32m8
-// CHECK-RV64-SAME: (<vscale x 16 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 64 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-SAME: (<vscale x 16 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 32 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.sf.vqmaccsu.4x8x4.nxv16i32.nxv8i8.nxv64i8.i64(<vscale x 16 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 64 x i8> [[VS2]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.sf.vqmaccsu.4x8x4.nxv16i32.nxv8i8.nxv32i8.i64(<vscale x 16 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 32 x i8> [[VS2]], i64 [[VL]], i64 3)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
-vint32m8_t test_sf_vqmaccsu_4x8x4_i32m8(vint32m8_t vd, vint8m1_t vs1, vuint8m8_t vs2, size_t vl) {
+vint32m8_t test_sf_vqmaccsu_4x8x4_i32m8(vint32m8_t vd, vint8m1_t vs1, vuint8m4_t vs2, size_t vl) {
   return __riscv_sf_vqmaccsu_4x8x4(vd, vs1, vs2, vl);
 }
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/sf_vqmaccu_4x8x4.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/sf_vqmaccu_4x8x4.c
index 4cb966b08f237..81965e86f77c8 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/sf_vqmaccu_4x8x4.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/sf_vqmaccu_4x8x4.c
@@ -7,41 +7,41 @@
 #include <sifive_vector.h>
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 2 x i32> @test_sf_vqmaccu_4x8x4_i32m1
-// CHECK-RV64-SAME: (<vscale x 2 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 8 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-SAME: (<vscale x 2 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 4 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.sf.vqmaccu.4x8x4.nxv2i32.nxv8i8.nxv8i8.i64(<vscale x 2 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 8 x i8> [[VS2]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.sf.vqmaccu.4x8x4.nxv2i32.nxv8i8.nxv4i8.i64(<vscale x 2 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 4 x i8> [[VS2]], i64 [[VL]], i64 3)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
-vint32m1_t test_sf_vqmaccu_4x8x4_i32m1(vint32m1_t vd, vuint8m1_t vs1, vuint8m1_t vs2, size_t vl) {
+vint32m1_t test_sf_vqmaccu_4x8x4_i32m1(vint32m1_t vd, vuint8m1_t vs1, vuint8mf2_t vs2, size_t vl) {
   return __riscv_sf_vqmaccu_4x8x4(vd, vs1, vs2, vl);
 }
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 4 x i32> @test_sf_vqmaccu_4x8x4_i32m2
-// CHECK-RV64-SAME: (<vscale x 4 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 16 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-SAME: (<vscale x 4 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 8 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.sf.vqmaccu.4x8x4.nxv4i32.nxv8i8.nxv16i8.i64(<vscale x 4 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 16 x i8> [[VS2]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.sf.vqmaccu.4x8x4.nxv4i32.nxv8i8.nxv8i8.i64(<vscale x 4 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 8 x i8> [[VS2]], i64 [[VL]], i64 3)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
-vint32m2_t test_sf_vqmaccu_4x8x4_i32m2(vint32m2_t vd, vuint8m1_t vs1, vuint8m2_t vs2, size_t vl) {
+vint32m2_t test_sf_vqmaccu_4x8x4_i32m2(vint32m2_t vd, vuint8m1_t vs1, vuint8m1_t vs2, size_t vl) {
   return __riscv_sf_vqmaccu_4x8x4(vd, vs1, vs2, vl);
 }
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 8 x i32> @test_sf_vqmaccu_4x8x4_i32m4
-// CHECK-RV64-SAME: (<vscale x 8 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 32 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-SAME: (<vscale x 8 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 16 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.sf.vqmaccu.4x8x4.nxv8i32.nxv8i8.nxv32i8.i64(<vscale x 8 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 32 x i8> [[VS2]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.sf.vqmaccu.4x8x4.nxv8i32.nxv8i8.nxv16i8.i64(<vscale x 8 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 16 x i8> [[VS2]], i64 [[VL]], i64 3)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
-vint32m4_t test_sf_vqmaccu_4x8x4_i32m4(vint32m4_t vd, vuint8m1_t vs1, vuint8m4_t vs2, size_t vl) {
+vint32m4_t test_sf_vqmaccu_4x8x4_i32m4(vint32m4_t vd, vuint8m1_t vs1, vuint8m2_t vs2, size_t vl) {
   return __riscv_sf_vqmaccu_4x8x4(vd, vs1, vs2, vl);
 }
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 16 x i32> @test_sf_vqmaccu_4x8x4_i32m8
-// CHECK-RV64-SAME: (<vscale x 16 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 64 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-SAME: (<vscale x 16 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 32 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.sf.vqmaccu.4x8x4.nxv16i32.nxv8i8.nxv64i8.i64(<vscale x 16 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 64 x i8> [[VS2]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.sf.vqmaccu.4x8x4.nxv16i32.nxv8i8.nxv32i8.i64(<vscale x 16 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 32 x i8> [[VS2]], i64 [[VL]], i64 3)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
-vint32m8_t test_sf_vqmaccu_4x8x4_i32m8(vint32m8_t vd, vuint8m1_t vs1, vuint8m8_t vs2, size_t vl) {
+vint32m8_t test_sf_vqmaccu_4x8x4_i32m8(vint32m8_t vd, vuint8m1_t vs1, vuint8m4_t vs2, size_t vl) {
   return __riscv_sf_vqmaccu_4x8x4(vd, vs1, vs2, vl);
 }
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/sf_vqmaccus_4x8x4.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/sf_vqmaccus_4x8x4.c
index f558151f88a3f..f2544cf3ef2ae 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/sf_vqmaccus_4x8x4.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/sf_vqmaccus_4x8x4.c
@@ -7,41 +7,41 @@
 #include <sifive_vector.h>
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 2 x i32> @test_sf_vqmaccus_4x8x4_i32m1
-// CHECK-RV64-SAME: (<vscale x 2 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 8 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-SAME: (<vscale x 2 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 4 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.sf.vqmaccus.4x8x4.nxv2i32.nxv8i8.nxv8i8.i64(<vscale x 2 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 8 x i8> [[VS2]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.sf.vqmaccus.4x8x4.nxv2i32.nxv8i8.nxv4i8.i64(<vscale x 2 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 4 x i8> [[VS2]], i64 [[VL]], i64 3)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
-vint32m1_t test_sf_vqmaccus_4x8x4_i32m1(vint32m1_t vd, vuint8m1_t vs1, vint8m1_t vs2, size_t vl) {
+vint32m1_t test_sf_vqmaccus_4x8x4_i32m1(vint32m1_t vd, vuint8m1_t vs1, vint8mf2_t vs2, size_t vl) {
   return __riscv_sf_vqmaccus_4x8x4(vd, vs1, vs2, vl);
 }
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 4 x i32> @test_sf_vqmaccus_4x8x4_i32m2
-// CHECK-RV64-SAME: (<vscale x 4 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 16 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-SAME: (<vscale x 4 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 8 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.sf.vqmaccus.4x8x4.nxv4i32.nxv8i8.nxv16i8.i64(<vscale x 4 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 16 x i8> [[VS2]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.sf.vqmaccus.4x8x4.nxv4i32.nxv8i8.nxv8i8.i64(<vscale x 4 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 8 x i8> [[VS2]], i64 [[VL]], i64 3)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
-vint32m2_t test_sf_vqmaccus_4x8x4_i32m2(vint32m2_t vd, vuint8m1_t vs1, vint8m2_t vs2, size_t vl) {
+vint32m2_t test_sf_vqmaccus_4x8x4_i32m2(vint32m2_t vd, vuint8m1_t vs1, vint8m1_t vs2, size_t vl) {
   return __riscv_sf_vqmaccus_4x8x4(vd, vs1, vs2, vl);
 }
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 8 x i32> @test_sf_vqmaccus_4x8x4_i32m4
-// CHECK-RV64-SAME: (<vscale x 8 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 32 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-SAME: (<vscale x 8 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 16 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.sf.vqmaccus.4x8x4.nxv8i32.nxv8i8.nxv32i8.i64(<vscale x 8 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 32 x i8> [[VS2]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.sf.vqmaccus.4x8x4.nxv8i32.nxv8i8.nxv16i8.i64(<vscale x 8 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 16 x i8> [[VS2]], i64 [[VL]], i64 3)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
-vint32m4_t test_sf_vqmaccus_4x8x4_i32m4(vint32m4_t vd, vuint8m1_t vs1, vint8m4_t vs2, size_t vl) {
+vint32m4_t test_sf_vqmaccus_4x8x4_i32m4(vint32m4_t vd, vuint8m1_t vs1, vint8m2_t vs2, size_t vl) {
   return __riscv_sf_vqmaccus_4x8x4(vd, vs1, vs2, vl);
 }
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 16 x i32> @test_sf_vqmaccus_4x8x4_i32m8
-// CHECK-RV64-SAME: (<vscale x 16 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 64 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-SAME: (<vscale x 16 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 32 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.sf.vqmaccus.4x8x4.nxv16i32.nxv8i8.nxv64i8.i64(<vscale x 16 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 64 x i8> [[VS2]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.sf.vqmaccus.4x8x4.nxv16i32.nxv8i8.nxv32i8.i64(<vscale x 16 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 32 x i8> [[VS2]], i64 [[VL]], i64 3)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
-vint32m8_t test_sf_vqmaccus_4x8x4_i32m8(vint32m8_t vd, vuint8m1_t vs1, vint8m8_t vs2, size_t vl) {
+vint32m8_t test_sf_vqmaccus_4x8x4_i32m8(vint32m8_t vd, vuint8m1_t vs1, vint8m4_t vs2, size_t vl) {
   return __riscv_sf_vqmaccus_4x8x4(vd, vs1, vs2, vl);
 }
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/sf_vqmacc_4x8x4.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/sf_vqmacc_4x8x4.c
index 05c10840cabfa..8fdeac62a31f1 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/sf_vqmacc_4x8x4.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/sf_vqmacc_4x8x4.c
@@ -7,41 +7,41 @@
 #include <sifive_vector.h>
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 2 x i32> @test_sf_vqmacc_4x8x4_i32m1_tu
-// CHECK-RV64-SAME: (<vscale x 2 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 8 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-SAME: (<vscale x 2 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 4 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.sf.vqmacc.4x8x4.nxv2i32.nxv8i8.nxv8i8.i64(<vscale x 2 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 8 x i8> [[VS2]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.sf.vqmacc.4x8x4.nxv2i32.nxv8i8.nxv4i8.i64(<vscale x 2 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 4 x i8> [[VS2]], i64 [[VL]], i64 2)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
-vint32m1_t test_sf_vqmacc_4x8x4_i32m1_tu(vint32m1_t vd, vint8m1_t vs1, vint8m1_t vs2, size_t vl) {
+vint32m1_t test_sf_vqmacc_4x8x4_i32m1_tu(vint32m1_t vd, vint8m1_t vs1, vint8mf2_t vs2, size_t vl) {
   return __riscv_sf_vqmacc_4x8x4_i32m1_tu(vd, vs1, vs2, vl);
 }
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 4 x i32> @test_sf_vqmacc_4x8x4_i32m2_tu
-// CHECK-RV64-SAME: (<vscale x 4 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 16 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-SAME: (<vscale x 4 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 8 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.sf.vqmacc.4x8x4.nxv4i32.nxv8i8.nxv16i8.i64(<vscale x 4 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 16 x i8> [[VS2]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.sf.vqmacc.4x8x4.nxv4i32.nxv8i8.nxv8i8.i64(<vscale x 4 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 8 x i8> [[VS2]], i64 [[VL]], i64 2)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
-vint32m2_t test_sf_vqmacc_4x8x4_i32m2_tu(vint32m2_t vd, vint8m1_t vs1, vint8m2_t vs2, size_t vl) {
+vint32m2_t test_sf_vqmacc_4x8x4_i32m2_tu(vint32m2_t vd, vint8m1_t vs1, vint8m1_t vs2, size_t vl) {
   return __riscv_sf_vqmacc_4x8x4_i32m2_tu(vd, vs1, vs2, vl);
 }
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 8 x i32> @test_sf_vqmacc_4x8x4_i32m4_tu
-// CHECK-RV64-SAME: (<vscale x 8 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 32 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-SAME: (<vscale x 8 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 16 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.sf.vqmacc.4x8x4.nxv8i32.nxv8i8.nxv32i8.i64(<vscale x 8 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 32 x i8> [[VS2]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.sf.vqmacc.4x8x4.nxv8i32.nxv8i8.nxv16i8.i64(<vscale x 8 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 16 x i8> [[VS2]], i64 [[VL]], i64 2)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
-vint32m4_t test_sf_vqmacc_4x8x4_i32m4_tu(vint32m4_t vd, vint8m1_t vs1, vint8m4_t vs2, size_t vl) {
+vint32m4_t test_sf_vqmacc_4x8x4_i32m4_tu(vint32m4_t vd, vint8m1_t vs1, vint8m2_t vs2, size_t vl) {
   return __riscv_sf_vqmacc_4x8x4_i32m4_tu(vd, vs1, vs2, vl);
 }
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 16 x i32> @test_sf_vqmacc_4x8x4_i32m8_tu
-// CHECK-RV64-SAME: (<vscale x 16 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 64 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-SAME: (<vscale x 16 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 32 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.sf.vqmacc.4x8x4.nxv16i32.nxv8i8.nxv64i8.i64(<vscale x 16 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 64 x i8> [[VS2]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.sf.vqmacc.4x8x4.nxv16i32.nxv8i8.nxv32i8.i64(<vscale x 16 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 32 x i8> [[VS2]], i64 [[VL]], i64 2)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
-vint32m8_t test_sf_vqmacc_4x8x4_i32m8_tu(vint32m8_t vd, vint8m1_t vs1, vint8m8_t vs2, size_t vl) {
+vint32m8_t test_sf_vqmacc_4x8x4_i32m8_tu(vint32m8_t vd, vint8m1_t vs1, vint8m4_t vs2, size_t vl) {
   return __riscv_sf_vqmacc_4x8x4_i32m8_tu(vd, vs1, vs2, vl);
 }
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/sf_vqmaccsu_4x8x4.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/sf_vqmaccsu_4x8x4.c
index bce1a4e9443fd..e02c790dfbeb7 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/sf_vqmaccsu_4x8x4.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/sf_vqmaccsu_4x8x4.c
@@ -7,41 +7,41 @@
 #include <sifive_vector.h>
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 2 x i32> @test_sf_vqmaccsu_4x8x4_i32m1_tu
-// CHECK-RV64-SAME: (<vscale x 2 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 8 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-SAME: (<vscale x 2 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 4 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.sf.vqmaccsu.4x8x4.nxv2i32.nxv8i8.nxv8i8.i64(<vscale x 2 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 8 x i8> [[VS2]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.sf.vqmaccsu.4x8x4.nxv2i32.nxv8i8.nxv4i8.i64(<vscale x 2 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 4 x i8> [[VS2]], i64 [[VL]], i64 2)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
-vint32m1_t test_sf_vqmaccsu_4x8x4_i32m1_tu(vint32m1_t vd, vint8m1_t vs1, vuint8m1_t vs2, size_t vl) {
+vint32m1_t test_sf_vqmaccsu_4x8x4_i32m1_tu(vint32m1_t vd, vint8m1_t vs1, vuint8mf2_t vs2, size_t vl) {
   return __riscv_sf_vqmaccsu_4x8x4_i32m1_tu(vd, vs1, vs2, vl);
 }
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 4 x i32> @test_sf_vqmaccsu_4x8x4_i32m2_tu
-// CHECK-RV64-SAME: (<vscale x 4 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 16 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-SAME: (<vscale x 4 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 8 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.sf.vqmaccsu.4x8x4.nxv4i32.nxv8i8.nxv16i8.i64(<vscale x 4 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 16 x i8> [[VS2]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.sf.vqmaccsu.4x8x4.nxv4i32.nxv8i8.nxv8i8.i64(<vscale x 4 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 8 x i8> [[VS2]], i64 [[VL]], i64 2)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
-vint32m2_t test_sf_vqmaccsu_4x8x4_i32m2_tu(vint32m2_t vd, vint8m1_t vs1, vuint8m2_t vs2, size_t vl) {
+vint32m2_t test_sf_vqmaccsu_4x8x4_i32m2_tu(vint32m2_t vd, vint8m1_t vs1, vuint8m1_t vs2, size_t vl) {
   return __riscv_sf_vqmaccsu_4x8x4_i32m2_tu(vd, vs1, vs2, vl);
 }
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 8 x i32> @test_sf_vqmaccsu_4x8x4_i32m4_tu
-// CHECK-RV64-SAME: (<vscale x 8 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 32 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-SAME: (<vscale x 8 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 16 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.sf.vqmaccsu.4x8x4.nxv8i32.nxv8i8.nxv32i8.i64(<vscale x 8 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 32 x i8> [[VS2]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.sf.vqmaccsu.4x8x4.nxv8i32.nxv8i8.nxv16i8.i64(<vscale x 8 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 16 x i8> [[VS2]], i64 [[VL]], i64 2)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
-vint32m4_t test_sf_vqmaccsu_4x8x4_i32m4_tu(vint32m4_t vd, vint8m1_t vs1, vuint8m4_t vs2, size_t vl) {
+vint32m4_t test_sf_vqmaccsu_4x8x4_i32m4_tu(vint32m4_t vd, vint8m1_t vs1, vuint8m2_t vs2, size_t vl) {
   return __riscv_sf_vqmaccsu_4x8x4_i32m4_tu(vd, vs1, vs2, vl);
 }
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 16 x i32> @test_sf_vqmaccsu_4x8x4_i32m8_tu
-// CHECK-RV64-SAME: (<vscale x 16 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 64 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-SAME: (<vscale x 16 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 32 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.sf.vqmaccsu.4x8x4.nxv16i32.nxv8i8.nxv64i8.i64(<vscale x 16 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 64 x i8> [[VS2]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.sf.vqmaccsu.4x8x4.nxv16i32.nxv8i8.nxv32i8.i64(<vscale x 16 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 32 x i8> [[VS2]], i64 [[VL]], i64 2)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
-vint32m8_t test_sf_vqmaccsu_4x8x4_i32m8_tu(vint32m8_t vd, vint8m1_t vs1, vuint8m8_t vs2, size_t vl) {
+vint32m8_t test_sf_vqmaccsu_4x8x4_i32m8_tu(vint32m8_t vd, vint8m1_t vs1, vuint8m4_t vs2, size_t vl) {
   return __riscv_sf_vqmaccsu_4x8x4_i32m8_tu(vd, vs1, vs2, vl);
 }
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/sf_vqmaccu_4x8x4.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/sf_vqmaccu_4x8x4.c
index 36aaae9caebf6..ddeb6de007164 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/sf_vqmaccu_4x8x4.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/sf_vqmaccu_4x8x4.c
@@ -7,41 +7,41 @@
 #include <sifive_vector.h>
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 2 x i32> @test_sf_vqmaccu_4x8x4_i32m1_tu
-// CHECK-RV64-SAME: (<vscale x 2 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 8 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-SAME: (<vscale x 2 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 4 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.sf.vqmaccu.4x8x4.nxv2i32.nxv8i8.nxv8i8.i64(<vscale x 2 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 8 x i8> [[VS2]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.sf.vqmaccu.4x8x4.nxv2i32.nxv8i8.nxv4i8.i64(<vscale x 2 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 4 x i8> [[VS2]], i64 [[VL]], i64 2)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
-vint32m1_t test_sf_vqmaccu_4x8x4_i32m1_tu(vint32m1_t vd, vuint8m1_t vs1, vuint8m1_t vs2, size_t vl) {
+vint32m1_t test_sf_vqmaccu_4x8x4_i32m1_tu(vint32m1_t vd, vuint8m1_t vs1, vuint8mf2_t vs2, size_t vl) {
   return __riscv_sf_vqmaccu_4x8x4_i32m1_tu(vd, vs1, vs2, vl);
 }
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 4 x i32> @test_sf_vqmaccu_4x8x4_i32m2_tu
-// CHECK-RV64-SAME: (<vscale x 4 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 16 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-SAME: (<vscale x 4 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 8 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.sf.vqmaccu.4x8x4.nxv4i32.nxv8i8.nxv16i8.i64(<vscale x 4 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 16 x i8> [[VS2]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.sf.vqmaccu.4x8x4.nxv4i32.nxv8i8.nxv8i8.i64(<vscale x 4 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 8 x i8> [[VS2]], i64 [[VL]], i64 2)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
-vint32m2_t test_sf_vqmaccu_4x8x4_i32m2_tu(vint32m2_t vd, vuint8m1_t vs1, vuint8m2_t vs2, size_t vl) {
+vint32m2_t test_sf_vqmaccu_4x8x4_i32m2_tu(vint32m2_t vd, vuint8m1_t vs1, vuint8m1_t vs2, size_t vl) {
   return __riscv_sf_vqmaccu_4x8x4_i32m2_tu(vd, vs1, vs2, vl);
 }
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 8 x i32> @test_sf_vqmaccu_4x8x4_i32m4_tu
-// CHECK-RV64-SAME: (<vscale x 8 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 32 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-SAME: (<vscale x 8 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 16 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.sf.vqmaccu.4x8x4.nxv8i32.nxv8i8.nxv32i8.i64(<vscale x 8 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 32 x i8> [[VS2]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.sf.vqmaccu.4x8x4.nxv8i32.nxv8i8.nxv16i8.i64(<vscale x 8 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 16 x i8> [[VS2]], i64 [[VL]], i64 2)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
-vint32m4_t test_sf_vqmaccu_4x8x4_i32m4_tu(vint32m4_t vd, vuint8m1_t vs1, vuint8m4_t vs2, size_t vl) {
+vint32m4_t test_sf_vqmaccu_4x8x4_i32m4_tu(vint32m4_t vd, vuint8m1_t vs1, vuint8m2_t vs2, size_t vl) {
   return __riscv_sf_vqmaccu_4x8x4_i32m4_tu(vd, vs1, vs2, vl);
 }
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 16 x i32> @test_sf_vqmaccu_4x8x4_i32m8_tu
-// CHECK-RV64-SAME: (<vscale x 16 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 64 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-SAME: (<vscale x 16 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 32 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.sf.vqmaccu.4x8x4.nxv16i32.nxv8i8.nxv64i8.i64(<vscale x 16 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 64 x i8> [[VS2]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.sf.vqmaccu.4x8x4.nxv16i32.nxv8i8.nxv32i8.i64(<vscale x 16 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 32 x i8> [[VS2]], i64 [[VL]], i64 2)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
-vint32m8_t test_sf_vqmaccu_4x8x4_i32m8_tu(vint32m8_t vd, vuint8m1_t vs1, vuint8m8_t vs2, size_t vl) {
+vint32m8_t test_sf_vqmaccu_4x8x4_i32m8_tu(vint32m8_t vd, vuint8m1_t vs1, vuint8m4_t vs2, size_t vl) {
   return __riscv_sf_vqmaccu_4x8x4_i32m8_tu(vd, vs1, vs2, vl);
 }
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/sf_vqmaccus_4x8x4.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/sf_vqmaccus_4x8x4.c
index f5ac2bf0f1f3a..397e406c2ee58 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/sf_vqmaccus_4x8x4.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/sf_vqmaccus_4x8x4.c
@@ -7,41 +7,41 @@
 #include <sifive_vector.h>
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 2 x i32> @test_sf_vqmaccus_4x8x4_i32m1_tu
-// CHECK-RV64-SAME: (<vscale x 2 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 8 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-SAME: (<vscale x 2 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 4 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.sf.vqmaccus.4x8x4.nxv2i32.nxv8i8.nxv8i8.i64(<vscale x 2 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 8 x i8> [[VS2]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.sf.vqmaccus.4x8x4.nxv2i32.nxv8i8.nxv4i8.i64(<vscale x 2 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 4 x i8> [[VS2]], i64 [[VL]], i64 2)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
-vint32m1_t test_sf_vqmaccus_4x8x4_i32m1_tu(vint32m1_t vd, vuint8m1_t vs1, vint8m1_t vs2, size_t vl) {
+vint32m1_t test_sf_vqmaccus_4x8x4_i32m1_tu(vint32m1_t vd, vuint8m1_t vs1, vint8mf2_t vs2, size_t vl) {
   return __riscv_sf_vqmaccus_4x8x4_i32m1_tu(vd, vs1, vs2, vl);
 }
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 4 x i32> @test_sf_vqmaccus_4x8x4_i32m2_tu
-// CHECK-RV64-SAME: (<vscale x 4 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 16 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-SAME: (<vscale x 4 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 8 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.sf.vqmaccus.4x8x4.nxv4i32.nxv8i8.nxv16i8.i64(<vscale x 4 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 16 x i8> [[VS2]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.sf.vqmaccus.4x8x4.nxv4i32.nxv8i8.nxv8i8.i64(<vscale x 4 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 8 x i8> [[VS2]], i64 [[VL]], i64 2)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
-vint32m2_t test_sf_vqmaccus_4x8x4_i32m2_tu(vint32m2_t vd, vuint8m1_t vs1, vint8m2_t vs2, size_t vl) {
+vint32m2_t test_sf_vqmaccus_4x8x4_i32m2_tu(vint32m2_t vd, vuint8m1_t vs1, vint8m1_t vs2, size_t vl) {
   return __riscv_sf_vqmaccus_4x8x4_i32m2_tu(vd, vs1, vs2, vl);
 }
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 8 x i32> @test_sf_vqmaccus_4x8x4_i32m4_tu
-// CHECK-RV64-SAME: (<vscale x 8 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 32 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-SAME: (<vscale x 8 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 16 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.sf.vqmaccus.4x8x4.nxv8i32.nxv8i8.nxv32i8.i64(<vscale x 8 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 32 x i8> [[VS2]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.sf.vqmaccus.4x8x4.nxv8i32.nxv8i8.nxv16i8.i64(<vscale x 8 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 16 x i8> [[VS2]], i64 [[VL]], i64 2)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
-vint32m4_t test_sf_vqmaccus_4x8x4_i32m4_tu(vint32m4_t vd, vuint8m1_t vs1, vint8m4_t vs2, size_t vl) {
+vint32m4_t test_sf_vqmaccus_4x8x4_i32m4_tu(vint32m4_t vd, vuint8m1_t vs1, vint8m2_t vs2, size_t vl) {
   return __riscv_sf_vqmaccus_4x8x4_i32m4_tu(vd, vs1, vs2, vl);
 }
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 16 x i32> @test_sf_vqmaccus_4x8x4_i32m8_tu
-// CHECK-RV64-SAME: (<vscale x 16 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 64 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-SAME: (<vscale x 16 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 32 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.sf.vqmaccus.4x8x4.nxv16i32.nxv8i8.nxv64i8.i64(<vscale x 16 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 64 x i8> [[VS2]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.sf.vqmaccus.4x8x4.nxv16i32.nxv8i8.nxv32i8.i64(<vscale x 16 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 32 x i8> [[VS2]], i64 [[VL]], i64 2)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
-vint32m8_t test_sf_vqmaccus_4x8x4_i32m8_tu(vint32m8_t vd, vuint8m1_t vs1, vint8m8_t vs2, size_t vl) {
+vint32m8_t test_sf_vqmaccus_4x8x4_i32m8_tu(vint32m8_t vd, vuint8m1_t vs1, vint8m4_t vs2, size_t vl) {
   return __riscv_sf_vqmaccus_4x8x4_i32m8_tu(vd, vs1, vs2, vl);
 }
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/sf_vqmacc_4x8x4.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/sf_vqmacc_4x8x4.c
index 531bc2b2b9425..7b3b25a203315 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/sf_vqmacc_4x8x4.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/sf_vqmacc_4x8x4.c
@@ -7,41 +7,41 @@
 #include <sifive_vector.h>
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 2 x i32> @test_sf_vqmacc_4x8x4_i32m1_tu
-// CHECK-RV64-SAME: (<vscale x 2 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 8 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-SAME: (<vscale x 2 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 4 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.sf.vqmacc.4x8x4.nxv2i32.nxv8i8.nxv8i8.i64(<vscale x 2 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 8 x i8> [[VS2]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.sf.vqmacc.4x8x4.nxv2i32.nxv8i8.nxv4i8.i64(<vscale x 2 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 4 x i8> [[VS2]], i64 [[VL]], i64 2)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
-vint32m1_t test_sf_vqmacc_4x8x4_i32m1_tu(vint32m1_t vd, vint8m1_t vs1, vint8m1_t vs2, size_t vl) {
+vint32m1_t test_sf_vqmacc_4x8x4_i32m1_tu(vint32m1_t vd, vint8m1_t vs1, vint8mf2_t vs2, size_t vl) {
   return __riscv_sf_vqmacc_4x8x4_tu(vd, vs1, vs2, vl);
 }
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 4 x i32> @test_sf_vqmacc_4x8x4_i32m2_tu
-// CHECK-RV64-SAME: (<vscale x 4 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 16 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-SAME: (<vscale x 4 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 8 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.sf.vqmacc.4x8x4.nxv4i32.nxv8i8.nxv16i8.i64(<vscale x 4 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 16 x i8> [[VS2]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.sf.vqmacc.4x8x4.nxv4i32.nxv8i8.nxv8i8.i64(<vscale x 4 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 8 x i8> [[VS2]], i64 [[VL]], i64 2)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
-vint32m2_t test_sf_vqmacc_4x8x4_i32m2_tu(vint32m2_t vd, vint8m1_t vs1, vint8m2_t vs2, size_t vl) {
+vint32m2_t test_sf_vqmacc_4x8x4_i32m2_tu(vint32m2_t vd, vint8m1_t vs1, vint8m1_t vs2, size_t vl) {
   return __riscv_sf_vqmacc_4x8x4_tu(vd, vs1, vs2, vl);
 }
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 8 x i32> @test_sf_vqmacc_4x8x4_i32m4_tu
-// CHECK-RV64-SAME: (<vscale x 8 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 32 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-SAME: (<vscale x 8 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 16 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.sf.vqmacc.4x8x4.nxv8i32.nxv8i8.nxv32i8.i64(<vscale x 8 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 32 x i8> [[VS2]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.sf.vqmacc.4x8x4.nxv8i32.nxv8i8.nxv16i8.i64(<vscale x 8 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 16 x i8> [[VS2]], i64 [[VL]], i64 2)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
-vint32m4_t test_sf_vqmacc_4x8x4_i32m4_tu(vint32m4_t vd, vint8m1_t vs1, vint8m4_t vs2, size_t vl) {
+vint32m4_t test_sf_vqmacc_4x8x4_i32m4_tu(vint32m4_t vd, vint8m1_t vs1, vint8m2_t vs2, size_t vl) {
   return __riscv_sf_vqmacc_4x8x4_tu(vd, vs1, vs2, vl);
 }
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 16 x i32> @test_sf_vqmacc_4x8x4_i32m8_tu
-// CHECK-RV64-SAME: (<vscale x 16 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 64 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-SAME: (<vscale x 16 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 32 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.sf.vqmacc.4x8x4.nxv16i32.nxv8i8.nxv64i8.i64(<vscale x 16 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 64 x i8> [[VS2]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.sf.vqmacc.4x8x4.nxv16i32.nxv8i8.nxv32i8.i64(<vscale x 16 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 32 x i8> [[VS2]], i64 [[VL]], i64 2)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
-vint32m8_t test_sf_vqmacc_4x8x4_i32m8_tu(vint32m8_t vd, vint8m1_t vs1, vint8m8_t vs2, size_t vl) {
+vint32m8_t test_sf_vqmacc_4x8x4_i32m8_tu(vint32m8_t vd, vint8m1_t vs1, vint8m4_t vs2, size_t vl) {
   return __riscv_sf_vqmacc_4x8x4_tu(vd, vs1, vs2, vl);
 }
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/sf_vqmaccsu_4x8x4.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/sf_vqmaccsu_4x8x4.c
index 23bba523aaa44..2f3cbeec26fc9 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/sf_vqmaccsu_4x8x4.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/sf_vqmaccsu_4x8x4.c
@@ -7,41 +7,41 @@
 #include <sifive_vector.h>
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 2 x i32> @test_sf_vqmaccsu_4x8x4_i32m1_tu
-// CHECK-RV64-SAME: (<vscale x 2 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 8 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-SAME: (<vscale x 2 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 4 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.sf.vqmaccsu.4x8x4.nxv2i32.nxv8i8.nxv8i8.i64(<vscale x 2 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 8 x i8> [[VS2]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.sf.vqmaccsu.4x8x4.nxv2i32.nxv8i8.nxv4i8.i64(<vscale x 2 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 4 x i8> [[VS2]], i64 [[VL]], i64 2)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
-vint32m1_t test_sf_vqmaccsu_4x8x4_i32m1_tu(vint32m1_t vd, vint8m1_t vs1, vuint8m1_t vs2, size_t vl) {
+vint32m1_t test_sf_vqmaccsu_4x8x4_i32m1_tu(vint32m1_t vd, vint8m1_t vs1, vuint8mf2_t vs2, size_t vl) {
   return __riscv_sf_vqmaccsu_4x8x4_tu(vd, vs1, vs2, vl);
 }
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 4 x i32> @test_sf_vqmaccsu_4x8x4_i32m2_tu
-// CHECK-RV64-SAME: (<vscale x 4 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 16 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-SAME: (<vscale x 4 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 8 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.sf.vqmaccsu.4x8x4.nxv4i32.nxv8i8.nxv16i8.i64(<vscale x 4 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 16 x i8> [[VS2]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.sf.vqmaccsu.4x8x4.nxv4i32.nxv8i8.nxv8i8.i64(<vscale x 4 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 8 x i8> [[VS2]], i64 [[VL]], i64 2)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
-vint32m2_t test_sf_vqmaccsu_4x8x4_i32m2_tu(vint32m2_t vd, vint8m1_t vs1, vuint8m2_t vs2, size_t vl) {
+vint32m2_t test_sf_vqmaccsu_4x8x4_i32m2_tu(vint32m2_t vd, vint8m1_t vs1, vuint8m1_t vs2, size_t vl) {
   return __riscv_sf_vqmaccsu_4x8x4_tu(vd, vs1, vs2, vl);
 }
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 8 x i32> @test_sf_vqmaccsu_4x8x4_i32m4_tu
-// CHECK-RV64-SAME: (<vscale x 8 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 32 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-SAME: (<vscale x 8 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 16 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.sf.vqmaccsu.4x8x4.nxv8i32.nxv8i8.nxv32i8.i64(<vscale x 8 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 32 x i8> [[VS2]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.sf.vqmaccsu.4x8x4.nxv8i32.nxv8i8.nxv16i8.i64(<vscale x 8 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 16 x i8> [[VS2]], i64 [[VL]], i64 2)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
-vint32m4_t test_sf_vqmaccsu_4x8x4_i32m4_tu(vint32m4_t vd, vint8m1_t vs1, vuint8m4_t vs2, size_t vl) {
+vint32m4_t test_sf_vqmaccsu_4x8x4_i32m4_tu(vint32m4_t vd, vint8m1_t vs1, vuint8m2_t vs2, size_t vl) {
   return __riscv_sf_vqmaccsu_4x8x4_tu(vd, vs1, vs2, vl);
 }
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 16 x i32> @test_sf_vqmaccsu_4x8x4_i32m8_tu
-// CHECK-RV64-SAME: (<vscale x 16 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 64 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-SAME: (<vscale x 16 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 32 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.sf.vqmaccsu.4x8x4.nxv16i32.nxv8i8.nxv64i8.i64(<vscale x 16 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 64 x i8> [[VS2]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.sf.vqmaccsu.4x8x4.nxv16i32.nxv8i8.nxv32i8.i64(<vscale x 16 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 32 x i8> [[VS2]], i64 [[VL]], i64 2)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
-vint32m8_t test_sf_vqmaccsu_4x8x4_i32m8_tu(vint32m8_t vd, vint8m1_t vs1, vuint8m8_t vs2, size_t vl) {
+vint32m8_t test_sf_vqmaccsu_4x8x4_i32m8_tu(vint32m8_t vd, vint8m1_t vs1, vuint8m4_t vs2, size_t vl) {
   return __riscv_sf_vqmaccsu_4x8x4_tu(vd, vs1, vs2, vl);
 }
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/sf_vqmaccu_4x8x4.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/sf_vqmaccu_4x8x4.c
index 950688c6c7851..1f2b2a1c86451 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/sf_vqmaccu_4x8x4.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/sf_vqmaccu_4x8x4.c
@@ -7,41 +7,41 @@
 #include <sifive_vector.h>
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 2 x i32> @test_sf_vqmaccu_4x8x4_i32m1_tu
-// CHECK-RV64-SAME: (<vscale x 2 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 8 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-SAME: (<vscale x 2 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 4 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.sf.vqmaccu.4x8x4.nxv2i32.nxv8i8.nxv8i8.i64(<vscale x 2 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 8 x i8> [[VS2]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.sf.vqmaccu.4x8x4.nxv2i32.nxv8i8.nxv4i8.i64(<vscale x 2 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 4 x i8> [[VS2]], i64 [[VL]], i64 2)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
-vint32m1_t test_sf_vqmaccu_4x8x4_i32m1_tu(vint32m1_t vd, vuint8m1_t vs1, vuint8m1_t vs2, size_t vl) {
+vint32m1_t test_sf_vqmaccu_4x8x4_i32m1_tu(vint32m1_t vd, vuint8m1_t vs1, vuint8mf2_t vs2, size_t vl) {
   return __riscv_sf_vqmaccu_4x8x4_tu(vd, vs1, vs2, vl);
 }
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 4 x i32> @test_sf_vqmaccu_4x8x4_i32m2_tu
-// CHECK-RV64-SAME: (<vscale x 4 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 16 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-SAME: (<vscale x 4 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 8 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.sf.vqmaccu.4x8x4.nxv4i32.nxv8i8.nxv16i8.i64(<vscale x 4 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 16 x i8> [[VS2]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.sf.vqmaccu.4x8x4.nxv4i32.nxv8i8.nxv8i8.i64(<vscale x 4 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 8 x i8> [[VS2]], i64 [[VL]], i64 2)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
-vint32m2_t test_sf_vqmaccu_4x8x4_i32m2_tu(vint32m2_t vd, vuint8m1_t vs1, vuint8m2_t vs2, size_t vl) {
+vint32m2_t test_sf_vqmaccu_4x8x4_i32m2_tu(vint32m2_t vd, vuint8m1_t vs1, vuint8m1_t vs2, size_t vl) {
   return __riscv_sf_vqmaccu_4x8x4_tu(vd, vs1, vs2, vl);
 }
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 8 x i32> @test_sf_vqmaccu_4x8x4_i32m4_tu
-// CHECK-RV64-SAME: (<vscale x 8 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 32 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-SAME: (<vscale x 8 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 16 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.sf.vqmaccu.4x8x4.nxv8i32.nxv8i8.nxv32i8.i64(<vscale x 8 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 32 x i8> [[VS2]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.sf.vqmaccu.4x8x4.nxv8i32.nxv8i8.nxv16i8.i64(<vscale x 8 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 16 x i8> [[VS2]], i64 [[VL]], i64 2)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
-vint32m4_t test_sf_vqmaccu_4x8x4_i32m4_tu(vint32m4_t vd, vuint8m1_t vs1, vuint8m4_t vs2, size_t vl) {
+vint32m4_t test_sf_vqmaccu_4x8x4_i32m4_tu(vint32m4_t vd, vuint8m1_t vs1, vuint8m2_t vs2, size_t vl) {
   return __riscv_sf_vqmaccu_4x8x4_tu(vd, vs1, vs2, vl);
 }
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 16 x i32> @test_sf_vqmaccu_4x8x4_i32m8_tu
-// CHECK-RV64-SAME: (<vscale x 16 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 64 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-SAME: (<vscale x 16 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 32 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.sf.vqmaccu.4x8x4.nxv16i32.nxv8i8.nxv64i8.i64(<vscale x 16 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 64 x i8> [[VS2]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.sf.vqmaccu.4x8x4.nxv16i32.nxv8i8.nxv32i8.i64(<vscale x 16 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 32 x i8> [[VS2]], i64 [[VL]], i64 2)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
-vint32m8_t test_sf_vqmaccu_4x8x4_i32m8_tu(vint32m8_t vd, vuint8m1_t vs1, vuint8m8_t vs2, size_t vl) {
+vint32m8_t test_sf_vqmaccu_4x8x4_i32m8_tu(vint32m8_t vd, vuint8m1_t vs1, vuint8m4_t vs2, size_t vl) {
   return __riscv_sf_vqmaccu_4x8x4_tu(vd, vs1, vs2, vl);
 }
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/sf_vqmaccus_4x8x4.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/sf_vqmaccus_4x8x4.c
index 7bdce95043ee4..923234fe8e2b6 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/sf_vqmaccus_4x8x4.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/sf_vqmaccus_4x8x4.c
@@ -7,41 +7,41 @@
 #include <sifive_vector.h>
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 2 x i32> @test_sf_vqmaccus_4x8x4_i32m1_tu
-// CHECK-RV64-SAME: (<vscale x 2 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 8 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-SAME: (<vscale x 2 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 4 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.sf.vqmaccus.4x8x4.nxv2i32.nxv8i8.nxv8i8.i64(<vscale x 2 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 8 x i8> [[VS2]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.sf.vqmaccus.4x8x4.nxv2i32.nxv8i8.nxv4i8.i64(<vscale x 2 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 4 x i8> [[VS2]], i64 [[VL]], i64 2)
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> [[TMP0]]
 //
-vint32m1_t test_sf_vqmaccus_4x8x4_i32m1_tu(vint32m1_t vd, vuint8m1_t vs1, vint8m1_t vs2, size_t vl) {
+vint32m1_t test_sf_vqmaccus_4x8x4_i32m1_tu(vint32m1_t vd, vuint8m1_t vs1, vint8mf2_t vs2, size_t vl) {
   return __riscv_sf_vqmaccus_4x8x4_tu(vd, vs1, vs2, vl);
 }
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 4 x i32> @test_sf_vqmaccus_4x8x4_i32m2_tu
-// CHECK-RV64-SAME: (<vscale x 4 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 16 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-SAME: (<vscale x 4 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 8 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.sf.vqmaccus.4x8x4.nxv4i32.nxv8i8.nxv16i8.i64(<vscale x 4 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 16 x i8> [[VS2]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.sf.vqmaccus.4x8x4.nxv4i32.nxv8i8.nxv8i8.i64(<vscale x 4 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 8 x i8> [[VS2]], i64 [[VL]], i64 2)
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
-vint32m2_t test_sf_vqmaccus_4x8x4_i32m2_tu(vint32m2_t vd, vuint8m1_t vs1, vint8m2_t vs2, size_t vl) {
+vint32m2_t test_sf_vqmaccus_4x8x4_i32m2_tu(vint32m2_t vd, vuint8m1_t vs1, vint8m1_t vs2, size_t vl) {
   return __riscv_sf_vqmaccus_4x8x4_tu(vd, vs1, vs2, vl);
 }
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 8 x i32> @test_sf_vqmaccus_4x8x4_i32m4_tu
-// CHECK-RV64-SAME: (<vscale x 8 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 32 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-SAME: (<vscale x 8 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 16 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.sf.vqmaccus.4x8x4.nxv8i32.nxv8i8.nxv32i8.i64(<vscale x 8 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 32 x i8> [[VS2]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.sf.vqmaccus.4x8x4.nxv8i32.nxv8i8.nxv16i8.i64(<vscale x 8 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 16 x i8> [[VS2]], i64 [[VL]], i64 2)
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP0]]
 //
-vint32m4_t test_sf_vqmaccus_4x8x4_i32m4_tu(vint32m4_t vd, vuint8m1_t vs1, vint8m4_t vs2, size_t vl) {
+vint32m4_t test_sf_vqmaccus_4x8x4_i32m4_tu(vint32m4_t vd, vuint8m1_t vs1, vint8m2_t vs2, size_t vl) {
   return __riscv_sf_vqmaccus_4x8x4_tu(vd, vs1, vs2, vl);
 }
 
 // CHECK-RV64-LABEL: define dso_local <vscale x 16 x i32> @test_sf_vqmaccus_4x8x4_i32m8_tu
-// CHECK-RV64-SAME: (<vscale x 16 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 64 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-SAME: (<vscale x 16 x i32> [[VD:%.*]], <vscale x 8 x i8> [[VS1:%.*]], <vscale x 32 x i8> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
-// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.sf.vqmaccus.4x8x4.nxv16i32.nxv8i8.nxv64i8.i64(<vscale x 16 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 64 x i8> [[VS2]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.sf.vqmaccus.4x8x4.nxv16i32.nxv8i8.nxv32i8.i64(<vscale x 16 x i32> [[VD]], <vscale x 8 x i8> [[VS1]], <vscale x 32 x i8> [[VS2]], i64 [[VL]], i64 2)
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
 //
-vint32m8_t test_sf_vqmaccus_4x8x4_i32m8_tu(vint32m8_t vd, vuint8m1_t vs1, vint8m8_t vs2, size_t vl) {
+vint32m8_t test_sf_vqmaccus_4x8x4_i32m8_tu(vint32m8_t vd, vuint8m1_t vs1, vint8m4_t vs2, size_t vl) {
   return __riscv_sf_vqmaccus_4x8x4_tu(vd, vs1, vs2, vl);
 }
diff --git a/clang/test/Sema/rvv-required-features.c b/clang/test/Sema/rvv-required-features.c
index 2714ef04b9bf2..5846f338aa801 100644
--- a/clang/test/Sema/rvv-required-features.c
+++ b/clang/test/Sema/rvv-required-features.c
@@ -1,8 +1,6 @@
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +xsfvcp \
-// RUN:     -target-feature +xsfvqmaccdod -target-feature +xsfvqmaccqoq \
-// RUN:     -target-feature +experimental-zvfbfmin -target-feature +xsfvfwmaccqqq \
-// RUN:     -target-feature +xsfvfnrclipxfqf %s -fsyntax-only -verify
+// RUN:     -target-feature +xsfvqmaccdod -target-feature +xsfvqmaccqoq %s -fsyntax-only -verify
 
 // expected-no-diagnostics
 
@@ -25,18 +23,6 @@ void test_xsfvqmaccdod(vint32m1_t vd, vint8m1_t vs1, vint8m1_t vs2, size_t vl) {
   __riscv_sf_vqmacc_2x8x2(vd, vs1, vs2, vl);
 }
 
-void test_xsfvqmaccqoq(vint32m1_t vd, vint8m1_t vs1, vint8m1_t vs2, size_t vl) {
+void test_xsfvqmaccqoq(vint32m1_t vd, vint8m1_t vs1, vint8mf2_t vs2, size_t vl) {
   __riscv_sf_vqmacc_4x8x4(vd, vs1, vs2, vl);
 }
-
-void test_xsfvfwmaccqqq(vfloat32m1_t vd, vbfloat16m1_t vs1, vbfloat16mf2_t vs2, size_t vl) {
-  __riscv_sf_vfwmacc_4x4x4(vd, vs1, vs2, vl);
-}
-
-void test_xsfvfnrclipxufqf(vfloat32m1_t vs1, float rs2, size_t vl) {
-  __riscv_sf_vfnrclip_xu_f_qf(vs1, rs2, vl);
-}
-
-void test_xsfvfnrclipxfqf(vfloat32m1_t vs1, float rs2, size_t vl) {
-  __riscv_sf_vfnrclip_x_f_qf(vs1, rs2, vl);
-}
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td
index 0b1d5b664df97..1b63ee7ac4bbd 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td
@@ -349,14 +349,21 @@ multiclass VPseudoSiFiveVMACC<string mx, VReg vd_type, VReg vs2_type,
       : VPseudoTernaryNoMaskWithPolicy<vd_type, V_M1.vrclass, vs2_type, Constraint>;
 }
 
-multiclass VPseudoSiFiveVQMACC<string Constraint = ""> {
+multiclass VPseudoSiFiveVQMACCDOD<string Constraint = ""> {
   foreach m = MxListVF8 in
     let VLMul = m.value in
     defm NAME : VPseudoSiFiveVMACC<m.MX, m.vrclass, m.vrclass, Constraint>;
 }
 
+multiclass VPseudoSiFiveVQMACCQOQ<string Constraint = ""> {
+  foreach i = 0-3 in
+    let VLMul = MxListVF4[i].value in
+    defm NAME : VPseudoSiFiveVMACC<MxListVF4[i].MX, MxListVF8[i].vrclass,
+                                   MxListVF4[i].vrclass, Constraint>;
+}
+
 multiclass VPseudoSiFiveVFWMACC<string Constraint = ""> {
-  foreach m = MxListFW in
+  foreach m = MxListVF2 in
     let VLMul = m.value in
     defm NAME : VPseudoSiFiveVMACC<m.MX, m.wvrclass, m.vrclass, Constraint>;
 }
@@ -400,17 +407,17 @@ let Predicates = [HasVendorXSfvcp] in {
 }
 
 let Predicates = [HasVendorXSfvqmaccdod] in {
-  defm VQMACCU_2x8x2  : VPseudoSiFiveVQMACC;
-  defm VQMACC_2x8x2   : VPseudoSiFiveVQMACC;
-  defm VQMACCUS_2x8x2 : VPseudoSiFiveVQMACC;
-  defm VQMACCSU_2x8x2 : VPseudoSiFiveVQMACC;
+  defm VQMACCU_2x8x2  : VPseudoSiFiveVQMACCDOD;
+  defm VQMACC_2x8x2   : VPseudoSiFiveVQMACCDOD;
+  defm VQMACCUS_2x8x2 : VPseudoSiFiveVQMACCDOD;
+  defm VQMACCSU_2x8x2 : VPseudoSiFiveVQMACCDOD;
 }
 
 let Predicates = [HasVendorXSfvqmaccqoq] in {
-  defm VQMACCU_4x8x4  : VPseudoSiFiveVQMACC;
-  defm VQMACC_4x8x4   : VPseudoSiFiveVQMACC;
-  defm VQMACCUS_4x8x4 : VPseudoSiFiveVQMACC;
-  defm VQMACCSU_4x8x4 : VPseudoSiFiveVQMACC;
+  defm VQMACCU_4x8x4  : VPseudoSiFiveVQMACCQOQ;
+  defm VQMACC_4x8x4   : VPseudoSiFiveVQMACCQOQ;
+  defm VQMACCUS_4x8x4 : VPseudoSiFiveVQMACCQOQ;
+  defm VQMACCSU_4x8x4 : VPseudoSiFiveVQMACCQOQ;
 }
 
 let Predicates = [HasVendorXSfvfwmaccqqq] in {
@@ -566,16 +573,25 @@ multiclass VPatVMACC<string intrinsic, string instruction, string kind,
   }
 }
 
-defset list<VTypeInfoToWide> VQMACCInfoPairs = {
+defset list<VTypeInfoToWide> VQMACCDODInfoPairs = {
   def : VTypeInfoToWide<VI8M1, VI32M1>;
   def : VTypeInfoToWide<VI8M2, VI32M2>;
   def : VTypeInfoToWide<VI8M4, VI32M4>;
   def : VTypeInfoToWide<VI8M8, VI32M8>;
 }
 
-multiclass VPatVQMACC<string intrinsic, string instruction, string kind>
-    : VPatVMACC<intrinsic, instruction, kind, VQMACCInfoPairs, vint8m1_t>;
+defset list<VTypeInfoToWide> VQMACCQOQInfoPairs = {
+   def : VTypeInfoToWide<VI8MF2, VI32M1>;
+   def : VTypeInfoToWide<VI8M1, VI32M2>;
+   def : VTypeInfoToWide<VI8M2, VI32M4>;
+   def : VTypeInfoToWide<VI8M4, VI32M8>;
+}
+
+multiclass VPatVQMACCDOD<string intrinsic, string instruction, string kind>
+    : VPatVMACC<intrinsic, instruction, kind, VQMACCDODInfoPairs, vint8m1_t>;
 
+multiclass VPatVQMACCQOQ<string intrinsic, string instruction, string kind>
+    : VPatVMACC<intrinsic, instruction, kind, VQMACCQOQInfoPairs, vint8m1_t>;
 
 multiclass VPatVFWMACC<string intrinsic, string instruction, string kind>
     : VPatVMACC<intrinsic, instruction, kind, AllWidenableBFloatToFloatVectors,
@@ -637,17 +653,17 @@ let Predicates = [HasVendorXSfvcp] in {
 }
 
 let Predicates = [HasVendorXSfvqmaccdod] in {
-  defm : VPatVQMACC<"vqmaccu_2x8x2", "VQMACCU", "2x8x2">;
-  defm : VPatVQMACC<"vqmacc_2x8x2", "VQMACC", "2x8x2">;
-  defm : VPatVQMACC<"vqmaccus_2x8x2", "VQMACCUS", "2x8x2">;
-  defm : VPatVQMACC<"vqmaccsu_2x8x2", "VQMACCSU", "2x8x2">;
+  defm : VPatVQMACCDOD<"vqmaccu_2x8x2", "VQMACCU", "2x8x2">;
+  defm : VPatVQMACCDOD<"vqmacc_2x8x2", "VQMACC", "2x8x2">;
+  defm : VPatVQMACCDOD<"vqmaccus_2x8x2", "VQMACCUS", "2x8x2">;
+  defm : VPatVQMACCDOD<"vqmaccsu_2x8x2", "VQMACCSU", "2x8x2">;
 }
 
 let Predicates = [HasVendorXSfvqmaccqoq] in {
-  defm : VPatVQMACC<"vqmaccu_4x8x4", "VQMACCU", "4x8x4">;
-  defm : VPatVQMACC<"vqmacc_4x8x4", "VQMACC", "4x8x4">;
-  defm : VPatVQMACC<"vqmaccus_4x8x4", "VQMACCUS", "4x8x4">;
-  defm : VPatVQMACC<"vqmaccsu_4x8x4", "VQMACCSU", "4x8x4">;
+  defm : VPatVQMACCQOQ<"vqmaccu_4x8x4", "VQMACCU", "4x8x4">;
+  defm : VPatVQMACCQOQ<"vqmacc_4x8x4", "VQMACC", "4x8x4">;
+  defm : VPatVQMACCQOQ<"vqmaccus_4x8x4", "VQMACCUS", "4x8x4">;
+  defm : VPatVQMACCQOQ<"vqmaccsu_4x8x4", "VQMACCSU", "4x8x4">;
 }
 
 let Predicates = [HasVendorXSfvfwmaccqqq] in {
diff --git a/llvm/test/CodeGen/RISCV/rvv/sf_vqmacc_4x8x4.ll b/llvm/test/CodeGen/RISCV/rvv/sf_vqmacc_4x8x4.ll
index 2d591be2adc21..eebc51619480b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/sf_vqmacc_4x8x4.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/sf_vqmacc_4x8x4.ll
@@ -7,36 +7,36 @@
 declare <vscale x 2 x i32> @llvm.riscv.sf.vqmacc.4x8x4.nxv2i32.nxv8i8.nxv8i8(
   <vscale x 2 x i32>,
   <vscale x 8 x i8>,
-  <vscale x 8 x i8>,
+  <vscale x 4 x i8>,
   iXLen, iXLen);
 
-define <vscale x 2 x i32> @intrinsic_vqmacc_4x8x4_tu_i32m1(<vscale x 2 x i32> %0, <vscale x 8 x i8> %1, <vscale x 8 x i8> %2, iXLen %3) nounwind {
+define <vscale x 2 x i32> @intrinsic_vqmacc_4x8x4_tu_i32m1(<vscale x 2 x i32> %0, <vscale x 8 x i8> %1, <vscale x 4 x i8> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vqmacc_4x8x4_tu_i32m1:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, tu, ma
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, tu, ma
 ; CHECK-NEXT:    sf.vqmacc.4x8x4 v8, v9, v10
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i32> @llvm.riscv.sf.vqmacc.4x8x4.nxv2i32.nxv8i8.nxv8i8(
     <vscale x 2 x i32> %0,
     <vscale x 8 x i8> %1,
-    <vscale x 8 x i8> %2,
+    <vscale x 4 x i8> %2,
     iXLen %3, iXLen 2)
 
   ret <vscale x 2 x i32> %a
 }
 
-define <vscale x 2 x i32> @intrinsic_vqmacc_4x8x4_ta_i32m1(<vscale x 2 x i32> %0, <vscale x 8 x i8> %1, <vscale x 8 x i8> %2, iXLen %3) nounwind {
+define <vscale x 2 x i32> @intrinsic_vqmacc_4x8x4_ta_i32m1(<vscale x 2 x i32> %0, <vscale x 8 x i8> %1, <vscale x 4 x i8> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vqmacc_4x8x4_ta_i32m1:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; CHECK-NEXT:    sf.vqmacc.4x8x4 v8, v9, v10
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i32> @llvm.riscv.sf.vqmacc.4x8x4.nxv2i32.nxv8i8.nxv8i8(
     <vscale x 2 x i32> %0,
     <vscale x 8 x i8> %1,
-    <vscale x 8 x i8> %2,
+    <vscale x 4 x i8> %2,
     iXLen %3, iXLen 3)
 
   ret <vscale x 2 x i32> %a
@@ -45,36 +45,36 @@ entry:
 declare <vscale x 4 x i32> @llvm.riscv.sf.vqmacc.4x8x4.nxv4i32.nxv8i8.nxv16i8(
   <vscale x 4 x i32>,
   <vscale x 8 x i8>,
-  <vscale x 16 x i8>,
+  <vscale x 8 x i8>,
   iXLen, iXLen);
 
-define <vscale x 4 x i32> @intrinsic_vqmacc_4x8x4_tu_i32m2(<vscale x 4 x i32> %0, <vscale x 8 x i8> %1, <vscale x 16 x i8> %2, iXLen %3) nounwind {
+define <vscale x 4 x i32> @intrinsic_vqmacc_4x8x4_tu_i32m2(<vscale x 4 x i32> %0, <vscale x 8 x i8> %1, <vscale x 8 x i8> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vqmacc_4x8x4_tu_i32m2:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m2, tu, ma
-; CHECK-NEXT:    sf.vqmacc.4x8x4 v8, v10, v12
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, tu, ma
+; CHECK-NEXT:    sf.vqmacc.4x8x4 v8, v10, v11
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i32> @llvm.riscv.sf.vqmacc.4x8x4.nxv4i32.nxv8i8.nxv16i8(
     <vscale x 4 x i32> %0,
     <vscale x 8 x i8> %1,
-    <vscale x 16 x i8> %2,
+    <vscale x 8 x i8> %2,
     iXLen %3, iXLen 2)
 
   ret <vscale x 4 x i32> %a
 }
 
-define <vscale x 4 x i32> @intrinsic_vqmacc_4x8x4_ta_i32m2(<vscale x 4 x i32> %0, <vscale x 8 x i8> %1, <vscale x 16 x i8> %2, iXLen %3) nounwind {
+define <vscale x 4 x i32> @intrinsic_vqmacc_4x8x4_ta_i32m2(<vscale x 4 x i32> %0, <vscale x 8 x i8> %1, <vscale x 8 x i8> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vqmacc_4x8x4_ta_i32m2:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
-; CHECK-NEXT:    sf.vqmacc.4x8x4 v8, v10, v12
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    sf.vqmacc.4x8x4 v8, v10, v11
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i32> @llvm.riscv.sf.vqmacc.4x8x4.nxv4i32.nxv8i8.nxv16i8(
     <vscale x 4 x i32> %0,
     <vscale x 8 x i8> %1,
-    <vscale x 16 x i8> %2,
+    <vscale x 8 x i8> %2,
     iXLen %3, iXLen 3)
 
   ret <vscale x 4 x i32> %a
@@ -83,36 +83,36 @@ entry:
 declare <vscale x 8 x i32> @llvm.riscv.sf.vqmacc.4x8x4.nxv8i32.nxv8i8.nxv32i8(
   <vscale x 8 x i32>,
   <vscale x 8 x i8>,
-  <vscale x 32 x i8>,
+  <vscale x 16 x i8>,
   iXLen, iXLen);
 
-define <vscale x 8 x i32> @intrinsic_vqmacc_4x8x4_tu_i32m4(<vscale x 8 x i32> %0, <vscale x 8 x i8> %1, <vscale x 32 x i8> %2, iXLen %3) nounwind {
+define <vscale x 8 x i32> @intrinsic_vqmacc_4x8x4_tu_i32m4(<vscale x 8 x i32> %0, <vscale x 8 x i8> %1, <vscale x 16 x i8> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vqmacc_4x8x4_tu_i32m4:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m4, tu, ma
-; CHECK-NEXT:    sf.vqmacc.4x8x4 v8, v12, v16
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, tu, ma
+; CHECK-NEXT:    sf.vqmacc.4x8x4 v8, v12, v14
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i32> @llvm.riscv.sf.vqmacc.4x8x4.nxv8i32.nxv8i8.nxv32i8(
     <vscale x 8 x i32> %0,
     <vscale x 8 x i8> %1,
-    <vscale x 32 x i8> %2,
+    <vscale x 16 x i8> %2,
     iXLen %3, iXLen 2)
 
   ret <vscale x 8 x i32> %a
 }
 
-define <vscale x 8 x i32> @intrinsic_vqmacc_4x8x4_ta_i32m4(<vscale x 8 x i32> %0, <vscale x 8 x i8> %1, <vscale x 32 x i8> %2, iXLen %3) nounwind {
+define <vscale x 8 x i32> @intrinsic_vqmacc_4x8x4_ta_i32m4(<vscale x 8 x i32> %0, <vscale x 8 x i8> %1, <vscale x 16 x i8> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vqmacc_4x8x4_ta_i32m4:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
-; CHECK-NEXT:    sf.vqmacc.4x8x4 v8, v12, v16
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
+; CHECK-NEXT:    sf.vqmacc.4x8x4 v8, v12, v14
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i32> @llvm.riscv.sf.vqmacc.4x8x4.nxv8i32.nxv8i8.nxv32i8(
     <vscale x 8 x i32> %0,
     <vscale x 8 x i8> %1,
-    <vscale x 32 x i8> %2,
+    <vscale x 16 x i8> %2,
     iXLen %3, iXLen 3)
 
   ret <vscale x 8 x i32> %a
@@ -121,38 +121,36 @@ entry:
 declare <vscale x 16 x i32> @llvm.riscv.sf.vqmacc.4x8x4.nxv16i32.nxv8i8.nxv64i8(
   <vscale x 16 x i32>,
   <vscale x 8 x i8>,
-  <vscale x 64 x i8>,
+  <vscale x 32 x i8>,
   iXLen, iXLen);
 
-define <vscale x 16 x i32> @intrinsic_vqmacc_4x8x4_tu_i32m8(<vscale x 16 x i32> %0, <vscale x 8 x i8> %1, <vscale x 64 x i8> %2, iXLen %3) nounwind {
+define <vscale x 16 x i32> @intrinsic_vqmacc_4x8x4_tu_i32m8(<vscale x 16 x i32> %0, <vscale x 8 x i8> %1, <vscale x 32 x i8> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vqmacc_4x8x4_tu_i32m8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vl8r.v v24, (a0)
-; CHECK-NEXT:    vsetvli zero, a1, e8, m8, tu, ma
-; CHECK-NEXT:    sf.vqmacc.4x8x4 v8, v16, v24
+; CHECK-NEXT:    vsetvli zero, a0, e8, m4, tu, ma
+; CHECK-NEXT:    sf.vqmacc.4x8x4 v8, v16, v20
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i32> @llvm.riscv.sf.vqmacc.4x8x4.nxv16i32.nxv8i8.nxv64i8(
     <vscale x 16 x i32> %0,
     <vscale x 8 x i8> %1,
-    <vscale x 64 x i8> %2,
+    <vscale x 32 x i8> %2,
     iXLen %3, iXLen 2)
 
   ret <vscale x 16 x i32> %a
 }
 
-define <vscale x 16 x i32> @intrinsic_vqmacc_4x8x4_ta_i32m8(<vscale x 16 x i32> %0, <vscale x 8 x i8> %1, <vscale x 64 x i8> %2, iXLen %3) nounwind {
+define <vscale x 16 x i32> @intrinsic_vqmacc_4x8x4_ta_i32m8(<vscale x 16 x i32> %0, <vscale x 8 x i8> %1, <vscale x 32 x i8> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vqmacc_4x8x4_ta_i32m8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vl8r.v v24, (a0)
-; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
-; CHECK-NEXT:    sf.vqmacc.4x8x4 v8, v16, v24
+; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; CHECK-NEXT:    sf.vqmacc.4x8x4 v8, v16, v20
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i32> @llvm.riscv.sf.vqmacc.4x8x4.nxv16i32.nxv8i8.nxv64i8(
     <vscale x 16 x i32> %0,
     <vscale x 8 x i8> %1,
-    <vscale x 64 x i8> %2,
+    <vscale x 32 x i8> %2,
     iXLen %3, iXLen 3)
 
   ret <vscale x 16 x i32> %a
diff --git a/llvm/test/CodeGen/RISCV/rvv/sf_vqmaccsu_4x8x4.ll b/llvm/test/CodeGen/RISCV/rvv/sf_vqmaccsu_4x8x4.ll
index bfdab33965c13..0d7052356e558 100644
--- a/llvm/test/CodeGen/RISCV/rvv/sf_vqmaccsu_4x8x4.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/sf_vqmaccsu_4x8x4.ll
@@ -7,36 +7,36 @@
 declare <vscale x 2 x i32> @llvm.riscv.sf.vqmaccsu.4x8x4.nxv2i32.nxv8i8.nxv8i8(
   <vscale x 2 x i32>,
   <vscale x 8 x i8>,
-  <vscale x 8 x i8>,
+  <vscale x 4 x i8>,
   iXLen, iXLen);
 
-define <vscale x 2 x i32> @intrinsic_vqmaccsu_4x8x4_tu_i32m1(<vscale x 2 x i32> %0, <vscale x 8 x i8> %1, <vscale x 8 x i8> %2, iXLen %3) nounwind {
+define <vscale x 2 x i32> @intrinsic_vqmaccsu_4x8x4_tu_i32m1(<vscale x 2 x i32> %0, <vscale x 8 x i8> %1, <vscale x 4 x i8> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vqmaccsu_4x8x4_tu_i32m1:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, tu, ma
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, tu, ma
 ; CHECK-NEXT:    sf.vqmaccsu.4x8x4 v8, v9, v10
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i32> @llvm.riscv.sf.vqmaccsu.4x8x4.nxv2i32.nxv8i8.nxv8i8(
     <vscale x 2 x i32> %0,
     <vscale x 8 x i8> %1,
-    <vscale x 8 x i8> %2,
+    <vscale x 4 x i8> %2,
     iXLen %3, iXLen 2)
 
   ret <vscale x 2 x i32> %a
 }
 
-define <vscale x 2 x i32> @intrinsic_vqmaccsu_4x8x4_ta_i32m1(<vscale x 2 x i32> %0, <vscale x 8 x i8> %1, <vscale x 8 x i8> %2, iXLen %3) nounwind {
+define <vscale x 2 x i32> @intrinsic_vqmaccsu_4x8x4_ta_i32m1(<vscale x 2 x i32> %0, <vscale x 8 x i8> %1, <vscale x 4 x i8> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vqmaccsu_4x8x4_ta_i32m1:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; CHECK-NEXT:    sf.vqmaccsu.4x8x4 v8, v9, v10
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i32> @llvm.riscv.sf.vqmaccsu.4x8x4.nxv2i32.nxv8i8.nxv8i8(
     <vscale x 2 x i32> %0,
     <vscale x 8 x i8> %1,
-    <vscale x 8 x i8> %2,
+    <vscale x 4 x i8> %2,
     iXLen %3, iXLen 3)
 
   ret <vscale x 2 x i32> %a
@@ -45,36 +45,36 @@ entry:
 declare <vscale x 4 x i32> @llvm.riscv.sf.vqmaccsu.4x8x4.nxv4i32.nxv8i8.nxv16i8(
   <vscale x 4 x i32>,
   <vscale x 8 x i8>,
-  <vscale x 16 x i8>,
+  <vscale x 8 x i8>,
   iXLen, iXLen);
 
-define <vscale x 4 x i32> @intrinsic_vqmaccsu_4x8x4_tu_i32m2(<vscale x 4 x i32> %0, <vscale x 8 x i8> %1, <vscale x 16 x i8> %2, iXLen %3) nounwind {
+define <vscale x 4 x i32> @intrinsic_vqmaccsu_4x8x4_tu_i32m2(<vscale x 4 x i32> %0, <vscale x 8 x i8> %1, <vscale x 8 x i8> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vqmaccsu_4x8x4_tu_i32m2:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m2, tu, ma
-; CHECK-NEXT:    sf.vqmaccsu.4x8x4 v8, v10, v12
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, tu, ma
+; CHECK-NEXT:    sf.vqmaccsu.4x8x4 v8, v10, v11
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i32> @llvm.riscv.sf.vqmaccsu.4x8x4.nxv4i32.nxv8i8.nxv16i8(
     <vscale x 4 x i32> %0,
     <vscale x 8 x i8> %1,
-    <vscale x 16 x i8> %2,
+    <vscale x 8 x i8> %2,
     iXLen %3, iXLen 2)
 
   ret <vscale x 4 x i32> %a
 }
 
-define <vscale x 4 x i32> @intrinsic_vqmaccsu_4x8x4_ta_i32m2(<vscale x 4 x i32> %0, <vscale x 8 x i8> %1, <vscale x 16 x i8> %2, iXLen %3) nounwind {
+define <vscale x 4 x i32> @intrinsic_vqmaccsu_4x8x4_ta_i32m2(<vscale x 4 x i32> %0, <vscale x 8 x i8> %1, <vscale x 8 x i8> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vqmaccsu_4x8x4_ta_i32m2:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
-; CHECK-NEXT:    sf.vqmaccsu.4x8x4 v8, v10, v12
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    sf.vqmaccsu.4x8x4 v8, v10, v11
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i32> @llvm.riscv.sf.vqmaccsu.4x8x4.nxv4i32.nxv8i8.nxv16i8(
     <vscale x 4 x i32> %0,
     <vscale x 8 x i8> %1,
-    <vscale x 16 x i8> %2,
+    <vscale x 8 x i8> %2,
     iXLen %3, iXLen 3)
 
   ret <vscale x 4 x i32> %a
@@ -83,36 +83,36 @@ entry:
 declare <vscale x 8 x i32> @llvm.riscv.sf.vqmaccsu.4x8x4.nxv8i32.nxv8i8.nxv32i8(
   <vscale x 8 x i32>,
   <vscale x 8 x i8>,
-  <vscale x 32 x i8>,
+  <vscale x 16 x i8>,
   iXLen, iXLen);
 
-define <vscale x 8 x i32> @intrinsic_vqmaccsu_4x8x4_tu_i32m4(<vscale x 8 x i32> %0, <vscale x 8 x i8> %1, <vscale x 32 x i8> %2, iXLen %3) nounwind {
+define <vscale x 8 x i32> @intrinsic_vqmaccsu_4x8x4_tu_i32m4(<vscale x 8 x i32> %0, <vscale x 8 x i8> %1, <vscale x 16 x i8> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vqmaccsu_4x8x4_tu_i32m4:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m4, tu, ma
-; CHECK-NEXT:    sf.vqmaccsu.4x8x4 v8, v12, v16
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, tu, ma
+; CHECK-NEXT:    sf.vqmaccsu.4x8x4 v8, v12, v14
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i32> @llvm.riscv.sf.vqmaccsu.4x8x4.nxv8i32.nxv8i8.nxv32i8(
     <vscale x 8 x i32> %0,
     <vscale x 8 x i8> %1,
-    <vscale x 32 x i8> %2,
+    <vscale x 16 x i8> %2,
     iXLen %3, iXLen 2)
 
   ret <vscale x 8 x i32> %a
 }
 
-define <vscale x 8 x i32> @intrinsic_vqmaccsu_4x8x4_ta_i32m4(<vscale x 8 x i32> %0, <vscale x 8 x i8> %1, <vscale x 32 x i8> %2, iXLen %3) nounwind {
+define <vscale x 8 x i32> @intrinsic_vqmaccsu_4x8x4_ta_i32m4(<vscale x 8 x i32> %0, <vscale x 8 x i8> %1, <vscale x 16 x i8> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vqmaccsu_4x8x4_ta_i32m4:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
-; CHECK-NEXT:    sf.vqmaccsu.4x8x4 v8, v12, v16
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
+; CHECK-NEXT:    sf.vqmaccsu.4x8x4 v8, v12, v14
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i32> @llvm.riscv.sf.vqmaccsu.4x8x4.nxv8i32.nxv8i8.nxv32i8(
     <vscale x 8 x i32> %0,
     <vscale x 8 x i8> %1,
-    <vscale x 32 x i8> %2,
+    <vscale x 16 x i8> %2,
     iXLen %3, iXLen 3)
 
   ret <vscale x 8 x i32> %a
@@ -121,38 +121,36 @@ entry:
 declare <vscale x 16 x i32> @llvm.riscv.sf.vqmaccsu.4x8x4.nxv16i32.nxv8i8.nxv64i8(
   <vscale x 16 x i32>,
   <vscale x 8 x i8>,
-  <vscale x 64 x i8>,
+  <vscale x 32 x i8>,
   iXLen, iXLen);
 
-define <vscale x 16 x i32> @intrinsic_vqmaccsu_4x8x4_tu_i32m8(<vscale x 16 x i32> %0, <vscale x 8 x i8> %1, <vscale x 64 x i8> %2, iXLen %3) nounwind {
+define <vscale x 16 x i32> @intrinsic_vqmaccsu_4x8x4_tu_i32m8(<vscale x 16 x i32> %0, <vscale x 8 x i8> %1, <vscale x 32 x i8> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vqmaccsu_4x8x4_tu_i32m8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vl8r.v v24, (a0)
-; CHECK-NEXT:    vsetvli zero, a1, e8, m8, tu, ma
-; CHECK-NEXT:    sf.vqmaccsu.4x8x4 v8, v16, v24
+; CHECK-NEXT:    vsetvli zero, a0, e8, m4, tu, ma
+; CHECK-NEXT:    sf.vqmaccsu.4x8x4 v8, v16, v20
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i32> @llvm.riscv.sf.vqmaccsu.4x8x4.nxv16i32.nxv8i8.nxv64i8(
     <vscale x 16 x i32> %0,
     <vscale x 8 x i8> %1,
-    <vscale x 64 x i8> %2,
+    <vscale x 32 x i8> %2,
     iXLen %3, iXLen 2)
 
   ret <vscale x 16 x i32> %a
 }
 
-define <vscale x 16 x i32> @intrinsic_vqmaccsu_4x8x4_ta_i32m8(<vscale x 16 x i32> %0, <vscale x 8 x i8> %1, <vscale x 64 x i8> %2, iXLen %3) nounwind {
+define <vscale x 16 x i32> @intrinsic_vqmaccsu_4x8x4_ta_i32m8(<vscale x 16 x i32> %0, <vscale x 8 x i8> %1, <vscale x 32 x i8> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vqmaccsu_4x8x4_ta_i32m8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vl8r.v v24, (a0)
-; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
-; CHECK-NEXT:    sf.vqmaccsu.4x8x4 v8, v16, v24
+; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; CHECK-NEXT:    sf.vqmaccsu.4x8x4 v8, v16, v20
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i32> @llvm.riscv.sf.vqmaccsu.4x8x4.nxv16i32.nxv8i8.nxv64i8(
     <vscale x 16 x i32> %0,
     <vscale x 8 x i8> %1,
-    <vscale x 64 x i8> %2,
+    <vscale x 32 x i8> %2,
     iXLen %3, iXLen 3)
 
   ret <vscale x 16 x i32> %a
diff --git a/llvm/test/CodeGen/RISCV/rvv/sf_vqmaccu_4x8x4.ll b/llvm/test/CodeGen/RISCV/rvv/sf_vqmaccu_4x8x4.ll
index d1565fb9a634f..3332390f71e01 100644
--- a/llvm/test/CodeGen/RISCV/rvv/sf_vqmaccu_4x8x4.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/sf_vqmaccu_4x8x4.ll
@@ -7,36 +7,36 @@
 declare <vscale x 2 x i32> @llvm.riscv.sf.vqmaccu.4x8x4.nxv2i32.nxv8i8.nxv8i8(
   <vscale x 2 x i32>,
   <vscale x 8 x i8>,
-  <vscale x 8 x i8>,
+  <vscale x 4 x i8>,
   iXLen, iXLen);
 
-define <vscale x 2 x i32> @intrinsic_vqmaccu_4x8x4_tu_i32m1(<vscale x 2 x i32> %0, <vscale x 8 x i8> %1, <vscale x 8 x i8> %2, iXLen %3) nounwind {
+define <vscale x 2 x i32> @intrinsic_vqmaccu_4x8x4_tu_i32m1(<vscale x 2 x i32> %0, <vscale x 8 x i8> %1, <vscale x 4 x i8> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vqmaccu_4x8x4_tu_i32m1:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, tu, ma
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, tu, ma
 ; CHECK-NEXT:    sf.vqmaccu.4x8x4 v8, v9, v10
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i32> @llvm.riscv.sf.vqmaccu.4x8x4.nxv2i32.nxv8i8.nxv8i8(
     <vscale x 2 x i32> %0,
     <vscale x 8 x i8> %1,
-    <vscale x 8 x i8> %2,
+    <vscale x 4 x i8> %2,
     iXLen %3, iXLen 2)
 
   ret <vscale x 2 x i32> %a
 }
 
-define <vscale x 2 x i32> @intrinsic_vqmaccu_4x8x4_ta_i32m1(<vscale x 2 x i32> %0, <vscale x 8 x i8> %1, <vscale x 8 x i8> %2, iXLen %3) nounwind {
+define <vscale x 2 x i32> @intrinsic_vqmaccu_4x8x4_ta_i32m1(<vscale x 2 x i32> %0, <vscale x 8 x i8> %1, <vscale x 4 x i8> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vqmaccu_4x8x4_ta_i32m1:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; CHECK-NEXT:    sf.vqmaccu.4x8x4 v8, v9, v10
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i32> @llvm.riscv.sf.vqmaccu.4x8x4.nxv2i32.nxv8i8.nxv8i8(
     <vscale x 2 x i32> %0,
     <vscale x 8 x i8> %1,
-    <vscale x 8 x i8> %2,
+    <vscale x 4 x i8> %2,
     iXLen %3, iXLen 3)
 
   ret <vscale x 2 x i32> %a
@@ -45,36 +45,36 @@ entry:
 declare <vscale x 4 x i32> @llvm.riscv.sf.vqmaccu.4x8x4.nxv4i32.nxv8i8.nxv16i8(
   <vscale x 4 x i32>,
   <vscale x 8 x i8>,
-  <vscale x 16 x i8>,
+  <vscale x 8 x i8>,
   iXLen, iXLen);
 
-define <vscale x 4 x i32> @intrinsic_vqmaccu_4x8x4_tu_i32m2(<vscale x 4 x i32> %0, <vscale x 8 x i8> %1, <vscale x 16 x i8> %2, iXLen %3) nounwind {
+define <vscale x 4 x i32> @intrinsic_vqmaccu_4x8x4_tu_i32m2(<vscale x 4 x i32> %0, <vscale x 8 x i8> %1, <vscale x 8 x i8> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vqmaccu_4x8x4_tu_i32m2:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m2, tu, ma
-; CHECK-NEXT:    sf.vqmaccu.4x8x4 v8, v10, v12
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, tu, ma
+; CHECK-NEXT:    sf.vqmaccu.4x8x4 v8, v10, v11
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i32> @llvm.riscv.sf.vqmaccu.4x8x4.nxv4i32.nxv8i8.nxv16i8(
     <vscale x 4 x i32> %0,
     <vscale x 8 x i8> %1,
-    <vscale x 16 x i8> %2,
+    <vscale x 8 x i8> %2,
     iXLen %3, iXLen 2)
 
   ret <vscale x 4 x i32> %a
 }
 
-define <vscale x 4 x i32> @intrinsic_vqmaccu_4x8x4_ta_i32m2(<vscale x 4 x i32> %0, <vscale x 8 x i8> %1, <vscale x 16 x i8> %2, iXLen %3) nounwind {
+define <vscale x 4 x i32> @intrinsic_vqmaccu_4x8x4_ta_i32m2(<vscale x 4 x i32> %0, <vscale x 8 x i8> %1, <vscale x 8 x i8> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vqmaccu_4x8x4_ta_i32m2:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
-; CHECK-NEXT:    sf.vqmaccu.4x8x4 v8, v10, v12
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    sf.vqmaccu.4x8x4 v8, v10, v11
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i32> @llvm.riscv.sf.vqmaccu.4x8x4.nxv4i32.nxv8i8.nxv16i8(
     <vscale x 4 x i32> %0,
     <vscale x 8 x i8> %1,
-    <vscale x 16 x i8> %2,
+    <vscale x 8 x i8> %2,
     iXLen %3, iXLen 3)
 
   ret <vscale x 4 x i32> %a
@@ -83,36 +83,36 @@ entry:
 declare <vscale x 8 x i32> @llvm.riscv.sf.vqmaccu.4x8x4.nxv8i32.nxv8i8.nxv32i8(
   <vscale x 8 x i32>,
   <vscale x 8 x i8>,
-  <vscale x 32 x i8>,
+  <vscale x 16 x i8>,
   iXLen, iXLen);
 
-define <vscale x 8 x i32> @intrinsic_vqmaccu_4x8x4_tu_i32m4(<vscale x 8 x i32> %0, <vscale x 8 x i8> %1, <vscale x 32 x i8> %2, iXLen %3) nounwind {
+define <vscale x 8 x i32> @intrinsic_vqmaccu_4x8x4_tu_i32m4(<vscale x 8 x i32> %0, <vscale x 8 x i8> %1, <vscale x 16 x i8> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vqmaccu_4x8x4_tu_i32m4:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m4, tu, ma
-; CHECK-NEXT:    sf.vqmaccu.4x8x4 v8, v12, v16
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, tu, ma
+; CHECK-NEXT:    sf.vqmaccu.4x8x4 v8, v12, v14
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i32> @llvm.riscv.sf.vqmaccu.4x8x4.nxv8i32.nxv8i8.nxv32i8(
     <vscale x 8 x i32> %0,
     <vscale x 8 x i8> %1,
-    <vscale x 32 x i8> %2,
+    <vscale x 16 x i8> %2,
     iXLen %3, iXLen 2)
 
   ret <vscale x 8 x i32> %a
 }
 
-define <vscale x 8 x i32> @intrinsic_vqmaccu_4x8x4_ta_i32m4(<vscale x 8 x i32> %0, <vscale x 8 x i8> %1, <vscale x 32 x i8> %2, iXLen %3) nounwind {
+define <vscale x 8 x i32> @intrinsic_vqmaccu_4x8x4_ta_i32m4(<vscale x 8 x i32> %0, <vscale x 8 x i8> %1, <vscale x 16 x i8> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vqmaccu_4x8x4_ta_i32m4:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
-; CHECK-NEXT:    sf.vqmaccu.4x8x4 v8, v12, v16
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
+; CHECK-NEXT:    sf.vqmaccu.4x8x4 v8, v12, v14
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i32> @llvm.riscv.sf.vqmaccu.4x8x4.nxv8i32.nxv8i8.nxv32i8(
     <vscale x 8 x i32> %0,
     <vscale x 8 x i8> %1,
-    <vscale x 32 x i8> %2,
+    <vscale x 16 x i8> %2,
     iXLen %3, iXLen 3)
 
   ret <vscale x 8 x i32> %a
@@ -121,38 +121,36 @@ entry:
 declare <vscale x 16 x i32> @llvm.riscv.sf.vqmaccu.4x8x4.nxv16i32.nxv8i8.nxv64i8(
   <vscale x 16 x i32>,
   <vscale x 8 x i8>,
-  <vscale x 64 x i8>,
+  <vscale x 32 x i8>,
   iXLen, iXLen);
 
-define <vscale x 16 x i32> @intrinsic_vqmaccu_4x8x4_tu_i32m8(<vscale x 16 x i32> %0, <vscale x 8 x i8> %1, <vscale x 64 x i8> %2, iXLen %3) nounwind {
+define <vscale x 16 x i32> @intrinsic_vqmaccu_4x8x4_tu_i32m8(<vscale x 16 x i32> %0, <vscale x 8 x i8> %1, <vscale x 32 x i8> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vqmaccu_4x8x4_tu_i32m8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vl8r.v v24, (a0)
-; CHECK-NEXT:    vsetvli zero, a1, e8, m8, tu, ma
-; CHECK-NEXT:    sf.vqmaccu.4x8x4 v8, v16, v24
+; CHECK-NEXT:    vsetvli zero, a0, e8, m4, tu, ma
+; CHECK-NEXT:    sf.vqmaccu.4x8x4 v8, v16, v20
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i32> @llvm.riscv.sf.vqmaccu.4x8x4.nxv16i32.nxv8i8.nxv64i8(
     <vscale x 16 x i32> %0,
     <vscale x 8 x i8> %1,
-    <vscale x 64 x i8> %2,
+    <vscale x 32 x i8> %2,
     iXLen %3, iXLen 2)
 
   ret <vscale x 16 x i32> %a
 }
 
-define <vscale x 16 x i32> @intrinsic_vqmaccu_4x8x4_ta_i32m8(<vscale x 16 x i32> %0, <vscale x 8 x i8> %1, <vscale x 64 x i8> %2, iXLen %3) nounwind {
+define <vscale x 16 x i32> @intrinsic_vqmaccu_4x8x4_ta_i32m8(<vscale x 16 x i32> %0, <vscale x 8 x i8> %1, <vscale x 32 x i8> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vqmaccu_4x8x4_ta_i32m8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vl8r.v v24, (a0)
-; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
-; CHECK-NEXT:    sf.vqmaccu.4x8x4 v8, v16, v24
+; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; CHECK-NEXT:    sf.vqmaccu.4x8x4 v8, v16, v20
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i32> @llvm.riscv.sf.vqmaccu.4x8x4.nxv16i32.nxv8i8.nxv64i8(
     <vscale x 16 x i32> %0,
     <vscale x 8 x i8> %1,
-    <vscale x 64 x i8> %2,
+    <vscale x 32 x i8> %2,
     iXLen %3, iXLen 3)
 
   ret <vscale x 16 x i32> %a
diff --git a/llvm/test/CodeGen/RISCV/rvv/sf_vqmaccus_4x8x4.ll b/llvm/test/CodeGen/RISCV/rvv/sf_vqmaccus_4x8x4.ll
index c6d2a048c5cbc..74fb66f5bf351 100644
--- a/llvm/test/CodeGen/RISCV/rvv/sf_vqmaccus_4x8x4.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/sf_vqmaccus_4x8x4.ll
@@ -7,36 +7,36 @@
 declare <vscale x 2 x i32> @llvm.riscv.sf.vqmaccus.4x8x4.nxv2i32.nxv8i8.nxv8i8(
   <vscale x 2 x i32>,
   <vscale x 8 x i8>,
-  <vscale x 8 x i8>,
+  <vscale x 4 x i8>,
   iXLen, iXLen);
 
-define <vscale x 2 x i32> @intrinsic_vqmaccus_4x8x4_tu_i32m1(<vscale x 2 x i32> %0, <vscale x 8 x i8> %1, <vscale x 8 x i8> %2, iXLen %3) nounwind {
+define <vscale x 2 x i32> @intrinsic_vqmaccus_4x8x4_tu_i32m1(<vscale x 2 x i32> %0, <vscale x 8 x i8> %1, <vscale x 4 x i8> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vqmaccus_4x8x4_tu_i32m1:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, tu, ma
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, tu, ma
 ; CHECK-NEXT:    sf.vqmaccus.4x8x4 v8, v9, v10
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i32> @llvm.riscv.sf.vqmaccus.4x8x4.nxv2i32.nxv8i8.nxv8i8(
     <vscale x 2 x i32> %0,
     <vscale x 8 x i8> %1,
-    <vscale x 8 x i8> %2,
+    <vscale x 4 x i8> %2,
     iXLen %3, iXLen 2)
 
   ret <vscale x 2 x i32> %a
 }
 
-define <vscale x 2 x i32> @intrinsic_vqmaccus_4x8x4_ta_i32m1(<vscale x 2 x i32> %0, <vscale x 8 x i8> %1, <vscale x 8 x i8> %2, iXLen %3) nounwind {
+define <vscale x 2 x i32> @intrinsic_vqmaccus_4x8x4_ta_i32m1(<vscale x 2 x i32> %0, <vscale x 8 x i8> %1, <vscale x 4 x i8> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vqmaccus_4x8x4_ta_i32m1:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; CHECK-NEXT:    sf.vqmaccus.4x8x4 v8, v9, v10
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 2 x i32> @llvm.riscv.sf.vqmaccus.4x8x4.nxv2i32.nxv8i8.nxv8i8(
     <vscale x 2 x i32> %0,
     <vscale x 8 x i8> %1,
-    <vscale x 8 x i8> %2,
+    <vscale x 4 x i8> %2,
     iXLen %3, iXLen 3)
 
   ret <vscale x 2 x i32> %a
@@ -45,36 +45,36 @@ entry:
 declare <vscale x 4 x i32> @llvm.riscv.sf.vqmaccus.4x8x4.nxv4i32.nxv8i8.nxv16i8(
   <vscale x 4 x i32>,
   <vscale x 8 x i8>,
-  <vscale x 16 x i8>,
+  <vscale x 8 x i8>,
   iXLen, iXLen);
 
-define <vscale x 4 x i32> @intrinsic_vqmaccus_4x8x4_tu_i32m2(<vscale x 4 x i32> %0, <vscale x 8 x i8> %1, <vscale x 16 x i8> %2, iXLen %3) nounwind {
+define <vscale x 4 x i32> @intrinsic_vqmaccus_4x8x4_tu_i32m2(<vscale x 4 x i32> %0, <vscale x 8 x i8> %1, <vscale x 8 x i8> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vqmaccus_4x8x4_tu_i32m2:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m2, tu, ma
-; CHECK-NEXT:    sf.vqmaccus.4x8x4 v8, v10, v12
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, tu, ma
+; CHECK-NEXT:    sf.vqmaccus.4x8x4 v8, v10, v11
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i32> @llvm.riscv.sf.vqmaccus.4x8x4.nxv4i32.nxv8i8.nxv16i8(
     <vscale x 4 x i32> %0,
     <vscale x 8 x i8> %1,
-    <vscale x 16 x i8> %2,
+    <vscale x 8 x i8> %2,
     iXLen %3, iXLen 2)
 
   ret <vscale x 4 x i32> %a
 }
 
-define <vscale x 4 x i32> @intrinsic_vqmaccus_4x8x4_ta_i32m2(<vscale x 4 x i32> %0, <vscale x 8 x i8> %1, <vscale x 16 x i8> %2, iXLen %3) nounwind {
+define <vscale x 4 x i32> @intrinsic_vqmaccus_4x8x4_ta_i32m2(<vscale x 4 x i32> %0, <vscale x 8 x i8> %1, <vscale x 8 x i8> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vqmaccus_4x8x4_ta_i32m2:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
-; CHECK-NEXT:    sf.vqmaccus.4x8x4 v8, v10, v12
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    sf.vqmaccus.4x8x4 v8, v10, v11
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i32> @llvm.riscv.sf.vqmaccus.4x8x4.nxv4i32.nxv8i8.nxv16i8(
     <vscale x 4 x i32> %0,
     <vscale x 8 x i8> %1,
-    <vscale x 16 x i8> %2,
+    <vscale x 8 x i8> %2,
     iXLen %3, iXLen 3)
 
   ret <vscale x 4 x i32> %a
@@ -83,36 +83,36 @@ entry:
 declare <vscale x 8 x i32> @llvm.riscv.sf.vqmaccus.4x8x4.nxv8i32.nxv8i8.nxv32i8(
   <vscale x 8 x i32>,
   <vscale x 8 x i8>,
-  <vscale x 32 x i8>,
+  <vscale x 16 x i8>,
   iXLen, iXLen);
 
-define <vscale x 8 x i32> @intrinsic_vqmaccus_4x8x4_tu_i32m4(<vscale x 8 x i32> %0, <vscale x 8 x i8> %1, <vscale x 32 x i8> %2, iXLen %3) nounwind {
+define <vscale x 8 x i32> @intrinsic_vqmaccus_4x8x4_tu_i32m4(<vscale x 8 x i32> %0, <vscale x 8 x i8> %1, <vscale x 16 x i8> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vqmaccus_4x8x4_tu_i32m4:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m4, tu, ma
-; CHECK-NEXT:    sf.vqmaccus.4x8x4 v8, v12, v16
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, tu, ma
+; CHECK-NEXT:    sf.vqmaccus.4x8x4 v8, v12, v14
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i32> @llvm.riscv.sf.vqmaccus.4x8x4.nxv8i32.nxv8i8.nxv32i8(
     <vscale x 8 x i32> %0,
     <vscale x 8 x i8> %1,
-    <vscale x 32 x i8> %2,
+    <vscale x 16 x i8> %2,
     iXLen %3, iXLen 2)
 
   ret <vscale x 8 x i32> %a
 }
 
-define <vscale x 8 x i32> @intrinsic_vqmaccus_4x8x4_ta_i32m4(<vscale x 8 x i32> %0, <vscale x 8 x i8> %1, <vscale x 32 x i8> %2, iXLen %3) nounwind {
+define <vscale x 8 x i32> @intrinsic_vqmaccus_4x8x4_ta_i32m4(<vscale x 8 x i32> %0, <vscale x 8 x i8> %1, <vscale x 16 x i8> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vqmaccus_4x8x4_ta_i32m4:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
-; CHECK-NEXT:    sf.vqmaccus.4x8x4 v8, v12, v16
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
+; CHECK-NEXT:    sf.vqmaccus.4x8x4 v8, v12, v14
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i32> @llvm.riscv.sf.vqmaccus.4x8x4.nxv8i32.nxv8i8.nxv32i8(
     <vscale x 8 x i32> %0,
     <vscale x 8 x i8> %1,
-    <vscale x 32 x i8> %2,
+    <vscale x 16 x i8> %2,
     iXLen %3, iXLen 3)
 
   ret <vscale x 8 x i32> %a
@@ -121,38 +121,36 @@ entry:
 declare <vscale x 16 x i32> @llvm.riscv.sf.vqmaccus.4x8x4.nxv16i32.nxv8i8.nxv64i8(
   <vscale x 16 x i32>,
   <vscale x 8 x i8>,
-  <vscale x 64 x i8>,
+  <vscale x 32 x i8>,
   iXLen, iXLen);
 
-define <vscale x 16 x i32> @intrinsic_vqmaccus_4x8x4_tu_i32m8(<vscale x 16 x i32> %0, <vscale x 8 x i8> %1, <vscale x 64 x i8> %2, iXLen %3) nounwind {
+define <vscale x 16 x i32> @intrinsic_vqmaccus_4x8x4_tu_i32m8(<vscale x 16 x i32> %0, <vscale x 8 x i8> %1, <vscale x 32 x i8> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vqmaccus_4x8x4_tu_i32m8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vl8r.v v24, (a0)
-; CHECK-NEXT:    vsetvli zero, a1, e8, m8, tu, ma
-; CHECK-NEXT:    sf.vqmaccus.4x8x4 v8, v16, v24
+; CHECK-NEXT:    vsetvli zero, a0, e8, m4, tu, ma
+; CHECK-NEXT:    sf.vqmaccus.4x8x4 v8, v16, v20
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i32> @llvm.riscv.sf.vqmaccus.4x8x4.nxv16i32.nxv8i8.nxv64i8(
     <vscale x 16 x i32> %0,
     <vscale x 8 x i8> %1,
-    <vscale x 64 x i8> %2,
+    <vscale x 32 x i8> %2,
     iXLen %3, iXLen 2)
 
   ret <vscale x 16 x i32> %a
 }
 
-define <vscale x 16 x i32> @intrinsic_vqmaccus_4x8x4_ta_i32m8(<vscale x 16 x i32> %0, <vscale x 8 x i8> %1, <vscale x 64 x i8> %2, iXLen %3) nounwind {
+define <vscale x 16 x i32> @intrinsic_vqmaccus_4x8x4_ta_i32m8(<vscale x 16 x i32> %0, <vscale x 8 x i8> %1, <vscale x 32 x i8> %2, iXLen %3) nounwind {
 ; CHECK-LABEL: intrinsic_vqmaccus_4x8x4_ta_i32m8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vl8r.v v24, (a0)
-; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
-; CHECK-NEXT:    sf.vqmaccus.4x8x4 v8, v16, v24
+; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; CHECK-NEXT:    sf.vqmaccus.4x8x4 v8, v16, v20
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i32> @llvm.riscv.sf.vqmaccus.4x8x4.nxv16i32.nxv8i8.nxv64i8(
     <vscale x 16 x i32> %0,
     <vscale x 8 x i8> %1,
-    <vscale x 64 x i8> %2,
+    <vscale x 32 x i8> %2,
     iXLen %3, iXLen 3)
 
   ret <vscale x 16 x i32> %a

From 66922a566bc29d9d9cc056964cb5d1c868da1ea3 Mon Sep 17 00:00:00 2001
From: Shengchen Kan <shengchen.kan@intel.com>
Date: Tue, 26 Dec 2023 14:22:38 +0800
Subject: [PATCH 274/342] [X86][NFC] Simplify the definition of MULX by using
 class ITy

---
 llvm/lib/Target/X86/X86InstrArithmetic.td | 73 +++++++++++------------
 1 file changed, 34 insertions(+), 39 deletions(-)

diff --git a/llvm/lib/Target/X86/X86InstrArithmetic.td b/llvm/lib/Target/X86/X86InstrArithmetic.td
index 71abd03044c82..5f77091bc8029 100644
--- a/llvm/lib/Target/X86/X86InstrArithmetic.td
+++ b/llvm/lib/Target/X86/X86InstrArithmetic.td
@@ -1135,53 +1135,48 @@ let Predicates = [HasBMI], AddedComplexity = -6 in {
 //===----------------------------------------------------------------------===//
 // MULX Instruction
 //
-multiclass bmi_mulx<string mnemonic, RegisterClass RC, X86MemOperand x86memop,
-                    X86FoldableSchedWrite sched> {
-let hasSideEffects = 0 in {
-let Predicates = [HasBMI2, NoEGPR] in {
-  def rr : I<0xF6, MRMSrcReg, (outs RC:$dst1, RC:$dst2), (ins RC:$src),
-             !strconcat(mnemonic, "\t{$src, $dst2, $dst1|$dst1, $dst2, $src}"),
-             []>, T8, XD, VEX, VVVV, Sched<[WriteIMulH, sched]>;
-
+multiclass MulX<X86TypeInfo t, X86FoldableSchedWrite sched> {
+  defvar mulx_args = "{$src, $dst2, $dst1|$dst1, $dst2, $src}";
+  defvar mulx_rm_sched =
+    [WriteIMulHLd, sched.Folded,
+     // Memory operand.
+     ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault,
+     // Implicit read of EDX/RDX
+     sched.ReadAfterFold];
+
+  def rr : ITy<0xF6, MRMSrcReg, t, (outs t.RegClass:$dst1, t.RegClass:$dst2),
+               (ins t.RegClass:$src), "mulx", mulx_args, []>, T8, XD, VEX,
+           VVVV, Sched<[WriteIMulH, sched]>;
   let mayLoad = 1 in
-  def rm : I<0xF6, MRMSrcMem, (outs RC:$dst1, RC:$dst2), (ins x86memop:$src),
-             !strconcat(mnemonic, "\t{$src, $dst2, $dst1|$dst1, $dst2, $src}"),
-             []>, T8, XD, VEX, VVVV,
-             Sched<[WriteIMulHLd, sched.Folded,
-                    // Memory operand.
-                    ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault,
-                    // Implicit read of EDX/RDX
-                    sched.ReadAfterFold]>;
-
+  def rm : ITy<0xF6, MRMSrcMem, t, (outs t.RegClass:$dst1, t.RegClass:$dst2),
+               (ins t.MemOperand:$src), "mulx", mulx_args, []>, T8, XD, VEX,
+               VVVV, Sched<mulx_rm_sched>;
+
+  let Predicates = [In64BitMode] in {
+  def rr_EVEX : ITy<0xF6, MRMSrcReg, t,
+                    (outs t.RegClass:$dst1, t.RegClass:$dst2),
+                    (ins t.RegClass:$src), "mulx", mulx_args, []>, T8, XD,
+                EVEX, VVVV, Sched<[WriteIMulH, sched]>;
+    let mayLoad = 1 in
+    def rm_EVEX : ITy<0xF6, MRMSrcMem, t,
+                      (outs t.RegClass:$dst1, t.RegClass:$dst2),
+                      (ins t.MemOperand:$src), "mulx", mulx_args, []>, T8, XD,
+                  EVEX, VVVV, Sched<mulx_rm_sched>;
+  }
   // Pseudo instructions to be used when the low result isn't used. The
   // instruction is defined to keep the high if both destinations are the same.
-  def Hrr : PseudoI<(outs RC:$dst), (ins RC:$src),
-                    []>, Sched<[sched]>;
-
+  def Hrr : PseudoI<(outs t.RegClass:$dst), (ins t.RegClass:$src), []>,
+            Sched<[sched]>;
   let mayLoad = 1 in
-  def Hrm : PseudoI<(outs RC:$dst), (ins x86memop:$src),
-                    []>, Sched<[sched.Folded]>;
-}
-let Predicates = [HasBMI2, HasEGPR, In64BitMode] in
-  def rr#_EVEX : I<0xF6, MRMSrcReg, (outs RC:$dst1, RC:$dst2), (ins RC:$src),
-                   !strconcat(mnemonic, "\t{$src, $dst2, $dst1|$dst1, $dst2, $src}"),
-                   []>, T8, XD, EVEX, VVVV, Sched<[WriteIMulH, sched]>;
-let Predicates = [HasBMI2, HasEGPR, In64BitMode], mayLoad = 1 in
-  def rm#_EVEX : I<0xF6, MRMSrcMem, (outs RC:$dst1, RC:$dst2), (ins x86memop:$src),
-                   !strconcat(mnemonic, "\t{$src, $dst2, $dst1|$dst1, $dst2, $src}"),
-                   []>, T8, XD, EVEX, VVVV,
-                 Sched<[WriteIMulHLd, sched.Folded,
-                        // Memory operand.
-                        ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault,
-                        // Implicit read of EDX/RDX
-                        sched.ReadAfterFold]>;
-}
+  def Hrm : PseudoI<(outs t.RegClass:$dst), (ins t.MemOperand:$src), []>,
+            Sched<[sched.Folded]>;
 }
 
 let Uses = [EDX] in
-  defm MULX32 : bmi_mulx<"mulx{l}", GR32, i32mem, WriteMULX32>;
+defm MULX32 : MulX<Xi32, WriteMULX32>;
+
 let Uses = [RDX] in
-  defm MULX64 : bmi_mulx<"mulx{q}", GR64, i64mem, WriteMULX64>, REX_W;
+defm MULX64 : MulX<Xi64, WriteMULX64>, REX_W;
 
 //===----------------------------------------------------------------------===//
 // ADCX and ADOX Instructions

From c019ed972f95cd17838b6d01257383539a5d889c Mon Sep 17 00:00:00 2001
From: Michael Lettrich <MichaelLettrich@users.noreply.github.com>
Date: Tue, 26 Dec 2023 08:40:40 +0100
Subject: [PATCH 275/342] Allow to pass config file to clang-tidy-diff (#75457)

Adds a `-config-file` command line option that passes on the path of
.`clang-tidy` or custom config file to the `clang-tidy` executable.
---
 .../clang-tidy/tool/clang-tidy-diff.py            |  8 ++++++++
 clang-tools-extra/docs/ReleaseNotes.rst           | 15 +++++++++++----
 2 files changed, 19 insertions(+), 4 deletions(-)

diff --git a/clang-tools-extra/clang-tidy/tool/clang-tidy-diff.py b/clang-tools-extra/clang-tidy/tool/clang-tidy-diff.py
index 8817e2914f6e2..d96b3450fdbe8 100755
--- a/clang-tools-extra/clang-tidy/tool/clang-tidy-diff.py
+++ b/clang-tools-extra/clang-tidy/tool/clang-tidy-diff.py
@@ -173,6 +173,12 @@ def main():
         help="checks filter, when not specified, use clang-tidy " "default",
         default="",
     )
+    parser.add_argument(
+        "-config-file",
+        dest="config_file",
+        help="Specify the path of .clang-tidy or custom config file",
+        default="",
+    )
     parser.add_argument("-use-color", action="store_true", help="Use colors in output")
     parser.add_argument(
         "-path", dest="build_path", help="Path used to read a compile command database."
@@ -313,6 +319,8 @@ def main():
         common_clang_tidy_args.append("-fix")
     if args.checks != "":
         common_clang_tidy_args.append("-checks=" + args.checks)
+    if args.config_file != "":
+        common_clang_tidy_args.append("-config-file=" + args.config_file)
     if args.quiet:
         common_clang_tidy_args.append("-quiet")
     if args.build_path is not None:
diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index c843efac754ce..00f570bcd2184 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -119,15 +119,22 @@ Improvements to clang-tidy
 
 - Improved `--dump-config` to print check options in alphabetical order.
 
-- Improved :program:`clang-tidy-diff.py` script. It now returns exit code `1`
-  if any :program:`clang-tidy` subprocess exits with a non-zero code or if
-  exporting fixes fails. It now accepts a directory as a value for
-  `-export-fixes` to export individual yaml files for each compilation unit.
+- Improved :program:`clang-tidy-diff.py` script. 
+    * Return exit code `1` if any :program:`clang-tidy` subprocess exits with
+      a non-zero code or if exporting fixes fails.
+
+    * Accept a directory as a value for `-export-fixes` to export individual
+      yaml files for each compilation unit.
+
+    * Introduce a `-config-file` option that forwards a configuration file to
+      :program:`clang-tidy`. Corresponds to the `--config-file` option in
+      :program:`clang-tidy`.
 
 - Improved :program:`run-clang-tidy.py` script. It now accepts a directory
   as a value for `-export-fixes` to export individual yaml files for each
   compilation unit.
 
+
 New checks
 ^^^^^^^^^^
 

From dc1fadef232948712ea5cb327b79213475941f11 Mon Sep 17 00:00:00 2001
From: Vettel <924105575@qq.com>
Date: Tue, 26 Dec 2023 16:22:42 +0800
Subject: [PATCH 276/342] [MCP] Enhance MCP copy Instruction removal for
 special case(reapply) (#74239)

Machine Copy Propagation Pass may lose some opportunities to further
remove the redundant copy instructions during the ForwardCopyPropagateBlock
procedure. When we Clobber a "Def" register, we also need to remove the record
from the copy maps that indicates "Src" defined "Def" to ensure the correct semantics
of the ClobberRegister function.  This patch reapplies #70778 and addresses the corner
case bug  #73512 specific to the AMDGPU backend. Additionally, it refines the criteria
for removing empty records from the copy maps, thereby enhancing overall safety.

For more information, please see the C++ test case generated code in
"vector.body" after the MCP Pass: https://gcc.godbolt.org/z/nK4oMaWv5.
---
 llvm/lib/CodeGen/MachineCopyPropagation.cpp   | 42 ++++++++++++++++++-
 .../CodeGen/AMDGPU/mcp-implicit-clobber.mir   | 26 ++++++++++++
 .../RISCV/rvv/fixed-vectors-nearbyint-vp.ll   |  1 -
 llvm/test/CodeGen/X86/shift-i128.ll           |  4 --
 llvm/test/CodeGen/X86/shift-i256.ll           |  1 -
 .../X86/smulo-128-legalisation-lowering.ll    |  2 +-
 .../vector-interleaved-load-i16-stride-7.ll   | 21 ++++------
 .../vector-interleaved-load-i64-stride-7.ll   |  4 +-
 .../vector-interleaved-load-i8-stride-5.ll    |  3 +-
 .../vector-interleaved-load-i8-stride-6.ll    |  6 +--
 .../vector-interleaved-load-i8-stride-7.ll    | 21 +++++-----
 .../vector-interleaved-load-i8-stride-8.ll    |  4 +-
 .../vector-interleaved-store-i16-stride-7.ll  |  3 +-
 .../X86/wide-scalar-shift-legalization.ll     | 18 ++------
 14 files changed, 98 insertions(+), 58 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/mcp-implicit-clobber.mir

diff --git a/llvm/lib/CodeGen/MachineCopyPropagation.cpp b/llvm/lib/CodeGen/MachineCopyPropagation.cpp
index a032b31a1fc7c..51e944d0279f2 100644
--- a/llvm/lib/CodeGen/MachineCopyPropagation.cpp
+++ b/llvm/lib/CodeGen/MachineCopyPropagation.cpp
@@ -175,8 +175,46 @@ class CopyTracker {
         if (MachineInstr *MI = I->second.MI) {
           std::optional<DestSourcePair> CopyOperands =
               isCopyInstr(*MI, TII, UseCopyInstr);
-          markRegsUnavailable({CopyOperands->Destination->getReg().asMCReg()},
-                              TRI);
+
+          MCRegister Def = CopyOperands->Destination->getReg().asMCReg();
+          MCRegister Src = CopyOperands->Source->getReg().asMCReg();
+
+          markRegsUnavailable(Def, TRI);
+
+          // Since we clobber the destination of a copy, the semantic of Src's
+          // "DefRegs" to contain Def is no longer effectual. We will also need
+          // to remove the record from the copy maps that indicates Src defined
+          // Def. Failing to do so might cause the target to miss some
+          // opportunities to further eliminate redundant copy instructions.
+          // Consider the following sequence during the
+          // ForwardCopyPropagateBlock procedure:
+          // L1: r0 = COPY r9     <- TrackMI
+          // L2: r0 = COPY r8     <- TrackMI (Remove r9 defined r0 from tracker)
+          // L3: use r0           <- Remove L2 from MaybeDeadCopies
+          // L4: early-clobber r9 <- Clobber r9 (L2 is still valid in tracker)
+          // L5: r0 = COPY r8     <- Remove NopCopy
+          for (MCRegUnit SrcUnit : TRI.regunits(Src)) {
+            auto SrcCopy = Copies.find(SrcUnit);
+            if (SrcCopy != Copies.end() && SrcCopy->second.LastSeenUseInCopy) {
+              // If SrcCopy defines multiple values, we only need
+              // to erase the record for Def in DefRegs.
+              for (auto itr = SrcCopy->second.DefRegs.begin();
+                   itr != SrcCopy->second.DefRegs.end(); itr++) {
+                if (*itr == Def) {
+                  SrcCopy->second.DefRegs.erase(itr);
+                  // If DefReg becomes empty after removal, we can remove the
+                  // SrcCopy from the tracker's copy maps. We only remove those
+                  // entries solely record the Def is defined by Src. If an
+                  // entry also contains the definition record of other Def'
+                  // registers, it cannot be cleared.
+                  if (SrcCopy->second.DefRegs.empty() && !SrcCopy->second.MI) {
+                    Copies.erase(SrcCopy);
+                  }
+                  break;
+                }
+              }
+            }
+          }
         }
         // Now we can erase the copy.
         Copies.erase(I);
diff --git a/llvm/test/CodeGen/AMDGPU/mcp-implicit-clobber.mir b/llvm/test/CodeGen/AMDGPU/mcp-implicit-clobber.mir
new file mode 100644
index 0000000000000..6e613243e38c5
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/mcp-implicit-clobber.mir
@@ -0,0 +1,26 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
+# RUN:  llc -march=amdgcn -mcpu=gfx900 %s -o - -run-pass machine-cp -verify-machineinstrs | FileCheck %s
+
+# The MachineCopyPropagation Pass should not treat the subsequent
+# instruction "$sgpr2_sgpr3 = COPY $sgpr6_sgpr7" as a NopCopy.
+# For detailed information, please refer to issue 73512.
+---
+name:            foo
+body:             |
+  bb.0.entry:
+    liveins: $sgpr4_sgpr5, $sgpr6_sgpr7
+
+    ; CHECK-LABEL: name: foo
+    ; CHECK: liveins: $sgpr4_sgpr5, $sgpr6_sgpr7
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: $sgpr2_sgpr3 = COPY $sgpr6_sgpr7
+    ; CHECK-NEXT: S_NOP 0, implicit-def $sgpr0
+    ; CHECK-NEXT: $sgpr2_sgpr3 = COPY $sgpr6_sgpr7
+    ; CHECK-NEXT: S_NOP 0, implicit $sgpr2_sgpr3
+    $sgpr2_sgpr3 = COPY $sgpr6_sgpr7
+    $sgpr0 = COPY $sgpr3
+    S_NOP 0, implicit-def $sgpr0
+    $sgpr3 = COPY killed $sgpr5
+    $sgpr2_sgpr3 = COPY $sgpr6_sgpr7
+    S_NOP 0, implicit $sgpr2_sgpr3
+...
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-nearbyint-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-nearbyint-vp.ll
index d9958f4aae350..5407eadb160bd 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-nearbyint-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-nearbyint-vp.ll
@@ -637,7 +637,6 @@ define <32 x double> @vp_nearbyint_v32f64(<32 x double> %va, <32 x i1> %m, i32 z
 ; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vfabs.v v16, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vmv1r.v v0, v1
 ; CHECK-NEXT:    vmflt.vf v1, v16, fa5, v0.t
 ; CHECK-NEXT:    frflags a0
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
diff --git a/llvm/test/CodeGen/X86/shift-i128.ll b/llvm/test/CodeGen/X86/shift-i128.ll
index 1fe8d834dbcdd..4fbe05cd1b2f2 100644
--- a/llvm/test/CodeGen/X86/shift-i128.ll
+++ b/llvm/test/CodeGen/X86/shift-i128.ll
@@ -347,7 +347,6 @@ define void @test_lshr_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) no
 ; i686-NEXT:    movl %edx, %ecx
 ; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; i686-NEXT:    shrdl %cl, %eax, (%esp) # 4-byte Folded Spill
-; i686-NEXT:    movl %edx, %ecx
 ; i686-NEXT:    shrl %cl, %esi
 ; i686-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; i686-NEXT:    movl %esi, 28(%ecx)
@@ -489,7 +488,6 @@ define void @test_ashr_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) no
 ; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; i686-NEXT:    shrdl %cl, %esi, %ebx
-; i686-NEXT:    movl %edx, %ecx
 ; i686-NEXT:    sarl %cl, %ebp
 ; i686-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; i686-NEXT:    movl %ebp, 28(%ecx)
@@ -623,11 +621,9 @@ define void @test_shl_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) nou
 ; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; i686-NEXT:    shll %cl, %edi
 ; i686-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl %ecx, %edi
 ; i686-NEXT:    shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; i686-NEXT:    negl %ebp
 ; i686-NEXT:    movl 64(%esp,%ebp), %esi
-; i686-NEXT:    movl %edi, %ecx
 ; i686-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; i686-NEXT:    movl (%esp), %edi # 4-byte Reload
 ; i686-NEXT:    shldl %cl, %edi, %esi
diff --git a/llvm/test/CodeGen/X86/shift-i256.ll b/llvm/test/CodeGen/X86/shift-i256.ll
index 0e4e706669300..e1466aebf4225 100644
--- a/llvm/test/CodeGen/X86/shift-i256.ll
+++ b/llvm/test/CodeGen/X86/shift-i256.ll
@@ -78,7 +78,6 @@ define void @shift1(i256 %x, i256 %a, ptr nocapture %r) nounwind readnone {
 ; CHECK-NEXT:    movl %eax, %ecx
 ; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; CHECK-NEXT:    shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; CHECK-NEXT:    movl %eax, %ecx
 ; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; CHECK-NEXT:    shrdl %cl, %edx, (%esp) # 4-byte Folded Spill
 ; CHECK-NEXT:    movl 28(%esp,%ebp), %edx
diff --git a/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll
index abab313f4b12e..b2b5bcc5b44b2 100644
--- a/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll
@@ -1201,7 +1201,7 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %ebp, %ebx
+; X86-NEXT:    addl %edx, %ebx
 ; X86-NEXT:    adcl $0, %ebp
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    movl %ecx, %esi
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll
index b1f0dcb9238a5..08667aed4bb35 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll
@@ -14447,7 +14447,6 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
 ; AVX512DQ-SLOW-NEXT:    vinserti32x4 $2, %xmm6, %zmm5, %zmm3
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %ymm23, %ymm10
 ; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm15[0],ymm9[1],ymm15[2,3,4],ymm9[5],ymm15[6,7]
 ; AVX512DQ-SLOW-NEXT:    vextracti128 $1, %ymm5, %xmm6
 ; AVX512DQ-SLOW-NEXT:    vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2],xmm5[3],xmm6[4,5,6,7]
@@ -14483,7 +14482,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-SLOW-NEXT:    vinserti32x4 $2, %xmm7, %zmm3, %zmm3
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-SLOW-NEXT:    vmovdqa %ymm9, %ymm11
-; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm10[0,1],ymm9[2],ymm10[3,4],ymm9[5],ymm10[6,7]
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm15[0,1],ymm9[2],ymm15[3,4],ymm9[5],ymm15[6,7]
 ; AVX512DQ-SLOW-NEXT:    vextracti128 $1, %ymm3, %xmm7
 ; AVX512DQ-SLOW-NEXT:    vpblendw {{.*#+}} xmm3 = xmm7[0,1],xmm3[2],xmm7[3],xmm3[4],xmm7[5,6,7]
 ; AVX512DQ-SLOW-NEXT:    vpshufb %xmm1, %xmm3, %xmm1
@@ -14516,7 +14515,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7]
 ; AVX512DQ-SLOW-NEXT:    vinserti32x4 $2, %xmm0, %zmm6, %zmm0
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm11[3],ymm10[4,5],ymm11[6],ymm10[7]
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm15[0,1,2],ymm11[3],ymm15[4,5],ymm11[6],ymm15[7]
 ; AVX512DQ-SLOW-NEXT:    vextracti128 $1, %ymm0, %xmm6
 ; AVX512DQ-SLOW-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7]
 ; AVX512DQ-SLOW-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[0,1,2,1]
@@ -14530,8 +14529,8 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-SLOW-NEXT:    kmovw %eax, %k1
 ; AVX512DQ-SLOW-NEXT:    vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm26 {%k1} # 16-byte Folded Reload
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm11[2,3],ymm10[4,5],ymm11[6,7]
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %ymm10, %ymm18
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm11[2,3],ymm15[4,5],ymm11[6,7]
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 %ymm15, %ymm18
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %ymm11, %ymm25
 ; AVX512DQ-SLOW-NEXT:    vextracti128 $1, %ymm0, %xmm6
 ; AVX512DQ-SLOW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm6[0,1,2],xmm0[3],xmm6[4],xmm0[5],xmm6[6,7]
@@ -14738,7 +14737,6 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-SLOW-NEXT:    vinserti32x4 $2, %xmm11, %zmm0, %zmm27
 ; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm13[0,1,2],ymm1[3],ymm13[4,5],ymm1[6],ymm13[7]
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %ymm13, %ymm19
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %ymm1, %ymm16
 ; AVX512DQ-SLOW-NEXT:    vextracti128 $1, %ymm0, %xmm11
 ; AVX512DQ-SLOW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm11[0],xmm0[1],xmm11[2,3,4,5],xmm0[6],xmm11[7]
 ; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm11 = ymm2[0,1],ymm4[2],ymm2[3,4,5],ymm4[6],ymm2[7]
@@ -14747,7 +14745,6 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-SLOW-NEXT:    vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3],xmm12[4],xmm11[5],xmm12[6],xmm11[7]
 ; AVX512DQ-SLOW-NEXT:    vmovdqa {{.*#+}} xmm12 = <0,1,2,3,2,3,0,1,14,15,12,13,10,11,128,128>
 ; AVX512DQ-SLOW-NEXT:    vpshufb %xmm12, %xmm0, %xmm0
-; AVX512DQ-SLOW-NEXT:    vmovdqa %ymm3, %ymm1
 ; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm13 = ymm14[0,1],ymm3[2],ymm14[3,4],ymm3[5],ymm14[6,7]
 ; AVX512DQ-SLOW-NEXT:    vpermq {{.*#+}} ymm13 = ymm13[1,1,2,0]
 ; AVX512DQ-SLOW-NEXT:    vmovdqa {{.*#+}} ymm15 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,22,23,28,29,18,19,128,128,128,128,128,128,128,128,128,128]
@@ -14823,14 +14820,14 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-SLOW-NEXT:    vextracti128 $1, %ymm0, %xmm12
 ; AVX512DQ-SLOW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm12[0],xmm0[1],xmm12[2,3,4,5],xmm0[6],xmm12[7]
 ; AVX512DQ-SLOW-NEXT:    vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm12 = ymm14[0,1],ymm1[2,3],ymm14[4,5],ymm1[6,7]
-; AVX512DQ-SLOW-NEXT:    vmovdqa %ymm1, %ymm13
-; AVX512DQ-SLOW-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm12 = ymm14[0,1],ymm3[2,3],ymm14[4,5],ymm3[6,7]
+; AVX512DQ-SLOW-NEXT:    vmovdqa %ymm3, %ymm13
+; AVX512DQ-SLOW-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-SLOW-NEXT:    vpermq {{.*#+}} ymm15 = ymm14[0,1,0,1]
 ; AVX512DQ-SLOW-NEXT:    vpblendw {{.*#+}} ymm12 = ymm12[0,1,2],ymm15[3],ymm12[4,5,6,7,8,9,10],ymm15[11],ymm12[12,13,14,15]
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %ymm19, %ymm5
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 %ymm16, %ymm4
-; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm15 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5,6,7]
+; AVX512DQ-SLOW-NEXT:    vmovdqa %ymm1, %ymm4
+; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm15 = ymm1[0],ymm5[1],ymm1[2,3],ymm5[4],ymm1[5,6,7]
 ; AVX512DQ-SLOW-NEXT:    vextracti128 $1, %ymm15, %xmm10
 ; AVX512DQ-SLOW-NEXT:    vpblendw {{.*#+}} xmm10 = xmm15[0],xmm10[1],xmm15[2,3,4,5],xmm10[6],xmm15[7]
 ; AVX512DQ-SLOW-NEXT:    vmovdqa {{.*#+}} xmm3 = [2,3,2,3,2,3,2,3,0,1,14,15,12,13,10,11]
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll
index 91a70fb000dd6..db8bca5bc16bf 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll
@@ -8490,12 +8490,12 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-NEXT:    vpermt2q %zmm31, %zmm23, %zmm12
 ; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm24 = [11,4,11,4,11,4,11,4]
 ; AVX512F-NEXT:    # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm0
+; AVX512F-NEXT:    vmovdqa64 %zmm16, %zmm0
 ; AVX512F-NEXT:    vpermt2q %zmm31, %zmm24, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm8 = [12,5,12,5,12,5,12,5]
 ; AVX512F-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm0
+; AVX512F-NEXT:    vmovdqa64 %zmm16, %zmm0
 ; AVX512F-NEXT:    vpermt2q %zmm31, %zmm8, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm16 = [13,6,13,6,13,6,13,6]
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll
index a060d29200ba8..10ccd40e48655 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll
@@ -2480,7 +2480,6 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7]
 ; SSE-NEXT:    psllq $48, %xmm0
 ; SSE-NEXT:    packuswb %xmm1, %xmm0
-; SSE-NEXT:    movdqa %xmm7, %xmm4
 ; SSE-NEXT:    movdqa %xmm7, %xmm1
 ; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
 ; SSE-NEXT:    pandn %xmm5, %xmm1
@@ -2537,7 +2536,7 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    pandn %xmm1, %xmm2
 ; SSE-NEXT:    movdqa %xmm8, %xmm1
 ; SSE-NEXT:    pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; SSE-NEXT:    movdqa %xmm4, %xmm0
+; SSE-NEXT:    movdqa %xmm7, %xmm0
 ; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
 ; SSE-NEXT:    pandn %xmm4, %xmm0
 ; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll
index 992b190ac17cc..16808dca4511d 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll
@@ -1181,13 +1181,13 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    pandn %xmm9, %xmm4
 ; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa %xmm9, %xmm11
-; SSE-NEXT:    pand %xmm1, %xmm11
-; SSE-NEXT:    movdqa %xmm1, %xmm4
+; SSE-NEXT:    pand %xmm10, %xmm11
+; SSE-NEXT:    movdqa %xmm10, %xmm4
 ; SSE-NEXT:    pandn %xmm0, %xmm4
 ; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 96(%rdi), %xmm13
 ; SSE-NEXT:    movdqa %xmm13, %xmm4
-; SSE-NEXT:    pand %xmm1, %xmm4
+; SSE-NEXT:    pand %xmm10, %xmm4
 ; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 176(%rdi), %xmm4
 ; SSE-NEXT:    movdqa %xmm4, %xmm10
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll
index f82a192c60b57..2b2cb554d6ac6 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll
@@ -1024,8 +1024,8 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    movdqa {{.*#+}} xmm14 = [65535,0,65535,65535,0,65535,65535,65535]
 ; SSE-NEXT:    movdqa %xmm9, %xmm7
 ; SSE-NEXT:    pand %xmm14, %xmm7
-; SSE-NEXT:    movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa %xmm6, %xmm15
+; SSE-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm5, %xmm15
 ; SSE-NEXT:    pand %xmm14, %xmm15
 ; SSE-NEXT:    movdqa %xmm11, %xmm3
 ; SSE-NEXT:    pandn %xmm8, %xmm3
@@ -2148,7 +2148,6 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    movdqa %xmm5, %xmm9
 ; SSE-NEXT:    pand %xmm13, %xmm9
 ; SSE-NEXT:    por %xmm0, %xmm9
-; SSE-NEXT:    movdqa %xmm6, %xmm3
 ; SSE-NEXT:    movdqa %xmm6, %xmm0
 ; SSE-NEXT:    pand %xmm13, %xmm0
 ; SSE-NEXT:    pandn %xmm10, %xmm13
@@ -2185,7 +2184,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    pandn %xmm3, %xmm2
+; SSE-NEXT:    pandn %xmm6, %xmm2
 ; SSE-NEXT:    por %xmm10, %xmm2
 ; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa {{.*#+}} xmm7 = [65535,0,65535,65535,65535,65535,65535,65535]
@@ -5451,19 +5450,19 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pand %xmm14, %xmm6
 ; SSE-NEXT:    movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa %xmm0, %xmm3
+; SSE-NEXT:    movdqa %xmm14, %xmm3
 ; SSE-NEXT:    movdqa %xmm11, %xmm6
 ; SSE-NEXT:    pandn %xmm11, %xmm3
 ; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    pand %xmm0, %xmm5
+; SSE-NEXT:    pand %xmm14, %xmm5
 ; SSE-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa %xmm2, %xmm3
-; SSE-NEXT:    pand %xmm0, %xmm3
+; SSE-NEXT:    pand %xmm14, %xmm3
 ; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pandn %xmm1, %xmm0
 ; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll
index 0c2df82fd1be5..f2133b9e42d30 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll
@@ -11212,7 +11212,6 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm1, %xmm7, %xmm9
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm1, %xmm5, %xmm15
-; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm5, %xmm23
 ; AVX512F-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3]
 ; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3]
 ; AVX512F-SLOW-NEXT:    vpsrlq $32, %zmm17, %zmm9
@@ -11289,7 +11288,6 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm30, %xmm10
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm3, %xmm10, %xmm9
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm3, %xmm12, %xmm15
-; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm12, %xmm31
 ; AVX512F-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3]
 ; AVX512F-SLOW-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
 ; AVX512F-SLOW-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
@@ -11302,7 +11300,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
 ; AVX512F-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm1, %xmm6, %xmm9
-; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm23, %xmm11
+; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm21, %xmm11
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm1, %xmm11, %xmm15
 ; AVX512F-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3]
 ; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3]
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll
index 739a5c879de25..d253dd117b109 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll
@@ -1343,10 +1343,9 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; SSE-NEXT:    shufps {{.*#+}} xmm9 = xmm9[0,1],xmm3[3,3]
 ; SSE-NEXT:    movdqa %xmm15, %xmm10
 ; SSE-NEXT:    punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm5[4],xmm10[5],xmm5[5],xmm10[6],xmm5[6],xmm10[7],xmm5[7]
-; SSE-NEXT:    movdqa %xmm5, %xmm1
 ; SSE-NEXT:    punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm5[0],xmm15[1],xmm5[1],xmm15[2],xmm5[2],xmm15[3],xmm5[3]
 ; SSE-NEXT:    movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,3,3,3,4,5,6,7]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm5[3,3,3,3,4,5,6,7]
 ; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm9[0,2]
 ; SSE-NEXT:    andps %xmm8, %xmm1
 ; SSE-NEXT:    orps %xmm6, %xmm1
diff --git a/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll b/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll
index 24475360cbbc4..f84131dfc8797 100644
--- a/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll
+++ b/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll
@@ -1845,7 +1845,6 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, (%esp) # 4-byte Folded Spill
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 28(%esp,%ebp), %edx
@@ -2485,7 +2484,6 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, (%esp) # 4-byte Folded Spill
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 60(%esp,%ebx), %edx
@@ -3129,7 +3127,6 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, (%esp) # 4-byte Folded Spill
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 28(%esp,%ebp), %edx
@@ -3562,7 +3559,6 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r15, %r11
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r12, %r14
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -64(%rsp,%rdi), %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rsi, %rbp
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rdi, %r8
@@ -4197,7 +4193,6 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
@@ -4879,7 +4874,6 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %rsi, %r14
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %r15, %r11
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq (%rsp,%r10), %rsi
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    shldq %cl, %r12, %rsi
@@ -5200,7 +5194,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    notl %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    andl $31, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -5211,7 +5205,6 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 20(%ebp), %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 16(%ebp), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
@@ -5534,7 +5527,6 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
@@ -6233,7 +6225,6 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r15, %r11
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r12, %r14
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -64(%rsp,%rdi), %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rsi, %rbp
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rdi, %r8
@@ -6872,7 +6863,6 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
@@ -7360,9 +7350,9 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 }
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; ALL: {{.*}}
-; X86: {{.*}}
-; X86-NO-SHLD: {{.*}}
-; X86-SHLD: {{.*}}
 ; X64: {{.*}}
 ; X64-NO-SHLD: {{.*}}
 ; X64-SHLD: {{.*}}
+; X86: {{.*}}
+; X86-NO-SHLD: {{.*}}
+; X86-SHLD: {{.*}}

From 250e98ee663bf347648ee8fea45c6083d6a8d716 Mon Sep 17 00:00:00 2001
From: Shengchen Kan <shengchen.kan@intel.com>
Date: Tue, 26 Dec 2023 16:28:55 +0800
Subject: [PATCH 277/342] [X86][NFC] Simplify the definition of ANDN by using
 class ITy

---
 llvm/lib/Target/X86/X86InstrArithmetic.td | 48 ++++++++++-------------
 llvm/lib/Target/X86/X86InstrUtils.td      |  1 +
 2 files changed, 22 insertions(+), 27 deletions(-)

diff --git a/llvm/lib/Target/X86/X86InstrArithmetic.td b/llvm/lib/Target/X86/X86InstrArithmetic.td
index 5f77091bc8029..2e59a2a1d673c 100644
--- a/llvm/lib/Target/X86/X86InstrArithmetic.td
+++ b/llvm/lib/Target/X86/X86InstrArithmetic.td
@@ -1089,36 +1089,30 @@ def : Pat<(X86testpat (loadi64 addr:$src1), i64relocImmSExt32_su:$src2),
 //===----------------------------------------------------------------------===//
 // ANDN Instruction
 //
-multiclass bmi_andn<string mnemonic, RegisterClass RC, X86MemOperand x86memop,
-                    PatFrag ld_frag, X86FoldableSchedWrite sched> {
-let Predicates = [HasBMI, NoEGPR] in {
-  def rr : I<0xF2, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
-             !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-             [(set RC:$dst, EFLAGS, (X86and_flag (not RC:$src1), RC:$src2))]>,
-           VEX, VVVV, Sched<[sched]>;
-  def rm : I<0xF2, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
-             !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-             [(set RC:$dst, EFLAGS,
-              (X86and_flag (not RC:$src1), (ld_frag addr:$src2)))]>,
-           VEX, VVVV, Sched<[sched.Folded, sched.ReadAfterFold]>;
-}
-let Predicates = [HasBMI, HasEGPR, In64BitMode] in {
-  def rr_EVEX : I<0xF2, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
-                  !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                  [(set RC:$dst, EFLAGS, (X86and_flag (not RC:$src1), RC:$src2))]>,
-                EVEX, VVVV, Sched<[sched]>;
-  def rm_EVEX : I<0xF2, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
-                  !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                  [(set RC:$dst, EFLAGS,
-                   (X86and_flag (not RC:$src1), (ld_frag addr:$src2)))]>,
-                EVEX, VVVV, Sched<[sched.Folded, sched.ReadAfterFold]>;
-}
+multiclass AndN<X86TypeInfo t, string suffix> {
+  defvar andn_rr_p =
+    [(set t.RegClass:$dst, EFLAGS, (X86and_flag (not t.RegClass:$src1),
+     t.RegClass:$src2))];
+  defvar andn_rm_p =
+    [(set t.RegClass:$dst, EFLAGS, (X86and_flag (not t.RegClass:$src1),
+     (t.LoadNode addr:$src2)))];
+  def rr#suffix : ITy<0xF2, MRMSrcReg, t, (outs t.RegClass:$dst),
+                      (ins t.RegClass:$src1, t.RegClass:$src2), "andn",
+                      binop_ndd_args, andn_rr_p>, VVVV, Sched<[WriteALU]>,
+                     T8, DefEFLAGS;
+  def rm#suffix : ITy<0xF2, MRMSrcMem, t, (outs t.RegClass:$dst),
+                       (ins t.RegClass:$src1, t.MemOperand:$src2), "andn",
+                       binop_ndd_args, andn_rm_p>, VVVV,
+                       Sched<[WriteALU.Folded, WriteALU.ReadAfterFold]>,
+                       T8, DefEFLAGS;
 }
 
 // Complexity is reduced to give and with immediate a chance to match first.
-let Defs = [EFLAGS], AddedComplexity = -6 in {
-  defm ANDN32 : bmi_andn<"andn{l}", GR32, i32mem, loadi32, WriteALU>, T8;
-  defm ANDN64 : bmi_andn<"andn{q}", GR64, i64mem, loadi64, WriteALU>, T8, REX_W;
+let AddedComplexity = -6 in {
+defm ANDN32 : AndN<Xi32, "">, VEX, Requires<[HasBMI, NoEGPR]>;
+defm ANDN64 : AndN<Xi64, "">, VEX, REX_W, Requires<[HasBMI, NoEGPR]>;
+defm ANDN32 : AndN<Xi32, "_EVEX">, EVEX, Requires<[HasBMI, HasEGPR, In64BitMode]>;
+defm ANDN64 : AndN<Xi64, "_EVEX">, EVEX, REX_W, Requires<[HasBMI, HasEGPR, In64BitMode]>;
 }
 
 let Predicates = [HasBMI], AddedComplexity = -6 in {
diff --git a/llvm/lib/Target/X86/X86InstrUtils.td b/llvm/lib/Target/X86/X86InstrUtils.td
index 9499753143d9d..89f5653c04f2d 100644
--- a/llvm/lib/Target/X86/X86InstrUtils.td
+++ b/llvm/lib/Target/X86/X86InstrUtils.td
@@ -969,3 +969,4 @@ class ITy<bits<8> opcode, Format f, X86TypeInfo typeinfo, dag outs, dag ins,
 }
 
 defvar binop_args = "{$src2, $src1|$src1, $src2}";
+defvar binop_ndd_args = "{$src2, $src1, $dst|$dst, $src1, $src2}";

From 3f3c5e558382f2861aff6578c4aad6866bfe7637 Mon Sep 17 00:00:00 2001
From: Aiden Grossman <agrossman154@yahoo.com>
Date: Tue, 26 Dec 2023 00:43:16 -0800
Subject: [PATCH 278/342] [NFC][llvm-exegesis] Refactor InstrBenchmark to
 BenchmarkResult (#76388)

This patch refactors InstrBenchmark to BenchmarkResult. Most of the
renaming away from things prefixed with Instr was performed in a
previous commit, but this specific instance was missed.
---
 .../llvm-exegesis/lib/BenchmarkRunner.cpp     | 48 ++++++++++---------
 .../tools/llvm-exegesis/lib/BenchmarkRunner.h |  2 +-
 llvm/tools/llvm-exegesis/llvm-exegesis.cpp    |  6 +--
 3 files changed, 29 insertions(+), 27 deletions(-)

diff --git a/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp b/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp
index 12e774e1a4b8d..1ee59a86ebbdc 100644
--- a/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp
+++ b/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp
@@ -499,19 +499,20 @@ BenchmarkRunner::getRunnableConfiguration(
     const SnippetRepetitor &Repetitor) const {
   RunnableConfiguration RC;
 
-  Benchmark &InstrBenchmark = RC.InstrBenchmark;
-  InstrBenchmark.Mode = Mode;
-  InstrBenchmark.CpuName = std::string(State.getTargetMachine().getTargetCPU());
-  InstrBenchmark.LLVMTriple =
+  Benchmark &BenchmarkResult = RC.BenchmarkResult;
+  BenchmarkResult.Mode = Mode;
+  BenchmarkResult.CpuName =
+      std::string(State.getTargetMachine().getTargetCPU());
+  BenchmarkResult.LLVMTriple =
       State.getTargetMachine().getTargetTriple().normalize();
-  InstrBenchmark.NumRepetitions = NumRepetitions;
-  InstrBenchmark.Info = BC.Info;
+  BenchmarkResult.NumRepetitions = NumRepetitions;
+  BenchmarkResult.Info = BC.Info;
 
   const std::vector<MCInst> &Instructions = BC.Key.Instructions;
 
   bool GenerateMemoryInstructions = ExecutionMode == ExecutionModeE::SubProcess;
 
-  InstrBenchmark.Key = BC.Key;
+  BenchmarkResult.Key = BC.Key;
 
   // Assemble at least kMinInstructionsForSnippet instructions by repeating
   // the snippet for debug/analysis. This is so that the user clearly
@@ -526,7 +527,7 @@ BenchmarkRunner::getRunnableConfiguration(
       return std::move(E);
 
     if (auto Err = getBenchmarkFunctionBytes(*Snippet,
-                                             InstrBenchmark.AssembledSnippet))
+                                             BenchmarkResult.AssembledSnippet))
       return std::move(Err);
   }
 
@@ -534,8 +535,9 @@ BenchmarkRunner::getRunnableConfiguration(
   // measurements.
   if (BenchmarkPhaseSelector >
       BenchmarkPhaseSelectorE::PrepareAndAssembleSnippet) {
-    auto Snippet = assembleSnippet(BC, Repetitor, InstrBenchmark.NumRepetitions,
-                                   LoopBodySize, GenerateMemoryInstructions);
+    auto Snippet =
+        assembleSnippet(BC, Repetitor, BenchmarkResult.NumRepetitions,
+                        LoopBodySize, GenerateMemoryInstructions);
     if (Error E = Snippet.takeError())
       return std::move(E);
     RC.ObjectFile = getObjectFromBuffer(*Snippet);
@@ -577,7 +579,7 @@ BenchmarkRunner::createFunctionExecutor(
 std::pair<Error, Benchmark> BenchmarkRunner::runConfiguration(
     RunnableConfiguration &&RC,
     const std::optional<StringRef> &DumpFile) const {
-  Benchmark &InstrBenchmark = RC.InstrBenchmark;
+  Benchmark &BenchmarkResult = RC.BenchmarkResult;
   object::OwningBinary<object::ObjectFile> &ObjectFile = RC.ObjectFile;
 
   if (DumpFile && BenchmarkPhaseSelector >
@@ -585,38 +587,38 @@ std::pair<Error, Benchmark> BenchmarkRunner::runConfiguration(
     auto ObjectFilePath =
         writeObjectFile(ObjectFile.getBinary()->getData(), *DumpFile);
     if (Error E = ObjectFilePath.takeError()) {
-      return {std::move(E), std::move(InstrBenchmark)};
+      return {std::move(E), std::move(BenchmarkResult)};
     }
     outs() << "Check generated assembly with: /usr/bin/objdump -d "
            << *ObjectFilePath << "\n";
   }
 
   if (BenchmarkPhaseSelector < BenchmarkPhaseSelectorE::Measure) {
-    InstrBenchmark.Error = "actual measurements skipped.";
-    return {Error::success(), std::move(InstrBenchmark)};
+    BenchmarkResult.Error = "actual measurements skipped.";
+    return {Error::success(), std::move(BenchmarkResult)};
   }
 
   Expected<std::unique_ptr<BenchmarkRunner::FunctionExecutor>> Executor =
-      createFunctionExecutor(std::move(ObjectFile), RC.InstrBenchmark.Key);
+      createFunctionExecutor(std::move(ObjectFile), RC.BenchmarkResult.Key);
   if (!Executor)
-    return {Executor.takeError(), std::move(InstrBenchmark)};
+    return {Executor.takeError(), std::move(BenchmarkResult)};
   auto NewMeasurements = runMeasurements(**Executor);
 
   if (Error E = NewMeasurements.takeError()) {
-    return {std::move(E), std::move(InstrBenchmark)};
+    return {std::move(E), std::move(BenchmarkResult)};
   }
-  assert(InstrBenchmark.NumRepetitions > 0 && "invalid NumRepetitions");
+  assert(BenchmarkResult.NumRepetitions > 0 && "invalid NumRepetitions");
   for (BenchmarkMeasure &BM : *NewMeasurements) {
     // Scale the measurements by instruction.
-    BM.PerInstructionValue /= InstrBenchmark.NumRepetitions;
+    BM.PerInstructionValue /= BenchmarkResult.NumRepetitions;
     // Scale the measurements by snippet.
     BM.PerSnippetValue *=
-        static_cast<double>(InstrBenchmark.Key.Instructions.size()) /
-        InstrBenchmark.NumRepetitions;
+        static_cast<double>(BenchmarkResult.Key.Instructions.size()) /
+        BenchmarkResult.NumRepetitions;
   }
-  InstrBenchmark.Measurements = std::move(*NewMeasurements);
+  BenchmarkResult.Measurements = std::move(*NewMeasurements);
 
-  return {Error::success(), std::move(InstrBenchmark)};
+  return {Error::success(), std::move(BenchmarkResult)};
 }
 
 Expected<std::string>
diff --git a/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.h b/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.h
index 2c48d07e37ca9..d746a0f775646 100644
--- a/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.h
+++ b/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.h
@@ -56,7 +56,7 @@ class BenchmarkRunner {
   private:
     RunnableConfiguration() = default;
 
-    Benchmark InstrBenchmark;
+    Benchmark BenchmarkResult;
     object::OwningBinary<object::ObjectFile> ObjectFile;
   };
 
diff --git a/llvm/tools/llvm-exegesis/llvm-exegesis.cpp b/llvm/tools/llvm-exegesis/llvm-exegesis.cpp
index a5f8a09dcb241..1b35fde815f11 100644
--- a/llvm/tools/llvm-exegesis/llvm-exegesis.cpp
+++ b/llvm/tools/llvm-exegesis/llvm-exegesis.cpp
@@ -410,7 +410,7 @@ static void runBenchmarkConfigurations(
       std::optional<StringRef> DumpFile;
       if (DumpObjectToDisk.getNumOccurrences())
         DumpFile = DumpObjectToDisk;
-      auto [Err, InstrBenchmark] =
+      auto [Err, BenchmarkResult] =
           Runner.runConfiguration(std::move(RC), DumpFile);
       if (Err) {
         // Errors from executing the snippets are fine.
@@ -419,9 +419,9 @@ static void runBenchmarkConfigurations(
           llvm::errs() << "llvm-exegesis error: " << toString(std::move(Err));
           exit(1);
         }
-        InstrBenchmark.Error = toString(std::move(Err));
+        BenchmarkResult.Error = toString(std::move(Err));
       }
-      AllResults.push_back(std::move(InstrBenchmark));
+      AllResults.push_back(std::move(BenchmarkResult));
     }
     Benchmark &Result = AllResults.front();
 

From b996f84bc421387a36effd2aa2fa1abad25d1762 Mon Sep 17 00:00:00 2001
From: Yeting Kuo <46629943+yetingk@users.noreply.github.com>
Date: Tue, 26 Dec 2023 17:13:05 +0800
Subject: [PATCH 279/342] [RISCV][NFC] Refine MCOperandPredicate code for
 rtlist. (#76028)

(Imm <= 15) could be implied by isUInt<4>(Imm).
---
 llvm/lib/Target/RISCV/RISCVInstrInfoZc.td | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZc.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZc.td
index a78f362444687..9a7249fe3e3d6 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZc.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZc.td
@@ -56,9 +56,8 @@ def rlist : Operand<OtherVT> {
     int64_t Imm;
     if (!MCOp.evaluateAsConstantImm(Imm))
       return false;
-    if (!isUInt<4>(Imm)) return false;
     // 0~3 Reserved for EABI
-    return (Imm >= 4) && (Imm <= 15);
+    return isUInt<4>(Imm) && Imm >= 4;
   }];
  }
 

From 1d76692cf831aa27c4743999c8cd416e38810e67 Mon Sep 17 00:00:00 2001
From: Jivan Hakobyan <jivanhakobyan9@gmail.com>
Date: Tue, 26 Dec 2023 13:21:38 +0400
Subject: [PATCH 280/342] [RISCV][MC] Add support for experimental Zimop
 extension (#75182)

This implements experimental support for the Zimop extension as
specified here:
https://github.com/riscv/riscv-isa-manual/blob/main/src/zimop.adoc.

This change adds only assembly support.

---------

Co-authored-by: ln8-8 <lyut.nersisyan@gmail.com>
Co-authored-by: ln8-8 <73429801+ln8-8@users.noreply.github.com>
---
 .../test/Preprocessor/riscv-target-features.c |  9 ++++++
 llvm/docs/RISCVUsage.rst                      |  3 ++
 llvm/lib/Support/RISCVISAInfo.cpp             |  2 ++
 llvm/lib/Target/RISCV/RISCVFeatures.td        |  6 ++++
 llvm/lib/Target/RISCV/RISCVInstrFormats.td    | 21 ++++++++++++++
 llvm/lib/Target/RISCV/RISCVInstrInfo.td       | 28 +++++++++++++++++++
 llvm/test/CodeGen/RISCV/attributes.ll         |  4 +++
 llvm/test/MC/RISCV/rv32zimop-invalid.s        |  6 ++++
 llvm/test/MC/RISCV/rvzimop-valid.s            | 26 +++++++++++++++++
 llvm/unittests/Support/RISCVISAInfoTest.cpp   |  1 +
 10 files changed, 106 insertions(+)
 create mode 100644 llvm/test/MC/RISCV/rv32zimop-invalid.s
 create mode 100644 llvm/test/MC/RISCV/rvzimop-valid.s

diff --git a/clang/test/Preprocessor/riscv-target-features.c b/clang/test/Preprocessor/riscv-target-features.c
index 35208b2eae8fb..2111b3f1c5832 100644
--- a/clang/test/Preprocessor/riscv-target-features.c
+++ b/clang/test/Preprocessor/riscv-target-features.c
@@ -120,6 +120,7 @@
 // CHECK-NOT: __riscv_zfbfmin {{.*$}}
 // CHECK-NOT: __riscv_zicfilp {{.*$}}
 // CHECK-NOT: __riscv_zicond {{.*$}}
+// CHECK-NOT: __riscv_zimop {{.*$}}
 // CHECK-NOT: __riscv_ztso {{.*$}}
 // CHECK-NOT: __riscv_zvbb {{.*$}}
 // CHECK-NOT: __riscv_zvbc {{.*$}}
@@ -1071,6 +1072,14 @@
 // RUN: -o - | FileCheck --check-prefix=CHECK-ZICOND-EXT %s
 // CHECK-ZICOND-EXT: __riscv_zicond  1000000{{$}}
 
+// RUN: %clang --target=riscv32 -menable-experimental-extensions \
+// RUN: -march=rv32i_zimop0p1 -x c -E -dM %s \
+// RUN: -o - | FileCheck --check-prefix=CHECK-ZIMOP-EXT %s
+// RUN: %clang --target=riscv64 -menable-experimental-extensions \
+// RUN: -march=rv64i_zimop0p1 -x c -E -dM %s \
+// RUN: -o - | FileCheck --check-prefix=CHECK-ZIMOP-EXT %s
+// CHECK-ZIMOP-EXT: __riscv_zimop  1000{{$}}
+
 // RUN: %clang --target=riscv32-unknown-linux-gnu -menable-experimental-extensions \
 // RUN: -march=rv32iztso0p1 -x c -E -dM %s \
 // RUN: -o - | FileCheck --check-prefix=CHECK-ZTSO-EXT %s
diff --git a/llvm/docs/RISCVUsage.rst b/llvm/docs/RISCVUsage.rst
index 84cc83ef847a5..3125f2d7c9cfd 100644
--- a/llvm/docs/RISCVUsage.rst
+++ b/llvm/docs/RISCVUsage.rst
@@ -221,6 +221,9 @@ The primary goal of experimental support is to assist in the process of ratifica
 ``experimental-ztso``
   LLVM implements the `v0.1 proposed specification <https://github.com/riscv/riscv-isa-manual/releases/download/draft-20220723-10eea63/riscv-spec.pdf>`__ (see Chapter 25).  The mapping from the C/C++ memory model to Ztso has not yet been ratified in any standards document.  There are multiple possible mappings, and they are *not* mutually ABI compatible.  The mapping LLVM implements is ABI compatible with the default WMO mapping.  This mapping may change and there is *explicitly* no ABI stability offered while the extension remains in experimental status.  User beware.
 
+``experimental-zimop``
+  LLVM implements the `v0.1 proposed specification <https://github.com/riscv/riscv-isa-manual/blob/main/src/zimop.adoc>`__.
+
 To use an experimental extension from `clang`, you must add `-menable-experimental-extensions` to the command line, and specify the exact version of the experimental extension you are using.  To use an experimental extension with LLVM's internal developer tools (e.g. `llc`, `llvm-objdump`, `llvm-mc`), you must prefix the extension name with `experimental-`.  Note that you don't need to specify the version with internal tools, and shouldn't include the `experimental-` prefix with `clang`.
 
 Vendor Extensions
diff --git a/llvm/lib/Support/RISCVISAInfo.cpp b/llvm/lib/Support/RISCVISAInfo.cpp
index 7256e9a293299..e71e96e3417e4 100644
--- a/llvm/lib/Support/RISCVISAInfo.cpp
+++ b/llvm/lib/Support/RISCVISAInfo.cpp
@@ -196,6 +196,8 @@ static const RISCVSupportedExtension SupportedExperimentalExtensions[] = {
     {"zicfilp", RISCVExtensionVersion{0, 4}},
     {"zicond", RISCVExtensionVersion{1, 0}},
 
+    {"zimop", RISCVExtensionVersion{0, 1}},
+
     {"ztso", RISCVExtensionVersion{0, 1}},
 
     {"zvfbfmin", RISCVExtensionVersion{0, 8}},
diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td
index a66dd135ae5f8..a6e7c15b50e97 100644
--- a/llvm/lib/Target/RISCV/RISCVFeatures.td
+++ b/llvm/lib/Target/RISCV/RISCVFeatures.td
@@ -687,6 +687,12 @@ def HasStdExtZicond : Predicate<"Subtarget->hasStdExtZicond()">,
                                 AssemblerPredicate<(all_of FeatureStdExtZicond),
                                 "'Zicond' (Integer Conditional Operations)">;
 
+def FeatureStdExtZimop : SubtargetFeature<"experimental-zimop", "HasStdExtZimop", "true",
+                                          "'Zimop' (May-Be-Operations)">;
+def HasStdExtZimop : Predicate<"Subtarget->hasStdExtZimop()">,
+                               AssemblerPredicate<(all_of FeatureStdExtZimop),
+                               "'Zimop' (May-Be-Operations)">;
+
 def FeatureStdExtSmaia
     : SubtargetFeature<"smaia", "HasStdExtSmaia", "true",
                        "'Smaia' (Smaia encompasses all added CSRs and all "
diff --git a/llvm/lib/Target/RISCV/RISCVInstrFormats.td b/llvm/lib/Target/RISCV/RISCVInstrFormats.td
index f56f49ae24571..288c33cfe11c8 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrFormats.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrFormats.td
@@ -410,6 +410,27 @@ class RVInstIUnary<bits<12> imm12, bits<3> funct3, RISCVOpcode opcode,
   let Inst{31-20} = imm12;
 }
 
+class RVInstIMopr<bits<7> imm7, bits<5> imm5, bits<3> funct3, RISCVOpcode opcode,
+                   dag outs, dag ins, string opcodestr, string argstr>
+    : RVInstIBase<funct3, opcode, outs, ins, opcodestr, argstr> {
+  let Inst{31} = imm7{6};
+  let Inst{30} = imm5{4};
+  let Inst{29-28} = imm7{5-4};
+  let Inst{27-26} = imm5{3-2};
+  let Inst{25-22} = imm7{3-0};
+  let Inst{21-20} = imm5{1-0};
+}
+
+class RVInstRMoprr<bits<4> imm4, bits<3> imm3, bits<3> funct3, RISCVOpcode opcode,
+                   dag outs, dag ins, string opcodestr, string argstr>
+    : RVInstRBase<funct3, opcode, outs, ins, opcodestr, argstr> {
+  let Inst{31} = imm4{3};
+  let Inst{30} = imm3{2};
+  let Inst{29-28} = imm4{2-1};
+  let Inst{27-26} = imm3{1-0};
+  let Inst{25} = imm4{0};
+}
+
 class RVInstS<bits<3> funct3, RISCVOpcode opcode, dag outs, dag ins,
               string opcodestr, string argstr>
     : RVInst<outs, ins, opcodestr, argstr, [], InstFormatS> {
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
index edc08187d8f77..099cc0abd1424 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
@@ -597,6 +597,18 @@ class Priv_rr<string opcodestr, bits<7> funct7>
   let rd = 0;
 }
 
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
+class RVMopr<bits<7> imm7, bits<5> imm5, bits<3> funct3,
+             RISCVOpcode opcode, string opcodestr>
+    : RVInstIMopr<imm7, imm5, funct3, opcode, (outs GPR:$rd), (ins GPR:$rs1),
+                   opcodestr, "$rd, $rs1">;
+
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
+class RVMoprr<bits<4> imm4, bits<3> imm3, bits<3> funct3,
+             RISCVOpcode opcode, string opcodestr>
+    : RVInstRMoprr<imm4, imm3, funct3, opcode, (outs GPR:$rd), (ins GPR:$rs1, GPR:$rs2),
+                   opcodestr, "$rd, $rs1, $rs2">;
+
 //===----------------------------------------------------------------------===//
 // Instructions
 //===----------------------------------------------------------------------===//
@@ -786,6 +798,22 @@ def SRAW  : ALUW_rr<0b0100000, 0b101, "sraw">,
 } // IsSignExtendingOpW = 1
 } // Predicates = [IsRV64]
 
+// Zimop instructions
+
+foreach i = 0...31 in {
+    let Predicates = [HasStdExtZimop] in {
+    def MOPR#i : RVMopr<0b1000111, i, 0b100, OPC_SYSTEM, "mop.r."#i>,
+                 Sched<[]>;
+    } // Predicates = [HasStdExtZimop]
+}
+
+foreach i = 0...7 in {
+    let Predicates = [HasStdExtZimop] in {
+    def MOPRR#i : RVMoprr<0b1001, i, 0b100, OPC_SYSTEM, "mop.rr."#i>,
+                  Sched<[]>;
+    } // Predicates = [HasStdExtZimop]
+}
+
 //===----------------------------------------------------------------------===//
 // Privileged instructions
 //===----------------------------------------------------------------------===//
diff --git a/llvm/test/CodeGen/RISCV/attributes.ll b/llvm/test/CodeGen/RISCV/attributes.ll
index 7e14c0f2c43be..5841f1a98f23e 100644
--- a/llvm/test/CodeGen/RISCV/attributes.ll
+++ b/llvm/test/CodeGen/RISCV/attributes.ll
@@ -85,6 +85,7 @@
 ; RUN: llc -mtriple=riscv32 -mattr=+zve32x -mattr=+zvkt %s -o - | FileCheck --check-prefix=RV32ZVKT %s
 ; RUN: llc -mtriple=riscv32 -mattr=+zvfh %s -o - | FileCheck --check-prefix=RV32ZVFH %s
 ; RUN: llc -mtriple=riscv32 -mattr=+experimental-zicond %s -o - | FileCheck --check-prefix=RV32ZICOND %s
+; RUN: llc -mtriple=riscv32 -mattr=+experimental-zimop %s -o - | FileCheck --check-prefix=RV32ZIMOP %s
 ; RUN: llc -mtriple=riscv32 -mattr=+smaia %s -o - | FileCheck --check-prefixes=CHECK,RV32SMAIA %s
 ; RUN: llc -mtriple=riscv32 -mattr=+ssaia %s -o - | FileCheck --check-prefixes=CHECK,RV32SSAIA %s
 ; RUN: llc -mtriple=riscv32 -mattr=+experimental-zfbfmin %s -o - | FileCheck --check-prefixes=CHECK,RV32ZFBFMIN %s
@@ -177,6 +178,7 @@
 ; RUN: llc -mtriple=riscv64 -mattr=+zve32x -mattr=+zvkt %s -o - | FileCheck --check-prefix=RV64ZVKT %s
 ; RUN: llc -mtriple=riscv64 -mattr=+zvfh %s -o - | FileCheck --check-prefix=RV64ZVFH %s
 ; RUN: llc -mtriple=riscv64 -mattr=+experimental-zicond %s -o - | FileCheck --check-prefix=RV64ZICOND %s
+; RUN: llc -mtriple=riscv64 -mattr=+experimental-zimop %s -o - | FileCheck --check-prefix=RV64ZIMOP %s
 ; RUN: llc -mtriple=riscv64 -mattr=+smaia %s -o - | FileCheck --check-prefixes=CHECK,RV64SMAIA %s
 ; RUN: llc -mtriple=riscv64 -mattr=+ssaia %s -o - | FileCheck --check-prefixes=CHECK,RV64SSAIA %s
 ; RUN: llc -mtriple=riscv64 -mattr=+experimental-zfbfmin %s -o - | FileCheck --check-prefixes=CHECK,RV64ZFBFMIN %s
@@ -271,6 +273,7 @@
 ; RV32ZVKT: .attribute 5, "rv32i2p1_zicsr2p0_zve32x1p0_zvkt1p0_zvl32b1p0"
 ; RV32ZVFH: .attribute 5, "rv32i2p1_f2p2_zicsr2p0_zfhmin1p0_zve32f1p0_zve32x1p0_zvfh1p0_zvfhmin1p0_zvl32b1p0"
 ; RV32ZICOND: .attribute 5, "rv32i2p1_zicond1p0"
+; RV32ZIMOP: .attribute 5, "rv32i2p1_zimop0p1"
 ; RV32SMAIA: .attribute 5, "rv32i2p1_smaia1p0"
 ; RV32SSAIA: .attribute 5, "rv32i2p1_ssaia1p0"
 ; RV32ZFBFMIN: .attribute 5, "rv32i2p1_f2p2_zicsr2p0_zfbfmin0p8"
@@ -362,6 +365,7 @@
 ; RV64ZVKT: .attribute 5, "rv64i2p1_zicsr2p0_zve32x1p0_zvkt1p0_zvl32b1p0"
 ; RV64ZVFH: .attribute 5, "rv64i2p1_f2p2_zicsr2p0_zfhmin1p0_zve32f1p0_zve32x1p0_zvfh1p0_zvfhmin1p0_zvl32b1p0"
 ; RV64ZICOND: .attribute 5, "rv64i2p1_zicond1p0"
+; RV64ZIMOP: .attribute 5, "rv64i2p1_zimop0p1"
 ; RV64SMAIA: .attribute 5, "rv64i2p1_smaia1p0"
 ; RV64SSAIA: .attribute 5, "rv64i2p1_ssaia1p0"
 ; RV64ZFBFMIN: .attribute 5, "rv64i2p1_f2p2_zicsr2p0_zfbfmin0p8"
diff --git a/llvm/test/MC/RISCV/rv32zimop-invalid.s b/llvm/test/MC/RISCV/rv32zimop-invalid.s
new file mode 100644
index 0000000000000..e6c3adc4cd309
--- /dev/null
+++ b/llvm/test/MC/RISCV/rv32zimop-invalid.s
@@ -0,0 +1,6 @@
+# RUN: not llvm-mc -triple riscv32 -mattr=+experimental-zimop < %s 2>&1 | FileCheck %s
+
+# Too few operands
+mop.r.0 t0 # CHECK: :[[@LINE]]:1: error: too few operands for instruction
+# Too few operands
+mop.rr.0 t0, t1 # CHECK: :[[@LINE]]:1: error: too few operands for instruction
diff --git a/llvm/test/MC/RISCV/rvzimop-valid.s b/llvm/test/MC/RISCV/rvzimop-valid.s
new file mode 100644
index 0000000000000..1552936629902
--- /dev/null
+++ b/llvm/test/MC/RISCV/rvzimop-valid.s
@@ -0,0 +1,26 @@
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+experimental-zimop -show-encoding \
+# RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+experimental-zimop -show-encoding \
+# RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
+# RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+experimental-zimop < %s \
+# RUN:     | llvm-objdump --mattr=+experimental-zimop -d -r - \
+# RUN:     | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s
+# RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+experimental-zimop < %s \
+# RUN:     | llvm-objdump --mattr=+experimental-zimop -d -r - \
+# RUN:     | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s
+
+# CHECK-ASM-AND-OBJ: mop.r.0 a2, a1
+# CHECK-ASM: encoding: [0x73,0xc6,0xc5,0x81]
+mop.r.0 a2, a1
+
+# CHECK-ASM-AND-OBJ: mop.r.31 a2, a1
+# CHECK-ASM: encoding: [0x73,0xc6,0xf5,0xcd]
+mop.r.31 a2, a1
+
+# CHECK-ASM-AND-OBJ: mop.rr.0 a3, a2, a1
+# CHECK-ASM: encoding: [0xf3,0x46,0xb6,0x82]
+mop.rr.0 a3, a2, a1
+
+# CHECK-ASM-AND-OBJ: mop.rr.7 a3, a2, a1
+# CHECK-ASM: encoding: [0xf3,0x46,0xb6,0xce]
+mop.rr.7 a3, a2, a1
\ No newline at end of file
diff --git a/llvm/unittests/Support/RISCVISAInfoTest.cpp b/llvm/unittests/Support/RISCVISAInfoTest.cpp
index 2dd307603a82f..eeac1e8175658 100644
--- a/llvm/unittests/Support/RISCVISAInfoTest.cpp
+++ b/llvm/unittests/Support/RISCVISAInfoTest.cpp
@@ -756,6 +756,7 @@ R"(All available -march extensions for RISC-V
 Experimental extensions
     zicfilp             0.4       This is a long dummy description
     zicond              1.0
+    zimop               0.1
     zacas               1.0
     zfbfmin             0.8
     ztso                0.1

From 9dcc66578e12ad8e72c8ae7216122a1125976ac5 Mon Sep 17 00:00:00 2001
From: Da-Viper <57949090+Da-Viper@users.noreply.github.com>
Date: Tue, 26 Dec 2023 09:27:10 +0000
Subject: [PATCH 281/342] [clang-tidy] Don't replace typedefs in extern c scope
 (#69102)

Added IgnoreExternC option to modernize-use-using check.
Fixes #35272
---
 .../clang-tidy/modernize/UseUsingCheck.cpp    | 31 +++++++++++++++----
 .../clang-tidy/modernize/UseUsingCheck.h      |  1 +
 clang-tools-extra/docs/ReleaseNotes.rst       |  3 +-
 .../clang-tidy/checks/modernize/use-using.rst | 13 ++++++++
 .../modernize/use-using-ignore-extern-c.cpp   | 14 +++++++++
 .../checkers/modernize/use-using.cpp          | 17 ++++++++++
 6 files changed, 72 insertions(+), 7 deletions(-)
 create mode 100644 clang-tools-extra/test/clang-tidy/checkers/modernize/use-using-ignore-extern-c.cpp

diff --git a/clang-tools-extra/clang-tidy/modernize/UseUsingCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseUsingCheck.cpp
index e6293ed48bfdd..f5fc3ad3fac68 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseUsingCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/UseUsingCheck.cpp
@@ -11,26 +11,39 @@
 #include "clang/Lex/Lexer.h"
 
 using namespace clang::ast_matchers;
+namespace {
+
+AST_MATCHER(clang::LinkageSpecDecl, isExternCLinkage) {
+  return Node.getLanguage() == clang::LinkageSpecDecl::lang_c;
+}
+} // namespace
 
 namespace clang::tidy::modernize {
 
+static constexpr llvm::StringLiteral ExternCDeclName = "extern-c-decl";
 static constexpr llvm::StringLiteral ParentDeclName = "parent-decl";
 static constexpr llvm::StringLiteral TagDeclName = "tag-decl";
 static constexpr llvm::StringLiteral TypedefName = "typedef";
 
 UseUsingCheck::UseUsingCheck(StringRef Name, ClangTidyContext *Context)
     : ClangTidyCheck(Name, Context),
-      IgnoreMacros(Options.getLocalOrGlobal("IgnoreMacros", true)) {}
+      IgnoreMacros(Options.getLocalOrGlobal("IgnoreMacros", true)),
+      IgnoreExternC(Options.get("IgnoreExternC", false)) {}
 
 void UseUsingCheck::storeOptions(ClangTidyOptions::OptionMap &Opts) {
   Options.store(Opts, "IgnoreMacros", IgnoreMacros);
+  Options.store(Opts, "IgnoreExternC", IgnoreExternC);
 }
 
 void UseUsingCheck::registerMatchers(MatchFinder *Finder) {
-  Finder->addMatcher(typedefDecl(unless(isInstantiated()),
-                                 hasParent(decl().bind(ParentDeclName)))
-                         .bind(TypedefName),
-                     this);
+  Finder->addMatcher(
+      typedefDecl(
+          unless(isInstantiated()),
+          optionally(hasAncestor(
+              linkageSpecDecl(isExternCLinkage()).bind(ExternCDeclName))),
+          hasParent(decl().bind(ParentDeclName)))
+          .bind(TypedefName),
+      this);
 
   // This matcher is used to find tag declarations in source code within
   // typedefs. They appear in the AST just *prior* to the typedefs.
@@ -70,6 +83,11 @@ void UseUsingCheck::check(const MatchFinder::MatchResult &Result) {
   if (MatchedDecl->getLocation().isInvalid())
     return;
 
+  const auto *ExternCDecl =
+      Result.Nodes.getNodeAs<LinkageSpecDecl>(ExternCDeclName);
+  if (ExternCDecl && IgnoreExternC)
+    return;
+
   SourceLocation StartLoc = MatchedDecl->getBeginLoc();
 
   if (StartLoc.isMacroID() && IgnoreMacros)
@@ -122,7 +140,8 @@ void UseUsingCheck::check(const MatchFinder::MatchResult &Result) {
       Type = FirstTypedefName + Type.substr(FirstTypedefType.size() + 1);
   }
   if (!ReplaceRange.getEnd().isMacroID()) {
-    const SourceLocation::IntTy Offset = MatchedDecl->getFunctionType() ? 0 : Name.size();
+    const SourceLocation::IntTy Offset =
+        MatchedDecl->getFunctionType() ? 0 : Name.size();
     LastReplacementEnd = ReplaceRange.getEnd().getLocWithOffset(Offset);
   }
 
diff --git a/clang-tools-extra/clang-tidy/modernize/UseUsingCheck.h b/clang-tools-extra/clang-tidy/modernize/UseUsingCheck.h
index 5c741a92d0131..7054778d84a0c 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseUsingCheck.h
+++ b/clang-tools-extra/clang-tidy/modernize/UseUsingCheck.h
@@ -20,6 +20,7 @@ namespace clang::tidy::modernize {
 class UseUsingCheck : public ClangTidyCheck {
 
   const bool IgnoreMacros;
+  const bool IgnoreExternC;
   SourceLocation LastReplacementEnd;
   llvm::DenseMap<const Decl *, SourceRange> LastTagDeclRanges;
 
diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index 00f570bcd2184..ce82063dbfe23 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -408,7 +408,8 @@ Changes in existing checks
 
 - Improved :doc:`modernize-use-using
   <clang-tidy/checks/modernize/use-using>` check to fix function pointer and
-  forward declared ``typedef`` correctly.
+  forward declared ``typedef`` correctly. Added option `IgnoreExternC` to ignore ``typedef``
+  declaration in ``extern "C"`` scope.
 
 - Improved :doc:`performance-faster-string-find
   <clang-tidy/checks/performance/faster-string-find>` check to properly escape
diff --git a/clang-tools-extra/docs/clang-tidy/checks/modernize/use-using.rst b/clang-tools-extra/docs/clang-tidy/checks/modernize/use-using.rst
index eeddaf8d8d65a..32272a07994c2 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/modernize/use-using.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/modernize/use-using.rst
@@ -28,6 +28,14 @@ After:
   using R_t = struct { int a; };
   using R_p = R_t*;
 
+The checker ignores `typedef` within `extern "C" { ... }` blocks.
+
+.. code-block:: c++
+
+  extern "C" {
+    typedef int InExternC; // Left intact.
+  }
+
 This check requires using C++11 or higher to run.
 
 Options
@@ -37,3 +45,8 @@ Options
 
    If set to `true`, the check will not give warnings inside macros. Default
    is `true`.
+
+.. option:: IgnoreExternC
+
+   If set to `true`, the check will not give warning inside `extern "C"`scope.
+   Default is `false`
\ No newline at end of file
diff --git a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-using-ignore-extern-c.cpp b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-using-ignore-extern-c.cpp
new file mode 100644
index 0000000000000..6a845a0bcc350
--- /dev/null
+++ b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-using-ignore-extern-c.cpp
@@ -0,0 +1,14 @@
+// RUN: %check_clang_tidy %s modernize-use-using %t -- -config="{CheckOptions: {modernize-use-using.IgnoreExternC: true}}" -- -I %S/Input/use-using/
+
+// Some Header
+extern "C" {
+
+typedef int NewInt;
+}
+
+extern "C++" {
+
+typedef int InExternCPP;
+// CHECK-MESSAGES: :[[@LINE-1]]:1: warning: use 'using' instead of 'typedef' [modernize-use-using]
+// CHECK-FIXES: using InExternCPP = int;
+}
diff --git a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-using.cpp b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-using.cpp
index 422abee11a719..462bc984fd3ad 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-using.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-using.cpp
@@ -325,3 +325,20 @@ typedef bool (*ISSUE_65055_2)(int);
 typedef class ISSUE_67529_1 *ISSUE_67529;
 // CHECK-MESSAGES: :[[@LINE-1]]:1: warning: use 'using' instead of 'typedef'
 // CHECK-FIXES: using ISSUE_67529 = class ISSUE_67529_1 *;
+
+// Some Header
+extern "C" {
+
+typedef int InExternC;
+// CHECK-MESSAGES: :[[@LINE-1]]:1: warning: use 'using' instead of 'typedef' [modernize-use-using]
+// CHECK-FIXES: using InExternC = int;
+
+}
+
+extern "C++" {
+
+typedef int InExternCPP;
+// CHECK-MESSAGES: :[[@LINE-1]]:1: warning: use 'using' instead of 'typedef' [modernize-use-using]
+// CHECK-FIXES: using InExternCPP = int;
+
+}

From 34621aa81f63812b31d1356030e9d74ce59e56fc Mon Sep 17 00:00:00 2001
From: Piotr Zegar <me@piotrzegar.pl>
Date: Tue, 26 Dec 2023 09:35:18 +0000
Subject: [PATCH 282/342] Revert "[clang-tidy] Don't replace typedefs in extern
 c scope (#69102)"

This reverts commit 9dcc66578e12ad8e72c8ae7216122a1125976ac5.
---
 .../clang-tidy/modernize/UseUsingCheck.cpp    | 31 ++++---------------
 .../clang-tidy/modernize/UseUsingCheck.h      |  1 -
 clang-tools-extra/docs/ReleaseNotes.rst       |  3 +-
 .../clang-tidy/checks/modernize/use-using.rst | 13 --------
 .../modernize/use-using-ignore-extern-c.cpp   | 14 ---------
 .../checkers/modernize/use-using.cpp          | 17 ----------
 6 files changed, 7 insertions(+), 72 deletions(-)
 delete mode 100644 clang-tools-extra/test/clang-tidy/checkers/modernize/use-using-ignore-extern-c.cpp

diff --git a/clang-tools-extra/clang-tidy/modernize/UseUsingCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseUsingCheck.cpp
index f5fc3ad3fac68..e6293ed48bfdd 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseUsingCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/UseUsingCheck.cpp
@@ -11,39 +11,26 @@
 #include "clang/Lex/Lexer.h"
 
 using namespace clang::ast_matchers;
-namespace {
-
-AST_MATCHER(clang::LinkageSpecDecl, isExternCLinkage) {
-  return Node.getLanguage() == clang::LinkageSpecDecl::lang_c;
-}
-} // namespace
 
 namespace clang::tidy::modernize {
 
-static constexpr llvm::StringLiteral ExternCDeclName = "extern-c-decl";
 static constexpr llvm::StringLiteral ParentDeclName = "parent-decl";
 static constexpr llvm::StringLiteral TagDeclName = "tag-decl";
 static constexpr llvm::StringLiteral TypedefName = "typedef";
 
 UseUsingCheck::UseUsingCheck(StringRef Name, ClangTidyContext *Context)
     : ClangTidyCheck(Name, Context),
-      IgnoreMacros(Options.getLocalOrGlobal("IgnoreMacros", true)),
-      IgnoreExternC(Options.get("IgnoreExternC", false)) {}
+      IgnoreMacros(Options.getLocalOrGlobal("IgnoreMacros", true)) {}
 
 void UseUsingCheck::storeOptions(ClangTidyOptions::OptionMap &Opts) {
   Options.store(Opts, "IgnoreMacros", IgnoreMacros);
-  Options.store(Opts, "IgnoreExternC", IgnoreExternC);
 }
 
 void UseUsingCheck::registerMatchers(MatchFinder *Finder) {
-  Finder->addMatcher(
-      typedefDecl(
-          unless(isInstantiated()),
-          optionally(hasAncestor(
-              linkageSpecDecl(isExternCLinkage()).bind(ExternCDeclName))),
-          hasParent(decl().bind(ParentDeclName)))
-          .bind(TypedefName),
-      this);
+  Finder->addMatcher(typedefDecl(unless(isInstantiated()),
+                                 hasParent(decl().bind(ParentDeclName)))
+                         .bind(TypedefName),
+                     this);
 
   // This matcher is used to find tag declarations in source code within
   // typedefs. They appear in the AST just *prior* to the typedefs.
@@ -83,11 +70,6 @@ void UseUsingCheck::check(const MatchFinder::MatchResult &Result) {
   if (MatchedDecl->getLocation().isInvalid())
     return;
 
-  const auto *ExternCDecl =
-      Result.Nodes.getNodeAs<LinkageSpecDecl>(ExternCDeclName);
-  if (ExternCDecl && IgnoreExternC)
-    return;
-
   SourceLocation StartLoc = MatchedDecl->getBeginLoc();
 
   if (StartLoc.isMacroID() && IgnoreMacros)
@@ -140,8 +122,7 @@ void UseUsingCheck::check(const MatchFinder::MatchResult &Result) {
       Type = FirstTypedefName + Type.substr(FirstTypedefType.size() + 1);
   }
   if (!ReplaceRange.getEnd().isMacroID()) {
-    const SourceLocation::IntTy Offset =
-        MatchedDecl->getFunctionType() ? 0 : Name.size();
+    const SourceLocation::IntTy Offset = MatchedDecl->getFunctionType() ? 0 : Name.size();
     LastReplacementEnd = ReplaceRange.getEnd().getLocWithOffset(Offset);
   }
 
diff --git a/clang-tools-extra/clang-tidy/modernize/UseUsingCheck.h b/clang-tools-extra/clang-tidy/modernize/UseUsingCheck.h
index 7054778d84a0c..5c741a92d0131 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseUsingCheck.h
+++ b/clang-tools-extra/clang-tidy/modernize/UseUsingCheck.h
@@ -20,7 +20,6 @@ namespace clang::tidy::modernize {
 class UseUsingCheck : public ClangTidyCheck {
 
   const bool IgnoreMacros;
-  const bool IgnoreExternC;
   SourceLocation LastReplacementEnd;
   llvm::DenseMap<const Decl *, SourceRange> LastTagDeclRanges;
 
diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index ce82063dbfe23..00f570bcd2184 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -408,8 +408,7 @@ Changes in existing checks
 
 - Improved :doc:`modernize-use-using
   <clang-tidy/checks/modernize/use-using>` check to fix function pointer and
-  forward declared ``typedef`` correctly. Added option `IgnoreExternC` to ignore ``typedef``
-  declaration in ``extern "C"`` scope.
+  forward declared ``typedef`` correctly.
 
 - Improved :doc:`performance-faster-string-find
   <clang-tidy/checks/performance/faster-string-find>` check to properly escape
diff --git a/clang-tools-extra/docs/clang-tidy/checks/modernize/use-using.rst b/clang-tools-extra/docs/clang-tidy/checks/modernize/use-using.rst
index 32272a07994c2..eeddaf8d8d65a 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/modernize/use-using.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/modernize/use-using.rst
@@ -28,14 +28,6 @@ After:
   using R_t = struct { int a; };
   using R_p = R_t*;
 
-The checker ignores `typedef` within `extern "C" { ... }` blocks.
-
-.. code-block:: c++
-
-  extern "C" {
-    typedef int InExternC; // Left intact.
-  }
-
 This check requires using C++11 or higher to run.
 
 Options
@@ -45,8 +37,3 @@ Options
 
    If set to `true`, the check will not give warnings inside macros. Default
    is `true`.
-
-.. option:: IgnoreExternC
-
-   If set to `true`, the check will not give warning inside `extern "C"`scope.
-   Default is `false`
\ No newline at end of file
diff --git a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-using-ignore-extern-c.cpp b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-using-ignore-extern-c.cpp
deleted file mode 100644
index 6a845a0bcc350..0000000000000
--- a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-using-ignore-extern-c.cpp
+++ /dev/null
@@ -1,14 +0,0 @@
-// RUN: %check_clang_tidy %s modernize-use-using %t -- -config="{CheckOptions: {modernize-use-using.IgnoreExternC: true}}" -- -I %S/Input/use-using/
-
-// Some Header
-extern "C" {
-
-typedef int NewInt;
-}
-
-extern "C++" {
-
-typedef int InExternCPP;
-// CHECK-MESSAGES: :[[@LINE-1]]:1: warning: use 'using' instead of 'typedef' [modernize-use-using]
-// CHECK-FIXES: using InExternCPP = int;
-}
diff --git a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-using.cpp b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-using.cpp
index 462bc984fd3ad..422abee11a719 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-using.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-using.cpp
@@ -325,20 +325,3 @@ typedef bool (*ISSUE_65055_2)(int);
 typedef class ISSUE_67529_1 *ISSUE_67529;
 // CHECK-MESSAGES: :[[@LINE-1]]:1: warning: use 'using' instead of 'typedef'
 // CHECK-FIXES: using ISSUE_67529 = class ISSUE_67529_1 *;
-
-// Some Header
-extern "C" {
-
-typedef int InExternC;
-// CHECK-MESSAGES: :[[@LINE-1]]:1: warning: use 'using' instead of 'typedef' [modernize-use-using]
-// CHECK-FIXES: using InExternC = int;
-
-}
-
-extern "C++" {
-
-typedef int InExternCPP;
-// CHECK-MESSAGES: :[[@LINE-1]]:1: warning: use 'using' instead of 'typedef' [modernize-use-using]
-// CHECK-FIXES: using InExternCPP = int;
-
-}

From 583a2583bb5f53b7b2cbd3d2043c0b2ac286464f Mon Sep 17 00:00:00 2001
From: Da-Viper <57949090+Da-Viper@users.noreply.github.com>
Date: Tue, 26 Dec 2023 09:27:10 +0000
Subject: [PATCH 283/342] [clang-tidy] Don't replace typedefs in extern c scope
 (#69102)

Added IgnoreExternC option to modernize-use-using check.
Fixes #35272
---
 .../clang-tidy/modernize/UseUsingCheck.cpp    | 31 +++++++++++++++----
 .../clang-tidy/modernize/UseUsingCheck.h      |  1 +
 clang-tools-extra/docs/ReleaseNotes.rst       |  3 +-
 .../clang-tidy/checks/modernize/use-using.rst | 13 ++++++++
 .../modernize/use-using-ignore-extern-c.cpp   | 14 +++++++++
 .../checkers/modernize/use-using.cpp          | 17 ++++++++++
 6 files changed, 72 insertions(+), 7 deletions(-)
 create mode 100644 clang-tools-extra/test/clang-tidy/checkers/modernize/use-using-ignore-extern-c.cpp

diff --git a/clang-tools-extra/clang-tidy/modernize/UseUsingCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseUsingCheck.cpp
index e6293ed48bfdd..f5fc3ad3fac68 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseUsingCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/UseUsingCheck.cpp
@@ -11,26 +11,39 @@
 #include "clang/Lex/Lexer.h"
 
 using namespace clang::ast_matchers;
+namespace {
+
+AST_MATCHER(clang::LinkageSpecDecl, isExternCLinkage) {
+  return Node.getLanguage() == clang::LinkageSpecDecl::lang_c;
+}
+} // namespace
 
 namespace clang::tidy::modernize {
 
+static constexpr llvm::StringLiteral ExternCDeclName = "extern-c-decl";
 static constexpr llvm::StringLiteral ParentDeclName = "parent-decl";
 static constexpr llvm::StringLiteral TagDeclName = "tag-decl";
 static constexpr llvm::StringLiteral TypedefName = "typedef";
 
 UseUsingCheck::UseUsingCheck(StringRef Name, ClangTidyContext *Context)
     : ClangTidyCheck(Name, Context),
-      IgnoreMacros(Options.getLocalOrGlobal("IgnoreMacros", true)) {}
+      IgnoreMacros(Options.getLocalOrGlobal("IgnoreMacros", true)),
+      IgnoreExternC(Options.get("IgnoreExternC", false)) {}
 
 void UseUsingCheck::storeOptions(ClangTidyOptions::OptionMap &Opts) {
   Options.store(Opts, "IgnoreMacros", IgnoreMacros);
+  Options.store(Opts, "IgnoreExternC", IgnoreExternC);
 }
 
 void UseUsingCheck::registerMatchers(MatchFinder *Finder) {
-  Finder->addMatcher(typedefDecl(unless(isInstantiated()),
-                                 hasParent(decl().bind(ParentDeclName)))
-                         .bind(TypedefName),
-                     this);
+  Finder->addMatcher(
+      typedefDecl(
+          unless(isInstantiated()),
+          optionally(hasAncestor(
+              linkageSpecDecl(isExternCLinkage()).bind(ExternCDeclName))),
+          hasParent(decl().bind(ParentDeclName)))
+          .bind(TypedefName),
+      this);
 
   // This matcher is used to find tag declarations in source code within
   // typedefs. They appear in the AST just *prior* to the typedefs.
@@ -70,6 +83,11 @@ void UseUsingCheck::check(const MatchFinder::MatchResult &Result) {
   if (MatchedDecl->getLocation().isInvalid())
     return;
 
+  const auto *ExternCDecl =
+      Result.Nodes.getNodeAs<LinkageSpecDecl>(ExternCDeclName);
+  if (ExternCDecl && IgnoreExternC)
+    return;
+
   SourceLocation StartLoc = MatchedDecl->getBeginLoc();
 
   if (StartLoc.isMacroID() && IgnoreMacros)
@@ -122,7 +140,8 @@ void UseUsingCheck::check(const MatchFinder::MatchResult &Result) {
       Type = FirstTypedefName + Type.substr(FirstTypedefType.size() + 1);
   }
   if (!ReplaceRange.getEnd().isMacroID()) {
-    const SourceLocation::IntTy Offset = MatchedDecl->getFunctionType() ? 0 : Name.size();
+    const SourceLocation::IntTy Offset =
+        MatchedDecl->getFunctionType() ? 0 : Name.size();
     LastReplacementEnd = ReplaceRange.getEnd().getLocWithOffset(Offset);
   }
 
diff --git a/clang-tools-extra/clang-tidy/modernize/UseUsingCheck.h b/clang-tools-extra/clang-tidy/modernize/UseUsingCheck.h
index 5c741a92d0131..7054778d84a0c 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseUsingCheck.h
+++ b/clang-tools-extra/clang-tidy/modernize/UseUsingCheck.h
@@ -20,6 +20,7 @@ namespace clang::tidy::modernize {
 class UseUsingCheck : public ClangTidyCheck {
 
   const bool IgnoreMacros;
+  const bool IgnoreExternC;
   SourceLocation LastReplacementEnd;
   llvm::DenseMap<const Decl *, SourceRange> LastTagDeclRanges;
 
diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index 00f570bcd2184..ce82063dbfe23 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -408,7 +408,8 @@ Changes in existing checks
 
 - Improved :doc:`modernize-use-using
   <clang-tidy/checks/modernize/use-using>` check to fix function pointer and
-  forward declared ``typedef`` correctly.
+  forward declared ``typedef`` correctly. Added option `IgnoreExternC` to ignore ``typedef``
+  declaration in ``extern "C"`` scope.
 
 - Improved :doc:`performance-faster-string-find
   <clang-tidy/checks/performance/faster-string-find>` check to properly escape
diff --git a/clang-tools-extra/docs/clang-tidy/checks/modernize/use-using.rst b/clang-tools-extra/docs/clang-tidy/checks/modernize/use-using.rst
index eeddaf8d8d65a..32272a07994c2 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/modernize/use-using.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/modernize/use-using.rst
@@ -28,6 +28,14 @@ After:
   using R_t = struct { int a; };
   using R_p = R_t*;
 
+The checker ignores `typedef` within `extern "C" { ... }` blocks.
+
+.. code-block:: c++
+
+  extern "C" {
+    typedef int InExternC; // Left intact.
+  }
+
 This check requires using C++11 or higher to run.
 
 Options
@@ -37,3 +45,8 @@ Options
 
    If set to `true`, the check will not give warnings inside macros. Default
    is `true`.
+
+.. option:: IgnoreExternC
+
+   If set to `true`, the check will not give warning inside `extern "C"`scope.
+   Default is `false`
\ No newline at end of file
diff --git a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-using-ignore-extern-c.cpp b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-using-ignore-extern-c.cpp
new file mode 100644
index 0000000000000..6a845a0bcc350
--- /dev/null
+++ b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-using-ignore-extern-c.cpp
@@ -0,0 +1,14 @@
+// RUN: %check_clang_tidy %s modernize-use-using %t -- -config="{CheckOptions: {modernize-use-using.IgnoreExternC: true}}" -- -I %S/Input/use-using/
+
+// Some Header
+extern "C" {
+
+typedef int NewInt;
+}
+
+extern "C++" {
+
+typedef int InExternCPP;
+// CHECK-MESSAGES: :[[@LINE-1]]:1: warning: use 'using' instead of 'typedef' [modernize-use-using]
+// CHECK-FIXES: using InExternCPP = int;
+}
diff --git a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-using.cpp b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-using.cpp
index 422abee11a719..462bc984fd3ad 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-using.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-using.cpp
@@ -325,3 +325,20 @@ typedef bool (*ISSUE_65055_2)(int);
 typedef class ISSUE_67529_1 *ISSUE_67529;
 // CHECK-MESSAGES: :[[@LINE-1]]:1: warning: use 'using' instead of 'typedef'
 // CHECK-FIXES: using ISSUE_67529 = class ISSUE_67529_1 *;
+
+// Some Header
+extern "C" {
+
+typedef int InExternC;
+// CHECK-MESSAGES: :[[@LINE-1]]:1: warning: use 'using' instead of 'typedef' [modernize-use-using]
+// CHECK-FIXES: using InExternC = int;
+
+}
+
+extern "C++" {
+
+typedef int InExternCPP;
+// CHECK-MESSAGES: :[[@LINE-1]]:1: warning: use 'using' instead of 'typedef' [modernize-use-using]
+// CHECK-FIXES: using InExternCPP = int;
+
+}

From 7a48039eb79fc887f473e80618b6bc98effea077 Mon Sep 17 00:00:00 2001
From: Piotr Zegar <me@piotrzegar.pl>
Date: Tue, 26 Dec 2023 10:20:10 +0000
Subject: [PATCH 284/342] [clang-tidy] Fix compilation of modernize-use-using
 check

Fix compilation issue introduced by #69102.
---
 clang-tools-extra/clang-tidy/modernize/UseUsingCheck.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang-tools-extra/clang-tidy/modernize/UseUsingCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseUsingCheck.cpp
index f5fc3ad3fac68..bb05f206c717c 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseUsingCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/UseUsingCheck.cpp
@@ -14,7 +14,7 @@ using namespace clang::ast_matchers;
 namespace {
 
 AST_MATCHER(clang::LinkageSpecDecl, isExternCLinkage) {
-  return Node.getLanguage() == clang::LinkageSpecDecl::lang_c;
+  return Node.getLanguage() == clang::LinkageSpecLanguageIDs::C;
 }
 } // namespace
 

From d51e06c73c33b6c35ee47542a9674da63c6b7f07 Mon Sep 17 00:00:00 2001
From: Ivan Kosarev <ivan.kosarev@amd.com>
Date: Tue, 26 Dec 2023 11:34:16 +0000
Subject: [PATCH 285/342] [AMDGPU][True16] Fix the VGPR register class for
 16-bit values. (#76170)

---
 llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp |  2 +-
 llvm/test/CodeGen/AMDGPU/fadd.f16.ll      | 14 +++++++++-----
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 021d797344c55..732a8a9ed8e81 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -2661,7 +2661,7 @@ SIRegisterInfo::getVGPRClassForBitWidth(unsigned BitWidth) const {
   if (BitWidth == 1)
     return &AMDGPU::VReg_1RegClass;
   if (BitWidth == 16)
-    return &AMDGPU::VGPR_LO16RegClass;
+    return &AMDGPU::VGPR_16RegClass;
   if (BitWidth == 32)
     return &AMDGPU::VGPR_32RegClass;
   return ST.needsAlignedVGPRs() ? getAlignedVGPRClassForBitWidth(BitWidth)
diff --git a/llvm/test/CodeGen/AMDGPU/fadd.f16.ll b/llvm/test/CodeGen/AMDGPU/fadd.f16.ll
index 444d6122eb731..3450700cadb08 100644
--- a/llvm/test/CodeGen/AMDGPU/fadd.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fadd.f16.ll
@@ -100,7 +100,9 @@ define amdgpu_kernel void @fadd_f16(
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GISEL-NEXT:    buffer_load_u16 v1, off, s[0:3], 0 glc dlc
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT:    v_add_f16_e32 v0.l, v0.l, v1.l
+; GFX11-GISEL-NEXT:    v_mov_b16_e32 v0.h, v1.l
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_add_f16_e32 v0.l, v0.l, v0.h
 ; GFX11-GISEL-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
 ; GFX11-GISEL-NEXT:    s_nop 0
 ; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -257,13 +259,14 @@ define amdgpu_kernel void @fadd_f16_imm_a(
 ; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-GISEL-NEXT:    s_mov_b32 s6, -1
 ; GFX11-GISEL-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX11-GISEL-NEXT:    v_mov_b16_e32 v1.l, 0x3c00
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
 ; GFX11-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; GFX11-GISEL-NEXT:    buffer_load_u16 v0, off, s[4:7], 0
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT:    v_add_f16_e32 v0.l, v0.l, v1.l
+; GFX11-GISEL-NEXT:    v_mov_b16_e32 v0.h, 0x3c00
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_add_f16_e32 v0.l, v0.l, v0.h
 ; GFX11-GISEL-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
 ; GFX11-GISEL-NEXT:    s_nop 0
 ; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -400,13 +403,14 @@ define amdgpu_kernel void @fadd_f16_imm_b(
 ; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX11-GISEL-NEXT:    s_mov_b32 s6, -1
 ; GFX11-GISEL-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX11-GISEL-NEXT:    v_mov_b16_e32 v1.l, 0x4000
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
 ; GFX11-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; GFX11-GISEL-NEXT:    buffer_load_u16 v0, off, s[4:7], 0
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT:    v_add_f16_e32 v0.l, v0.l, v1.l
+; GFX11-GISEL-NEXT:    v_mov_b16_e32 v0.h, 0x4000
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_add_f16_e32 v0.l, v0.l, v0.h
 ; GFX11-GISEL-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
 ; GFX11-GISEL-NEXT:    s_nop 0
 ; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)

From 75073e0c8839f18de60d15515092d410a80a58cf Mon Sep 17 00:00:00 2001
From: Shengchen Kan <shengchen.kan@intel.com>
Date: Tue, 26 Dec 2023 22:03:23 +0800
Subject: [PATCH 286/342] [X86][NFC] Simplify the definitions of MUL/IMUL and
 DIV/IDIV

This patch is to extract the NFC in #76319 into a separate commit.
---
 llvm/lib/Target/X86/X86InstrArithmetic.td | 441 ++++++++--------------
 1 file changed, 162 insertions(+), 279 deletions(-)

diff --git a/llvm/lib/Target/X86/X86InstrArithmetic.td b/llvm/lib/Target/X86/X86InstrArithmetic.td
index 2e59a2a1d673c..0582270285180 100644
--- a/llvm/lib/Target/X86/X86InstrArithmetic.td
+++ b/llvm/lib/Target/X86/X86InstrArithmetic.td
@@ -115,47 +115,47 @@ class BinOpRMF_RF<bits<8> o, string m, X86TypeInfo t, SDPatternOperator node>
 }
 
 // BinOpRI - Instructions that read "reg, imm".
-class BinOpRI<bits<8> o, string m, X86TypeInfo t, Format f, dag out, list<dag> p>
+class BinOpRI<bits<8> o, string m, string args, X86TypeInfo t, Format f, dag out, list<dag> p>
   : ITy<o, f, t, out, (ins t.RegClass:$src1, t.ImmOperand:$src2), m,
-        binop_args, p>, Sched<[WriteALU]> {
+        args, p>, Sched<[WriteALU]> {
   let ImmT = t.ImmEncoding;
 }
 // BinOpRI_F - Instructions that read "reg, imm" and write EFLAGS only.
 class BinOpRI_F<bits<8> o, string m, X86TypeInfo t, SDPatternOperator node,
                 Format f>
-  : BinOpRI<o, m, t, f, (outs),
+  : BinOpRI<o, m, binop_args, t, f, (outs),
             [(set EFLAGS, (node t.RegClass:$src1,
              t.ImmOperator:$src2))]>, DefEFLAGS;
 // BinOpRI_RF - Instructions that read "reg, imm" and write "reg", EFLAGS.
 class BinOpRI_RF<bits<8> o, string m, X86TypeInfo t, SDNode node, Format f>
-  : BinOpRI<o, m, t, f, (outs t.RegClass:$dst),
+  : BinOpRI<o, m, binop_args, t, f, (outs t.RegClass:$dst),
             [(set t.RegClass:$dst, EFLAGS,
              (node t.RegClass:$src1, t.ImmOperator:$src2))]>, DefEFLAGS;
 // BinOpRIF_RF - Instructions that read "reg, imm", write "reg" and read/write
 // EFLAGS.
 class BinOpRIF_RF<bits<8> o, string m, X86TypeInfo t, SDNode node, Format f>
-  : BinOpRI<o, m, t, f, (outs t.RegClass:$dst),
+  : BinOpRI<o, m, binop_args, t, f, (outs t.RegClass:$dst),
             [(set t.RegClass:$dst, EFLAGS,
              (node t.RegClass:$src1, t.ImmOperator:$src2,
              EFLAGS))]>, DefEFLAGS, UseEFLAGS {
   let SchedRW = [WriteADC];
 }
 // BinOpRI8 - Instructions that read "reg, imm8".
-class BinOpRI8<bits<8> o, string m, X86TypeInfo t, Format f, dag out>
+class BinOpRI8<bits<8> o, string m, string args, X86TypeInfo t, Format f, dag out>
   : ITy<o, f, t, out, (ins t.RegClass:$src1, t.Imm8Operand:$src2), m,
-        binop_args, []>, Sched<[WriteALU]> {
+        args, []>, Sched<[WriteALU]> {
   let ImmT = Imm8;
 }
 // BinOpRI8_F - Instructions that read "reg, imm8" and write EFLAGS only.
 class BinOpRI8_F<bits<8> o, string m, X86TypeInfo t, Format f>
-  : BinOpRI8<o, m, t, f, (outs)>, DefEFLAGS;
+  : BinOpRI8<o, m, binop_args, t, f, (outs)>, DefEFLAGS;
 // BinOpRI8_RF - Instructions that read "reg, imm8" and write "reg", EFLAGS.
 class BinOpRI8_RF<bits<8> o, string m, X86TypeInfo t, Format f>
-  : BinOpRI8<o, m, t, f, (outs t.RegClass:$dst)>, DefEFLAGS;
+  : BinOpRI8<o, m, binop_args, t, f, (outs t.RegClass:$dst)>, DefEFLAGS;
 // BinOpRI8F_RF - Instructions that read "reg, imm", write "reg" and read/write
 // EFLAGS.
 class BinOpRI8F_RF<bits<8> o, string m, X86TypeInfo t, Format f>
-  : BinOpRI8<o, m, t, f, (outs t.RegClass:$dst)>, DefEFLAGS, UseEFLAGS {
+  : BinOpRI8<o, m, binop_args, t, f, (outs t.RegClass:$dst)>, DefEFLAGS, UseEFLAGS {
   let SchedRW = [WriteADC];
 }
 
@@ -200,21 +200,21 @@ class BinOpMRF_MF<bits<8> o, string m, X86TypeInfo t, SDNode node>
 }
 
 // BinOpMI - Instructions that read "[mem], imm".
-class BinOpMI<bits<8> o, string m, X86TypeInfo t, Format f, list<dag> p>
-  : ITy<o, f, t, (outs), (ins t.MemOperand:$src1, t.ImmOperand:$src2), m,
-        binop_args, p> {
+class BinOpMI<bits<8> o, string m, string args, X86TypeInfo t, Format f, dag out, list<dag> p>
+  : ITy<o, f, t, out, (ins t.MemOperand:$src1, t.ImmOperand:$src2), m,
+        args, p> {
   let ImmT = t.ImmEncoding;
   let mayLoad = 1;
 }
 // BinOpMI_F - Instructions that read "[mem], imm" and write EFLAGS only.
 class BinOpMI_F<bits<8> o, string m, X86TypeInfo t, SDPatternOperator node,
                 Format f>
-  : BinOpMI<o, m, t, f,
+  : BinOpMI<o, m, binop_args, t, f, (outs),
             [(set EFLAGS, (node (t.LoadNode addr:$src1), t.ImmOperator:$src2))]>,
     Sched<[WriteALU.Folded]>, DefEFLAGS;
 // BinOpMI_MF - Instructions that read "[mem], imm" and write "[mem]", EFLAGS.
 class BinOpMI_MF<bits<8> o, string m, X86TypeInfo t, SDNode node, Format f>
-  : BinOpMI<o, m, t, f,
+  : BinOpMI<o, m, binop_args, t, f, (outs),
             [(store (node (t.VT (load addr:$src1)),
              t.ImmOperator:$src2), addr:$src1), (implicit EFLAGS)]>,
     Sched<[WriteALURMW]>, DefEFLAGS {
@@ -223,7 +223,7 @@ class BinOpMI_MF<bits<8> o, string m, X86TypeInfo t, SDNode node, Format f>
 // BinOpMIF_MF - Instructions that read "[mem], imm", write "[mem]" and
 // read/write EFLAGS.
 class BinOpMIF_MF<bits<8> o, string m, X86TypeInfo t, SDNode node, Format f>
-  : BinOpMI<o, m, t, f,
+  : BinOpMI<o, m, binop_args, t, f, (outs),
             [(store (node (t.VT (load addr:$src1)),
              t.ImmOperator:$src2, EFLAGS), addr:$src1), (implicit EFLAGS)]>,
     Sched<[WriteADCRMW]>, DefEFLAGS, UseEFLAGS {
@@ -231,24 +231,24 @@ class BinOpMIF_MF<bits<8> o, string m, X86TypeInfo t, SDNode node, Format f>
 }
 
 // BinOpMI8 - Instructions that read "[mem], imm8".
-class BinOpMI8<string m, X86TypeInfo t, Format f>
-  : ITy<0x83, f, t, (outs), (ins t.MemOperand:$src1, t.Imm8Operand:$src2), m,
-        binop_args, []> {
+class BinOpMI8<string m, string args, X86TypeInfo t, Format f, dag out>
+  : ITy<0x83, f, t, out, (ins t.MemOperand:$src1, t.Imm8Operand:$src2), m,
+        args, []> {
   let ImmT = Imm8;
   let mayLoad = 1;
 }
 // BinOpMI8_F - Instructions that read "[mem], imm8" and write EFLAGS only.
 class BinOpMI8_F<string m, X86TypeInfo t, Format f>
-  : BinOpMI8<m, t, f>, Sched<[WriteALU.Folded]>, DefEFLAGS;
+  : BinOpMI8<m, binop_args, t, f, (outs)>, Sched<[WriteALU.Folded]>, DefEFLAGS;
 // BinOpMI8_MF - Instructions that read "[mem], imm8" and write "[mem]", EFLAGS.
 class BinOpMI8_MF<string m, X86TypeInfo t, Format f>
-  : BinOpMI8<m, t, f>, Sched<[WriteALURMW]>, DefEFLAGS {
+  : BinOpMI8<m, binop_args, t, f, (outs)>, Sched<[WriteALURMW]>, DefEFLAGS {
   let mayStore = 1;
 }
 // BinOpMI8F_MF - Instructions that read "[mem], imm8", write "[mem]" and
 // read/write EFLAGS.
 class BinOpMI8F_MF<string m, X86TypeInfo t, Format f>
-  : BinOpMI8<m, t, f>, Sched<[WriteADCRMW]>, DefEFLAGS, UseEFLAGS {
+  : BinOpMI8<m, binop_args, t, f, (outs)>, Sched<[WriteADCRMW]>, DefEFLAGS, UseEFLAGS {
   let mayStore = 1;
 }
 
@@ -290,33 +290,13 @@ class UnaryOpM<bits<8> o, Format f, string m, X86TypeInfo t, list<dag> p>
   let mayStore = 1;
 }
 
-// INCDECR - Instructions like "inc reg".
-class INCDECR<Format f, string m, X86TypeInfo t, SDPatternOperator node>
-  : UnaryOpR<0xFF, f, m, t,
-             [(set t.RegClass:$dst, EFLAGS, (node t.RegClass:$src1, 1))]>,
-    DefEFLAGS {
-  let isConvertibleToThreeAddress = 1; // Can xform into LEA.
-}
-
-// INCDECM - Instructions like "inc [mem]".
-class INCDECM<Format f, string m, X86TypeInfo t, int num>
-  : UnaryOpM<0xFF, f, m, t,
-             [(store (add (t.LoadNode addr:$dst), num), addr:$dst),
-              (implicit EFLAGS)]>, DefEFLAGS;
-
-// INCDECR_ALT - Instructions like "inc reg" short forms.
-class INCDECR_ALT<bits<8> o, string m, X86TypeInfo t>
-  : UnaryOpR<o, AddRegFrm, m, t, []>, DefEFLAGS {
-  // Short forms only valid in 32-bit mode. Selected during MCInst lowering.
-  let Predicates = [Not64BitMode];
-}
-
-// MulOpR - Instructions like "mul reg".
+//===----------------------------------------------------------------------===//
+// MUL/IMUL and DIV/IDIV Instructions
+//
 class MulOpR<bits<8> o, Format f, string m, X86TypeInfo t,
              X86FoldableSchedWrite sched, list<dag> p>
   : ITy<o, f, t, (outs), (ins t.RegClass:$src), m, "$src", p>, Sched<[sched]>;
 
-// MulOpM - Instructions like "mul [mem]".
 class MulOpM<bits<8> o, Format f, string m, X86TypeInfo t,
              X86FoldableSchedWrite sched, list<dag> p>
   : ITy<o, f, t, (outs), (ins t.MemOperand:$src), m,
@@ -324,88 +304,144 @@ class MulOpM<bits<8> o, Format f, string m, X86TypeInfo t,
   let mayLoad = 1;
 }
 
-// NegOpR - Instructions like "neg reg".
-class NegOpR<bits<8> o, string m, X86TypeInfo t>
-  : UnaryOpR<o, MRM3r, m, t,
-             [(set t.RegClass:$dst, (ineg t.RegClass:$src1)),
-              (implicit EFLAGS)]>, DefEFLAGS;
-
-// NegOpM - Instructions like "neg [mem]".
-class NegOpM<bits<8> o, string m, X86TypeInfo t>
-  : UnaryOpM<o, MRM3m, m, t,
-             [(store (ineg (t.LoadNode addr:$dst)), addr:$dst),
-              (implicit EFLAGS)]>, DefEFLAGS;
-
-// NOTE: NOT does not set EFLAGS!
-// NotOpR - Instructions like "not reg".
-class NotOpR<bits<8> o, string m, X86TypeInfo t>
-  : UnaryOpR<o, MRM2r, m, t, [(set t.RegClass:$dst, (not t.RegClass:$src1))]>;
+multiclass Mul<bits<8> o, string m, Format RegMRM, Format MemMRM, SDPatternOperator node> {
+  // AL is really implied by AX, but the registers in Defs must match the
+  // SDNode results (i8, i32).
+  //
+  // FIXME: Used for 8-bit mul, ignore result upper 8 bits.
+  // This probably ought to be moved to a def : Pat<> if the
+  // syntax can be accepted.
+  let Defs = [AL,EFLAGS,AX], Uses = [AL] in
+  def 8r : MulOpR<o, RegMRM, m, Xi8, WriteIMul8,
+                  [(set AL, (node AL, GR8:$src)), (implicit EFLAGS)]>;
+  let Defs = [AX,DX,EFLAGS], Uses = [AX] in
+  def 16r : MulOpR<o, RegMRM, m, Xi16, WriteIMul16, []>, OpSize16;
+  let Defs = [EAX,EDX,EFLAGS], Uses = [EAX] in
+  def 32r : MulOpR<o, RegMRM, m, Xi32, WriteIMul32, []>, OpSize32;
+  let Defs = [RAX,RDX,EFLAGS], Uses = [RAX] in
+  def 64r : MulOpR<o, RegMRM, m, Xi64, WriteIMul64, []>;
+  let Defs = [AL,EFLAGS,AX], Uses = [AL] in
+  def 8m : MulOpM<o, MemMRM, m, Xi8, WriteIMul8,
+                  [(set AL, (node AL, (loadi8 addr:$src))), (implicit EFLAGS)]>;
+  let Defs = [AX,DX,EFLAGS], Uses = [AX] in
+  def 16m : MulOpM<o, MemMRM, m, Xi16, WriteIMul16, []>, OpSize16;
+  let Defs = [EAX,EDX,EFLAGS], Uses = [EAX] in
+  def 32m : MulOpM<o, MemMRM, m, Xi32, WriteIMul32, []>, OpSize32;
+  let Defs = [RAX,RDX,EFLAGS], Uses = [RAX] in
+  def 64m : MulOpM<o, MemMRM, m, Xi64, WriteIMul64, []>, Requires<[In64BitMode]>;
+}
 
-// NotOpM - Instructions like "neg [mem]".
-class NotOpM<bits<8> o, string m, X86TypeInfo t>
-  : UnaryOpM<o, MRM2m, m,  t,
-             [(store (not (t.LoadNode addr:$dst)), addr:$dst)]>;
+defm MUL : Mul<0xF7, "mul", MRM4r, MRM4m, mul>;
+defm IMUL : Mul<0xF7, "imul", MRM5r, MRM5m, null_frag>;
+
+multiclass Div<bits<8> o, string m, Format RegMRM, Format MemMRM> {
+  defvar sched8 = !if(!eq(m, "div"), WriteDiv8, WriteIDiv8);
+  defvar sched16 = !if(!eq(m, "div"), WriteDiv16, WriteIDiv16);
+  defvar sched32 = !if(!eq(m, "div"), WriteDiv32, WriteIDiv32);
+  defvar sched64 = !if(!eq(m, "div"), WriteDiv64, WriteIDiv64);
+  let Defs = [AL,AH,EFLAGS], Uses = [AX] in
+  def 8r  : MulOpR<o, RegMRM, m, Xi8, sched8, []>;
+  let Defs = [AX,DX,EFLAGS], Uses = [AX,DX] in
+  def 16r : MulOpR<o, RegMRM, m, Xi16, sched16, []>, OpSize16;
+  let Defs = [EAX,EDX,EFLAGS], Uses = [EAX,EDX] in
+  def 32r : MulOpR<o, RegMRM, m, Xi32, sched32, []>, OpSize32;
+  let Defs = [RAX,RDX,EFLAGS], Uses = [RAX,RDX] in
+  def 64r : MulOpR<o, RegMRM, m, Xi64, sched64, []>;
+  let Defs = [AL,AH,EFLAGS], Uses = [AX] in
+  def 8m  : MulOpM<o, MemMRM, m, Xi8, sched8, []>;
+  let Defs = [AX,DX,EFLAGS], Uses = [AX,DX] in
+  def 16m : MulOpM<o, MemMRM, m, Xi16, sched16, []>, OpSize16;
+  let Defs = [EAX,EDX,EFLAGS], Uses = [EAX,EDX] in
+  def 32m : MulOpM<o, MemMRM, m, Xi32, sched32, []>, OpSize32;
+  let Defs = [RAX,RDX,EFLAGS], Uses = [RAX,RDX] in
+  def 64m : MulOpM<o, MemMRM, m, Xi64, sched64, []>, Requires<[In64BitMode]>;
+}
+let hasSideEffects = 1 in { // so that we don't speculatively execute
+defm DIV: Div<0xF7, "div", MRM6r, MRM6m>;
+defm IDIV: Div<0xF7, "idiv", MRM7r, MRM7m>;
+}
 
-// IMulOpRR - Instructions like "imul reg, reg, i8".
-class IMulOpRR<bits<8> o, string m, X86TypeInfo t, X86FoldableSchedWrite sched>
-  : BinOpRR_RF<o, m, t, X86smul_flag>, TB {
+class IMulOpRR<X86TypeInfo t, X86FoldableSchedWrite sched>
+  : BinOpRR_RF<0xAF, "imul", t, X86smul_flag>, TB {
   let Form = MRMSrcReg;
   let SchedRW = [sched];
   // X = IMUL Y, Z --> X = IMUL Z, Y
   let isCommutable = 1;
 }
-
-// IMulOpRM - Instructions like "imul reg, reg, [mem]".
-class IMulOpRM<bits<8> o, string m, X86TypeInfo t, X86FoldableSchedWrite sched>
-  : BinOpRM_RF<o, m, t, X86smul_flag>, TB {
+class IMulOpRM<X86TypeInfo t, X86FoldableSchedWrite sched>
+  : BinOpRM_RF<0xAF, "imul", t, X86smul_flag>, TB {
 let Form = MRMSrcMem;
 let SchedRW = [sched.Folded, sched.ReadAfterFold];
 }
 
-// IMulOpRRI8 - Instructions like "imul reg, reg, i8".
-class IMulOpRRI8<bits<8> o, string m, X86TypeInfo t,
-                 X86FoldableSchedWrite sched>
-  : ITy<o, MRMSrcReg, t, (outs t.RegClass:$dst),
-        (ins t.RegClass:$src1, t.Imm8Operand:$src2), m,
-        "{$src2, $src1, $dst|$dst, $src1, $src2}", []>, Sched<[sched]>, DefEFLAGS {
-  let ImmT = Imm8;
+let Constraints = "$src1 = $dst" in {
+def IMUL16rr : IMulOpRR<Xi16, WriteIMul16Reg>, OpSize16;
+def IMUL32rr : IMulOpRR<Xi32, WriteIMul32Reg>, OpSize32;
+def IMUL64rr : IMulOpRR<Xi64, WriteIMul64Reg>;
+def IMUL16rm : IMulOpRM<Xi16, WriteIMul16Reg>, OpSize16;
+def IMUL32rm : IMulOpRM<Xi32, WriteIMul32Reg>, OpSize32;
+def IMUL64rm : IMulOpRM<Xi64, WriteIMul64Reg>;
 }
 
-// IMulOpRRI - Instructions like "imul reg, reg, i16/i32/i64".
-class IMulOpRRI<bits<8> o, string m, X86TypeInfo t,
-                X86FoldableSchedWrite sched>
-  : ITy<o, MRMSrcReg, t, (outs t.RegClass:$dst),
-        (ins t.RegClass:$src1, t.ImmOperand:$src2), m,
-        "{$src2, $src1, $dst|$dst, $src1, $src2}",
-        [(set t.RegClass:$dst, EFLAGS, (X86smul_flag t.RegClass:$src1,
-         t.ImmNoSuOperator:$src2))]>,
-    Sched<[sched]>, DefEFLAGS {
-  let ImmT = t.ImmEncoding;
+class IMulOpRI8_R<X86TypeInfo t, X86FoldableSchedWrite sched>
+  : BinOpRI8<0x6B, "imul", binop_ndd_args, t, MRMSrcReg,
+             (outs t.RegClass:$dst)>, DefEFLAGS {
+  let SchedRW = [sched];
 }
-
-// IMulOpRMI8 - Instructions like "imul reg, [mem], i8".
-class IMulOpRMI8<bits<8> o, string m, X86TypeInfo t,
-                 X86FoldableSchedWrite sched>
-  : ITy<o, MRMSrcMem, t, (outs t.RegClass:$dst),
-        (ins t.MemOperand:$src1, t.Imm8Operand:$src2), m,
-        "{$src2, $src1, $dst|$dst, $src1, $src2}", []>, Sched<[sched.Folded]>,
+class IMulOpRI_R<X86TypeInfo t, X86FoldableSchedWrite sched>
+  : BinOpRI<0x69, "imul", binop_ndd_args, t, MRMSrcReg,
+            (outs t.RegClass:$dst),
+            [(set t.RegClass:$dst, EFLAGS, (X86smul_flag t.RegClass:$src1,
+             t.ImmNoSuOperator:$src2))]>, DefEFLAGS {
+  let SchedRW = [sched];
+}
+class IMulOpMI8_R<X86TypeInfo t, X86FoldableSchedWrite sched>
+  : BinOpMI8<"imul", binop_ndd_args, t, MRMSrcMem, (outs t.RegClass:$dst)>,
     DefEFLAGS {
-  let ImmT = Imm8;
-  let mayLoad = 1;
+  let Opcode = 0x6B;
+  let SchedRW = [sched.Folded];
 }
-
-// IMulOpRMI - Instructions like "imul reg, [mem], i16/i32/i64".
-class IMulOpRMI<bits<8> o, string m, X86TypeInfo t,
-                X86FoldableSchedWrite sched>
-  : ITy<o, MRMSrcMem, t, (outs t.RegClass:$dst),
-        (ins t.MemOperand:$src1, t.ImmOperand:$src2), m,
-        "{$src2, $src1, $dst|$dst, $src1, $src2}",
-        [(set t.RegClass:$dst, EFLAGS,
-         (X86smul_flag (t.LoadNode addr:$src1), t.ImmNoSuOperator:$src2))]>,
-    Sched<[sched.Folded]>, DefEFLAGS {
-  let ImmT = t.ImmEncoding;
+class IMulOpMI_R<X86TypeInfo t, X86FoldableSchedWrite sched>
+  : BinOpMI<0x69, "imul", binop_ndd_args, t, MRMSrcMem,
+            (outs t.RegClass:$dst),
+            [(set t.RegClass:$dst, EFLAGS, (X86smul_flag (t.LoadNode addr:$src1),
+             t.ImmNoSuOperator:$src2))]>,
+    DefEFLAGS {
+  let SchedRW = [sched.Folded];
 }
+def IMUL16rri8 : IMulOpRI8_R<Xi16, WriteIMul16Imm>, OpSize16;
+def IMUL32rri8 : IMulOpRI8_R<Xi32, WriteIMul32Imm>, OpSize32;
+def IMUL64rri8 : IMulOpRI8_R<Xi64, WriteIMul64Imm>;
+def IMUL16rri  : IMulOpRI_R<Xi16, WriteIMul16Imm>, OpSize16;
+def IMUL32rri  : IMulOpRI_R<Xi32, WriteIMul32Imm>, OpSize32;
+def IMUL64rri32 : IMulOpRI_R<Xi64, WriteIMul64Imm>;
+
+def IMUL16rmi8 : IMulOpMI8_R<Xi16, WriteIMul16Imm>, OpSize16;
+def IMUL32rmi8 : IMulOpMI8_R<Xi32, WriteIMul32Imm>, OpSize32;
+def IMUL64rmi8 : IMulOpMI8_R<Xi64, WriteIMul64Imm>;
+def IMUL16rmi  : IMulOpMI_R<Xi16, WriteIMul16Imm>, OpSize16;
+def IMUL32rmi  : IMulOpMI_R<Xi32, WriteIMul32Imm>, OpSize32;
+def IMUL64rmi32 : IMulOpMI_R<Xi64, WriteIMul64Imm>;
 
+//===----------------------------------------------------------------------===//
+// INC and DEC Instructions
+//
+class INCDECR<Format f, string m, X86TypeInfo t, SDPatternOperator node>
+  : UnaryOpR<0xFF, f, m, t,
+             [(set t.RegClass:$dst, EFLAGS, (node t.RegClass:$src1, 1))]>,
+    DefEFLAGS {
+  let isConvertibleToThreeAddress = 1; // Can xform into LEA.
+}
+class INCDECM<Format f, string m, X86TypeInfo t, int num>
+  : UnaryOpM<0xFF, f, m, t,
+             [(store (add (t.LoadNode addr:$dst), num), addr:$dst),
+              (implicit EFLAGS)]>, DefEFLAGS;
+// INCDECR_ALT - Instructions like "inc reg" short forms.
+class INCDECR_ALT<bits<8> o, string m, X86TypeInfo t>
+  : UnaryOpR<o, AddRegFrm, m, t, []>, DefEFLAGS {
+  // Short forms only valid in 32-bit mode. Selected during MCInst lowering.
+  let Predicates = [Not64BitMode];
+}
 let Constraints = "$src1 = $dst" in {
 def INC16r_alt : INCDECR_ALT<0x40, "inc", Xi16>, OpSize16;
 def INC32r_alt : INCDECR_ALT<0x40, "inc", Xi32>, OpSize32;
@@ -413,7 +449,6 @@ def INC8r  : INCDECR<MRM0r, "inc", Xi8, X86add_flag_nocf>;
 def INC16r : INCDECR<MRM0r, "inc", Xi16, X86add_flag_nocf>, OpSize16;
 def INC32r : INCDECR<MRM0r, "inc", Xi32, X86add_flag_nocf>, OpSize32;
 def INC64r : INCDECR<MRM0r, "inc", Xi64, X86add_flag_nocf>;
-
 def DEC16r_alt : INCDECR_ALT<0x48, "dec", Xi16>, OpSize16;
 def DEC32r_alt : INCDECR_ALT<0x48, "dec", Xi32>, OpSize32;
 def DEC8r  : INCDECR<MRM1r, "dec", Xi8, X86sub_flag_nocf>;
@@ -421,7 +456,6 @@ def DEC16r : INCDECR<MRM1r, "dec", Xi16, X86sub_flag_nocf>, OpSize16;
 def DEC32r : INCDECR<MRM1r, "dec", Xi32, X86sub_flag_nocf>, OpSize32;
 def DEC64r : INCDECR<MRM1r, "dec", Xi64, X86sub_flag_nocf>;
 }
-
 let Predicates = [UseIncDec] in {
 def INC8m  : INCDECM<MRM0m, "inc", Xi8, 1>;
 def INC16m : INCDECM<MRM0m, "inc", Xi16, 1>, OpSize16;
@@ -435,176 +469,25 @@ def INC64m : INCDECM<MRM0m, "inc", Xi64, 1>;
 def DEC64m : INCDECM<MRM1m, "dec", Xi64, -1>;
 }
 
-// Extra precision multiplication
-
-// AL is really implied by AX, but the registers in Defs must match the
-// SDNode results (i8, i32).
-// AL,AH = AL*GR8
-let Defs = [AL,EFLAGS,AX], Uses = [AL] in
-def MUL8r  : MulOpR<0xF6, MRM4r, "mul", Xi8, WriteIMul8,
-               // FIXME: Used for 8-bit mul, ignore result upper 8 bits.
-               // This probably ought to be moved to a def : Pat<> if the
-               // syntax can be accepted.
-               [(set AL, (mul AL, GR8:$src)), (implicit EFLAGS)]>;
-// AX,DX = AX*GR16
-let Defs = [AX,DX,EFLAGS], Uses = [AX] in
-def MUL16r : MulOpR<0xF7, MRM4r, "mul", Xi16, WriteIMul16, []>, OpSize16;
-// EAX,EDX = EAX*GR32
-let Defs = [EAX,EDX,EFLAGS], Uses = [EAX] in
-def MUL32r : MulOpR<0xF7, MRM4r, "mul", Xi32, WriteIMul32,
-               [/*(set EAX, EDX, EFLAGS, (X86umul_flag EAX, GR32:$src))*/]>, OpSize32;
-// RAX,RDX = RAX*GR64
-let Defs = [RAX,RDX,EFLAGS], Uses = [RAX] in
-def MUL64r : MulOpR<0xF7, MRM4r, "mul", Xi64, WriteIMul64,
-                [/*(set RAX, RDX, EFLAGS, (X86umul_flag RAX, GR64:$src))*/]>;
-// AL,AH = AL*[mem8]
-let Defs = [AL,EFLAGS,AX], Uses = [AL] in
-def MUL8m  : MulOpM<0xF6, MRM4m, "mul", Xi8, WriteIMul8,
-               // FIXME: Used for 8-bit mul, ignore result upper 8 bits.
-               // This probably ought to be moved to a def : Pat<> if the
-               // syntax can be accepted.
-               [(set AL, (mul AL, (loadi8 addr:$src))),
-                (implicit EFLAGS)]>;
-// AX,DX = AX*[mem16]
-let Defs = [AX,DX,EFLAGS], Uses = [AX] in
-def MUL16m : MulOpM<0xF7, MRM4m, "mul", Xi16, WriteIMul16, []>, OpSize16;
-// EAX,EDX = EAX*[mem32]
-let Defs = [EAX,EDX,EFLAGS], Uses = [EAX] in
-def MUL32m : MulOpM<0xF7, MRM4m, "mul", Xi32, WriteIMul32, []>, OpSize32;
-// RAX,RDX = RAX*[mem64]
-let Defs = [RAX,RDX,EFLAGS], Uses = [RAX] in
-def MUL64m : MulOpM<0xF7, MRM4m, "mul", Xi64, WriteIMul64, []>,
-             Requires<[In64BitMode]>;
-
-// AL,AH = AL*GR8
-let Defs = [AL,EFLAGS,AX], Uses = [AL] in
-def IMUL8r  : MulOpR<0xF6, MRM5r, "imul", Xi8, WriteIMul8, []>;
-// AX,DX = AX*GR16
-let Defs = [AX,DX,EFLAGS], Uses = [AX] in
-def IMUL16r : MulOpR<0xF7, MRM5r, "imul", Xi16, WriteIMul16, []>, OpSize16;
-// EAX,EDX = EAX*GR32
-let Defs = [EAX,EDX,EFLAGS], Uses = [EAX] in
-def IMUL32r : MulOpR<0xF7, MRM5r, "imul", Xi32, WriteIMul32, []>, OpSize32;
-// RAX,RDX = RAX*GR64
-let Defs = [RAX,RDX,EFLAGS], Uses = [RAX] in
-def IMUL64r : MulOpR<0xF7, MRM5r, "imul", Xi64, WriteIMul64, []>;
-
-// AL,AH = AL*[mem8]
-let Defs = [AL,EFLAGS,AX], Uses = [AL] in
-def IMUL8m  : MulOpM<0xF6, MRM5m, "imul", Xi8, WriteIMul8, []>;
-// AX,DX = AX*[mem16]
-let Defs = [AX,DX,EFLAGS], Uses = [AX] in
-def IMUL16m : MulOpM<0xF7, MRM5m, "imul", Xi16, WriteIMul16, []>, OpSize16;
-// EAX,EDX = EAX*[mem32]
-let Defs = [EAX,EDX,EFLAGS], Uses = [EAX] in
-def IMUL32m : MulOpM<0xF7, MRM5m, "imul", Xi32, WriteIMul32, []>, OpSize32;
-// RAX,RDX = RAX*[mem64]
-let Defs = [RAX,RDX,EFLAGS], Uses = [RAX] in
-def IMUL64m : MulOpM<0xF7, MRM5m, "imul", Xi64, WriteIMul64, []>,
-              Requires<[In64BitMode]>;
-
-let Constraints = "$src1 = $dst" in {
-// Register-Register Signed Integer Multiply
-def IMUL16rr : IMulOpRR<0xAF, "imul", Xi16, WriteIMul16Reg>, OpSize16;
-def IMUL32rr : IMulOpRR<0xAF, "imul", Xi32, WriteIMul32Reg>, OpSize32;
-def IMUL64rr : IMulOpRR<0xAF, "imul", Xi64, WriteIMul64Reg>;
-
-// Register-Memory Signed Integer Multiply
-def IMUL16rm : IMulOpRM<0xAF, "imul", Xi16, WriteIMul16Reg>, OpSize16;
-def IMUL32rm : IMulOpRM<0xAF, "imul", Xi32, WriteIMul32Reg>, OpSize32;
-def IMUL64rm : IMulOpRM<0xAF, "imul", Xi64, WriteIMul64Reg>;
-}
+//===----------------------------------------------------------------------===//
+// NEG and NOT Instructions
+//
+class NegOpR<bits<8> o, string m, X86TypeInfo t>
+  : UnaryOpR<o, MRM3r, m, t,
+             [(set t.RegClass:$dst, (ineg t.RegClass:$src1)),
+              (implicit EFLAGS)]>, DefEFLAGS;
+class NegOpM<bits<8> o, string m, X86TypeInfo t>
+  : UnaryOpM<o, MRM3m, m, t,
+             [(store (ineg (t.LoadNode addr:$dst)), addr:$dst),
+              (implicit EFLAGS)]>, DefEFLAGS;
 
-// Surprisingly enough, these are not two address instructions!
-// NOTE: These are order specific, we want the ri8 forms to be listed
-// first so that they are slightly preferred to the ri forms.
-
-// Register-Integer Signed Integer Multiply
-// GR16 = GR16*I8
-def IMUL16rri8 : IMulOpRRI8<0x6B, "imul", Xi16, WriteIMul16Imm>, OpSize16;
-// GR16 = GR16*I16
-def IMUL16rri  : IMulOpRRI<0x69, "imul", Xi16, WriteIMul16Imm>, OpSize16;
-// GR32 = GR32*I8
-def IMUL32rri8 : IMulOpRRI8<0x6B, "imul", Xi32, WriteIMul32Imm>, OpSize32;
-// GR32 = GR32*I32
-def IMUL32rri  : IMulOpRRI<0x69, "imul", Xi32, WriteIMul32Imm>, OpSize32;
-// GR64 = GR64*I8
-def IMUL64rri8 : IMulOpRRI8<0x6B, "imul", Xi64, WriteIMul64Imm>;
-// GR64 = GR64*I32
-def IMUL64rri32 : IMulOpRRI<0x69, "imul", Xi64, WriteIMul64Imm>;
-
-// Memory-Integer Signed Integer Multiply
-// GR16 = [mem16]*I8
-def IMUL16rmi8 : IMulOpRMI8<0x6B, "imul", Xi16, WriteIMul16Imm>, OpSize16;
-// GR16 = [mem16]*I16
-def IMUL16rmi  : IMulOpRMI<0x69, "imul", Xi16, WriteIMul16Imm>, OpSize16;
-// GR32 = [mem32]*I8
-def IMUL32rmi8 : IMulOpRMI8<0x6B, "imul", Xi32, WriteIMul32Imm>, OpSize32;
-// GR32 = [mem32]*I32
-def IMUL32rmi  : IMulOpRMI<0x69, "imul", Xi32, WriteIMul32Imm>, OpSize32;
-// GR64 = [mem64]*I8
-def IMUL64rmi8 : IMulOpRMI8<0x6B, "imul", Xi64, WriteIMul64Imm>;
-// GR64 = [mem64]*I32
-def IMUL64rmi32 : IMulOpRMI<0x69, "imul", Xi64, WriteIMul64Imm>;
-
-// unsigned division/remainder
-let hasSideEffects = 1 in { // so that we don't speculatively execute
-let Defs = [AL,AH,EFLAGS], Uses = [AX] in
-// AX/r8 = AL,AH
-def DIV8r  : MulOpR<0xF6, MRM6r, "div", Xi8, WriteDiv8, []>;
-let Defs = [AX,DX,EFLAGS], Uses = [AX,DX] in
-// DX:AX/r16 = AX,DX
-def DIV16r : MulOpR<0xF7, MRM6r, "div", Xi16, WriteDiv16, []>, OpSize16;
-let Defs = [EAX,EDX,EFLAGS], Uses = [EAX,EDX] in
-// EDX:EAX/r32 = EAX,EDX
-def DIV32r : MulOpR<0xF7, MRM6r, "div", Xi32, WriteDiv32, []>, OpSize32;
-// RDX:RAX/r64 = RAX,RDX
-let Defs = [RAX,RDX,EFLAGS], Uses = [RAX,RDX] in
-def DIV64r : MulOpR<0xF7, MRM6r, "div", Xi64, WriteDiv64, []>;
-
-let mayLoad = 1 in {
-let Defs = [AL,AH,EFLAGS], Uses = [AX] in
-// AX/[mem8] = AL,AH
-def DIV8m  : MulOpM<0xF6, MRM6m, "div", Xi8, WriteDiv8, []>;
-let Defs = [AX,DX,EFLAGS], Uses = [AX,DX] in
-// DX:AX/[mem16] = AX,DX
-def DIV16m : MulOpM<0xF7, MRM6m, "div", Xi16, WriteDiv16, []>, OpSize16;
-let Defs = [EAX,EDX,EFLAGS], Uses = [EAX,EDX] in    // EDX:EAX/[mem32] = EAX,EDX
-def DIV32m : MulOpM<0xF7, MRM6m, "div", Xi32, WriteDiv32, []>, OpSize32;
-// RDX:RAX/[mem64] = RAX,RDX
-let Defs = [RAX,RDX,EFLAGS], Uses = [RAX,RDX] in
-def DIV64m : MulOpM<0xF7, MRM6m, "div", Xi64, WriteDiv64, []>,
-             Requires<[In64BitMode]>;
-}
+// NOTE: NOT does not set EFLAGS!
+class NotOpR<bits<8> o, string m, X86TypeInfo t>
+  : UnaryOpR<o, MRM2r, m, t, [(set t.RegClass:$dst, (not t.RegClass:$src1))]>;
 
-// Signed division/remainder.
-let Defs = [AL,AH,EFLAGS], Uses = [AX] in
-// AX/r8 = AL,AH
-def IDIV8r : MulOpR<0xF6, MRM7r, "idiv", Xi8, WriteIDiv8, []>;
-let Defs = [AX,DX,EFLAGS], Uses = [AX,DX] in
-// DX:AX/r16 = AX,DX
-def IDIV16r: MulOpR<0xF7, MRM7r, "idiv", Xi16, WriteIDiv16, []>, OpSize16;
-let Defs = [EAX,EDX,EFLAGS], Uses = [EAX,EDX] in
-// EDX:EAX/r32 = EAX,EDX
-def IDIV32r: MulOpR<0xF7, MRM7r, "idiv", Xi32, WriteIDiv32, []>, OpSize32;
-// RDX:RAX/r64 = RAX,RDX
-let Defs = [RAX,RDX,EFLAGS], Uses = [RAX,RDX] in
-def IDIV64r: MulOpR<0xF7, MRM7r, "idiv", Xi64, WriteIDiv64, []>;
-
-let Defs = [AL,AH,EFLAGS], Uses = [AX] in
-// AX/[mem8] = AL,AH
-def IDIV8m : MulOpM<0xF6, MRM7m, "idiv", Xi8, WriteIDiv8, []>;
-let Defs = [AX,DX,EFLAGS], Uses = [AX,DX] in
-// DX:AX/[mem16] = AX,DX
-def IDIV16m: MulOpM<0xF7, MRM7m, "idiv", Xi16, WriteIDiv16, []>, OpSize16;
-let Defs = [EAX,EDX,EFLAGS], Uses = [EAX,EDX] in
-// EDX:EAX/[mem32] = EAX,EDX
-def IDIV32m: MulOpM<0xF7, MRM7m, "idiv", Xi32, WriteIDiv32, []>, OpSize32;
-let Defs = [RAX,RDX,EFLAGS], Uses = [RAX,RDX] in // RDX:RAX/[mem64] = RAX,RDX
-// RDX:RAX/[mem64] = RAX,RDX
-def IDIV64m: MulOpM<0xF7, MRM7m, "idiv", Xi64, WriteIDiv64, []>,
-             Requires<[In64BitMode]>;
-} // hasSideEffects = 1
+class NotOpM<bits<8> o, string m, X86TypeInfo t>
+  : UnaryOpM<o, MRM2m, m,  t,
+             [(store (not (t.LoadNode addr:$dst)), addr:$dst)]>;
 
 let Constraints = "$src1 = $dst" in {
 def NEG8r  : NegOpR<0xF6, "neg", Xi8>;

From 898320d4e52c9110422a31a8afc896b9d211ed9d Mon Sep 17 00:00:00 2001
From: Jacek Caban <jacek@codeweavers.com>
Date: Tue, 26 Dec 2023 15:10:44 +0100
Subject: [PATCH 287/342] [cmake] Disable all -Wuninitialized warnings on GCC
 older than 12. (#76251)

As discussed in #75183, avoids dealing with GCC false positives.
---
 llvm/cmake/config-ix.cmake | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/llvm/cmake/config-ix.cmake b/llvm/cmake/config-ix.cmake
index 7bb3e98333eff..3e12213f6e6b5 100644
--- a/llvm/cmake/config-ix.cmake
+++ b/llvm/cmake/config-ix.cmake
@@ -431,15 +431,13 @@ set(USE_NO_UNINITIALIZED 0)
 # Disable gcc's potentially uninitialized use analysis as it presents lots of
 # false positives.
 if (CMAKE_COMPILER_IS_GNUCXX)
-  check_cxx_compiler_flag("-Wmaybe-uninitialized" HAS_MAYBE_UNINITIALIZED)
-  if (HAS_MAYBE_UNINITIALIZED)
-    set(USE_NO_MAYBE_UNINITIALIZED 1)
-  else()
-    # Only recent versions of gcc make the distinction between -Wuninitialized
-    # and -Wmaybe-uninitialized. If -Wmaybe-uninitialized isn't supported, just
-    # turn off all uninitialized use warnings.
+  # Disable all -Wuninitialized warning for old GCC versions.
+  if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 12.0)
     check_cxx_compiler_flag("-Wuninitialized" HAS_UNINITIALIZED)
     set(USE_NO_UNINITIALIZED ${HAS_UNINITIALIZED})
+  else()
+    check_cxx_compiler_flag("-Wmaybe-uninitialized" HAS_MAYBE_UNINITIALIZED)
+    set(USE_NO_MAYBE_UNINITIALIZED ${HAS_MAYBE_UNINITIALIZED})
   endif()
 endif()
 

From aca3727e97bced3e98f8e0719de95c93034a97d2 Mon Sep 17 00:00:00 2001
From: Sirui Mu <msrlancern@gmail.com>
Date: Tue, 26 Dec 2023 22:49:41 +0800
Subject: [PATCH 288/342] [clang-tidy] Treat fields in anonymous records as
 names in enclosing scope when checking name styles (#75701)

Currently, fields in anonymous records are treated as normal record
members during naming style check. This can be undesirable in certain
situations since these fields are used just like names in their
enclosing scopes:

```c++
class Foo {
  union {
    int iv_;    // warning: invalid case style for public member 'iv_'
    float fv_;  // warning: invalid case style for public member 'fv_'
  };
};
```

`iv_` and `fv_` are used in the code like private members of `Foo` but
their naming style comes from rules for public members.

This PR changes this behavior. It adds a new option
`CheckAnonFieldInParent` to `readability-identifier-naming`. When set to
`true`, fields in anonymous records will be treated as names in their
enclosing scopes when checking name styles. Specifically:

- If the anonymous record is defined within the file scope or in a
namespace scope, treat its fields as global variables when checking name
styles;
- If the anonymous record is defined within a function, treat its fields
as local variables when checking name styles;
- If the anonymous record is defined within a non-anonymous record,
treat its fields as non-static record members when checking name styles.
---
 .../readability/IdentifierNamingCheck.cpp     | 219 +++++++++++-------
 .../readability/IdentifierNamingCheck.h       |  26 ++-
 .../clang-tidy/utils/ASTUtils.cpp             |  24 ++
 clang-tools-extra/clang-tidy/utils/ASTUtils.h |   5 +
 clang-tools-extra/docs/ReleaseNotes.rst       |   5 +-
 .../checks/readability/identifier-naming.rst  |  27 +++
 .../identifier-naming-anon-record-fields.cpp  | 185 +++++++++++++++
 7 files changed, 398 insertions(+), 93 deletions(-)
 create mode 100644 clang-tools-extra/test/clang-tidy/checkers/readability/identifier-naming-anon-record-fields.cpp

diff --git a/clang-tools-extra/clang-tidy/readability/IdentifierNamingCheck.cpp b/clang-tools-extra/clang-tidy/readability/IdentifierNamingCheck.cpp
index 03dcfa5f81109..e6f44dd51b459 100644
--- a/clang-tools-extra/clang-tidy/readability/IdentifierNamingCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/IdentifierNamingCheck.cpp
@@ -9,6 +9,7 @@
 #include "IdentifierNamingCheck.h"
 
 #include "../GlobList.h"
+#include "../utils/ASTUtils.h"
 #include "clang/AST/CXXInheritance.h"
 #include "clang/Lex/PPCallbacks.h"
 #include "clang/Lex/Preprocessor.h"
@@ -286,7 +287,9 @@ IdentifierNamingCheck::FileStyle IdentifierNamingCheck::getFileStyleFromOptions(
                         HPTOpt.value_or(IdentifierNamingCheck::HPT_Off));
   }
   bool IgnoreMainLike = Options.get("IgnoreMainLikeFunctions", false);
-  return {std::move(Styles), std::move(HNOption), IgnoreMainLike};
+  bool CheckAnonFieldInParent = Options.get("CheckAnonFieldInParent", false);
+  return {std::move(Styles), std::move(HNOption), IgnoreMainLike,
+          CheckAnonFieldInParent};
 }
 
 std::string IdentifierNamingCheck::HungarianNotation::getDeclTypeName(
@@ -859,6 +862,8 @@ void IdentifierNamingCheck::storeOptions(ClangTidyOptions::OptionMap &Opts) {
   Options.store(Opts, "IgnoreFailedSplit", IgnoreFailedSplit);
   Options.store(Opts, "IgnoreMainLikeFunctions",
                 MainFileStyle->isIgnoringMainLikeFunction());
+  Options.store(Opts, "CheckAnonFieldInParent",
+                MainFileStyle->isCheckingAnonFieldInParentScope());
 }
 
 bool IdentifierNamingCheck::matchesStyle(
@@ -1111,7 +1116,7 @@ std::string IdentifierNamingCheck::fixupWithStyle(
 StyleKind IdentifierNamingCheck::findStyleKind(
     const NamedDecl *D,
     ArrayRef<std::optional<IdentifierNamingCheck::NamingStyle>> NamingStyles,
-    bool IgnoreMainLikeFunctions) const {
+    bool IgnoreMainLikeFunctions, bool CheckAnonFieldInParentScope) const {
   assert(D && D->getIdentifier() && !D->getName().empty() && !D->isImplicit() &&
          "Decl must be an explicit identifier with a name.");
 
@@ -1185,29 +1190,14 @@ StyleKind IdentifierNamingCheck::findStyleKind(
   }
 
   if (const auto *Decl = dyn_cast<FieldDecl>(D)) {
-    QualType Type = Decl->getType();
-
-    if (!Type.isNull() && Type.isConstQualified()) {
-      if (NamingStyles[SK_ConstantMember])
-        return SK_ConstantMember;
-
-      if (NamingStyles[SK_Constant])
-        return SK_Constant;
+    if (CheckAnonFieldInParentScope) {
+      const RecordDecl *Record = Decl->getParent();
+      if (Record->isAnonymousStructOrUnion()) {
+        return findStyleKindForAnonField(Decl, NamingStyles);
+      }
     }
 
-    if (Decl->getAccess() == AS_private && NamingStyles[SK_PrivateMember])
-      return SK_PrivateMember;
-
-    if (Decl->getAccess() == AS_protected && NamingStyles[SK_ProtectedMember])
-      return SK_ProtectedMember;
-
-    if (Decl->getAccess() == AS_public && NamingStyles[SK_PublicMember])
-      return SK_PublicMember;
-
-    if (NamingStyles[SK_Member])
-      return SK_Member;
-
-    return SK_Invalid;
+    return findStyleKindForField(Decl, Decl->getType(), NamingStyles);
   }
 
   if (const auto *Decl = dyn_cast<ParmVarDecl>(D)) {
@@ -1244,66 +1234,7 @@ StyleKind IdentifierNamingCheck::findStyleKind(
   }
 
   if (const auto *Decl = dyn_cast<VarDecl>(D)) {
-    QualType Type = Decl->getType();
-
-    if (Decl->isConstexpr() && NamingStyles[SK_ConstexprVariable])
-      return SK_ConstexprVariable;
-
-    if (!Type.isNull() && Type.isConstQualified()) {
-      if (Decl->isStaticDataMember() && NamingStyles[SK_ClassConstant])
-        return SK_ClassConstant;
-
-      if (Decl->isFileVarDecl() && Type.getTypePtr()->isAnyPointerType() &&
-          NamingStyles[SK_GlobalConstantPointer])
-        return SK_GlobalConstantPointer;
-
-      if (Decl->isFileVarDecl() && NamingStyles[SK_GlobalConstant])
-        return SK_GlobalConstant;
-
-      if (Decl->isStaticLocal() && NamingStyles[SK_StaticConstant])
-        return SK_StaticConstant;
-
-      if (Decl->isLocalVarDecl() && Type.getTypePtr()->isAnyPointerType() &&
-          NamingStyles[SK_LocalConstantPointer])
-        return SK_LocalConstantPointer;
-
-      if (Decl->isLocalVarDecl() && NamingStyles[SK_LocalConstant])
-        return SK_LocalConstant;
-
-      if (Decl->isFunctionOrMethodVarDecl() && NamingStyles[SK_LocalConstant])
-        return SK_LocalConstant;
-
-      if (NamingStyles[SK_Constant])
-        return SK_Constant;
-    }
-
-    if (Decl->isStaticDataMember() && NamingStyles[SK_ClassMember])
-      return SK_ClassMember;
-
-    if (Decl->isFileVarDecl() && Type.getTypePtr()->isAnyPointerType() &&
-        NamingStyles[SK_GlobalPointer])
-      return SK_GlobalPointer;
-
-    if (Decl->isFileVarDecl() && NamingStyles[SK_GlobalVariable])
-      return SK_GlobalVariable;
-
-    if (Decl->isStaticLocal() && NamingStyles[SK_StaticVariable])
-      return SK_StaticVariable;
-
-    if (Decl->isLocalVarDecl() && Type.getTypePtr()->isAnyPointerType() &&
-        NamingStyles[SK_LocalPointer])
-      return SK_LocalPointer;
-
-    if (Decl->isLocalVarDecl() && NamingStyles[SK_LocalVariable])
-      return SK_LocalVariable;
-
-    if (Decl->isFunctionOrMethodVarDecl() && NamingStyles[SK_LocalVariable])
-      return SK_LocalVariable;
-
-    if (NamingStyles[SK_Variable])
-      return SK_Variable;
-
-    return SK_Invalid;
+    return findStyleKindForVar(Decl, Decl->getType(), NamingStyles);
   }
 
   if (const auto *Decl = dyn_cast<CXXMethodDecl>(D)) {
@@ -1442,12 +1373,13 @@ IdentifierNamingCheck::getDeclFailureInfo(const NamedDecl *Decl,
   if (!FileStyle.isActive())
     return std::nullopt;
 
-  return getFailureInfo(HungarianNotation.getDeclTypeName(Decl),
-                        Decl->getName(), Decl, Loc, FileStyle.getStyles(),
-                        FileStyle.getHNOption(),
-                        findStyleKind(Decl, FileStyle.getStyles(),
-                                      FileStyle.isIgnoringMainLikeFunction()),
-                        SM, IgnoreFailedSplit);
+  return getFailureInfo(
+      HungarianNotation.getDeclTypeName(Decl), Decl->getName(), Decl, Loc,
+      FileStyle.getStyles(), FileStyle.getHNOption(),
+      findStyleKind(Decl, FileStyle.getStyles(),
+                    FileStyle.isIgnoringMainLikeFunction(),
+                    FileStyle.isCheckingAnonFieldInParentScope()),
+      SM, IgnoreFailedSplit);
 }
 
 std::optional<RenamerClangTidyCheck::FailureInfo>
@@ -1496,5 +1428,114 @@ IdentifierNamingCheck::getStyleForFile(StringRef FileName) const {
   return It.first->getValue();
 }
 
+StyleKind IdentifierNamingCheck::findStyleKindForAnonField(
+    const FieldDecl *AnonField,
+    ArrayRef<std::optional<NamingStyle>> NamingStyles) const {
+  const IndirectFieldDecl *IFD =
+      utils::findOutermostIndirectFieldDeclForField(AnonField);
+  assert(IFD && "Found an anonymous record field without an IndirectFieldDecl");
+
+  QualType Type = AnonField->getType();
+
+  if (const auto *F = dyn_cast<FieldDecl>(IFD->chain().front())) {
+    return findStyleKindForField(F, Type, NamingStyles);
+  }
+
+  if (const auto *V = IFD->getVarDecl()) {
+    return findStyleKindForVar(V, Type, NamingStyles);
+  }
+
+  return SK_Invalid;
+}
+
+StyleKind IdentifierNamingCheck::findStyleKindForField(
+    const FieldDecl *Field, QualType Type,
+    ArrayRef<std::optional<NamingStyle>> NamingStyles) const {
+  if (!Type.isNull() && Type.isConstQualified()) {
+    if (NamingStyles[SK_ConstantMember])
+      return SK_ConstantMember;
+
+    if (NamingStyles[SK_Constant])
+      return SK_Constant;
+  }
+
+  if (Field->getAccess() == AS_private && NamingStyles[SK_PrivateMember])
+    return SK_PrivateMember;
+
+  if (Field->getAccess() == AS_protected && NamingStyles[SK_ProtectedMember])
+    return SK_ProtectedMember;
+
+  if (Field->getAccess() == AS_public && NamingStyles[SK_PublicMember])
+    return SK_PublicMember;
+
+  if (NamingStyles[SK_Member])
+    return SK_Member;
+
+  return SK_Invalid;
+}
+
+StyleKind IdentifierNamingCheck::findStyleKindForVar(
+    const VarDecl *Var, QualType Type,
+    ArrayRef<std::optional<NamingStyle>> NamingStyles) const {
+  if (Var->isConstexpr() && NamingStyles[SK_ConstexprVariable])
+    return SK_ConstexprVariable;
+
+  if (!Type.isNull() && Type.isConstQualified()) {
+    if (Var->isStaticDataMember() && NamingStyles[SK_ClassConstant])
+      return SK_ClassConstant;
+
+    if (Var->isFileVarDecl() && Type.getTypePtr()->isAnyPointerType() &&
+        NamingStyles[SK_GlobalConstantPointer])
+      return SK_GlobalConstantPointer;
+
+    if (Var->isFileVarDecl() && NamingStyles[SK_GlobalConstant])
+      return SK_GlobalConstant;
+
+    if (Var->isStaticLocal() && NamingStyles[SK_StaticConstant])
+      return SK_StaticConstant;
+
+    if (Var->isLocalVarDecl() && Type.getTypePtr()->isAnyPointerType() &&
+        NamingStyles[SK_LocalConstantPointer])
+      return SK_LocalConstantPointer;
+
+    if (Var->isLocalVarDecl() && NamingStyles[SK_LocalConstant])
+      return SK_LocalConstant;
+
+    if (Var->isFunctionOrMethodVarDecl() && NamingStyles[SK_LocalConstant])
+      return SK_LocalConstant;
+
+    if (NamingStyles[SK_Constant])
+      return SK_Constant;
+  }
+
+  if (Var->isStaticDataMember() && NamingStyles[SK_ClassMember])
+    return SK_ClassMember;
+
+  if (Var->isFileVarDecl() && Type.getTypePtr()->isAnyPointerType() &&
+      NamingStyles[SK_GlobalPointer])
+    return SK_GlobalPointer;
+
+  if (Var->isFileVarDecl() && NamingStyles[SK_GlobalVariable])
+    return SK_GlobalVariable;
+
+  if (Var->isStaticLocal() && NamingStyles[SK_StaticVariable])
+    return SK_StaticVariable;
+
+  if (Var->isLocalVarDecl() && Type.getTypePtr()->isAnyPointerType() &&
+      NamingStyles[SK_LocalPointer])
+    return SK_LocalPointer;
+
+  if (Var->isLocalVarDecl() && NamingStyles[SK_LocalVariable])
+    return SK_LocalVariable;
+
+  if (Var->isFunctionOrMethodVarDecl() && NamingStyles[SK_LocalVariable])
+    return SK_LocalVariable;
+
+  if (NamingStyles[SK_Variable])
+    return SK_Variable;
+
+  return SK_Invalid;
+}
+
 } // namespace readability
 } // namespace clang::tidy
diff --git a/clang-tools-extra/clang-tidy/readability/IdentifierNamingCheck.h b/clang-tools-extra/clang-tidy/readability/IdentifierNamingCheck.h
index 14626981cc42d..27c8e4bc768c4 100644
--- a/clang-tools-extra/clang-tidy/readability/IdentifierNamingCheck.h
+++ b/clang-tools-extra/clang-tidy/readability/IdentifierNamingCheck.h
@@ -127,9 +127,11 @@ class IdentifierNamingCheck final : public RenamerClangTidyCheck {
   struct FileStyle {
     FileStyle() : IsActive(false), IgnoreMainLikeFunctions(false) {}
     FileStyle(SmallVectorImpl<std::optional<NamingStyle>> &&Styles,
-              HungarianNotationOption HNOption, bool IgnoreMainLike)
+              HungarianNotationOption HNOption, bool IgnoreMainLike,
+              bool CheckAnonFieldInParent)
         : Styles(std::move(Styles)), HNOption(std::move(HNOption)),
-          IsActive(true), IgnoreMainLikeFunctions(IgnoreMainLike) {}
+          IsActive(true), IgnoreMainLikeFunctions(IgnoreMainLike),
+          CheckAnonFieldInParentScope(CheckAnonFieldInParent) {}
 
     ArrayRef<std::optional<NamingStyle>> getStyles() const {
       assert(IsActive);
@@ -144,11 +146,16 @@ class IdentifierNamingCheck final : public RenamerClangTidyCheck {
     bool isActive() const { return IsActive; }
     bool isIgnoringMainLikeFunction() const { return IgnoreMainLikeFunctions; }
 
+    bool isCheckingAnonFieldInParentScope() const {
+      return CheckAnonFieldInParentScope;
+    }
+
   private:
     SmallVector<std::optional<NamingStyle>, 0> Styles;
     HungarianNotationOption HNOption;
     bool IsActive;
     bool IgnoreMainLikeFunctions;
+    bool CheckAnonFieldInParentScope;
   };
 
   IdentifierNamingCheck::FileStyle
@@ -175,7 +182,7 @@ class IdentifierNamingCheck final : public RenamerClangTidyCheck {
   StyleKind findStyleKind(
       const NamedDecl *D,
       ArrayRef<std::optional<IdentifierNamingCheck::NamingStyle>> NamingStyles,
-      bool IgnoreMainLikeFunctions) const;
+      bool IgnoreMainLikeFunctions, bool CheckAnonFieldInParentScope) const;
 
   std::optional<RenamerClangTidyCheck::FailureInfo> getFailureInfo(
       StringRef Type, StringRef Name, const NamedDecl *ND,
@@ -199,6 +206,19 @@ class IdentifierNamingCheck final : public RenamerClangTidyCheck {
 
   const FileStyle &getStyleForFile(StringRef FileName) const;
 
+  /// Find the style kind of a field in an anonymous record.
+  StyleKind findStyleKindForAnonField(
+      const FieldDecl *AnonField,
+      ArrayRef<std::optional<NamingStyle>> NamingStyles) const;
+
+  StyleKind findStyleKindForField(
+      const FieldDecl *Field, QualType Type,
+      ArrayRef<std::optional<NamingStyle>> NamingStyles) const;
+
+  StyleKind
+  findStyleKindForVar(const VarDecl *Var, QualType Type,
+                      ArrayRef<std::optional<NamingStyle>> NamingStyles) const;
+
   /// Stores the style options as a vector, indexed by the specified \ref
   /// StyleKind, for a given directory.
   mutable llvm::StringMap<FileStyle> NamingStylesCache;
diff --git a/clang-tools-extra/clang-tidy/utils/ASTUtils.cpp b/clang-tools-extra/clang-tidy/utils/ASTUtils.cpp
index 64333f2c18745..fd5dadc9b01db 100644
--- a/clang-tools-extra/clang-tidy/utils/ASTUtils.cpp
+++ b/clang-tools-extra/clang-tidy/utils/ASTUtils.cpp
@@ -113,4 +113,28 @@ bool areStatementsIdentical(const Stmt *FirstStmt, const Stmt *SecondStmt,
   return DataFirst == DataSecond;
 }
 
+const IndirectFieldDecl *
+findOutermostIndirectFieldDeclForField(const FieldDecl *FD) {
+  const RecordDecl *Record = FD->getParent();
+  assert(Record->isAnonymousStructOrUnion() &&
+         "FD must be a field in an anonymous record");
+
+  const DeclContext *Context = Record;
+  while (isa<RecordDecl>(Context) &&
+         cast<RecordDecl>(Context)->isAnonymousStructOrUnion()) {
+    Context = Context->getParent();
+  }
+
+  // Search for the target IndirectFieldDecl within the located context.
+  for (const auto *D : Context->decls()) {
+    const auto *IFD = dyn_cast<IndirectFieldDecl>(D);
+    if (!IFD)
+      continue;
+    if (IFD->getAnonField() == FD)
+      return IFD;
+  }
+
+  return nullptr;
+}
+
 } // namespace clang::tidy::utils
diff --git a/clang-tools-extra/clang-tidy/utils/ASTUtils.h b/clang-tools-extra/clang-tidy/utils/ASTUtils.h
index 1bba5daf2fc76..6c3e54facd020 100644
--- a/clang-tools-extra/clang-tidy/utils/ASTUtils.h
+++ b/clang-tools-extra/clang-tidy/utils/ASTUtils.h
@@ -40,6 +40,11 @@ bool rangeCanBeFixed(SourceRange Range, const SourceManager *SM);
 bool areStatementsIdentical(const Stmt *FirstStmt, const Stmt *SecondStmt,
                             const ASTContext &Context, bool Canonical = false);
 
+// Given a field of an anonymous record, find its corresponding
+// IndirectFieldDecl in the outermost possible scope.
+const IndirectFieldDecl *
+findOutermostIndirectFieldDeclForField(const FieldDecl *FD);
+
 } // namespace clang::tidy::utils
 
 #endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_ASTUTILS_H
diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index ce82063dbfe23..fe7f40d95fe6c 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -454,7 +454,10 @@ Changes in existing checks
   has been enhanced, particularly within complex types like function pointers
   and cases where style checks were omitted when functions started with macros.
   Added support for C++20 ``concept`` declarations. ``Camel_Snake_Case`` and
-  ``camel_Snake_Case`` now detect more invalid identifier names.
+  ``camel_Snake_Case`` now detect more invalid identifier names. Fields in
+  anonymous records (i.e. anonymous structs and unions) now can be checked with
+  the naming rules associated with their enclosing scopes rather than the naming
+  rules of public struct/union members.
 
 - Improved :doc:`readability-implicit-bool-conversion
   <clang-tidy/checks/readability/implicit-bool-conversion>` check to take
diff --git a/clang-tools-extra/docs/clang-tidy/checks/readability/identifier-naming.rst b/clang-tools-extra/docs/clang-tidy/checks/readability/identifier-naming.rst
index e36bbee394f17..2affb55cfa9ad 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/readability/identifier-naming.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/readability/identifier-naming.rst
@@ -42,6 +42,7 @@ The following options are described below:
 
  - :option:`AbstractClassCase`, :option:`AbstractClassPrefix`, :option:`AbstractClassSuffix`, :option:`AbstractClassIgnoredRegexp`, :option:`AbstractClassHungarianPrefix`
  - :option:`AggressiveDependentMemberLookup`
+ - :option:`CheckAnonFieldInParent`
  - :option:`ClassCase`, :option:`ClassPrefix`, :option:`ClassSuffix`, :option:`ClassIgnoredRegexp`, :option:`ClassHungarianPrefix`
  - :option:`ClassConstantCase`, :option:`ClassConstantPrefix`, :option:`ClassConstantSuffix`, :option:`ClassConstantIgnoredRegexp`, :option:`ClassConstantHungarianPrefix`
  - :option:`ClassMemberCase`, :option:`ClassMemberPrefix`, :option:`ClassMemberSuffix`, :option:`ClassMemberIgnoredRegexp`, :option:`ClassMemberHungarianPrefix`
@@ -207,6 +208,32 @@ After if AggressiveDependentMemberLookup is `true`:
       }
     };
 
+.. option:: CheckAnonFieldInParent
+
+    When set to `true`, fields in anonymous records (i.e. anonymous
+    unions and structs) will be treated as names in the enclosing scope
+    rather than public members of the anonymous record for the purpose
+    of name checking.
+
+For example:
+
+.. code-block:: c++
+
+    class Foo {
+    private:
+      union {
+        int iv_;
+        float fv_;
+      };
+    };
+
+If :option:`CheckAnonFieldInParent` is `false`, you may get warnings
+that ``iv_`` and ``fv_`` are not coherent to public member names, because
+``iv_`` and ``fv_`` are public members of the anonymous union. When
+:option:`CheckAnonFieldInParent` is `true`, ``iv_`` and ``fv_`` will be
+treated as private data members of ``Foo`` for the purpose of name checking
+and thus no warnings will be emitted.
+
 .. option:: ClassCase
 
     When defined, the check will ensure class names conform to the
diff --git a/clang-tools-extra/test/clang-tidy/checkers/readability/identifier-naming-anon-record-fields.cpp b/clang-tools-extra/test/clang-tidy/checkers/readability/identifier-naming-anon-record-fields.cpp
new file mode 100644
index 0000000000000..1b4d4e924a721
--- /dev/null
+++ b/clang-tools-extra/test/clang-tidy/checkers/readability/identifier-naming-anon-record-fields.cpp
@@ -0,0 +1,185 @@
+// RUN: %check_clang_tidy -std=c++20 %s readability-identifier-naming %t -- \
+// RUN:   -config='{CheckOptions: { \
+// RUN:     readability-identifier-naming.CheckAnonFieldInParent: true, \
+// RUN:     readability-identifier-naming.ClassConstantCase: CamelCase, \
+// RUN:     readability-identifier-naming.ClassConstantPrefix: 'k', \
+// RUN:     readability-identifier-naming.ClassMemberCase: CamelCase, \
+// RUN:     readability-identifier-naming.ConstantCase: UPPER_CASE, \
+// RUN:     readability-identifier-naming.ConstantSuffix: '_CST', \
+// RUN:     readability-identifier-naming.ConstexprVariableCase: lower_case, \
+// RUN:     readability-identifier-naming.GlobalConstantCase: UPPER_CASE, \
+// RUN:     readability-identifier-naming.GlobalVariableCase: lower_case, \
+// RUN:     readability-identifier-naming.GlobalVariablePrefix: 'g_', \
+// RUN:     readability-identifier-naming.LocalConstantCase: CamelCase, \
+// RUN:     readability-identifier-naming.LocalConstantPrefix: 'k', \
+// RUN:     readability-identifier-naming.LocalVariableCase: lower_case, \
+// RUN:     readability-identifier-naming.MemberCase: CamelCase, \
+// RUN:     readability-identifier-naming.MemberPrefix: 'm_', \
+// RUN:     readability-identifier-naming.ConstantMemberCase: lower_case, \
+// RUN:     readability-identifier-naming.PrivateMemberPrefix: '__', \
+// RUN:     readability-identifier-naming.ProtectedMemberPrefix: '_', \
+// RUN:     readability-identifier-naming.PublicMemberCase: lower_case, \
+// RUN:     readability-identifier-naming.StaticConstantCase: UPPER_CASE, \
+// RUN:     readability-identifier-naming.StaticVariableCase: camelBack, \
+// RUN:     readability-identifier-naming.StaticVariablePrefix: 's_', \
+// RUN:     readability-identifier-naming.VariableCase: lower_case, \
+// RUN:     readability-identifier-naming.GlobalPointerCase: CamelCase, \
+// RUN:     readability-identifier-naming.GlobalPointerSuffix: '_Ptr', \
+// RUN:     readability-identifier-naming.GlobalConstantPointerCase: UPPER_CASE, \
+// RUN:     readability-identifier-naming.GlobalConstantPointerSuffix: '_Ptr', \
+// RUN:     readability-identifier-naming.LocalPointerCase: CamelCase, \
+// RUN:     readability-identifier-naming.LocalPointerPrefix: 'l_', \
+// RUN:     readability-identifier-naming.LocalConstantPointerCase: CamelCase, \
+// RUN:     readability-identifier-naming.LocalConstantPointerPrefix: 'lc_', \
+// RUN:   }}'
+
+static union {
+  int global;
+// CHECK-MESSAGES: :[[@LINE-1]]:7: warning: invalid case style for global variable 'global'
+// CHECK-FIXES: {{^}}  int g_global;{{$}}
+
+  const int global_const;
+// CHECK-MESSAGES: :[[@LINE-1]]:13: warning: invalid case style for global constant 'global_const'
+// CHECK-FIXES: {{^}}  const int GLOBAL_CONST;{{$}}
+
+  int *global_ptr;
+// CHECK-MESSAGES: :[[@LINE-1]]:8: warning: invalid case style for global pointer 'global_ptr'
+// CHECK-FIXES: {{^}}  int *GlobalPtr_Ptr;{{$}}
+
+  int *const global_const_ptr;
+// CHECK-MESSAGES: :[[@LINE-1]]:14: warning: invalid case style for global constant pointer 'global_const_ptr'
+// CHECK-FIXES: {{^}}  int *const GLOBAL_CONST_PTR_Ptr;{{$}}
+};
+
+namespace ns {
+
+static union {
+  int ns_global;
+// CHECK-MESSAGES: :[[@LINE-1]]:7: warning: invalid case style for global variable 'ns_global'
+// CHECK-FIXES: {{^}}  int g_ns_global;{{$}}
+
+  const int ns_global_const;
+// CHECK-MESSAGES: :[[@LINE-1]]:13: warning: invalid case style for global constant 'ns_global_const'
+// CHECK-FIXES: {{^}}  const int NS_GLOBAL_CONST;{{$}}
+
+  int *ns_global_ptr;
+// CHECK-MESSAGES: :[[@LINE-1]]:8: warning: invalid case style for global pointer 'ns_global_ptr'
+// CHECK-FIXES: {{^}}  int *NsGlobalPtr_Ptr;{{$}}
+
+  int *const ns_global_const_ptr;
+// CHECK-MESSAGES: :[[@LINE-1]]:14: warning: invalid case style for global constant pointer 'ns_global_const_ptr'
+// CHECK-FIXES: {{^}}  int *const NS_GLOBAL_CONST_PTR_Ptr;{{$}}
+};
+
+namespace {
+
+union {
+  int anon_ns_global;
+// CHECK-MESSAGES: :[[@LINE-1]]:7: warning: invalid case style for global variable 'anon_ns_global'
+// CHECK-FIXES: {{^}}  int g_anon_ns_global;{{$}}
+
+  const int anon_ns_global_const;
+// CHECK-MESSAGES: :[[@LINE-1]]:13: warning: invalid case style for global constant 'anon_ns_global_const'
+// CHECK-FIXES: {{^}}  const int ANON_NS_GLOBAL_CONST;{{$}}
+
+  int *anon_ns_global_ptr;
+// CHECK-MESSAGES: :[[@LINE-1]]:8: warning: invalid case style for global pointer 'anon_ns_global_ptr'
+// CHECK-FIXES: {{^}}  int *AnonNsGlobalPtr_Ptr;{{$}}
+
+  int *const anon_ns_global_const_ptr;
+// CHECK-MESSAGES: :[[@LINE-1]]:14: warning: invalid case style for global constant pointer 'anon_ns_global_const_ptr'
+// CHECK-FIXES: {{^}}  int *const ANON_NS_GLOBAL_CONST_PTR_Ptr;{{$}}
+};
+
+}
+
+}
+
+
+class Foo {
+public:
+  union {
+    int PubMember;
+// CHECK-MESSAGES: :[[@LINE-1]]:9: warning: invalid case style for public member 'PubMember'
+// CHECK-FIXES: {{^}}    int pub_member;{{$}}
+
+    const int PubConstMember;
+// CHECK-MESSAGES: :[[@LINE-1]]:15: warning: invalid case style for constant member 'PubConstMember'
+// CHECK-FIXES: {{^}}    const int pub_const_member;{{$}}
+
+    int *PubPtrMember;
+// CHECK-MESSAGES: :[[@LINE-1]]:10: warning: invalid case style for public member 'PubPtrMember'
+// CHECK-FIXES: {{^}}    int *pub_ptr_member;{{$}}
+
+    int *const PubConstPtrMember;
+// CHECK-MESSAGES: :[[@LINE-1]]:16: warning: invalid case style for constant member 'PubConstPtrMember'
+// CHECK-FIXES: {{^}}    int *const pub_const_ptr_member;{{$}}
+  };
+
+protected:
+  union {
+    int prot_member;
+// CHECK-MESSAGES: :[[@LINE-1]]:9: warning: invalid case style for protected member 'prot_member'
+// CHECK-FIXES: {{^}}    int _prot_member;{{$}}
+
+    const int prot_const_member;
+
+    int *prot_ptr_member;
+// CHECK-MESSAGES: :[[@LINE-1]]:10: warning: invalid case style for protected member 'prot_ptr_member'
+// CHECK-FIXES: {{^}}    int *_prot_ptr_member;{{$}}
+
+    int *const prot_const_ptr_member;
+  };
+
+
+private:
+  union {
+    int pri_member;
+// CHECK-MESSAGES: :[[@LINE-1]]:9: warning: invalid case style for private member 'pri_member'
+// CHECK-FIXES: {{^}}    int __pri_member;{{$}}
+
+    const int pri_const_member;
+
+    int *pri_ptr_member;
+// CHECK-MESSAGES: :[[@LINE-1]]:10: warning: invalid case style for private member 'pri_ptr_member'
+// CHECK-FIXES: {{^}}    int *__pri_ptr_member;{{$}}
+
+    int *const pri_const_ptr_member;
+  };
+};
+
+void test() {
+  union {
+    int local;
+
+    const int local_const;
+// CHECK-MESSAGES: :[[@LINE-1]]:15: warning: invalid case style for local constant 'local_const'
+// CHECK-FIXES: {{^}}    const int kLocalConst;{{$}}
+
+    int *local_ptr;
+// CHECK-MESSAGES: :[[@LINE-1]]:10: warning: invalid case style for local pointer 'local_ptr'
+// CHECK-FIXES: {{^}}    int *l_LocalPtr;{{$}}
+
+    int *const local_const_ptr;
+// CHECK-MESSAGES: :[[@LINE-1]]:16: warning: invalid case style for local constant pointer 'local_const_ptr'
+// CHECK-FIXES: {{^}}    int *const lc_LocalConstPtr;{{$}}
+  };
+
+  static union {
+    int local_static;
+// CHECK-MESSAGES: :[[@LINE-1]]:9: warning: invalid case style for static variable 'local_static'
+// CHECK-FIXES: {{^}}    int s_localStatic;{{$}}
+
+    const int local_static_const;
+// CHECK-MESSAGES: :[[@LINE-1]]:15: warning: invalid case style for static constant 'local_static_const'
+// CHECK-FIXES: {{^}}    const int LOCAL_STATIC_CONST;{{$}}
+
+    int *local_static_ptr;
+// CHECK-MESSAGES: :[[@LINE-1]]:10: warning: invalid case style for static variable 'local_static_ptr'
+// CHECK-FIXES: {{^}}    int *s_localStaticPtr;{{$}}
+
+    int *const local_static_const_ptr;
+// CHECK-MESSAGES: :[[@LINE-1]]:16: warning: invalid case style for static constant 'local_static_const_ptr'
+// CHECK-FIXES: {{^}}    int *const LOCAL_STATIC_CONST_PTR;{{$}}
+  };
+}

From f11b056c02cca28fe0b82ec44c59537035100e67 Mon Sep 17 00:00:00 2001
From: Alexandre Ganea <37383324+aganea@users.noreply.github.com>
Date: Tue, 26 Dec 2023 15:33:42 +0000
Subject: [PATCH 289/342] [Support] Resolve symlinks in `getMainExecutable()`
 on Windows (#76304)

This makes the Windows implementation for `getMainExecutable()` behave
the same as its Linux counterpart, in regards to symlinks. Previously,
when using `cmake ... -DLLVM_USE_SYMLINKS=ON`, calling this function
wouldn't resolve to the "real", non-symlinked path.
---
 llvm/lib/Support/Windows/Path.inc | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Support/Windows/Path.inc b/llvm/lib/Support/Windows/Path.inc
index 168a63bb2d969..2bf68b7972e74 100644
--- a/llvm/lib/Support/Windows/Path.inc
+++ b/llvm/lib/Support/Windows/Path.inc
@@ -154,7 +154,10 @@ std::string getMainExecutable(const char *argv0, void *MainExecAddr) {
     return "";
 
   llvm::sys::path::make_preferred(PathNameUTF8);
-  return std::string(PathNameUTF8.data());
+
+  SmallString<256> RealPath;
+  sys::fs::real_path(PathNameUTF8, RealPath);
+  return std::string(RealPath);
 }
 
 UniqueID file_status::getUniqueID() const {

From dc5fb32547627223967f11fa08886045bba804f5 Mon Sep 17 00:00:00 2001
From: Jacek Caban <jacek@codeweavers.com>
Date: Tue, 26 Dec 2023 18:17:35 +0100
Subject: [PATCH 290/342] [lld][NFC] Revert commit ccec22b675195bf. (#76398)

This reverts commit ccec22b675195bf45a5e34583a866ab881f94dde (#75183).
It's no longer needed with #76251.
---
 lld/COFF/Writer.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lld/COFF/Writer.cpp b/lld/COFF/Writer.cpp
index 2982165530c08..2e34a6c5cfa2c 100644
--- a/lld/COFF/Writer.cpp
+++ b/lld/COFF/Writer.cpp
@@ -560,7 +560,7 @@ void Writer::createECCodeMap() {
   codeMap.clear();
 
   std::optional<chpe_range_type> lastType;
-  Chunk *first = nullptr, *last = nullptr;
+  Chunk *first, *last;
 
   auto closeRange = [&]() {
     if (lastType) {

From 7c383481a8e86918b3aaca4288c1eed62a4d6ff4 Mon Sep 17 00:00:00 2001
From: Aiden Grossman <agrossman154@yahoo.com>
Date: Tue, 26 Dec 2023 10:19:48 -0800
Subject: [PATCH 291/342] [llvm-exegesis] Add support for loading X86 segment
 registers (#76368)

This patch adds support for setting the X86 segment registers. These
registers are used in quite a few basic blocks in BHive and similar
datasets, so being able to set them is necessary to ensure consistent
runs as the live-in values of fs and gs can change across runs.

Fixes #76340.
---
 .../latency/segment-registers-subprocess.asm  |  29 +++++
 llvm/tools/llvm-exegesis/lib/X86/Target.cpp   | 116 ++++++++++++------
 2 files changed, 110 insertions(+), 35 deletions(-)
 create mode 100644 llvm/test/tools/llvm-exegesis/X86/latency/segment-registers-subprocess.asm

diff --git a/llvm/test/tools/llvm-exegesis/X86/latency/segment-registers-subprocess.asm b/llvm/test/tools/llvm-exegesis/X86/latency/segment-registers-subprocess.asm
new file mode 100644
index 0000000000000..5d5219f9375f2
--- /dev/null
+++ b/llvm/test/tools/llvm-exegesis/X86/latency/segment-registers-subprocess.asm
@@ -0,0 +1,29 @@
+# REQUIRES: exegesis-can-measure-latency, x86_64-linux
+
+# Check that the value of the segment registers is set properly when in
+# subprocess mode.
+
+# RUN: llvm-exegesis -mtriple=x86_64-unknown-unknown -mode=latency -snippets-file=%s -execution-mode=subprocess | FileCheck %s
+
+# LLVM-EXEGESIS-DEFREG FS 12345600
+# LLVM-EXEGESIS-DEFREG GS 2468ac00
+# LLVM-EXEGESIS-DEFREG R13 0
+# LLVM-EXEGESIS-DEFREG R14 127
+# LLVM-EXEGESIS-DEFREG R15 0
+# LLVM-EXEGESIS-MEM-DEF MEM1 4096 0000000012345600
+# LLVM-EXEGESIS-MEM-DEF MEM2 4096 000000002468ac00
+# LLVM-EXEGESIS-MEM-MAP MEM1 305418240
+# LLVM-EXEGESIS-MEM-MAP MEM2 610836480
+
+movq %fs:0, %r13
+cmpq $0x12345600, %r13
+cmovneq %r14, %r15
+movq %gs:0, %r13
+cmpq $0x2468ac00, %r13
+cmovneq %r14, %r15
+
+movq $60, %rax
+movq %r15, %rdi
+syscall
+
+# CHECK-NOT: error:           'Child benchmarking process exited with non-zero exit code: Child process returned with unknown exit code'
diff --git a/llvm/tools/llvm-exegesis/lib/X86/Target.cpp b/llvm/tools/llvm-exegesis/lib/X86/Target.cpp
index 2c2d1adb0fcf0..27eecc357fde3 100644
--- a/llvm/tools/llvm-exegesis/lib/X86/Target.cpp
+++ b/llvm/tools/llvm-exegesis/lib/X86/Target.cpp
@@ -39,6 +39,7 @@
 #endif
 
 #ifdef __linux__
+#include <asm/prctl.h>
 #include <sys/mman.h>
 #include <sys/syscall.h>
 #include <unistd.h>
@@ -907,9 +908,85 @@ void ExegesisX86Target::decrementLoopCounterAndJump(
       .addImm(X86::COND_NE);
 }
 
+void generateRegisterStackPush(unsigned int Register,
+                               std::vector<MCInst> &GeneratedCode) {
+  GeneratedCode.push_back(MCInstBuilder(X86::PUSH64r).addReg(Register));
+}
+
+void generateRegisterStackPop(unsigned int Register,
+                              std::vector<MCInst> &GeneratedCode) {
+  GeneratedCode.push_back(MCInstBuilder(X86::POP64r).addReg(Register));
+}
+
+void generateSyscall(long SyscallNumber, std::vector<MCInst> &GeneratedCode) {
+  GeneratedCode.push_back(
+      loadImmediate(X86::RAX, 64, APInt(64, SyscallNumber)));
+  GeneratedCode.push_back(MCInstBuilder(X86::SYSCALL));
+}
+
+constexpr std::array<unsigned, 6> SyscallArgumentRegisters{
+    X86::RDI, X86::RSI, X86::RDX, X86::R10, X86::R8, X86::R9};
+
+static void saveSyscallRegisters(std::vector<MCInst> &GeneratedCode,
+                                 unsigned ArgumentCount) {
+  assert(ArgumentCount <= 6 &&
+         "System calls only X86-64 Linux can only take six arguments");
+  // Preserve RCX and R11 (Clobbered by the system call).
+  generateRegisterStackPush(X86::RCX, GeneratedCode);
+  generateRegisterStackPush(X86::R11, GeneratedCode);
+  // Preserve RAX (used for the syscall number/return value).
+  generateRegisterStackPush(X86::RAX, GeneratedCode);
+  // Preserve the registers used to pass arguments to the system call.
+  for (unsigned I = 0; I < ArgumentCount; ++I)
+    generateRegisterStackPush(SyscallArgumentRegisters[I], GeneratedCode);
+}
+
+static void restoreSyscallRegisters(std::vector<MCInst> &GeneratedCode,
+                                    unsigned ArgumentCount) {
+  assert(ArgumentCount <= 6 &&
+         "System calls only X86-64 Linux can only take six arguments");
+  // Restore the argument registers, in the opposite order of the way they are
+  // saved.
+  for (unsigned I = ArgumentCount; I > 0; --I) {
+    generateRegisterStackPop(SyscallArgumentRegisters[I - 1], GeneratedCode);
+  }
+  generateRegisterStackPop(X86::RAX, GeneratedCode);
+  generateRegisterStackPop(X86::R11, GeneratedCode);
+  generateRegisterStackPop(X86::RCX, GeneratedCode);
+}
+
+static std::vector<MCInst> loadImmediateSegmentRegister(unsigned Reg,
+                                                        const APInt &Value) {
+  assert(Value.getBitWidth() <= 64 && "Value must fit in the register.");
+  std::vector<MCInst> loadSegmentRegisterCode;
+  // Preserve the syscall registers here as we don't
+  // want to make any assumptions about the ordering of what registers are
+  // loaded in first, and we might have already loaded in registers that we are
+  // going to be clobbering here.
+  saveSyscallRegisters(loadSegmentRegisterCode, 2);
+  // Generate the instructions to make the arch_prctl system call to set
+  // the registers.
+  int SyscallCode = 0;
+  if (Reg == X86::FS)
+    SyscallCode = ARCH_SET_FS;
+  else if (Reg == X86::GS)
+    SyscallCode = ARCH_SET_GS;
+  else
+    llvm_unreachable("Only the segment registers GS and FS are supported");
+  loadSegmentRegisterCode.push_back(
+      loadImmediate(X86::RDI, 64, APInt(64, SyscallCode)));
+  loadSegmentRegisterCode.push_back(loadImmediate(X86::RSI, 64, Value));
+  generateSyscall(SYS_arch_prctl, loadSegmentRegisterCode);
+  // Restore the registers in reverse order
+  restoreSyscallRegisters(loadSegmentRegisterCode, 2);
+  return loadSegmentRegisterCode;
+}
+
 std::vector<MCInst> ExegesisX86Target::setRegTo(const MCSubtargetInfo &STI,
                                                 unsigned Reg,
                                                 const APInt &Value) const {
+  if (X86::SEGMENT_REGRegClass.contains(Reg))
+    return loadImmediateSegmentRegister(Reg, Value);
   if (X86::GR8RegClass.contains(Reg))
     return {loadImmediate(Reg, 8, Value)};
   if (X86::GR16RegClass.contains(Reg))
@@ -992,12 +1069,6 @@ static constexpr const intptr_t VAddressSpaceCeiling = 0xC0000000;
 static constexpr const intptr_t VAddressSpaceCeiling = 0x0000800000000000;
 #endif
 
-void generateSyscall(long SyscallNumber, std::vector<MCInst> &GeneratedCode) {
-  GeneratedCode.push_back(
-      loadImmediate(X86::RAX, 64, APInt(64, SyscallNumber)));
-  GeneratedCode.push_back(MCInstBuilder(X86::SYSCALL));
-}
-
 void generateRoundToNearestPage(unsigned int Register,
                                 std::vector<MCInst> &GeneratedCode) {
   int PageSizeShift = static_cast<int>(round(log2(getpagesize())));
@@ -1157,29 +1228,11 @@ intptr_t ExegesisX86Target::getAuxiliaryMemoryStartAddress() const {
   return VAddressSpaceCeiling - 2 * getpagesize();
 }
 
-void generateRegisterStackPush(unsigned int Register,
-                               std::vector<MCInst> &GeneratedCode) {
-  GeneratedCode.push_back(MCInstBuilder(X86::PUSH64r).addReg(Register));
-}
-
-void generateRegisterStackPop(unsigned int Register,
-                              std::vector<MCInst> &GeneratedCode) {
-  GeneratedCode.push_back(MCInstBuilder(X86::POP64r).addReg(Register));
-}
-
 std::vector<MCInst>
 ExegesisX86Target::configurePerfCounter(long Request, bool SaveRegisters) const {
   std::vector<MCInst> ConfigurePerfCounterCode;
-  if(SaveRegisters) {
-    // Preserve RAX, RDI, and RSI by pushing them to the stack.
-    generateRegisterStackPush(X86::RAX, ConfigurePerfCounterCode);
-    generateRegisterStackPush(X86::RDI, ConfigurePerfCounterCode);
-    generateRegisterStackPush(X86::RSI, ConfigurePerfCounterCode);
-    // RCX and R11 will get clobbered by the syscall instruction, so save them
-    // as well.
-    generateRegisterStackPush(X86::RCX, ConfigurePerfCounterCode);
-    generateRegisterStackPush(X86::R11, ConfigurePerfCounterCode);
-  }
+  if (SaveRegisters)
+    saveSyscallRegisters(ConfigurePerfCounterCode, 2);
   ConfigurePerfCounterCode.push_back(
       loadImmediate(X86::RDI, 64, APInt(64, getAuxiliaryMemoryStartAddress())));
   ConfigurePerfCounterCode.push_back(MCInstBuilder(X86::MOV32rm)
@@ -1192,15 +1245,8 @@ ExegesisX86Target::configurePerfCounter(long Request, bool SaveRegisters) const
   ConfigurePerfCounterCode.push_back(
       loadImmediate(X86::RSI, 64, APInt(64, Request)));
   generateSyscall(SYS_ioctl, ConfigurePerfCounterCode);
-  if(SaveRegisters) {
-    // Restore R11 then RCX
-    generateRegisterStackPop(X86::R11, ConfigurePerfCounterCode);
-    generateRegisterStackPop(X86::RCX, ConfigurePerfCounterCode);
-    // Restore RAX, RDI, and RSI, in reverse order.
-    generateRegisterStackPop(X86::RSI, ConfigurePerfCounterCode);
-    generateRegisterStackPop(X86::RDI, ConfigurePerfCounterCode);
-    generateRegisterStackPop(X86::RAX, ConfigurePerfCounterCode);
-  }
+  if (SaveRegisters)
+    restoreSyscallRegisters(ConfigurePerfCounterCode, 2);
   return ConfigurePerfCounterCode;
 }
 

From 8b485070844d03cda467e75aa8c924184ba671cf Mon Sep 17 00:00:00 2001
From: Aiden Grossman <agrossman154@yahoo.com>
Date: Tue, 26 Dec 2023 10:27:19 -0800
Subject: [PATCH 292/342] Revert "[llvm-exegesis] Add support for loading X86
 segment registers (#76368)"

This reverts commit 7c383481a8e86918b3aaca4288c1eed62a4d6ff4.

The reverted patch was failing on quite a few buildbots due to systems
not having asm/prctl.h. More investigation is needed on why exactly
those systems do not have that specific header.
---
 .../latency/segment-registers-subprocess.asm  |  29 -----
 llvm/tools/llvm-exegesis/lib/X86/Target.cpp   | 116 ++++++------------
 2 files changed, 35 insertions(+), 110 deletions(-)
 delete mode 100644 llvm/test/tools/llvm-exegesis/X86/latency/segment-registers-subprocess.asm

diff --git a/llvm/test/tools/llvm-exegesis/X86/latency/segment-registers-subprocess.asm b/llvm/test/tools/llvm-exegesis/X86/latency/segment-registers-subprocess.asm
deleted file mode 100644
index 5d5219f9375f2..0000000000000
--- a/llvm/test/tools/llvm-exegesis/X86/latency/segment-registers-subprocess.asm
+++ /dev/null
@@ -1,29 +0,0 @@
-# REQUIRES: exegesis-can-measure-latency, x86_64-linux
-
-# Check that the value of the segment registers is set properly when in
-# subprocess mode.
-
-# RUN: llvm-exegesis -mtriple=x86_64-unknown-unknown -mode=latency -snippets-file=%s -execution-mode=subprocess | FileCheck %s
-
-# LLVM-EXEGESIS-DEFREG FS 12345600
-# LLVM-EXEGESIS-DEFREG GS 2468ac00
-# LLVM-EXEGESIS-DEFREG R13 0
-# LLVM-EXEGESIS-DEFREG R14 127
-# LLVM-EXEGESIS-DEFREG R15 0
-# LLVM-EXEGESIS-MEM-DEF MEM1 4096 0000000012345600
-# LLVM-EXEGESIS-MEM-DEF MEM2 4096 000000002468ac00
-# LLVM-EXEGESIS-MEM-MAP MEM1 305418240
-# LLVM-EXEGESIS-MEM-MAP MEM2 610836480
-
-movq %fs:0, %r13
-cmpq $0x12345600, %r13
-cmovneq %r14, %r15
-movq %gs:0, %r13
-cmpq $0x2468ac00, %r13
-cmovneq %r14, %r15
-
-movq $60, %rax
-movq %r15, %rdi
-syscall
-
-# CHECK-NOT: error:           'Child benchmarking process exited with non-zero exit code: Child process returned with unknown exit code'
diff --git a/llvm/tools/llvm-exegesis/lib/X86/Target.cpp b/llvm/tools/llvm-exegesis/lib/X86/Target.cpp
index 27eecc357fde3..2c2d1adb0fcf0 100644
--- a/llvm/tools/llvm-exegesis/lib/X86/Target.cpp
+++ b/llvm/tools/llvm-exegesis/lib/X86/Target.cpp
@@ -39,7 +39,6 @@
 #endif
 
 #ifdef __linux__
-#include <asm/prctl.h>
 #include <sys/mman.h>
 #include <sys/syscall.h>
 #include <unistd.h>
@@ -908,85 +907,9 @@ void ExegesisX86Target::decrementLoopCounterAndJump(
       .addImm(X86::COND_NE);
 }
 
-void generateRegisterStackPush(unsigned int Register,
-                               std::vector<MCInst> &GeneratedCode) {
-  GeneratedCode.push_back(MCInstBuilder(X86::PUSH64r).addReg(Register));
-}
-
-void generateRegisterStackPop(unsigned int Register,
-                              std::vector<MCInst> &GeneratedCode) {
-  GeneratedCode.push_back(MCInstBuilder(X86::POP64r).addReg(Register));
-}
-
-void generateSyscall(long SyscallNumber, std::vector<MCInst> &GeneratedCode) {
-  GeneratedCode.push_back(
-      loadImmediate(X86::RAX, 64, APInt(64, SyscallNumber)));
-  GeneratedCode.push_back(MCInstBuilder(X86::SYSCALL));
-}
-
-constexpr std::array<unsigned, 6> SyscallArgumentRegisters{
-    X86::RDI, X86::RSI, X86::RDX, X86::R10, X86::R8, X86::R9};
-
-static void saveSyscallRegisters(std::vector<MCInst> &GeneratedCode,
-                                 unsigned ArgumentCount) {
-  assert(ArgumentCount <= 6 &&
-         "System calls only X86-64 Linux can only take six arguments");
-  // Preserve RCX and R11 (Clobbered by the system call).
-  generateRegisterStackPush(X86::RCX, GeneratedCode);
-  generateRegisterStackPush(X86::R11, GeneratedCode);
-  // Preserve RAX (used for the syscall number/return value).
-  generateRegisterStackPush(X86::RAX, GeneratedCode);
-  // Preserve the registers used to pass arguments to the system call.
-  for (unsigned I = 0; I < ArgumentCount; ++I)
-    generateRegisterStackPush(SyscallArgumentRegisters[I], GeneratedCode);
-}
-
-static void restoreSyscallRegisters(std::vector<MCInst> &GeneratedCode,
-                                    unsigned ArgumentCount) {
-  assert(ArgumentCount <= 6 &&
-         "System calls only X86-64 Linux can only take six arguments");
-  // Restore the argument registers, in the opposite order of the way they are
-  // saved.
-  for (unsigned I = ArgumentCount; I > 0; --I) {
-    generateRegisterStackPop(SyscallArgumentRegisters[I - 1], GeneratedCode);
-  }
-  generateRegisterStackPop(X86::RAX, GeneratedCode);
-  generateRegisterStackPop(X86::R11, GeneratedCode);
-  generateRegisterStackPop(X86::RCX, GeneratedCode);
-}
-
-static std::vector<MCInst> loadImmediateSegmentRegister(unsigned Reg,
-                                                        const APInt &Value) {
-  assert(Value.getBitWidth() <= 64 && "Value must fit in the register.");
-  std::vector<MCInst> loadSegmentRegisterCode;
-  // Preserve the syscall registers here as we don't
-  // want to make any assumptions about the ordering of what registers are
-  // loaded in first, and we might have already loaded in registers that we are
-  // going to be clobbering here.
-  saveSyscallRegisters(loadSegmentRegisterCode, 2);
-  // Generate the instructions to make the arch_prctl system call to set
-  // the registers.
-  int SyscallCode = 0;
-  if (Reg == X86::FS)
-    SyscallCode = ARCH_SET_FS;
-  else if (Reg == X86::GS)
-    SyscallCode = ARCH_SET_GS;
-  else
-    llvm_unreachable("Only the segment registers GS and FS are supported");
-  loadSegmentRegisterCode.push_back(
-      loadImmediate(X86::RDI, 64, APInt(64, SyscallCode)));
-  loadSegmentRegisterCode.push_back(loadImmediate(X86::RSI, 64, Value));
-  generateSyscall(SYS_arch_prctl, loadSegmentRegisterCode);
-  // Restore the registers in reverse order
-  restoreSyscallRegisters(loadSegmentRegisterCode, 2);
-  return loadSegmentRegisterCode;
-}
-
 std::vector<MCInst> ExegesisX86Target::setRegTo(const MCSubtargetInfo &STI,
                                                 unsigned Reg,
                                                 const APInt &Value) const {
-  if (X86::SEGMENT_REGRegClass.contains(Reg))
-    return loadImmediateSegmentRegister(Reg, Value);
   if (X86::GR8RegClass.contains(Reg))
     return {loadImmediate(Reg, 8, Value)};
   if (X86::GR16RegClass.contains(Reg))
@@ -1069,6 +992,12 @@ static constexpr const intptr_t VAddressSpaceCeiling = 0xC0000000;
 static constexpr const intptr_t VAddressSpaceCeiling = 0x0000800000000000;
 #endif
 
+void generateSyscall(long SyscallNumber, std::vector<MCInst> &GeneratedCode) {
+  GeneratedCode.push_back(
+      loadImmediate(X86::RAX, 64, APInt(64, SyscallNumber)));
+  GeneratedCode.push_back(MCInstBuilder(X86::SYSCALL));
+}
+
 void generateRoundToNearestPage(unsigned int Register,
                                 std::vector<MCInst> &GeneratedCode) {
   int PageSizeShift = static_cast<int>(round(log2(getpagesize())));
@@ -1228,11 +1157,29 @@ intptr_t ExegesisX86Target::getAuxiliaryMemoryStartAddress() const {
   return VAddressSpaceCeiling - 2 * getpagesize();
 }
 
+void generateRegisterStackPush(unsigned int Register,
+                               std::vector<MCInst> &GeneratedCode) {
+  GeneratedCode.push_back(MCInstBuilder(X86::PUSH64r).addReg(Register));
+}
+
+void generateRegisterStackPop(unsigned int Register,
+                              std::vector<MCInst> &GeneratedCode) {
+  GeneratedCode.push_back(MCInstBuilder(X86::POP64r).addReg(Register));
+}
+
 std::vector<MCInst>
 ExegesisX86Target::configurePerfCounter(long Request, bool SaveRegisters) const {
   std::vector<MCInst> ConfigurePerfCounterCode;
-  if (SaveRegisters)
-    saveSyscallRegisters(ConfigurePerfCounterCode, 2);
+  if(SaveRegisters) {
+    // Preserve RAX, RDI, and RSI by pushing them to the stack.
+    generateRegisterStackPush(X86::RAX, ConfigurePerfCounterCode);
+    generateRegisterStackPush(X86::RDI, ConfigurePerfCounterCode);
+    generateRegisterStackPush(X86::RSI, ConfigurePerfCounterCode);
+    // RCX and R11 will get clobbered by the syscall instruction, so save them
+    // as well.
+    generateRegisterStackPush(X86::RCX, ConfigurePerfCounterCode);
+    generateRegisterStackPush(X86::R11, ConfigurePerfCounterCode);
+  }
   ConfigurePerfCounterCode.push_back(
       loadImmediate(X86::RDI, 64, APInt(64, getAuxiliaryMemoryStartAddress())));
   ConfigurePerfCounterCode.push_back(MCInstBuilder(X86::MOV32rm)
@@ -1245,8 +1192,15 @@ ExegesisX86Target::configurePerfCounter(long Request, bool SaveRegisters) const
   ConfigurePerfCounterCode.push_back(
       loadImmediate(X86::RSI, 64, APInt(64, Request)));
   generateSyscall(SYS_ioctl, ConfigurePerfCounterCode);
-  if (SaveRegisters)
-    restoreSyscallRegisters(ConfigurePerfCounterCode, 2);
+  if(SaveRegisters) {
+    // Restore R11 then RCX
+    generateRegisterStackPop(X86::R11, ConfigurePerfCounterCode);
+    generateRegisterStackPop(X86::RCX, ConfigurePerfCounterCode);
+    // Restore RAX, RDI, and RSI, in reverse order.
+    generateRegisterStackPop(X86::RSI, ConfigurePerfCounterCode);
+    generateRegisterStackPop(X86::RDI, ConfigurePerfCounterCode);
+    generateRegisterStackPop(X86::RAX, ConfigurePerfCounterCode);
+  }
   return ConfigurePerfCounterCode;
 }
 

From b80e1acc8cfb82158255de24fb2887acd72a4049 Mon Sep 17 00:00:00 2001
From: Min-Yih Hsu <min@myhsu.dev>
Date: Tue, 26 Dec 2023 11:07:57 -0800
Subject: [PATCH 293/342] [M68k] Improve codegen of overflow arithmetics

The codegen logic for overflow arithmetics (e.g. llvm.uadd.overflow)
was a mess; overflow multiplications were not even supported.
This patch clean up the legalization of overflow arithmetics and add
supports for common variants of overflow multiplications.
---
 llvm/lib/Target/M68k/M68kISelLowering.cpp     | 227 ++++++++----------
 llvm/lib/Target/M68k/M68kInstrArithmetic.td   |  20 +-
 llvm/lib/Target/M68k/M68kInstrInfo.td         |  11 +-
 .../CodeGen/M68k/Arith/smul-with-overflow.ll  |  64 ++---
 .../CodeGen/M68k/Arith/umul-with-overflow.ll  |  16 +-
 5 files changed, 138 insertions(+), 200 deletions(-)

diff --git a/llvm/lib/Target/M68k/M68kISelLowering.cpp b/llvm/lib/Target/M68k/M68kISelLowering.cpp
index 0830cc7feb220..6ca5962965bde 100644
--- a/llvm/lib/Target/M68k/M68kISelLowering.cpp
+++ b/llvm/lib/Target/M68k/M68kISelLowering.cpp
@@ -94,11 +94,10 @@ M68kTargetLowering::M68kTargetLowering(const M68kTargetMachine &TM,
     setOperationAction(OP, MVT::i16, Expand);
   }
 
-  // FIXME It would be better to use a custom lowering
   for (auto OP : {ISD::SMULO, ISD::UMULO}) {
-    setOperationAction(OP, MVT::i8, Expand);
-    setOperationAction(OP, MVT::i16, Expand);
-    setOperationAction(OP, MVT::i32, Expand);
+    setOperationAction(OP, MVT::i8,  Custom);
+    setOperationAction(OP, MVT::i16, Custom);
+    setOperationAction(OP, MVT::i32, Custom);
   }
 
   for (auto OP : {ISD::SHL_PARTS, ISD::SRA_PARTS, ISD::SRL_PARTS})
@@ -1533,46 +1532,119 @@ bool M68kTargetLowering::decomposeMulByConstant(LLVMContext &Context, EVT VT,
   return VT.bitsLE(MVT::i32) || Subtarget.atLeastM68020();
 }
 
-SDValue M68kTargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) const {
-  // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
-  // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
-  // looks for this combo and may remove the "setcc" instruction if the "setcc"
-  // has only one use.
+static bool isOverflowArithmetic(unsigned Opcode) {
+  switch (Opcode) {
+  case ISD::UADDO:
+  case ISD::SADDO:
+  case ISD::USUBO:
+  case ISD::SSUBO:
+  case ISD::UMULO:
+  case ISD::SMULO:
+    return true;
+  default:
+    return false;
+  }
+}
+
+static void lowerOverflowArithmetic(SDValue Op, SelectionDAG &DAG,
+                                    SDValue &Result, SDValue &CCR,
+                                    unsigned &CC) {
   SDNode *N = Op.getNode();
+  EVT VT = N->getValueType(0);
   SDValue LHS = N->getOperand(0);
   SDValue RHS = N->getOperand(1);
-  unsigned BaseOp = 0;
-  unsigned Cond = 0;
   SDLoc DL(Op);
+
+  unsigned TruncOp = 0;
+  auto PromoteMULO = [&](unsigned ExtOp) {
+    // We don't have 8-bit multiplications, so promote i8 version of U/SMULO
+    // to i16.
+    // Ideally this should be done by legalizer but sadly there is no promotion
+    // rule for U/SMULO at this moment.
+    if (VT == MVT::i8) {
+      LHS = DAG.getNode(ExtOp, DL, MVT::i16, LHS);
+      RHS = DAG.getNode(ExtOp, DL, MVT::i16, RHS);
+      VT = MVT::i16;
+      TruncOp = ISD::TRUNCATE;
+    }
+  };
+
+  bool NoOverflow = false;
+  unsigned BaseOp = 0;
   switch (Op.getOpcode()) {
   default:
     llvm_unreachable("Unknown ovf instruction!");
   case ISD::SADDO:
     BaseOp = M68kISD::ADD;
-    Cond = M68k::COND_VS;
+    CC = M68k::COND_VS;
     break;
   case ISD::UADDO:
     BaseOp = M68kISD::ADD;
-    Cond = M68k::COND_CS;
+    CC = M68k::COND_CS;
     break;
   case ISD::SSUBO:
     BaseOp = M68kISD::SUB;
-    Cond = M68k::COND_VS;
+    CC = M68k::COND_VS;
     break;
   case ISD::USUBO:
     BaseOp = M68kISD::SUB;
-    Cond = M68k::COND_CS;
+    CC = M68k::COND_CS;
+    break;
+  case ISD::UMULO:
+    PromoteMULO(ISD::ZERO_EXTEND);
+    NoOverflow = VT != MVT::i32;
+    BaseOp = NoOverflow ? ISD::MUL : M68kISD::UMUL;
+    CC = M68k::COND_VS;
+    break;
+  case ISD::SMULO:
+    PromoteMULO(ISD::SIGN_EXTEND);
+    NoOverflow = VT != MVT::i32;
+    BaseOp = NoOverflow ? ISD::MUL : M68kISD::SMUL;
+    CC = M68k::COND_VS;
     break;
   }
 
-  // Also sets CCR.
-  SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i8);
+  SDVTList VTs;
+  if (NoOverflow)
+    VTs = DAG.getVTList(VT);
+  else
+    // Also sets CCR.
+    VTs = DAG.getVTList(VT, MVT::i8);
+
   SDValue Arith = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
-  SDValue SetCC = DAG.getNode(M68kISD::SETCC, DL, N->getValueType(1),
-                              DAG.getConstant(Cond, DL, MVT::i8),
-                              SDValue(Arith.getNode(), 1));
+  Result = Arith.getValue(0);
+  if (TruncOp)
+    // Right now the only place to truncate is from i16 to i8.
+    Result = DAG.getNode(TruncOp, DL, MVT::i8, Arith);
 
-  return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Arith, SetCC);
+  if (NoOverflow)
+    CCR = DAG.getConstant(0, DL, N->getValueType(1));
+  else
+    CCR = Arith.getValue(1);
+}
+
+SDValue M68kTargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) const {
+  SDNode *N = Op.getNode();
+  SDLoc DL(Op);
+
+  // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
+  // a "setcc" instruction that checks the overflow flag.
+  SDValue Result, CCR;
+  unsigned CC;
+  lowerOverflowArithmetic(Op, DAG, Result, CCR, CC);
+
+  SDValue Overflow;
+  if (isa<ConstantSDNode>(CCR)) {
+    // It's likely a result of operations that will not overflow
+    // hence no setcc is needed.
+    Overflow = DAG.getZExtOrTrunc(CCR, DL, N->getValueType(1));
+  } else {
+    // Generate a M68kISD::SETCC.
+    Overflow = DAG.getNode(M68kISD::SETCC, DL, N->getValueType(1),
+                           DAG.getConstant(CC, DL, MVT::i8), CCR);
+  }
+
+  return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Result, Overflow);
 }
 
 /// Create a BTST (Bit Test) node - Test bit \p BitNo in \p Src and set
@@ -2269,55 +2341,12 @@ SDValue M68kTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
       Cond = Cmp;
       addTest = false;
     }
-  } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
-             CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
-             CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) {
-    SDValue LHS = Cond.getOperand(0);
-    SDValue RHS = Cond.getOperand(1);
-    unsigned MxOpcode;
-    unsigned MxCond;
-    SDVTList VTs;
-    switch (CondOpcode) {
-    case ISD::UADDO:
-      MxOpcode = M68kISD::ADD;
-      MxCond = M68k::COND_CS;
-      break;
-    case ISD::SADDO:
-      MxOpcode = M68kISD::ADD;
-      MxCond = M68k::COND_VS;
-      break;
-    case ISD::USUBO:
-      MxOpcode = M68kISD::SUB;
-      MxCond = M68k::COND_CS;
-      break;
-    case ISD::SSUBO:
-      MxOpcode = M68kISD::SUB;
-      MxCond = M68k::COND_VS;
-      break;
-    case ISD::UMULO:
-      MxOpcode = M68kISD::UMUL;
-      MxCond = M68k::COND_VS;
-      break;
-    case ISD::SMULO:
-      MxOpcode = M68kISD::SMUL;
-      MxCond = M68k::COND_VS;
-      break;
-    default:
-      llvm_unreachable("unexpected overflowing operator");
-    }
-    if (CondOpcode == ISD::UMULO)
-      VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(), MVT::i32);
-    else
-      VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
-
-    SDValue MxOp = DAG.getNode(MxOpcode, DL, VTs, LHS, RHS);
-
-    if (CondOpcode == ISD::UMULO)
-      Cond = MxOp.getValue(2);
-    else
-      Cond = MxOp.getValue(1);
-
-    CC = DAG.getConstant(MxCond, DL, MVT::i8);
+  } else if (isOverflowArithmetic(CondOpcode)) {
+    // Result is unused here.
+    SDValue Result;
+    unsigned CCode;
+    lowerOverflowArithmetic(Cond, DAG, Result, Cond, CCode);
+    CC = DAG.getConstant(CCode, DL, MVT::i8);
     addTest = false;
   }
 
@@ -2466,61 +2495,15 @@ SDValue M68kTargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
     }
   }
   CondOpcode = Cond.getOpcode();
-  if (CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
-      CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO) {
-    SDValue LHS = Cond.getOperand(0);
-    SDValue RHS = Cond.getOperand(1);
-    unsigned MxOpcode;
-    unsigned MxCond;
-    SDVTList VTs;
-    // Keep this in sync with LowerXALUO, otherwise we might create redundant
-    // instructions that can't be removed afterwards (i.e. M68kISD::ADD and
-    // M68kISD::INC).
-    switch (CondOpcode) {
-    case ISD::UADDO:
-      MxOpcode = M68kISD::ADD;
-      MxCond = M68k::COND_CS;
-      break;
-    case ISD::SADDO:
-      MxOpcode = M68kISD::ADD;
-      MxCond = M68k::COND_VS;
-      break;
-    case ISD::USUBO:
-      MxOpcode = M68kISD::SUB;
-      MxCond = M68k::COND_CS;
-      break;
-    case ISD::SSUBO:
-      MxOpcode = M68kISD::SUB;
-      MxCond = M68k::COND_VS;
-      break;
-    case ISD::UMULO:
-      MxOpcode = M68kISD::UMUL;
-      MxCond = M68k::COND_VS;
-      break;
-    case ISD::SMULO:
-      MxOpcode = M68kISD::SMUL;
-      MxCond = M68k::COND_VS;
-      break;
-    default:
-      llvm_unreachable("unexpected overflowing operator");
-    }
+  if (isOverflowArithmetic(CondOpcode)) {
+    SDValue Result;
+    unsigned CCode;
+    lowerOverflowArithmetic(Cond, DAG, Result, Cond, CCode);
 
     if (Inverted)
-      MxCond = M68k::GetOppositeBranchCondition((M68k::CondCode)MxCond);
+      CCode = M68k::GetOppositeBranchCondition((M68k::CondCode)CCode);
+    CC = DAG.getConstant(CCode, DL, MVT::i8);
 
-    if (CondOpcode == ISD::UMULO)
-      VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(), MVT::i8);
-    else
-      VTs = DAG.getVTList(LHS.getValueType(), MVT::i8);
-
-    SDValue MxOp = DAG.getNode(MxOpcode, DL, VTs, LHS, RHS);
-
-    if (CondOpcode == ISD::UMULO)
-      Cond = MxOp.getValue(2);
-    else
-      Cond = MxOp.getValue(1);
-
-    CC = DAG.getConstant(MxCond, DL, MVT::i8);
     AddTest = false;
   } else {
     unsigned CondOpc;
diff --git a/llvm/lib/Target/M68k/M68kInstrArithmetic.td b/llvm/lib/Target/M68k/M68kInstrArithmetic.td
index 15d2049f62cb7..7f250f4e56ef2 100644
--- a/llvm/lib/Target/M68k/M68kInstrArithmetic.td
+++ b/llvm/lib/Target/M68k/M68kInstrArithmetic.td
@@ -590,8 +590,9 @@ class MxDiMuOp_DD<string MN, bits<4> CMD, bit SIGNED = false,
 }
 
 // $dreg <- $dreg op $dreg
-class MxDiMuOp_DD_Long<string MN, bits<10> CMD, bit SIGNED = false>
-    : MxInst<(outs MxDRD32:$dst), (ins MxDRD32:$src, MxDRD32:$opd), MN#"\t$opd, $dst", []> {
+class MxDiMuOp_DD_Long<string MN, SDNode NODE, bits<10> CMD, bit SIGNED = false>
+    : MxInst<(outs MxDRD32:$dst), (ins MxDRD32:$src, MxDRD32:$opd), MN#"\t$opd, $dst",
+             [(set i32:$dst, CCR, (NODE i32:$src, i32:$opd))]> {
   let Inst = (ascend
     (descend CMD,
       /*MODE*/0b000, /*REGISTER*/(operand "$opd", 3)),
@@ -634,8 +635,8 @@ multiclass MxDiMuOp<string MN, bits<4> CMD, bit isComm = 0> {
 
 defm DIV : MxDiMuOp<"div", 0x8>;
 
-def SDIVd32d32 : MxDiMuOp_DD_Long<"divs.l", 0x131, /*SIGNED*/true>;
-def UDIVd32d32 : MxDiMuOp_DD_Long<"divu.l", 0x131, /*SIGNED*/false>;
+def SDIVd32d32 : MxDiMuOp_DD_Long<"divs.l", sdiv, 0x131, /*SIGNED*/true>;
+def UDIVd32d32 : MxDiMuOp_DD_Long<"divu.l", udiv, 0x131, /*SIGNED*/false>;
 
 // This is used to cast immediates to 16-bits for operations which don't
 // support smaller immediate sizes.
@@ -685,13 +686,6 @@ def : Pat<(urem i16:$dst, i16:$opd),
             (LSR32di (LSR32di (UDIVd32d16 (MOVZXd32d16 $dst), $opd), 8), 8),
              MxSubRegIndex16Lo)>;
 
-
-// RR i32
-def : Pat<(sdiv i32:$dst, i32:$opd), (SDIVd32d32 $dst, $opd)>;
-
-def : Pat<(udiv i32:$dst, i32:$opd), (UDIVd32d32 $dst, $opd)>;
-
-
 // RI i8
 def : Pat<(sdiv i8:$dst, MximmSExt8:$opd),
           (EXTRACT_SUBREG
@@ -737,8 +731,8 @@ def : Pat<(urem i16:$dst, MximmSExt16:$opd),
 
 defm MUL : MxDiMuOp<"mul", 0xC, 1>;
 
-def SMULd32d32 : MxDiMuOp_DD_Long<"muls.l", 0x130, /*SIGNED*/true>;
-def UMULd32d32 : MxDiMuOp_DD_Long<"mulu.l", 0x130, /*SIGNED*/false>;
+def SMULd32d32 : MxDiMuOp_DD_Long<"muls.l", MxSMul, 0x130, /*SIGNED*/true>;
+def UMULd32d32 : MxDiMuOp_DD_Long<"mulu.l", MxUMul, 0x130, /*SIGNED*/false>;
 
 // RR
 def : Pat<(mul i16:$dst, i16:$opd),
diff --git a/llvm/lib/Target/M68k/M68kInstrInfo.td b/llvm/lib/Target/M68k/M68kInstrInfo.td
index dc66e103361a4..1e40c3c48990d 100644
--- a/llvm/lib/Target/M68k/M68kInstrInfo.td
+++ b/llvm/lib/Target/M68k/M68kInstrInfo.td
@@ -55,15 +55,6 @@ def MxSDT_BiArithCCRInOut : SDTypeProfile<2, 3, [
   /*   CCR */ SDTCisSameAs<1, 4>
 ]>;
 
-// RES1, RES2, CCR <- op LHS, RHS
-def MxSDT_2BiArithCCROut : SDTypeProfile<3, 2, [
-  /* RES 1 */ SDTCisInt<0>,
-  /* RES 2 */ SDTCisSameAs<0, 1>,
-  /*   CCR */ SDTCisVT<1, i8>,
-  /*   LHS */ SDTCisSameAs<0, 2>,
-  /*   RHS */ SDTCisSameAs<0, 3>
-]>;
-
 def MxSDT_CmpTest : SDTypeProfile<1, 2, [
    /* CCR */ SDTCisVT<0, i8>,
    /* Ops */ SDTCisSameAs<1, 2>
@@ -134,7 +125,7 @@ def MxAddX : SDNode<"M68kISD::ADDX", MxSDT_BiArithCCRInOut>;
 def MxSubX : SDNode<"M68kISD::SUBX", MxSDT_BiArithCCRInOut>;
 
 def MxSMul : SDNode<"M68kISD::SMUL", MxSDT_BiArithCCROut, [SDNPCommutative]>;
-def MxUMul : SDNode<"M68kISD::UMUL", MxSDT_2BiArithCCROut, [SDNPCommutative]>;
+def MxUMul : SDNode<"M68kISD::UMUL", MxSDT_BiArithCCROut, [SDNPCommutative]>;
 
 def MxCmp     : SDNode<"M68kISD::CMP", MxSDT_CmpTest>;
 def MxBtst    : SDNode<"M68kISD::BTST", MxSDT_CmpTest>;
diff --git a/llvm/test/CodeGen/M68k/Arith/smul-with-overflow.ll b/llvm/test/CodeGen/M68k/Arith/smul-with-overflow.ll
index 1d3371cce833a..cd9349181a631 100644
--- a/llvm/test/CodeGen/M68k/Arith/smul-with-overflow.ll
+++ b/llvm/test/CodeGen/M68k/Arith/smul-with-overflow.ll
@@ -3,7 +3,6 @@
 
 declare i32 @printf(i8*, ...) nounwind
 declare {i32, i1} @llvm.smul.with.overflow.i32(i32, i32)
-declare { i63, i1 } @llvm.smul.with.overflow.i63(i63, i63)
 
 @ok = internal constant [4 x i8] c"%d\0A\00"
 @no = internal constant [4 x i8] c"no\0A\00"
@@ -11,37 +10,23 @@ declare { i63, i1 } @llvm.smul.with.overflow.i63(i63, i63)
 define fastcc i1 @test1(i32 %v1, i32 %v2) nounwind {
 ; CHECK-LABEL: test1:
 ; CHECK:       ; %bb.0: ; %entry
-; CHECK-NEXT:    suba.l #28, %sp
-; CHECK-NEXT:    movem.l %d2-%d3, (20,%sp) ; 12-byte Folded Spill
-; CHECK-NEXT:    move.l %d1, (12,%sp)
-; CHECK-NEXT:    move.l #31, %d2
-; CHECK-NEXT:    asr.l %d2, %d1
-; CHECK-NEXT:    move.l %d1, (8,%sp)
-; CHECK-NEXT:    move.l %d0, (4,%sp)
-; CHECK-NEXT:    asr.l %d2, %d0
-; CHECK-NEXT:    move.l %d0, (%sp)
-; CHECK-NEXT:    jsr __muldi3@PLT
-; CHECK-NEXT:    move.l %d1, %d3
-; CHECK-NEXT:    asr.l %d2, %d3
-; CHECK-NEXT:    sub.l %d3, %d0
-; CHECK-NEXT:    sne %d0
-; CHECK-NEXT:    cmpi.b #0, %d0
-; CHECK-NEXT:    beq .LBB0_1
+; CHECK-NEXT:    suba.l #12, %sp
+; CHECK-NEXT:    muls.l %d1, %d0
+; CHECK-NEXT:    bvc .LBB0_1
 ; CHECK-NEXT:  ; %bb.2: ; %overflow
 ; CHECK-NEXT:    lea (no,%pc), %a0
 ; CHECK-NEXT:    move.l %a0, (%sp)
 ; CHECK-NEXT:    jsr printf@PLT
 ; CHECK-NEXT:    move.b #0, %d0
-; CHECK-NEXT:    bra .LBB0_3
+; CHECK-NEXT:    adda.l #12, %sp
+; CHECK-NEXT:    rts
 ; CHECK-NEXT:  .LBB0_1: ; %normal
-; CHECK-NEXT:    move.l %d1, (4,%sp)
+; CHECK-NEXT:    move.l %d0, (4,%sp)
 ; CHECK-NEXT:    lea (ok,%pc), %a0
 ; CHECK-NEXT:    move.l %a0, (%sp)
 ; CHECK-NEXT:    jsr printf@PLT
 ; CHECK-NEXT:    move.b #1, %d0
-; CHECK-NEXT:  .LBB0_3: ; %overflow
-; CHECK-NEXT:    movem.l (20,%sp), %d2-%d3 ; 12-byte Folded Reload
-; CHECK-NEXT:    adda.l #28, %sp
+; CHECK-NEXT:    adda.l #12, %sp
 ; CHECK-NEXT:    rts
 entry:
   %t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %v1, i32 %v2)
@@ -61,37 +46,25 @@ overflow:
 define fastcc i1 @test2(i32 %v1, i32 %v2) nounwind {
 ; CHECK-LABEL: test2:
 ; CHECK:       ; %bb.0: ; %entry
-; CHECK-NEXT:    suba.l #28, %sp
-; CHECK-NEXT:    movem.l %d2-%d3, (20,%sp) ; 12-byte Folded Spill
-; CHECK-NEXT:    move.l %d1, (12,%sp)
-; CHECK-NEXT:    move.l #31, %d2
-; CHECK-NEXT:    asr.l %d2, %d1
-; CHECK-NEXT:    move.l %d1, (8,%sp)
-; CHECK-NEXT:    move.l %d0, (4,%sp)
-; CHECK-NEXT:    asr.l %d2, %d0
-; CHECK-NEXT:    move.l %d0, (%sp)
-; CHECK-NEXT:    jsr __muldi3@PLT
-; CHECK-NEXT:    move.l %d1, %d3
-; CHECK-NEXT:    asr.l %d2, %d3
-; CHECK-NEXT:    sub.l %d3, %d0
-; CHECK-NEXT:    sne %d0
-; CHECK-NEXT:    sub.b #1, %d0
-; CHECK-NEXT:    bne .LBB1_3
+; CHECK-NEXT:    suba.l #12, %sp
+; CHECK-NEXT:    muls.l %d1, %d0
+; CHECK-NEXT:    svs %d1
+; CHECK-NEXT:    sub.b #1, %d1
+; CHECK-NEXT:    bne .LBB1_2
 ; CHECK-NEXT:  ; %bb.1: ; %overflow
 ; CHECK-NEXT:    lea (no,%pc), %a0
 ; CHECK-NEXT:    move.l %a0, (%sp)
 ; CHECK-NEXT:    jsr printf@PLT
 ; CHECK-NEXT:    move.b #0, %d0
-; CHECK-NEXT:    bra .LBB1_2
-; CHECK-NEXT:  .LBB1_3: ; %normal
-; CHECK-NEXT:    move.l %d1, (4,%sp)
+; CHECK-NEXT:    adda.l #12, %sp
+; CHECK-NEXT:    rts
+; CHECK-NEXT:  .LBB1_2: ; %normal
+; CHECK-NEXT:    move.l %d0, (4,%sp)
 ; CHECK-NEXT:    lea (ok,%pc), %a0
 ; CHECK-NEXT:    move.l %a0, (%sp)
 ; CHECK-NEXT:    jsr printf@PLT
 ; CHECK-NEXT:    move.b #1, %d0
-; CHECK-NEXT:  .LBB1_2: ; %overflow
-; CHECK-NEXT:    movem.l (20,%sp), %d2-%d3 ; 12-byte Folded Reload
-; CHECK-NEXT:    adda.l #28, %sp
+; CHECK-NEXT:    adda.l #12, %sp
 ; CHECK-NEXT:    rts
 entry:
   %t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %v1, i32 %v2)
@@ -129,7 +102,8 @@ define i32 @test4(i32 %a, i32 %b) nounwind readnone {
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    move.l (8,%sp), %d0
 ; CHECK-NEXT:    add.l (4,%sp), %d0
-; CHECK-NEXT:    lsl.l #2, %d0
+; CHECK-NEXT:    move.l #4, %d1
+; CHECK-NEXT:    muls.l %d1, %d0
 ; CHECK-NEXT:    rts
 entry:
 	%tmp0 = add i32 %b, %a
diff --git a/llvm/test/CodeGen/M68k/Arith/umul-with-overflow.ll b/llvm/test/CodeGen/M68k/Arith/umul-with-overflow.ll
index 16dc1036fd284..ef7171dc386fe 100644
--- a/llvm/test/CodeGen/M68k/Arith/umul-with-overflow.ll
+++ b/llvm/test/CodeGen/M68k/Arith/umul-with-overflow.ll
@@ -6,15 +6,10 @@ declare {i32, i1} @llvm.umul.with.overflow.i32(i32 %a, i32 %b)
 define i1 @a(i32 %x)  nounwind {
 ; CHECK-LABEL: a:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    suba.l #20, %sp
-; CHECK-NEXT:    move.l #3, (12,%sp)
-; CHECK-NEXT:    move.l #0, (8,%sp)
-; CHECK-NEXT:    move.l (24,%sp), (4,%sp)
-; CHECK-NEXT:    move.l #0, (%sp)
-; CHECK-NEXT:    jsr __muldi3@PLT
-; CHECK-NEXT:    cmpi.l #0, %d0
-; CHECK-NEXT:    sne %d0
-; CHECK-NEXT:    adda.l #20, %sp
+; CHECK-NEXT:    move.l #3, %d0
+; CHECK-NEXT:    move.l (4,%sp), %d1
+; CHECK-NEXT:    mulu.l %d0, %d1
+; CHECK-NEXT:    svs %d0
 ; CHECK-NEXT:    rts
   %res = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %x, i32 3)
   %obil = extractvalue {i32, i1} %res, 1
@@ -42,7 +37,8 @@ define i32 @test3(i32 %a, i32 %b) nounwind readnone {
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    move.l (8,%sp), %d0
 ; CHECK-NEXT:    add.l (4,%sp), %d0
-; CHECK-NEXT:    lsl.l #2, %d0
+; CHECK-NEXT:    move.l #4, %d1
+; CHECK-NEXT:    mulu.l %d1, %d0
 ; CHECK-NEXT:    rts
 entry:
 	%tmp0 = add i32 %b, %a

From 4358e6e0c5b1f08de60b6b2fb015a06ab6760ee7 Mon Sep 17 00:00:00 2001
From: Yingwei Zheng <dtcxzyw2333@gmail.com>
Date: Wed, 27 Dec 2023 03:16:43 +0800
Subject: [PATCH 294/342] [FuncAttrs] Infer `norecurse` for funcs with calls to
 `nocallback` callees (#76372)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch adds missing `norecurse` attrs to funcs that only call intrinsics with `nocallback` attrs.
Fixes the regression found in https://github.com/dtcxzyw/llvm-opt-benchmark/pull/45#discussion_r1436148743.
The function loses `norecurse` attr because it calls `@llvm.fabs.f64`, which is not marked as `norecurse`.

Since `norecurse` is not a default attribute of intrinsics and it is
ambiguous for intrinsics, I decided to use the existing `callback`
attributes.

> nocallback
This attribute indicates that the function is only allowed to jump back
into caller’s module by a return or an exception, and is not allowed to
jump back by invoking a callback function, a direct, possibly
transitive, external function call, use of longjmp, or other means. It
is a compiler hint that is used at module level to improve dataflow
analysis, dropped during linking, and has no effect on functions defined
in the current module.

See also https://llvm.org/docs/LangRef.html#function-attributes.
---
 .../RISCV/rvv-intrinsics-handcrafted/vlenb.c  | 24 +++++++++----------
 llvm/lib/Transforms/IPO/FunctionAttrs.cpp     |  5 +++-
 .../TypeBasedAliasAnalysis/functionattrs.ll   | 12 ++++------
 .../Transforms/FunctionAttrs/argmemonly.ll    |  6 ++---
 .../Transforms/FunctionAttrs/convergent.ll    |  2 +-
 .../FunctionAttrs/int_sideeffect.ll           |  4 ++--
 .../FunctionAttrs/make-buffer-rsrc.ll         |  2 +-
 .../Transforms/FunctionAttrs/nocapture.ll     | 16 ++++++-------
 .../FunctionAttrs/nofree-attributor.ll        |  4 ++--
 .../Transforms/FunctionAttrs/norecurse.ll     | 17 ++++++-------
 llvm/test/Transforms/FunctionAttrs/nosync.ll  |  6 ++---
 .../Transforms/FunctionAttrs/readattrs.ll     |  4 ++--
 .../Transforms/FunctionAttrs/writeonly.ll     | 18 +++++++-------
 13 files changed, 61 insertions(+), 59 deletions(-)

diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-handcrafted/vlenb.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-handcrafted/vlenb.c
index 9d95acc33dddc..582d5fd812bc3 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-handcrafted/vlenb.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-handcrafted/vlenb.c
@@ -21,19 +21,19 @@ unsigned long test_vlenb(void) {
   return __riscv_vlenb();
 }
 //.
-// RV32: attributes #0 = { mustprogress nofree noinline nosync nounwind willreturn memory(read) vscale_range(2,1024) "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+32bit,+d,+f,+v,+zicsr,+zve32f,+zve32x,+zve64d,+zve64f,+zve64x,+zvl128b,+zvl32b,+zvl64b" }
-// RV32: attributes #1 = { mustprogress nocallback nofree nosync nounwind willreturn memory(read) }
+// RV32: attributes #[[ATTR0:[0-9]+]] = { mustprogress nofree noinline norecurse nosync nounwind willreturn memory(read) vscale_range(2,1024) "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+32bit,+d,+f,+v,+zicsr,+zve32f,+zve32x,+zve64d,+zve64f,+zve64x,+zvl128b,+zvl32b,+zvl64b" }
+// RV32: attributes #[[ATTR1:[0-9]+]] = { mustprogress nocallback nofree nosync nounwind willreturn memory(read) }
 //.
-// RV64: attributes #0 = { mustprogress nofree noinline nosync nounwind willreturn memory(read) vscale_range(2,1024) "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+64bit,+d,+f,+v,+zicsr,+zve32f,+zve32x,+zve64d,+zve64f,+zve64x,+zvl128b,+zvl32b,+zvl64b" }
-// RV64: attributes #1 = { mustprogress nocallback nofree nosync nounwind willreturn memory(read) }
+// RV64: attributes #[[ATTR0:[0-9]+]] = { mustprogress nofree noinline norecurse nosync nounwind willreturn memory(read) vscale_range(2,1024) "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+64bit,+d,+f,+v,+zicsr,+zve32f,+zve32x,+zve64d,+zve64f,+zve64x,+zvl128b,+zvl32b,+zvl64b" }
+// RV64: attributes #[[ATTR1:[0-9]+]] = { mustprogress nocallback nofree nosync nounwind willreturn memory(read) }
 //.
-// RV32: !0 = !{i32 1, !"wchar_size", i32 4}
-// RV32: !1 = !{i32 1, !"target-abi", !"ilp32d"}
-// RV32: !2 = !{i32 8, !"SmallDataLimit", i32 0}
-// RV32: !3 = !{!"vlenb"}
+// RV32: [[META0:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
+// RV32: [[META1:![0-9]+]] = !{i32 1, !"target-abi", !"ilp32d"}
+// RV32: [[META2:![0-9]+]] = !{i32 8, !"SmallDataLimit", i32 0}
+// RV32: [[META3]] = !{!"vlenb"}
 //.
-// RV64: !0 = !{i32 1, !"wchar_size", i32 4}
-// RV64: !1 = !{i32 1, !"target-abi", !"lp64d"}
-// RV64: !2 = !{i32 8, !"SmallDataLimit", i32 0}
-// RV64: !3 = !{!"vlenb"}
+// RV64: [[META0:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
+// RV64: [[META1:![0-9]+]] = !{i32 1, !"target-abi", !"lp64d"}
+// RV64: [[META2:![0-9]+]] = !{i32 8, !"SmallDataLimit", i32 0}
+// RV64: [[META3]] = !{!"vlenb"}
 //.
diff --git a/llvm/lib/Transforms/IPO/FunctionAttrs.cpp b/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
index 7c277518b21db..9ce9f8451a95f 100644
--- a/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
+++ b/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
@@ -1629,7 +1629,10 @@ static void addNoRecurseAttrs(const SCCNodeSet &SCCNodes,
     for (auto &I : BB.instructionsWithoutDebug())
       if (auto *CB = dyn_cast<CallBase>(&I)) {
         Function *Callee = CB->getCalledFunction();
-        if (!Callee || Callee == F || !Callee->doesNotRecurse())
+        if (!Callee || Callee == F ||
+            (!Callee->doesNotRecurse() &&
+             !(Callee->isDeclaration() &&
+               Callee->hasFnAttribute(Attribute::NoCallback))))
           // Function calls a potentially recursive function.
           return;
       }
diff --git a/llvm/test/Analysis/TypeBasedAliasAnalysis/functionattrs.ll b/llvm/test/Analysis/TypeBasedAliasAnalysis/functionattrs.ll
index 86e7f8c113d1d..bea56a72bdeae 100644
--- a/llvm/test/Analysis/TypeBasedAliasAnalysis/functionattrs.ll
+++ b/llvm/test/Analysis/TypeBasedAliasAnalysis/functionattrs.ll
@@ -43,13 +43,13 @@ define void @test1_no(ptr %p) nounwind {
 ; This is unusual, since the function is memcpy, but as above, this
 ; isn't necessarily invalid.
 
-; CHECK: define void @test2_yes(ptr nocapture %p, ptr nocapture %q, i64 %n) #4 {
+; CHECK: define void @test2_yes(ptr nocapture %p, ptr nocapture %q, i64 %n) #0 {
 define void @test2_yes(ptr %p, ptr %q, i64 %n) nounwind {
   call void @llvm.memcpy.p0.p0.i64(ptr %p, ptr %q, i64 %n, i1 false), !tbaa !1
   ret void
 }
 
-; CHECK: define void @test2_no(ptr nocapture writeonly %p, ptr nocapture readonly %q, i64 %n) #5 {
+; CHECK: define void @test2_no(ptr nocapture writeonly %p, ptr nocapture readonly %q, i64 %n) #4 {
 define void @test2_no(ptr %p, ptr %q, i64 %n) nounwind {
   call void @llvm.memcpy.p0.p0.i64(ptr %p, ptr %q, i64 %n, i1 false), !tbaa !2
   ret void
@@ -63,7 +63,7 @@ define i32 @test3_yes(ptr %p) nounwind {
   ret i32 %t
 }
 
-; CHECK: define i32 @test3_no(ptr nocapture %p) #6 {
+; CHECK: define i32 @test3_no(ptr nocapture %p) #4 {
 define i32 @test3_no(ptr %p) nounwind {
   %t = va_arg ptr %p, i32, !tbaa !2
   ret i32 %t
@@ -76,10 +76,8 @@ declare void @llvm.memcpy.p0.p0.i64(ptr, ptr, i64, i1) nounwind
 ; CHECK: attributes #1 = { mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: write) }
 ; CHECK: attributes #2 = { nofree nosync nounwind memory(none) }
 ; CHECK: attributes #3 = { nounwind }
-; CHECK: attributes #4 = { mustprogress nofree nosync nounwind willreturn memory(none) }
-; CHECK: attributes #5 = { mustprogress nofree nosync nounwind willreturn memory(argmem: readwrite) }
-; CHECK: attributes #6 = { mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) }
-; CHECK: attributes #7 = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
+; CHECK: attributes #4 = { mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) }
+; CHECK: attributes #5 = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
 
 ; Root note.
 !0 = !{ }
diff --git a/llvm/test/Transforms/FunctionAttrs/argmemonly.ll b/llvm/test/Transforms/FunctionAttrs/argmemonly.ll
index 7a968e4119b83..ea6392714bf6f 100644
--- a/llvm/test/Transforms/FunctionAttrs/argmemonly.ll
+++ b/llvm/test/Transforms/FunctionAttrs/argmemonly.ll
@@ -219,7 +219,7 @@ entry:
 }
 
 define void @test_memcpy_argonly(ptr %dst, ptr %src) {
-; FNATTRS: Function Attrs: mustprogress nofree nosync nounwind willreturn memory(argmem: readwrite)
+; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
 ; FNATTRS-LABEL: define void @test_memcpy_argonly
 ; FNATTRS-SAME: (ptr nocapture writeonly [[DST:%.*]], ptr nocapture readonly [[SRC:%.*]]) #[[ATTR9:[0-9]+]] {
 ; FNATTRS-NEXT:  entry:
@@ -243,7 +243,7 @@ declare void @llvm.memcpy.p0.p0.i64(ptr, ptr, i64, i1)
 @arr = global [32 x i8] zeroinitializer
 
 define void @test_memcpy_src_global(ptr %dst) {
-; FNATTRS: Function Attrs: mustprogress nofree nosync nounwind willreturn memory(readwrite, inaccessiblemem: none)
+; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(readwrite, inaccessiblemem: none)
 ; FNATTRS-LABEL: define void @test_memcpy_src_global
 ; FNATTRS-SAME: (ptr nocapture writeonly [[DST:%.*]]) #[[ATTR11:[0-9]+]] {
 ; FNATTRS-NEXT:  entry:
@@ -263,7 +263,7 @@ entry:
 }
 
 define void @test_memcpy_dst_global(ptr %src) {
-; FNATTRS: Function Attrs: mustprogress nofree nosync nounwind willreturn memory(readwrite, inaccessiblemem: none)
+; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(readwrite, inaccessiblemem: none)
 ; FNATTRS-LABEL: define void @test_memcpy_dst_global
 ; FNATTRS-SAME: (ptr nocapture readonly [[SRC:%.*]]) #[[ATTR11]] {
 ; FNATTRS-NEXT:  entry:
diff --git a/llvm/test/Transforms/FunctionAttrs/convergent.ll b/llvm/test/Transforms/FunctionAttrs/convergent.ll
index 0263e0ec22551..a0f4c07e43371 100644
--- a/llvm/test/Transforms/FunctionAttrs/convergent.ll
+++ b/llvm/test/Transforms/FunctionAttrs/convergent.ll
@@ -74,7 +74,7 @@ declare void @llvm.nvvm.barrier0() convergent
 
 define i32 @intrinsic() convergent {
   ; Implicitly convergent, because the intrinsic is convergent.
-; CHECK: Function Attrs: convergent nounwind
+; CHECK: Function Attrs: convergent norecurse nounwind
 ; CHECK-LABEL: define {{[^@]+}}@intrinsic
 ; CHECK-SAME: () #[[ATTR4:[0-9]+]] {
 ; CHECK-NEXT:    call void @llvm.nvvm.barrier0()
diff --git a/llvm/test/Transforms/FunctionAttrs/int_sideeffect.ll b/llvm/test/Transforms/FunctionAttrs/int_sideeffect.ll
index 9ba82e2dc1cce..0f087e1a05f79 100644
--- a/llvm/test/Transforms/FunctionAttrs/int_sideeffect.ll
+++ b/llvm/test/Transforms/FunctionAttrs/int_sideeffect.ll
@@ -7,7 +7,7 @@ declare void @llvm.sideeffect()
 ; is present.
 
 define void @test() {
-; CHECK: Function Attrs: mustprogress nofree nosync nounwind willreturn memory(inaccessiblemem: readwrite)
+; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(inaccessiblemem: readwrite)
 ; CHECK-LABEL: @test(
 ; CHECK-NEXT:    call void @llvm.sideeffect()
 ; CHECK-NEXT:    ret void
@@ -17,7 +17,7 @@ define void @test() {
 }
 
 define void @loop() {
-; CHECK: Function Attrs: nofree noreturn nosync nounwind memory(inaccessiblemem: readwrite)
+; CHECK: Function Attrs: nofree norecurse noreturn nosync nounwind memory(inaccessiblemem: readwrite)
 ; CHECK-LABEL: @loop(
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
diff --git a/llvm/test/Transforms/FunctionAttrs/make-buffer-rsrc.ll b/llvm/test/Transforms/FunctionAttrs/make-buffer-rsrc.ll
index 17072bc433fbb..bb9ef9156794e 100644
--- a/llvm/test/Transforms/FunctionAttrs/make-buffer-rsrc.ll
+++ b/llvm/test/Transforms/FunctionAttrs/make-buffer-rsrc.ll
@@ -6,7 +6,7 @@
 target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-ni:7:8"
 
 define amdgpu_kernel void @test_make_buffer_rsrc(ptr %p, ptr %q) {
-; FNATTRS: Function Attrs: mustprogress nofree nosync nounwind willreturn memory(argmem: readwrite)
+; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
 ; FNATTRS-LABEL: define {{[^@]+}}@test_make_buffer_rsrc
 ; FNATTRS-SAME: (ptr nocapture readonly [[P:%.*]], ptr nocapture writeonly [[Q:%.*]]) #[[ATTR0:[0-9]+]] {
 ; FNATTRS-NEXT:    [[P_RSRC:%.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr [[P]], i16 0, i32 4, i32 822243328)
diff --git a/llvm/test/Transforms/FunctionAttrs/nocapture.ll b/llvm/test/Transforms/FunctionAttrs/nocapture.ll
index a70d71e62c305..eb999d69d95f1 100644
--- a/llvm/test/Transforms/FunctionAttrs/nocapture.ll
+++ b/llvm/test/Transforms/FunctionAttrs/nocapture.ll
@@ -650,7 +650,7 @@ entry:
 }
 
 define void @nocaptureLaunder(ptr %p) {
-; FNATTRS: Function Attrs: mustprogress nofree nosync nounwind willreturn memory(argmem: write, inaccessiblemem: readwrite)
+; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: write, inaccessiblemem: readwrite)
 ; FNATTRS-LABEL: define void @nocaptureLaunder
 ; FNATTRS-SAME: (ptr nocapture writeonly [[P:%.*]]) #[[ATTR13:[0-9]+]] {
 ; FNATTRS-NEXT:  entry:
@@ -674,7 +674,7 @@ entry:
 
 @g2 = global ptr null
 define void @captureLaunder(ptr %p) {
-; FNATTRS: Function Attrs: mustprogress nofree nosync nounwind willreturn memory(write, argmem: none, inaccessiblemem: readwrite)
+; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(write, argmem: none, inaccessiblemem: readwrite)
 ; FNATTRS-LABEL: define void @captureLaunder
 ; FNATTRS-SAME: (ptr [[P:%.*]]) #[[ATTR14:[0-9]+]] {
 ; FNATTRS-NEXT:    [[B:%.*]] = call ptr @llvm.launder.invariant.group.p0(ptr [[P]])
@@ -694,7 +694,7 @@ define void @captureLaunder(ptr %p) {
 }
 
 define void @nocaptureStrip(ptr %p) {
-; FNATTRS: Function Attrs: mustprogress nofree nosync nounwind willreturn memory(argmem: write)
+; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: write)
 ; FNATTRS-LABEL: define void @nocaptureStrip
 ; FNATTRS-SAME: (ptr nocapture writeonly [[P:%.*]]) #[[ATTR15:[0-9]+]] {
 ; FNATTRS-NEXT:  entry:
@@ -718,9 +718,9 @@ entry:
 
 @g3 = global ptr null
 define void @captureStrip(ptr %p) {
-; FNATTRS: Function Attrs: mustprogress nofree nosync nounwind willreturn memory(write, argmem: none, inaccessiblemem: none)
+; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(write, argmem: none, inaccessiblemem: none)
 ; FNATTRS-LABEL: define void @captureStrip
-; FNATTRS-SAME: (ptr [[P:%.*]]) #[[ATTR16:[0-9]+]] {
+; FNATTRS-SAME: (ptr [[P:%.*]]) #[[ATTR1]] {
 ; FNATTRS-NEXT:    [[B:%.*]] = call ptr @llvm.strip.invariant.group.p0(ptr [[P]])
 ; FNATTRS-NEXT:    store ptr [[B]], ptr @g3, align 8
 ; FNATTRS-NEXT:    ret void
@@ -831,7 +831,7 @@ define i1 @nocaptureDereferenceableOrNullICmp(ptr dereferenceable_or_null(4) %x)
 define i1 @captureDereferenceableOrNullICmp(ptr dereferenceable_or_null(4) %x) null_pointer_is_valid {
 ; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind null_pointer_is_valid willreturn memory(none)
 ; FNATTRS-LABEL: define i1 @captureDereferenceableOrNullICmp
-; FNATTRS-SAME: (ptr readnone dereferenceable_or_null(4) [[X:%.*]]) #[[ATTR17:[0-9]+]] {
+; FNATTRS-SAME: (ptr readnone dereferenceable_or_null(4) [[X:%.*]]) #[[ATTR16:[0-9]+]] {
 ; FNATTRS-NEXT:    [[TMP1:%.*]] = icmp eq ptr [[X]], null
 ; FNATTRS-NEXT:    ret i1 [[TMP1]]
 ;
@@ -886,8 +886,8 @@ define void @recurse_fptr(ptr %f, ptr %p) {
 define void @readnone_indirec(ptr %f, ptr %p) {
 ; FNATTRS: Function Attrs: nofree nosync memory(none)
 ; FNATTRS-LABEL: define void @readnone_indirec
-; FNATTRS-SAME: (ptr nocapture readonly [[F:%.*]], ptr readnone [[P:%.*]]) #[[ATTR18:[0-9]+]] {
-; FNATTRS-NEXT:    call void [[F]](ptr [[P]]) #[[ATTR21:[0-9]+]]
+; FNATTRS-SAME: (ptr nocapture readonly [[F:%.*]], ptr readnone [[P:%.*]]) #[[ATTR17:[0-9]+]] {
+; FNATTRS-NEXT:    call void [[F]](ptr [[P]]) #[[ATTR20:[0-9]+]]
 ; FNATTRS-NEXT:    ret void
 ;
 ; ATTRIBUTOR: Function Attrs: nosync memory(none)
diff --git a/llvm/test/Transforms/FunctionAttrs/nofree-attributor.ll b/llvm/test/Transforms/FunctionAttrs/nofree-attributor.ll
index 0fe0eadf5f669..ed5534a24cbe8 100644
--- a/llvm/test/Transforms/FunctionAttrs/nofree-attributor.ll
+++ b/llvm/test/Transforms/FunctionAttrs/nofree-attributor.ll
@@ -225,9 +225,9 @@ define void @call_both() #0 {
 declare float @llvm.floor.f32(float)
 
 define void @call_floor(float %a) #0 {
-; FNATTR: Function Attrs: mustprogress nofree noinline nosync nounwind willreturn memory(none) uwtable
+; FNATTR: Function Attrs: mustprogress nofree noinline norecurse nosync nounwind willreturn memory(none) uwtable
 ; FNATTR-LABEL: define {{[^@]+}}@call_floor
-; FNATTR-SAME: (float [[A:%.*]]) #[[ATTR7:[0-9]+]] {
+; FNATTR-SAME: (float [[A:%.*]]) #[[ATTR3]] {
 ; FNATTR-NEXT:    [[TMP1:%.*]] = tail call float @llvm.floor.f32(float [[A]])
 ; FNATTR-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/FunctionAttrs/norecurse.ll b/llvm/test/Transforms/FunctionAttrs/norecurse.ll
index e1c624dc2ce50..7924428fb4989 100644
--- a/llvm/test/Transforms/FunctionAttrs/norecurse.ll
+++ b/llvm/test/Transforms/FunctionAttrs/norecurse.ll
@@ -73,7 +73,7 @@ define i32 @extern() {
 ; ATTRIBUTOR: Function Attrs: nosync memory(none)
 ; ATTRIBUTOR-LABEL: define {{[^@]+}}@extern
 ; ATTRIBUTOR-SAME: () #[[ATTR2:[0-9]+]] {
-; ATTRIBUTOR-NEXT:    [[A:%.*]] = call i32 @k()
+; ATTRIBUTOR-NEXT:    [[A:%.*]] = call i32 @k() #[[ATTR7:[0-9]+]]
 ; ATTRIBUTOR-NEXT:    ret i32 [[A]]
 ;
   %a = call i32 @k()
@@ -83,7 +83,7 @@ define i32 @extern() {
 declare i32 @k() readnone
 
 define void @intrinsic(ptr %dest, ptr %src, i32 %len) {
-; FNATTRS: Function Attrs: mustprogress nofree nosync nounwind willreturn memory(argmem: readwrite)
+; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
 ; FNATTRS-LABEL: define {{[^@]+}}@intrinsic
 ; FNATTRS-SAME: (ptr nocapture writeonly [[DEST:%.*]], ptr nocapture readonly [[SRC:%.*]], i32 [[LEN:%.*]]) #[[ATTR4:[0-9]+]] {
 ; FNATTRS-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr [[DEST]], ptr [[SRC]], i32 [[LEN]], i1 false)
@@ -92,7 +92,7 @@ define void @intrinsic(ptr %dest, ptr %src, i32 %len) {
 ; ATTRIBUTOR: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
 ; ATTRIBUTOR-LABEL: define {{[^@]+}}@intrinsic
 ; ATTRIBUTOR-SAME: (ptr nocapture nofree writeonly [[DEST:%.*]], ptr nocapture nofree readonly [[SRC:%.*]], i32 [[LEN:%.*]]) #[[ATTR4:[0-9]+]] {
-; ATTRIBUTOR-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr nocapture writeonly [[DEST]], ptr nocapture readonly [[SRC]], i32 [[LEN]], i1 false) #[[ATTR7:[0-9]+]]
+; ATTRIBUTOR-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr nocapture writeonly [[DEST]], ptr nocapture readonly [[SRC]], i32 [[LEN]], i1 false) #[[ATTR8:[0-9]+]]
 ; ATTRIBUTOR-NEXT:    ret void
 ;
   call void @llvm.memcpy.p0.p0.i32(ptr %dest, ptr %src, i32 %len, i1 false)
@@ -111,7 +111,7 @@ define internal i32 @called_by_norecurse() {
 ; ATTRIBUTOR: Function Attrs: nosync memory(none)
 ; ATTRIBUTOR-LABEL: define {{[^@]+}}@called_by_norecurse
 ; ATTRIBUTOR-SAME: () #[[ATTR2]] {
-; ATTRIBUTOR-NEXT:    [[A:%.*]] = call i32 @k()
+; ATTRIBUTOR-NEXT:    [[A:%.*]] = call i32 @k() #[[ATTR7]]
 ; ATTRIBUTOR-NEXT:    ret i32 [[A]]
 ;
   %a = call i32 @k()
@@ -145,7 +145,7 @@ define internal i32 @called_by_norecurse_indirectly() {
 ; ATTRIBUTOR: Function Attrs: nosync memory(none)
 ; ATTRIBUTOR-LABEL: define {{[^@]+}}@called_by_norecurse_indirectly
 ; ATTRIBUTOR-SAME: () #[[ATTR2]] {
-; ATTRIBUTOR-NEXT:    [[A:%.*]] = call i32 @k()
+; ATTRIBUTOR-NEXT:    [[A:%.*]] = call i32 @k() #[[ATTR7]]
 ; ATTRIBUTOR-NEXT:    ret i32 [[A]]
 ;
   %a = call i32 @k()
@@ -196,7 +196,7 @@ define internal i32 @escapes_as_parameter(ptr %p) {
 ; ATTRIBUTOR: Function Attrs: nosync memory(none)
 ; ATTRIBUTOR-LABEL: define {{[^@]+}}@escapes_as_parameter
 ; ATTRIBUTOR-SAME: (ptr nocapture nofree readnone [[P:%.*]]) #[[ATTR2]] {
-; ATTRIBUTOR-NEXT:    [[A:%.*]] = call i32 @k()
+; ATTRIBUTOR-NEXT:    [[A:%.*]] = call i32 @k() #[[ATTR7]]
 ; ATTRIBUTOR-NEXT:    ret i32 [[A]]
 ;
   %a = call i32 @k()
@@ -241,7 +241,7 @@ define void @r() norecurse {
 ; FNATTRS: attributes #[[ATTR1]] = { nofree nosync nounwind memory(none) }
 ; FNATTRS: attributes #[[ATTR2]] = { nofree nosync memory(none) }
 ; FNATTRS: attributes #[[ATTR3:[0-9]+]] = { memory(none) }
-; FNATTRS: attributes #[[ATTR4]] = { mustprogress nofree nosync nounwind willreturn memory(argmem: readwrite) }
+; FNATTRS: attributes #[[ATTR4]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) }
 ; FNATTRS: attributes #[[ATTR5:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
 ; FNATTRS: attributes #[[ATTR6]] = { nofree norecurse nosync memory(none) }
 ;.
@@ -252,5 +252,6 @@ define void @r() norecurse {
 ; ATTRIBUTOR: attributes #[[ATTR4]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) }
 ; ATTRIBUTOR: attributes #[[ATTR5:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
 ; ATTRIBUTOR: attributes #[[ATTR6]] = { norecurse nosync memory(none) }
-; ATTRIBUTOR: attributes #[[ATTR7]] = { nofree willreturn }
+; ATTRIBUTOR: attributes #[[ATTR7]] = { nosync }
+; ATTRIBUTOR: attributes #[[ATTR8]] = { nofree willreturn }
 ;.
diff --git a/llvm/test/Transforms/FunctionAttrs/nosync.ll b/llvm/test/Transforms/FunctionAttrs/nosync.ll
index 5950f9e626c41..de5398f17ce51 100644
--- a/llvm/test/Transforms/FunctionAttrs/nosync.ll
+++ b/llvm/test/Transforms/FunctionAttrs/nosync.ll
@@ -236,7 +236,7 @@ declare void @llvm.memset(ptr %dest, i8 %val, i32 %len, i1 %isvolatile)
 
 ; negative, checking volatile intrinsics.
 define i32 @memcpy_volatile(ptr %ptr1, ptr %ptr2) {
-; CHECK: Function Attrs: mustprogress nofree nounwind willreturn memory(argmem: readwrite)
+; CHECK: Function Attrs: mustprogress nofree norecurse nounwind willreturn memory(argmem: readwrite)
 ; CHECK-LABEL: @memcpy_volatile(
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr [[PTR1:%.*]], ptr [[PTR2:%.*]], i32 8, i1 true)
 ; CHECK-NEXT:    ret i32 4
@@ -247,7 +247,7 @@ define i32 @memcpy_volatile(ptr %ptr1, ptr %ptr2) {
 
 ; positive, non-volatile intrinsic.
 define i32 @memset_non_volatile(ptr %ptr1, i8 %val) {
-; CHECK: Function Attrs: mustprogress nofree nosync nounwind willreturn memory(argmem: write)
+; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: write)
 ; CHECK-LABEL: @memset_non_volatile(
 ; CHECK-NEXT:    call void @llvm.memset.p0.i32(ptr [[PTR1:%.*]], i8 [[VAL:%.*]], i32 8, i1 false)
 ; CHECK-NEXT:    ret i32 4
@@ -298,7 +298,7 @@ define void @i_totally_sync() {
 declare float @llvm.cos(float %val) readnone
 
 define float @cos_test(float %x) {
-; CHECK: Function Attrs: mustprogress nofree nosync nounwind willreturn memory(none)
+; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
 ; CHECK-LABEL: @cos_test(
 ; CHECK-NEXT:    [[C:%.*]] = call float @llvm.cos.f32(float [[X:%.*]])
 ; CHECK-NEXT:    ret float [[C]]
diff --git a/llvm/test/Transforms/FunctionAttrs/readattrs.ll b/llvm/test/Transforms/FunctionAttrs/readattrs.ll
index 0986f74c181d9..39513976f90d7 100644
--- a/llvm/test/Transforms/FunctionAttrs/readattrs.ll
+++ b/llvm/test/Transforms/FunctionAttrs/readattrs.ll
@@ -251,7 +251,7 @@ entry:
 declare void @llvm.masked.scatter.v4i32.v4p0(<4 x i32>%val, <4 x ptr>, i32, <4 x i1>)
 
 define void @test9(<4 x ptr> %ptrs, <4 x i32>%val) {
-; FNATTRS: Function Attrs: mustprogress nofree nosync nounwind willreturn memory(write)
+; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(write)
 ; FNATTRS-LABEL: define {{[^@]+}}@test9
 ; FNATTRS-SAME: (<4 x ptr> [[PTRS:%.*]], <4 x i32> [[VAL:%.*]]) #[[ATTR7:[0-9]+]] {
 ; FNATTRS-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[VAL]], <4 x ptr> [[PTRS]], i32 4, <4 x i1> <i1 true, i1 false, i1 true, i1 false>)
@@ -275,7 +275,7 @@ define void @test9(<4 x ptr> %ptrs, <4 x i32>%val) {
 
 declare <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x i32>)
 define <4 x i32> @test10(<4 x ptr> %ptrs) {
-; FNATTRS: Function Attrs: mustprogress nofree nosync nounwind willreturn memory(read)
+; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(read)
 ; FNATTRS-LABEL: define {{[^@]+}}@test10
 ; FNATTRS-SAME: (<4 x ptr> [[PTRS:%.*]]) #[[ATTR9:[0-9]+]] {
 ; FNATTRS-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[PTRS]], i32 4, <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x i32> undef)
diff --git a/llvm/test/Transforms/FunctionAttrs/writeonly.ll b/llvm/test/Transforms/FunctionAttrs/writeonly.ll
index 5b20300610d81..de2d5e2238947 100644
--- a/llvm/test/Transforms/FunctionAttrs/writeonly.ll
+++ b/llvm/test/Transforms/FunctionAttrs/writeonly.ll
@@ -179,9 +179,9 @@ define void @test_atomicrmw(ptr %p) {
 }
 
 define void @test_ptrmask(ptr %p) {
-; FNATTRS: Function Attrs: mustprogress nofree nosync nounwind willreturn memory(argmem: write)
+; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: write)
 ; FNATTRS-LABEL: define {{[^@]+}}@test_ptrmask
-; FNATTRS-SAME: (ptr writeonly [[P:%.*]]) #[[ATTR8:[0-9]+]] {
+; FNATTRS-SAME: (ptr writeonly [[P:%.*]]) #[[ATTR3]] {
 ; FNATTRS-NEXT:    [[MASK:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[P]], i64 -5)
 ; FNATTRS-NEXT:    store i8 0, ptr [[MASK]], align 1
 ; FNATTRS-NEXT:    ret void
@@ -218,7 +218,7 @@ declare void @direct2_callee(ptr %p) writeonly
 define void @direct2(ptr %p) {
 ; FNATTRS: Function Attrs: memory(write)
 ; FNATTRS-LABEL: define {{[^@]+}}@direct2
-; FNATTRS-SAME: (ptr [[P:%.*]]) #[[ATTR10:[0-9]+]] {
+; FNATTRS-SAME: (ptr [[P:%.*]]) #[[ATTR9:[0-9]+]] {
 ; FNATTRS-NEXT:    call void @direct2_callee(ptr [[P]])
 ; FNATTRS-NEXT:    ret void
 ;
@@ -236,7 +236,7 @@ define void @direct2(ptr %p) {
 define void @direct2b(ptr %p) {
 ; FNATTRS: Function Attrs: memory(write)
 ; FNATTRS-LABEL: define {{[^@]+}}@direct2b
-; FNATTRS-SAME: (ptr nocapture writeonly [[P:%.*]]) #[[ATTR10]] {
+; FNATTRS-SAME: (ptr nocapture writeonly [[P:%.*]]) #[[ATTR9]] {
 ; FNATTRS-NEXT:    call void @direct2_callee(ptr nocapture [[P]])
 ; FNATTRS-NEXT:    ret void
 ;
@@ -325,8 +325,8 @@ define void @fptr_test2(ptr %p, ptr %f) {
 define void @fptr_test3(ptr %p, ptr %f) {
 ; FNATTRS: Function Attrs: memory(write)
 ; FNATTRS-LABEL: define {{[^@]+}}@fptr_test3
-; FNATTRS-SAME: (ptr nocapture writeonly [[P:%.*]], ptr nocapture readonly [[F:%.*]]) #[[ATTR10]] {
-; FNATTRS-NEXT:    call void [[F]](ptr nocapture [[P]]) #[[ATTR10]]
+; FNATTRS-SAME: (ptr nocapture writeonly [[P:%.*]], ptr nocapture readonly [[F:%.*]]) #[[ATTR9]] {
+; FNATTRS-NEXT:    call void [[F]](ptr nocapture [[P]]) #[[ATTR9]]
 ; FNATTRS-NEXT:    ret void
 ;
 ; ATTRIBUTOR: Function Attrs: memory(write)
@@ -342,7 +342,7 @@ define void @fptr_test3(ptr %p, ptr %f) {
 define void @test_argmem_none_callee(ptr %p) {
 ; FNATTRS-LABEL: define {{[^@]+}}@test_argmem_none_callee
 ; FNATTRS-SAME: (ptr nocapture readnone [[P:%.*]]) {
-; FNATTRS-NEXT:    call void @direct1_callee(ptr nocapture [[P]]) #[[ATTR11:[0-9]+]]
+; FNATTRS-NEXT:    call void @direct1_callee(ptr nocapture [[P]]) #[[ATTR10:[0-9]+]]
 ; FNATTRS-NEXT:    ret void
 ;
 ; ATTRIBUTOR-LABEL: define {{[^@]+}}@test_argmem_none_callee
@@ -357,7 +357,7 @@ define void @test_argmem_none_callee(ptr %p) {
 define void @test_argmem_read_callee(ptr %p) {
 ; FNATTRS-LABEL: define {{[^@]+}}@test_argmem_read_callee
 ; FNATTRS-SAME: (ptr nocapture readonly [[P:%.*]]) {
-; FNATTRS-NEXT:    call void @direct1_callee(ptr nocapture [[P]]) #[[ATTR12:[0-9]+]]
+; FNATTRS-NEXT:    call void @direct1_callee(ptr nocapture [[P]]) #[[ATTR11:[0-9]+]]
 ; FNATTRS-NEXT:    ret void
 ;
 ; ATTRIBUTOR-LABEL: define {{[^@]+}}@test_argmem_read_callee
@@ -372,7 +372,7 @@ define void @test_argmem_read_callee(ptr %p) {
 define void @test_argmem_write_callee(ptr %p) {
 ; FNATTRS-LABEL: define {{[^@]+}}@test_argmem_write_callee
 ; FNATTRS-SAME: (ptr nocapture writeonly [[P:%.*]]) {
-; FNATTRS-NEXT:    call void @direct1_callee(ptr nocapture [[P]]) #[[ATTR13:[0-9]+]]
+; FNATTRS-NEXT:    call void @direct1_callee(ptr nocapture [[P]]) #[[ATTR12:[0-9]+]]
 ; FNATTRS-NEXT:    ret void
 ;
 ; ATTRIBUTOR-LABEL: define {{[^@]+}}@test_argmem_write_callee

From 01bf29b9d04e047096b34acc7e4ad1aff97f1a43 Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Tue, 26 Dec 2023 11:21:06 -0800
Subject: [PATCH 295/342] [sanitizers] Optimize locking StackDepotBase for fork
 (#76280)

Locking StackDepotBase fully is very expensive, as 2^20 buckets needs to
be locked. Not locking, but only unlocking buckets, needed to be
unlocked to avoid deadlocks, increases a chance of data race, when the
value with same hash can be inserted into table twice, but one is lost.
However this is just a small additional memory usage by forked process.
---
 .../lib/sanitizer_common/sanitizer_flat_map.h |  4 ++++
 .../sanitizer_stackdepotbase.h                | 22 +++++++++++++++----
 2 files changed, 22 insertions(+), 4 deletions(-)

diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_flat_map.h b/compiler-rt/lib/sanitizer_common/sanitizer_flat_map.h
index 8bb8304910c73..d246781fe1df5 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_flat_map.h
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_flat_map.h
@@ -109,6 +109,10 @@ class TwoLevelMap {
     return *AddressSpaceView::LoadWritable(&map2[idx % kSize2]);
   }
 
+  void Lock() SANITIZER_NO_THREAD_SAFETY_ANALYSIS { mu_.Lock(); }
+
+  void Unlock() SANITIZER_NO_THREAD_SAFETY_ANALYSIS { mu_.Unlock(); }
+
  private:
   constexpr uptr MmapSize() const {
     return RoundUpTo(kSize2 * sizeof(T), GetPageSizeCached());
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_stackdepotbase.h b/compiler-rt/lib/sanitizer_common/sanitizer_stackdepotbase.h
index 21d57d9ab2a91..279bc5de3bb93 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_stackdepotbase.h
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_stackdepotbase.h
@@ -161,18 +161,32 @@ StackDepotBase<Node, kReservedBits, kTabSizeLog>::Get(u32 id) {
 
 template <class Node, int kReservedBits, int kTabSizeLog>
 void StackDepotBase<Node, kReservedBits, kTabSizeLog>::LockBeforeFork() {
-  for (int i = 0; i < kTabSize; ++i) {
-    lock(&tab[i]);
-  }
+  // Do not lock hash table. It's very expensive, but it's not rely needed. The
+  // parent process will neither lock nor unlock. Child process risks to be
+  // deadlocked on already locked buckets. To avoid deadlock we will unlock
+  // every locked buckets in `UnlockAfterFork`. This may affect consistency of
+  // the hash table, but the only issue is a few items inserted by parent
+  // process will be not found by child, and the child may insert them again,
+  // wasting some space in `stackStore`.
+
+  // We still need to lock nodes.
+  nodes.Lock();
 }
 
 template <class Node, int kReservedBits, int kTabSizeLog>
 void StackDepotBase<Node, kReservedBits, kTabSizeLog>::UnlockAfterFork(
     bool fork_child) {
+  nodes.Unlock();
+
+  // Only unlock in child process to avoid deadlock. See `LockBeforeFork`.
+  if (!fork_child)
+    return;
+
   for (int i = 0; i < kTabSize; ++i) {
     atomic_uint32_t *p = &tab[i];
     uptr s = atomic_load(p, memory_order_relaxed);
-    unlock(p, s & kUnlockMask);
+    if (s & kLockMask)
+      unlock(p, s & kUnlockMask);
   }
 }
 

From 1022febd9df30abbd5c490b94290c4422ca15b01 Mon Sep 17 00:00:00 2001
From: Abhinav271828 <71174780+Abhinav271828@users.noreply.github.com>
Date: Wed, 27 Dec 2023 00:59:26 +0530
Subject: [PATCH 296/342] [MLIR][Presburger] Generating functions and
 quasi-polynomials for Barvinok's algorithm (#75702)

Define basic types and classes for Barvinok's algorithm, including
polyhedra, generating functions and quasi-polynomials.
The class definitions include methods for arithmetic manipulation,
printing, logical relations, etc.
---
 .../Analysis/Presburger/QuasiPolynomial.h     |  71 +++++++++
 mlir/lib/Analysis/Presburger/CMakeLists.txt   |   1 +
 .../Analysis/Presburger/GeneratingFunction.h  | 132 +++++++++++++++++
 .../Analysis/Presburger/QuasiPolynomial.cpp   | 113 ++++++++++++++
 .../Analysis/Presburger/CMakeLists.txt        |   1 +
 .../Presburger/QuasiPolynomialTest.cpp        | 140 ++++++++++++++++++
 mlir/unittests/Analysis/Presburger/Utils.h    |  23 +++
 7 files changed, 481 insertions(+)
 create mode 100644 mlir/include/mlir/Analysis/Presburger/QuasiPolynomial.h
 create mode 100644 mlir/lib/Analysis/Presburger/GeneratingFunction.h
 create mode 100644 mlir/lib/Analysis/Presburger/QuasiPolynomial.cpp
 create mode 100644 mlir/unittests/Analysis/Presburger/QuasiPolynomialTest.cpp

diff --git a/mlir/include/mlir/Analysis/Presburger/QuasiPolynomial.h b/mlir/include/mlir/Analysis/Presburger/QuasiPolynomial.h
new file mode 100644
index 0000000000000..f8ce8524e41b2
--- /dev/null
+++ b/mlir/include/mlir/Analysis/Presburger/QuasiPolynomial.h
@@ -0,0 +1,71 @@
+//===- QuasiPolynomial.h - QuasiPolynomial Class ----------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Definition of the QuasiPolynomial class for Barvinok's algorithm,
+// which represents a single-valued function on a set of parameters.
+// It is an expression of the form
+// f(x) = \sum_i c_i * \prod_j ⌊g_{ij}(x)⌋
+// where c_i \in Q and
+// g_{ij} : Q^d -> Q are affine functionals over d parameters.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_ANALYSIS_PRESBURGER_QUASIPOLYNOMIAL_H
+#define MLIR_ANALYSIS_PRESBURGER_QUASIPOLYNOMIAL_H
+
+#include "mlir/Analysis/Presburger/Fraction.h"
+#include "mlir/Analysis/Presburger/PresburgerSpace.h"
+
+namespace mlir {
+namespace presburger {
+
+// A class to describe quasi-polynomials.
+// A quasipolynomial consists of a set of terms.
+// The ith term is a constant `coefficients[i]`, multiplied
+// by the product of a set of affine functions on n parameters.
+// Represents functions f : Q^n -> Q of the form
+//
+// f(x) = \sum_i c_i * \prod_j ⌊g_{ij}(x)⌋
+//
+// where c_i \in Q and
+// g_{ij} : Q^n -> Q are affine functionals.
+class QuasiPolynomial : public PresburgerSpace {
+public:
+  QuasiPolynomial(unsigned numVars, SmallVector<Fraction> coeffs = {},
+                  std::vector<std::vector<SmallVector<Fraction>>> aff = {});
+
+  // Find the number of inputs (numDomain) to the polynomial.
+  // numSymbols is set to zero.
+  unsigned getNumInputs() const {
+    return getNumDomainVars() + getNumSymbolVars();
+  }
+
+  const SmallVector<Fraction> &getCoefficients() const { return coefficients; }
+
+  const std::vector<std::vector<SmallVector<Fraction>>> &getAffine() const {
+    return affine;
+  }
+
+  // Arithmetic operations.
+  QuasiPolynomial operator+(const QuasiPolynomial &x) const;
+  QuasiPolynomial operator-(const QuasiPolynomial &x) const;
+  QuasiPolynomial operator*(const QuasiPolynomial &x) const;
+  QuasiPolynomial operator/(const Fraction x) const;
+
+  // Removes terms which evaluate to zero from the expression.
+  QuasiPolynomial simplify();
+
+private:
+  SmallVector<Fraction> coefficients;
+  std::vector<std::vector<SmallVector<Fraction>>> affine;
+};
+
+} // namespace presburger
+} // namespace mlir
+
+#endif // MLIR_ANALYSIS_PRESBURGER_QUASIPOLYNOMIAL_H
\ No newline at end of file
diff --git a/mlir/lib/Analysis/Presburger/CMakeLists.txt b/mlir/lib/Analysis/Presburger/CMakeLists.txt
index 22f1a4cac4405..e77e1623dae17 100644
--- a/mlir/lib/Analysis/Presburger/CMakeLists.txt
+++ b/mlir/lib/Analysis/Presburger/CMakeLists.txt
@@ -6,6 +6,7 @@ add_mlir_library(MLIRPresburger
   PresburgerRelation.cpp
   PresburgerSpace.cpp
   PWMAFunction.cpp
+  QuasiPolynomial.cpp
   Simplex.cpp
   SlowMPInt.cpp
   Utils.cpp
diff --git a/mlir/lib/Analysis/Presburger/GeneratingFunction.h b/mlir/lib/Analysis/Presburger/GeneratingFunction.h
new file mode 100644
index 0000000000000..8676b84c1c4df
--- /dev/null
+++ b/mlir/lib/Analysis/Presburger/GeneratingFunction.h
@@ -0,0 +1,132 @@
+//===- GeneratingFunction.h - Generating Functions over Q^d -----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Definition of the GeneratingFunction class for Barvinok's algorithm,
+// which represents a function over Q^n, parameterized by d parameters.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_ANALYSIS_PRESBURGER_GENERATINGFUNCTION_H
+#define MLIR_ANALYSIS_PRESBURGER_GENERATINGFUNCTION_H
+
+#include "mlir/Analysis/Presburger/Fraction.h"
+#include "mlir/Analysis/Presburger/Matrix.h"
+
+namespace mlir {
+namespace presburger {
+
+// A parametric point is a vector, each of whose elements
+// is an affine function of n parameters. Each row
+// in the matrix represents the affine function and
+// has n+1 elements.
+using ParamPoint = FracMatrix;
+
+// A point is simply a vector.
+using Point = SmallVector<Fraction>;
+
+// A class to describe the type of generating function
+// used to enumerate the integer points in a polytope.
+// Consists of a set of terms, where the ith term has
+// * a sign, ±1, stored in `signs[i]`
+// * a numerator, of the form x^{n},
+//      where n, stored in `numerators[i]`,
+//      is a parametric point.
+// * a denominator, of the form (1 - x^{d1})...(1 - x^{dn}),
+//      where each dj, stored in `denominators[i][j]`,
+//      is a vector.
+//
+// Represents functions f_p : Q^n -> Q of the form
+//
+// f_p(x) = \sum_i s_i * (x^n_i(p)) / (\prod_j (1 - x^d_{ij})
+//
+// where s_i is ±1,
+// n_i \in Q^d -> Q^n is an n-vector of affine functions on d parameters, and
+// g_{ij} \in Q^n are vectors.
+class GeneratingFunction {
+public:
+  GeneratingFunction(unsigned numParam, SmallVector<int, 8> signs,
+                     std::vector<ParamPoint> nums,
+                     std::vector<std::vector<Point>> dens)
+      : numParam(numParam), signs(signs), numerators(nums), denominators(dens) {
+    for (const ParamPoint &term : numerators)
+      assert(term.getNumColumns() == numParam + 1 &&
+             "dimensionality of numerator exponents does not match number of "
+             "parameters!");
+  }
+
+  unsigned getNumParams() { return numParam; }
+
+  SmallVector<int> getSigns() { return signs; }
+
+  std::vector<ParamPoint> getNumerators() { return numerators; }
+
+  std::vector<std::vector<Point>> getDenominators() { return denominators; }
+
+  GeneratingFunction operator+(const GeneratingFunction &gf) const {
+    assert(numParam == gf.getNumParams() &&
+           "two generating functions with different numbers of parameters "
+           "cannot be added!");
+    SmallVector<int> sumSigns = signs;
+    sumSigns.append(gf.signs);
+
+    std::vector<ParamPoint> sumNumerators = numerators;
+    sumNumerators.insert(sumNumerators.end(), gf.numerators.begin(),
+                         gf.numerators.end());
+
+    std::vector<std::vector<Point>> sumDenominators = denominators;
+    sumDenominators.insert(sumDenominators.end(), gf.denominators.begin(),
+                           gf.denominators.end());
+    return GeneratingFunction(sumSigns, sumNumerators, sumDenominators);
+  }
+
+  llvm::raw_ostream &print(llvm::raw_ostream &os) const {
+    for (unsigned i = 0, e = signs.size(); i < e; i++) {
+      if (i == 0) {
+        if (signs[i] == -1)
+          os << "- ";
+      } else {
+        if (signs[i] == 1)
+          os << " + ";
+        else
+          os << " - ";
+      }
+
+      os << "x^[";
+      unsigned r = numerators[i].getNumRows();
+      for (unsigned j = 0; j < r - 1; j++) {
+        os << "[";
+        for (unsigned k = 0, c = numerators[i].getNumColumns(); k < c - 1; k++)
+          os << numerators[i].at(j, k) << ",";
+        os << numerators[i].getRow(j).back() << "],";
+      }
+      os << "[";
+      for (unsigned k = 0, c = numerators[i].getNumColumns(); k < c - 1; k++)
+        os << numerators[i].at(r - 1, k) << ",";
+      os << numerators[i].getRow(r - 1).back() << "]]/";
+
+      for (const Point &den : denominators[i]) {
+        os << "(x^[";
+        for (unsigned j = 0, e = den.size(); j < e - 1; j++)
+          os << den[j] << ",";
+        os << den.back() << "])";
+      }
+    }
+    return os;
+  }
+
+private:
+  unsigned numParam;
+  SmallVector<int, 8> signs;
+  std::vector<ParamPoint> numerators;
+  std::vector<std::vector<Point>> denominators;
+};
+
+} // namespace presburger
+} // namespace mlir
+
+#endif // MLIR_ANALYSIS_PRESBURGER_GENERATINGFUNCTION_H
\ No newline at end of file
diff --git a/mlir/lib/Analysis/Presburger/QuasiPolynomial.cpp b/mlir/lib/Analysis/Presburger/QuasiPolynomial.cpp
new file mode 100644
index 0000000000000..902e3ced472f8
--- /dev/null
+++ b/mlir/lib/Analysis/Presburger/QuasiPolynomial.cpp
@@ -0,0 +1,113 @@
+//===- QuasiPolynomial.cpp - Quasipolynomial Class --------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Analysis/Presburger/QuasiPolynomial.h"
+#include "mlir/Analysis/Presburger/Fraction.h"
+#include "mlir/Analysis/Presburger/PresburgerSpace.h"
+#include "mlir/Analysis/Presburger/Utils.h"
+
+using namespace mlir;
+using namespace presburger;
+
+QuasiPolynomial::QuasiPolynomial(
+    unsigned numVars, SmallVector<Fraction> coeffs,
+    std::vector<std::vector<SmallVector<Fraction>>> aff)
+    : PresburgerSpace(/*numDomain=*/numVars, /*numRange=*/1, /*numSymbols=*/0,
+                      /*numLocals=*/0),
+      coefficients(coeffs), affine(aff) {
+  // For each term which involves at least one affine function,
+  for (const std::vector<SmallVector<Fraction>> &term : affine) {
+    if (term.size() == 0)
+      continue;
+    // the number of elements in each affine function is
+    // one more than the number of symbols.
+    for (const SmallVector<Fraction> &aff : term) {
+      assert(aff.size() == getNumInputs() + 1 &&
+             "dimensionality of affine functions does not match number of "
+             "symbols!");
+    }
+  }
+}
+
+QuasiPolynomial QuasiPolynomial::operator+(const QuasiPolynomial &x) const {
+  assert(getNumInputs() == x.getNumInputs() &&
+         "two quasi-polynomials with different numbers of symbols cannot "
+         "be added!");
+  SmallVector<Fraction> sumCoeffs = coefficients;
+  sumCoeffs.append(x.coefficients);
+  std::vector<std::vector<SmallVector<Fraction>>> sumAff = affine;
+  sumAff.insert(sumAff.end(), x.affine.begin(), x.affine.end());
+  return QuasiPolynomial(getNumInputs(), sumCoeffs, sumAff);
+}
+
+QuasiPolynomial QuasiPolynomial::operator-(const QuasiPolynomial &x) const {
+  assert(getNumInputs() == x.getNumInputs() &&
+         "two quasi-polynomials with different numbers of symbols cannot "
+         "be subtracted!");
+  QuasiPolynomial qp(getNumInputs(), x.coefficients, x.affine);
+  for (Fraction &coeff : qp.coefficients)
+    coeff = -coeff;
+  return *this + qp;
+}
+
+QuasiPolynomial QuasiPolynomial::operator*(const QuasiPolynomial &x) const {
+  assert(getNumInputs() == x.getNumInputs() &&
+         "two quasi-polynomials with different numbers of "
+         "symbols cannot be multiplied!");
+
+  SmallVector<Fraction> coeffs;
+  coeffs.reserve(coefficients.size() * x.coefficients.size());
+  for (const Fraction &coeff : coefficients)
+    for (const Fraction &xcoeff : x.coefficients)
+      coeffs.push_back(coeff * xcoeff);
+
+  std::vector<SmallVector<Fraction>> product;
+  std::vector<std::vector<SmallVector<Fraction>>> aff;
+  aff.reserve(affine.size() * x.affine.size());
+  for (const std::vector<SmallVector<Fraction>> &term : affine) {
+    for (const std::vector<SmallVector<Fraction>> &xterm : x.affine) {
+      product.clear();
+      product.insert(product.end(), term.begin(), term.end());
+      product.insert(product.end(), xterm.begin(), xterm.end());
+      aff.push_back(product);
+    }
+  }
+
+  return QuasiPolynomial(getNumInputs(), coeffs, aff);
+}
+
+QuasiPolynomial QuasiPolynomial::operator/(const Fraction x) const {
+  assert(x != 0 && "division by zero!");
+  QuasiPolynomial qp(*this);
+  for (Fraction &coeff : qp.coefficients)
+    coeff /= x;
+  return qp;
+}
+
+// Removes terms which evaluate to zero from the expression.
+QuasiPolynomial QuasiPolynomial::simplify() {
+  SmallVector<Fraction> newCoeffs({});
+  std::vector<std::vector<SmallVector<Fraction>>> newAffine({});
+  for (unsigned i = 0, e = coefficients.size(); i < e; i++) {
+    // A term is zero if its coefficient is zero, or
+    if (coefficients[i] == Fraction(0, 1))
+      continue;
+    bool product_is_zero =
+        // if any of the affine functions in the product
+        llvm::any_of(affine[i], [](const SmallVector<Fraction> &affine_ij) {
+          // has all its coefficients as zero.
+          return llvm::all_of(affine_ij,
+                              [](const Fraction &f) { return f == 0; });
+        });
+    if (product_is_zero)
+      continue;
+    newCoeffs.push_back(coefficients[i]);
+    newAffine.push_back(affine[i]);
+  }
+  return QuasiPolynomial(getNumInputs(), newCoeffs, newAffine);
+}
\ No newline at end of file
diff --git a/mlir/unittests/Analysis/Presburger/CMakeLists.txt b/mlir/unittests/Analysis/Presburger/CMakeLists.txt
index b6ce273e35a0e..e37133354e53c 100644
--- a/mlir/unittests/Analysis/Presburger/CMakeLists.txt
+++ b/mlir/unittests/Analysis/Presburger/CMakeLists.txt
@@ -11,6 +11,7 @@ add_mlir_unittest(MLIRPresburgerTests
   PresburgerRelationTest.cpp
   PresburgerSpaceTest.cpp
   PWMAFunctionTest.cpp
+  QuasiPolynomialTest.cpp
   SimplexTest.cpp
   UtilsTest.cpp
 )
diff --git a/mlir/unittests/Analysis/Presburger/QuasiPolynomialTest.cpp b/mlir/unittests/Analysis/Presburger/QuasiPolynomialTest.cpp
new file mode 100644
index 0000000000000..a84f0234067ab
--- /dev/null
+++ b/mlir/unittests/Analysis/Presburger/QuasiPolynomialTest.cpp
@@ -0,0 +1,140 @@
+//===- MatrixTest.cpp - Tests for QuasiPolynomial -------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Analysis/Presburger/QuasiPolynomial.h"
+#include "./Utils.h"
+#include "mlir/Analysis/Presburger/Fraction.h"
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+
+using namespace mlir;
+using namespace presburger;
+
+// Test the arithmetic operations on QuasiPolynomials;
+// addition, subtraction, multiplication, and division
+// by a constant.
+// Two QPs of 3 parameters each were generated randomly
+// and their sum, difference, and product computed by hand.
+TEST(QuasiPolynomialTest, arithmetic) {
+  QuasiPolynomial qp1(
+      3, {Fraction(1, 3), Fraction(1, 1), Fraction(1, 2)},
+      {{{Fraction(1, 1), Fraction(-1, 2), Fraction(4, 5), Fraction(0, 1)},
+        {Fraction(2, 3), Fraction(3, 4), Fraction(-1, 1), Fraction(5, 7)}},
+       {{Fraction(1, 2), Fraction(1, 1), Fraction(4, 5), Fraction(1, 1)}},
+       {{Fraction(-3, 2), Fraction(1, 1), Fraction(5, 6), Fraction(7, 5)},
+        {Fraction(1, 4), Fraction(2, 1), Fraction(6, 5), Fraction(-9, 8)},
+        {Fraction(3, 2), Fraction(2, 5), Fraction(-7, 4), Fraction(0, 1)}}});
+  QuasiPolynomial qp2(
+      3, {Fraction(1, 1), Fraction(2, 1)},
+      {{{Fraction(1, 2), Fraction(0, 1), Fraction(-1, 3), Fraction(5, 3)},
+        {Fraction(2, 1), Fraction(5, 4), Fraction(9, 7), Fraction(-1, 5)}},
+       {{Fraction(1, 3), Fraction(-2, 3), Fraction(1, 1), Fraction(0, 1)}}});
+
+  QuasiPolynomial sum = qp1 + qp2;
+  EXPECT_EQ_REPR_QUASIPOLYNOMIAL(
+      sum,
+      QuasiPolynomial(
+          3,
+          {Fraction(1, 3), Fraction(1, 1), Fraction(1, 2), Fraction(1, 1),
+           Fraction(2, 1)},
+          {{{Fraction(1, 1), Fraction(-1, 2), Fraction(4, 5), Fraction(0, 1)},
+            {Fraction(2, 3), Fraction(3, 4), Fraction(-1, 1), Fraction(5, 7)}},
+           {{Fraction(1, 2), Fraction(1, 1), Fraction(4, 5), Fraction(1, 1)}},
+           {{Fraction(-3, 2), Fraction(1, 1), Fraction(5, 6), Fraction(7, 5)},
+            {Fraction(1, 4), Fraction(2, 1), Fraction(6, 5), Fraction(-9, 8)},
+            {Fraction(3, 2), Fraction(2, 5), Fraction(-7, 4), Fraction(0, 1)}},
+           {{Fraction(1, 2), Fraction(0, 1), Fraction(-1, 3), Fraction(5, 3)},
+            {Fraction(2, 1), Fraction(5, 4), Fraction(9, 7), Fraction(-1, 5)}},
+           {{Fraction(1, 3), Fraction(-2, 3), Fraction(1, 1),
+             Fraction(0, 1)}}}));
+
+  QuasiPolynomial diff = qp1 - qp2;
+  EXPECT_EQ_REPR_QUASIPOLYNOMIAL(
+      diff,
+      QuasiPolynomial(
+          3,
+          {Fraction(1, 3), Fraction(1, 1), Fraction(1, 2), Fraction(-1, 1),
+           Fraction(-2, 1)},
+          {{{Fraction(1, 1), Fraction(-1, 2), Fraction(4, 5), Fraction(0, 1)},
+            {Fraction(2, 3), Fraction(3, 4), Fraction(-1, 1), Fraction(5, 7)}},
+           {{Fraction(1, 2), Fraction(1, 1), Fraction(4, 5), Fraction(1, 1)}},
+           {{Fraction(-3, 2), Fraction(1, 1), Fraction(5, 6), Fraction(7, 5)},
+            {Fraction(1, 4), Fraction(2, 1), Fraction(6, 5), Fraction(-9, 8)},
+            {Fraction(3, 2), Fraction(2, 5), Fraction(-7, 4), Fraction(0, 1)}},
+           {{Fraction(1, 2), Fraction(0, 1), Fraction(-1, 3), Fraction(5, 3)},
+            {Fraction(2, 1), Fraction(5, 4), Fraction(9, 7), Fraction(-1, 5)}},
+           {{Fraction(1, 3), Fraction(-2, 3), Fraction(1, 1),
+             Fraction(0, 1)}}}));
+
+  QuasiPolynomial prod = qp1 * qp2;
+  EXPECT_EQ_REPR_QUASIPOLYNOMIAL(
+      prod,
+      QuasiPolynomial(
+          3,
+          {Fraction(1, 3), Fraction(2, 3), Fraction(1, 1), Fraction(2, 1),
+           Fraction(1, 2), Fraction(1, 1)},
+          {{{Fraction(1, 1), Fraction(-1, 2), Fraction(4, 5), Fraction(0, 1)},
+            {Fraction(2, 3), Fraction(3, 4), Fraction(-1, 1), Fraction(5, 7)},
+            {Fraction(1, 2), Fraction(0, 1), Fraction(-1, 3), Fraction(5, 3)},
+            {Fraction(2, 1), Fraction(5, 4), Fraction(9, 7), Fraction(-1, 5)}},
+           {{Fraction(1, 1), Fraction(-1, 2), Fraction(4, 5), Fraction(0, 1)},
+            {Fraction(2, 3), Fraction(3, 4), Fraction(-1, 1), Fraction(5, 7)},
+            {Fraction(1, 3), Fraction(-2, 3), Fraction(1, 1), Fraction(0, 1)}},
+           {{Fraction(1, 2), Fraction(1, 1), Fraction(4, 5), Fraction(1, 1)},
+            {Fraction(1, 2), Fraction(0, 1), Fraction(-1, 3), Fraction(5, 3)},
+            {Fraction(2, 1), Fraction(5, 4), Fraction(9, 7), Fraction(-1, 5)}},
+           {{Fraction(1, 2), Fraction(1, 1), Fraction(4, 5), Fraction(1, 1)},
+            {Fraction(1, 3), Fraction(-2, 3), Fraction(1, 1), Fraction(0, 1)}},
+           {{Fraction(-3, 2), Fraction(1, 1), Fraction(5, 6), Fraction(7, 5)},
+            {Fraction(1, 4), Fraction(2, 1), Fraction(6, 5), Fraction(-9, 8)},
+            {Fraction(3, 2), Fraction(2, 5), Fraction(-7, 4), Fraction(0, 1)},
+            {Fraction(1, 2), Fraction(0, 1), Fraction(-1, 3), Fraction(5, 3)},
+            {Fraction(2, 1), Fraction(5, 4), Fraction(9, 7), Fraction(-1, 5)}},
+           {{Fraction(-3, 2), Fraction(1, 1), Fraction(5, 6), Fraction(7, 5)},
+            {Fraction(1, 4), Fraction(2, 1), Fraction(6, 5), Fraction(-9, 8)},
+            {Fraction(3, 2), Fraction(2, 5), Fraction(-7, 4), Fraction(0, 1)},
+            {Fraction(1, 3), Fraction(-2, 3), Fraction(1, 1),
+             Fraction(0, 1)}}}));
+
+  QuasiPolynomial quot = qp1 / 2;
+  EXPECT_EQ_REPR_QUASIPOLYNOMIAL(
+      quot,
+      QuasiPolynomial(
+          3, {Fraction(1, 6), Fraction(1, 2), Fraction(1, 4)},
+          {{{Fraction(1, 1), Fraction(-1, 2), Fraction(4, 5), Fraction(0, 1)},
+            {Fraction(2, 3), Fraction(3, 4), Fraction(-1, 1), Fraction(5, 7)}},
+           {{Fraction(1, 2), Fraction(1, 1), Fraction(4, 5), Fraction(1, 1)}},
+           {{Fraction(-3, 2), Fraction(1, 1), Fraction(5, 6), Fraction(7, 5)},
+            {Fraction(1, 4), Fraction(2, 1), Fraction(6, 5), Fraction(-9, 8)},
+            {Fraction(3, 2), Fraction(2, 5), Fraction(-7, 4),
+             Fraction(0, 1)}}}));
+}
+
+// Test the simplify() operation on QPs, which removes terms that
+// are identically zero. A random QP was generated and terms were
+// changed to account for each condition in simplify() – 
+// the term coefficient being zero, or all the coefficients in some
+// affine term in the product being zero.
+TEST(QuasiPolynomialTest, simplify) {
+  QuasiPolynomial qp(2,
+                     {Fraction(2, 3), Fraction(0, 1), Fraction(1, 1),
+                      Fraction(1, 2), Fraction(0, 1)},
+                     {{{Fraction(1, 1), Fraction(3, 4), Fraction(5, 3)},
+                       {Fraction(2, 1), Fraction(0, 1), Fraction(0, 1)}},
+                      {{Fraction(1, 3), Fraction(8, 5), Fraction(2, 5)}},
+                      {{Fraction(2, 7), Fraction(9, 5), Fraction(0, 1)},
+                       {Fraction(0, 1), Fraction(0, 1), Fraction(0, 1)}},
+                      {{Fraction(1, 1), Fraction(4, 5), Fraction(6, 5)}},
+                      {{Fraction(1, 3), Fraction(4, 3), Fraction(7, 8)}}});
+  EXPECT_EQ_REPR_QUASIPOLYNOMIAL(
+      qp.simplify(),
+      QuasiPolynomial(2, {Fraction(2, 3), Fraction(1, 2)},
+                      {{{Fraction(1, 1), Fraction(3, 4), Fraction(5, 3)},
+                        {Fraction(2, 1), Fraction(0, 1), Fraction(0, 1)}},
+                       {{Fraction(1, 1), Fraction(4, 5), Fraction(6, 5)}}}));
+}
\ No newline at end of file
diff --git a/mlir/unittests/Analysis/Presburger/Utils.h b/mlir/unittests/Analysis/Presburger/Utils.h
index 544577375dd1d..2a9966c7ce2ea 100644
--- a/mlir/unittests/Analysis/Presburger/Utils.h
+++ b/mlir/unittests/Analysis/Presburger/Utils.h
@@ -17,6 +17,7 @@
 #include "mlir/Analysis/Presburger/Matrix.h"
 #include "mlir/Analysis/Presburger/PWMAFunction.h"
 #include "mlir/Analysis/Presburger/PresburgerRelation.h"
+#include "mlir/Analysis/Presburger/QuasiPolynomial.h"
 #include "mlir/Analysis/Presburger/Simplex.h"
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/Support/LLVM.h"
@@ -71,6 +72,28 @@ inline void EXPECT_EQ_FRAC_MATRIX(FracMatrix a, FracMatrix b) {
       EXPECT_EQ(a(row, col), b(row, col));
 }
 
+// Check the coefficients (in order) of two quasipolynomials.
+// Note that this is not a true equality check.
+inline void EXPECT_EQ_REPR_QUASIPOLYNOMIAL(QuasiPolynomial a, QuasiPolynomial b) {
+  EXPECT_EQ(a.getNumInputs(), b.getNumInputs());
+
+  SmallVector<Fraction> aCoeffs = a.getCoefficients(),
+                        bCoeffs = b.getCoefficients();
+  EXPECT_EQ(aCoeffs.size(), bCoeffs.size());
+  for (unsigned i = 0, e = aCoeffs.size(); i < e; i++)
+    EXPECT_EQ(aCoeffs[i], bCoeffs[i]);
+
+  std::vector<std::vector<SmallVector<Fraction>>> aAff = a.getAffine(),
+                                                  bAff = b.getAffine();
+  EXPECT_EQ(aAff.size(), bAff.size());
+  for (unsigned i = 0, e = aAff.size(); i < e; i++) {
+    EXPECT_EQ(aAff[i].size(), bAff[i].size());
+    for (unsigned j = 0, f = aAff[i].size(); j < f; j++)
+      for (unsigned k = 0, g = a.getNumInputs(); k <= g; k++)
+        EXPECT_EQ(aAff[i][j][k], bAff[i][j][k]);
+  }
+}
+
 /// lhs and rhs represent non-negative integers or positive infinity. The
 /// infinity case corresponds to when the Optional is empty.
 inline bool infinityOrUInt64LE(std::optional<MPInt> lhs,

From e8b6fa5f301de4688b7a4bd6c41d30f29f0e2ddd Mon Sep 17 00:00:00 2001
From: DavidKorczynski <david@adalogics.com>
Date: Tue, 26 Dec 2023 21:32:13 +0000
Subject: [PATCH 297/342] [WebAssembly] Add bounds check in parseCodeSection
 (#76407)

This is needed as otherwise `Ctx.Ptr` will be incremented to a position
outside it's available buffer, which is being used to read values e.g.
https://github.com/llvm/llvm-project/blob/966d564e43e650b9c34f9c67829d3947f52add91/llvm/lib/Object/WasmObjectFile.cpp#L1469

Fixes: https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=28856

Signed-off-by: David Korczynski <david@adalogics.com>
---
 llvm/lib/Object/WasmObjectFile.cpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/llvm/lib/Object/WasmObjectFile.cpp b/llvm/lib/Object/WasmObjectFile.cpp
index dfe86a45df322..40665d686cf93 100644
--- a/llvm/lib/Object/WasmObjectFile.cpp
+++ b/llvm/lib/Object/WasmObjectFile.cpp
@@ -1484,6 +1484,11 @@ Error WasmObjectFile::parseCodeSection(ReadContext &Ctx) {
     }
 
     uint32_t BodySize = FunctionEnd - Ctx.Ptr;
+    // Ensure that Function is within Ctx's buffer.
+    if (Ctx.Ptr + BodySize > Ctx.End) {
+      return make_error<GenericBinaryError>("Function extends beyond buffer",
+                                            object_error::parse_failed);
+    }
     Function.Body = ArrayRef<uint8_t>(Ctx.Ptr, BodySize);
     // This will be set later when reading in the linking metadata section.
     Function.Comdat = UINT32_MAX;

From 7962bd5719b7a1acc595bbacdb479addd81703bc Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Tue, 26 Dec 2023 13:45:24 -0800
Subject: [PATCH 298/342] [hwasan] Make stack variables output consistent with
 globals (#76197)

---
 compiler-rt/lib/hwasan/hwasan_report.cpp              | 11 +++++------
 compiler-rt/test/hwasan/TestCases/stack-overflow.c    |  3 +--
 compiler-rt/test/hwasan/TestCases/stack-uar-dynamic.c |  3 +--
 compiler-rt/test/hwasan/TestCases/stack-uar.c         |  3 +--
 compiler-rt/test/hwasan/TestCases/stack-uas.c         |  3 +--
 compiler-rt/test/hwasan/TestCases/stack-underflow.c   |  3 +--
 compiler-rt/test/hwasan/TestCases/strip_path_prefix.c |  2 +-
 7 files changed, 11 insertions(+), 17 deletions(-)

diff --git a/compiler-rt/lib/hwasan/hwasan_report.cpp b/compiler-rt/lib/hwasan/hwasan_report.cpp
index 5b3a99adfea7c..1a018a891b56e 100644
--- a/compiler-rt/lib/hwasan/hwasan_report.cpp
+++ b/compiler-rt/lib/hwasan/hwasan_report.cpp
@@ -260,16 +260,15 @@ static void PrintStackAllocations(const StackAllocationsRingBuffer *sa,
         Printf("Cause: %s\n", cause);
         Printf("%s", d.Default());
         Printf("%s", d.Location());
-        Printf("%p is located %zd bytes %s a %zd-byte region [%p,%p)\n",
-               untagged_addr, offset, whence, local_end - local_beg, local_beg,
-               local_end);
-        Printf("%s", d.Allocation());
         StackTracePrinter::GetOrInit()->RenderSourceLocation(
             &location, local.decl_file, local.decl_line, /* column= */ 0,
             common_flags()->symbolize_vs_style,
             common_flags()->strip_path_prefix);
-        Printf("  %s in %s %s\n", local.name, local.function_name,
-               location.data());
+        Printf(
+            "%p is located %zd bytes %s a %zd-byte local variable %s [%p,%p) "
+            "in %s %s\n",
+            untagged_addr, offset, whence, local_end - local_beg, local.name,
+            local_beg, local_end, local.function_name, location.data());
         location.clear();
         Printf("%s\n", d.Default());
       }
diff --git a/compiler-rt/test/hwasan/TestCases/stack-overflow.c b/compiler-rt/test/hwasan/TestCases/stack-overflow.c
index 10e8d9c59e4bb..4af506e3ecf45 100644
--- a/compiler-rt/test/hwasan/TestCases/stack-overflow.c
+++ b/compiler-rt/test/hwasan/TestCases/stack-overflow.c
@@ -17,8 +17,7 @@ int main() {
   // CHECK: is located in stack of thread
   // CHECK: Potentially referenced stack objects:
   // CHECK: Cause: stack-buffer-overflow
-  // CHECK-NEXT: 0x{{.*}} is located 1 bytes after a 64-byte region
-  // CHECK-NEXT: c in buggy {{.*}}stack-overflow.c:
+  // CHECK-NEXT: 0x{{.*}} is located 1 bytes after a 64-byte local variable c [0x{{.*}},0x{{.*}}) in buggy {{.*}}stack-overflow.c:
   // CHECK: Memory tags around the buggy address
 
   // CHECK: SUMMARY: HWAddressSanitizer: tag-mismatch {{.*}} in buggy
diff --git a/compiler-rt/test/hwasan/TestCases/stack-uar-dynamic.c b/compiler-rt/test/hwasan/TestCases/stack-uar-dynamic.c
index 7a2a11593e7af..14b9cba8aa5e4 100644
--- a/compiler-rt/test/hwasan/TestCases/stack-uar-dynamic.c
+++ b/compiler-rt/test/hwasan/TestCases/stack-uar-dynamic.c
@@ -22,7 +22,6 @@ int main() {
   char *p = buggy(1);
   // CHECK: Potentially referenced stack objects:
   // CHECK-NEXT: use-after-scope
-  // CHECK-NEXT: 0x{{.*}} is located 0 bytes inside a 64-byte region
-  // CHECK-NEXT: c in buggy
+  // CHECK-NEXT: 0x{{.*}} is located 0 bytes inside a 64-byte local variable c [0x{{.*}},0x{{.*}}) in buggy
   p[0] = 0;
 }
diff --git a/compiler-rt/test/hwasan/TestCases/stack-uar.c b/compiler-rt/test/hwasan/TestCases/stack-uar.c
index 8810701f0c9ca..9fd4381a8049e 100644
--- a/compiler-rt/test/hwasan/TestCases/stack-uar.c
+++ b/compiler-rt/test/hwasan/TestCases/stack-uar.c
@@ -51,8 +51,7 @@ int main() {
   // CHECK: is located in stack of thread
   // CHECK: Potentially referenced stack objects:
   // CHECK: Cause: use-after-scope
-  // CHECK-NEXT: 0x{{.*}} is located 0 bytes inside a 2048-byte region
-  // CHECK-NEXT: {{zzz|yyy}} in buggy {{.*}}stack-uar.c:
+  // CHECK-NEXT: 0x{{.*}} is located 0 bytes inside a 2048-byte local variable {{zzz|yyy}} [0x{{.*}},0x{{.*}}) in buggy {{.*}}stack-uar.c:
   // CHECK: Memory tags around the buggy address
 
   // NOSYM: Previously allocated frames:
diff --git a/compiler-rt/test/hwasan/TestCases/stack-uas.c b/compiler-rt/test/hwasan/TestCases/stack-uas.c
index 53a7054c1c435..a0e4eb02dd226 100644
--- a/compiler-rt/test/hwasan/TestCases/stack-uas.c
+++ b/compiler-rt/test/hwasan/TestCases/stack-uas.c
@@ -70,8 +70,7 @@ int main() {
   // CHECK: is located in stack of thread
   // CHECK: Potentially referenced stack objects:
   // CHECK: Cause: use-after-scope
-  // CHECK-NEXT: 0x{{.*}} is located 0 bytes inside a 2048-byte region
-  // CHECK-NEXT: {{zzz|yyy}} in buggy {{.*}}stack-uas.c:
+  // CHECK-NEXT: 0x{{.*}} is located 0 bytes inside a 2048-byte local variable {{zzz|yyy}} [0x{{.*}}) in buggy {{.*}}stack-uas.c:
   // CHECK: Memory tags around the buggy address
 
   // NOSYM: Previously allocated frames:
diff --git a/compiler-rt/test/hwasan/TestCases/stack-underflow.c b/compiler-rt/test/hwasan/TestCases/stack-underflow.c
index 8e5174519272f..e13955ed37b41 100644
--- a/compiler-rt/test/hwasan/TestCases/stack-underflow.c
+++ b/compiler-rt/test/hwasan/TestCases/stack-underflow.c
@@ -17,8 +17,7 @@ int main() {
   // CHECK: is located in stack of thread
   // CHECK: Potentially referenced stack objects:
   // CHECK: Cause: stack-buffer-overflow
-  // CHECK-NEXT: 0x{{.*}} is located 2 bytes before a 64-byte region
-  // CHECK-NEXT: c in buggy {{.*}}stack-underflow.c:
+  // CHECK-NEXT: 0x{{.*}} is located 2 bytes before a 64-byte local variable c [0x{{.*}},0x{{.*}}) in buggy {{.*}}stack-underflow.c:
   // CHECK: Memory tags around the buggy address
 
   // CHECK: SUMMARY: HWAddressSanitizer: tag-mismatch {{.*}} in buggy
diff --git a/compiler-rt/test/hwasan/TestCases/strip_path_prefix.c b/compiler-rt/test/hwasan/TestCases/strip_path_prefix.c
index 80ef32699f8f4..22705ed35ce7e 100644
--- a/compiler-rt/test/hwasan/TestCases/strip_path_prefix.c
+++ b/compiler-rt/test/hwasan/TestCases/strip_path_prefix.c
@@ -23,5 +23,5 @@ int main() {
   // CHECK: READ of size 1 at
   // CHECK: #0 {{.*}} in main strip_path_prefix.c:[[@LINE-2]]
   // CHECK: Potentially referenced stack objects:
-  // CHECK: zzz in buggy strip_path_prefix.c:[[@LINE-12]]
+  // CHECK: in buggy strip_path_prefix.c:[[@LINE-12]]
 }

From aacff347af846e600d30eb1a8a38af75f8b33370 Mon Sep 17 00:00:00 2001
From: Yingwei Zheng <dtcxzyw2333@gmail.com>
Date: Wed, 27 Dec 2023 06:06:16 +0800
Subject: [PATCH 299/342] [InstCombine] Simplify `icmp pred (sdiv exact X, C),
 (sdiv exact Y, C)` into `icmp pred X, Y` when C is positive (#76409)

Alive2: https://alive2.llvm.org/ce/z/u49dQ9
It will improve the codegen of `std::_Vector_base<T>::~_Vector_base()` when `sizeof(T)` is not a power of 2.

NOTE: We can also fold `icmp signed-pred (sdiv exact X, C), (sdiv exact Y, C)` into `icmp signed-pred (sdiv exact Y, C), (sdiv exact X, C)` when C is negative. But I don't think it enables more optimizations for real-world applications.
---
 .../InstCombine/InstCombineCompares.cpp       |   3 +-
 llvm/test/Transforms/InstCombine/icmp.ll      | 101 ++++++++++++++++++
 2 files changed, 103 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
index 289976718e52f..0222c93faf24e 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -4966,7 +4966,8 @@ Instruction *InstCombinerImpl::foldICmpBinOp(ICmpInst &I,
       return new ICmpInst(Pred, BO0->getOperand(0), BO1->getOperand(0));
 
     case Instruction::SDiv:
-      if (!I.isEquality() || !BO0->isExact() || !BO1->isExact())
+      if (!(I.isEquality() || match(BO0->getOperand(1), m_NonNegative())) ||
+          !BO0->isExact() || !BO1->isExact())
         break;
       return new ICmpInst(Pred, BO0->getOperand(0), BO1->getOperand(0));
 
diff --git a/llvm/test/Transforms/InstCombine/icmp.ll b/llvm/test/Transforms/InstCombine/icmp.ll
index 1c7bb36f0d34c..9b2e141bdb050 100644
--- a/llvm/test/Transforms/InstCombine/icmp.ll
+++ b/llvm/test/Transforms/InstCombine/icmp.ll
@@ -854,6 +854,107 @@ define i1 @PR32949(i32 %X, i32 %Y, i32 %Z) {
   ret i1 %C
 }
 
+define i1 @test_sdiv_pos_slt(i32 %x, i32 %y) {
+; CHECK-LABEL: @test_sdiv_pos_slt(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %divx = sdiv exact i32 %x, 40
+  %divy = sdiv exact i32 %y, 40
+  %cmp = icmp slt i32 %divx, %divy
+  ret i1 %cmp
+}
+
+define i1 @test_sdiv_pos_sle(i32 %x, i32 %y) {
+; CHECK-LABEL: @test_sdiv_pos_sle(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sle i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %divx = sdiv exact i32 %x, 40
+  %divy = sdiv exact i32 %y, 40
+  %cmp = icmp sle i32 %divx, %divy
+  ret i1 %cmp
+}
+
+define i1 @test_sdiv_pos_sgt(i32 %x, i32 %y) {
+; CHECK-LABEL: @test_sdiv_pos_sgt(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %divx = sdiv exact i32 %x, 40
+  %divy = sdiv exact i32 %y, 40
+  %cmp = icmp sgt i32 %divx, %divy
+  ret i1 %cmp
+}
+
+define i1 @test_sdiv_pos_sge(i32 %x, i32 %y) {
+; CHECK-LABEL: @test_sdiv_pos_sge(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sge i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %divx = sdiv exact i32 %x, 40
+  %divy = sdiv exact i32 %y, 40
+  %cmp = icmp sge i32 %divx, %divy
+  ret i1 %cmp
+}
+
+define i1 @test_sdiv_pos_ult(i32 %x, i32 %y) {
+; CHECK-LABEL: @test_sdiv_pos_ult(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %divx = sdiv exact i32 %x, 40
+  %divy = sdiv exact i32 %y, 40
+  %cmp = icmp ult i32 %divx, %divy
+  ret i1 %cmp
+}
+
+define i1 @test_sdiv_pos_ule(i32 %x, i32 %y) {
+; CHECK-LABEL: @test_sdiv_pos_ule(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ule i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %divx = sdiv exact i32 %x, 40
+  %divy = sdiv exact i32 %y, 40
+  %cmp = icmp ule i32 %divx, %divy
+  ret i1 %cmp
+}
+
+define i1 @test_sdiv_pos_ugt(i32 %x, i32 %y) {
+; CHECK-LABEL: @test_sdiv_pos_ugt(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %divx = sdiv exact i32 %x, 40
+  %divy = sdiv exact i32 %y, 40
+  %cmp = icmp ugt i32 %divx, %divy
+  ret i1 %cmp
+}
+
+define i1 @test_sdiv_pos_uge(i32 %x, i32 %y) {
+; CHECK-LABEL: @test_sdiv_pos_uge(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp uge i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %divx = sdiv exact i32 %x, 40
+  %divy = sdiv exact i32 %y, 40
+  %cmp = icmp uge i32 %divx, %divy
+  ret i1 %cmp
+}
+
+define i1 @test_sdiv_neg_slt(i32 %x, i32 %y) {
+; CHECK-LABEL: @test_sdiv_neg_slt(
+; CHECK-NEXT:    [[DIVX:%.*]] = sdiv exact i32 [[X:%.*]], -40
+; CHECK-NEXT:    [[DIVY:%.*]] = sdiv exact i32 [[Y:%.*]], -40
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[DIVX]], [[DIVY]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %divx = sdiv exact i32 %x, -40
+  %divy = sdiv exact i32 %y, -40
+  %cmp = icmp slt i32 %divx, %divy
+  ret i1 %cmp
+}
+
 ; PR8469
 define <2 x i1> @test49(<2 x i32> %i3) {
 ; CHECK-LABEL: @test49(

From 532d4845ed0afced7cb1f8c7a4e3499b63cc3de3 Mon Sep 17 00:00:00 2001
From: "Balaji V. Iyer" <43187390+bviyer@users.noreply.github.com>
Date: Tue, 26 Dec 2023 16:34:32 -0600
Subject: [PATCH 300/342] [mlir][Quasipolynomials] Fixed type issues in
 GeneratorFuunction.h (#76413)

Fixed two issues: A SmallVector size that caused size-differences issue
(8 vs. 12). Thus removed this size restriction. Also a constant
parameter was causing an issue in a function not marked constant.
---
 mlir/lib/Analysis/Presburger/GeneratingFunction.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/mlir/lib/Analysis/Presburger/GeneratingFunction.h b/mlir/lib/Analysis/Presburger/GeneratingFunction.h
index 8676b84c1c4df..dad9594f960d1 100644
--- a/mlir/lib/Analysis/Presburger/GeneratingFunction.h
+++ b/mlir/lib/Analysis/Presburger/GeneratingFunction.h
@@ -49,7 +49,7 @@ using Point = SmallVector<Fraction>;
 // g_{ij} \in Q^n are vectors.
 class GeneratingFunction {
 public:
-  GeneratingFunction(unsigned numParam, SmallVector<int, 8> signs,
+  GeneratingFunction(unsigned numParam, SmallVector<int> signs,
                      std::vector<ParamPoint> nums,
                      std::vector<std::vector<Point>> dens)
       : numParam(numParam), signs(signs), numerators(nums), denominators(dens) {
@@ -67,7 +67,7 @@ class GeneratingFunction {
 
   std::vector<std::vector<Point>> getDenominators() { return denominators; }
 
-  GeneratingFunction operator+(const GeneratingFunction &gf) const {
+  GeneratingFunction operator+(GeneratingFunction &gf) const {
     assert(numParam == gf.getNumParams() &&
            "two generating functions with different numbers of parameters "
            "cannot be added!");
@@ -81,7 +81,7 @@ class GeneratingFunction {
     std::vector<std::vector<Point>> sumDenominators = denominators;
     sumDenominators.insert(sumDenominators.end(), gf.denominators.begin(),
                            gf.denominators.end());
-    return GeneratingFunction(sumSigns, sumNumerators, sumDenominators);
+    return GeneratingFunction(0, sumSigns, sumNumerators, sumDenominators);
   }
 
   llvm::raw_ostream &print(llvm::raw_ostream &os) const {
@@ -121,7 +121,7 @@ class GeneratingFunction {
 
 private:
   unsigned numParam;
-  SmallVector<int, 8> signs;
+  SmallVector<int> signs;
   std::vector<ParamPoint> numerators;
   std::vector<std::vector<Point>> denominators;
 };
@@ -129,4 +129,4 @@ class GeneratingFunction {
 } // namespace presburger
 } // namespace mlir
 
-#endif // MLIR_ANALYSIS_PRESBURGER_GENERATINGFUNCTION_H
\ No newline at end of file
+#endif // MLIR_ANALYSIS_PRESBURGER_GENERATINGFUNCTION_H

From 14e221aa68c17599c830f458b9727a84bb1df6a9 Mon Sep 17 00:00:00 2001
From: Peter Klausler <35819229+klausler@users.noreply.github.com>
Date: Tue, 26 Dec 2023 14:53:11 -0800
Subject: [PATCH 301/342] [flang][runtime] Correct EXw.0 output editing
 (#75121)

A zero 'd' digit count in EX output editing has a meaning that's
distinct from other numeric output editing descriptors, and I missed
this in my initial implementation of the feature. d==0 means that the
runtime should emit hexadecimal digits after the (hexa)decimal point
until all of the rest of them would be zero.
---
 .../flang/Decimal/binary-floating-point.h     |  2 +-
 flang/runtime/edit-output.cpp                 |  5 +-
 .../unittests/Runtime/NumericalFormatTest.cpp | 52 ++++++++++++-------
 3 files changed, 38 insertions(+), 21 deletions(-)

diff --git a/flang/include/flang/Decimal/binary-floating-point.h b/flang/include/flang/Decimal/binary-floating-point.h
index b9346a8585e2d..d1992819f85aa 100644
--- a/flang/include/flang/Decimal/binary-floating-point.h
+++ b/flang/include/flang/Decimal/binary-floating-point.h
@@ -143,7 +143,7 @@ class BinaryFloatingPointNumber : public common::RealDetails<BINARY_PRECISION> {
     if (IsNaN() || IsInfinite() || keepBits >= binaryPrecision) {
       return true;
     }
-    int lostBits{binaryPrecision - keepBits};
+    int lostBits{keepBits < binaryPrecision ? binaryPrecision - keepBits : 0};
     RawType lostMask{static_cast<RawType>((RawType{1} << lostBits) - 1)};
     if (RawType lost{static_cast<RawType>(raw_ & lostMask)}; lost != 0) {
       bool increase{false};
diff --git a/flang/runtime/edit-output.cpp b/flang/runtime/edit-output.cpp
index a4ce0b12f9111..32b13a8007d0c 100644
--- a/flang/runtime/edit-output.cpp
+++ b/flang/runtime/edit-output.cpp
@@ -649,7 +649,7 @@ auto RealOutputEditing<KIND>::ConvertToHexadecimal(
     // x_.binaryPrecision is constant, so / can be used for readability.
     int shift{x_.binaryPrecision - 4};
     typename BinaryFloatingPoint::RawType one{1};
-    auto remaining{(one << shift) - one};
+    auto remaining{(one << x_.binaryPrecision) - one};
     for (int digits{0}; digits < significantDigits; ++digits) {
       if ((flags & decimal::Minimize) && !(fraction & remaining)) {
         break;
@@ -682,7 +682,8 @@ bool RealOutputEditing<KIND>::EditEXOutput(const DataEdit &edit) {
     flags |= decimal::AlwaysSign;
   }
   int editWidth{edit.width.value_or(0)}; // 'w' field
-  if (editWidth == 0 && !edit.digits) { // EX0 (no .d)
+  if ((editWidth == 0 && !edit.digits) || editDigits == 0) {
+    // EX0 or EXw.0
     flags |= decimal::Minimize;
     significantDigits = 28; // enough for 128-bit F.P.
   }
diff --git a/flang/unittests/Runtime/NumericalFormatTest.cpp b/flang/unittests/Runtime/NumericalFormatTest.cpp
index b5b8eb0594373..69637d8c6cb4c 100644
--- a/flang/unittests/Runtime/NumericalFormatTest.cpp
+++ b/flang/unittests/Runtime/NumericalFormatTest.cpp
@@ -654,28 +654,44 @@ TEST(IOApiTests, FormatDoubleValues) {
               {"(EX24.13,';')", " 0XF.FFFFFFFFFFFF8P+1020;"},
           }},
       {// EX rounding
-          0x3ff1000000000000uLL, // 1.0625
+          0x3ff0100000000000uLL,
           {
-              {"(F7.4,';')", " 1.0625;"},
-              {"(EX9.1,';')", " 0X8.8P-3;"},
-              {"(EX9.0,';')", "  0X8.P-3;"},
-              {"(RN,EX9.0,';')", "  0X8.P-3;"},
-              {"(RU,EX9.0,';')", "  0X9.P-3;"},
-              {"(RD,EX9.0,';')", "  0X8.P-3;"},
-              {"(RZ,EX9.0,';')", "  0X8.P-3;"},
-              {"(RC,EX9.0,';')", "  0X9.P-3;"},
+              {"(F11.8,';')", " 1.00390625;"},
+              {"(EX10.2,';')", " 0X8.08P-3;"},
+              {"(EX10.1,';')", "  0X8.0P-3;"},
+              {"(EX10.0,';')", " 0X8.08P-3;"},
+              {"(EX0.0,';')", "0X8.08P-3;"},
+              {"(EX0,';')", "0X8.08P-3;"},
+              {"(RN,EX10.1,';')", "  0X8.0P-3;"},
+              {"(RU,EX10.1,';')", "  0X8.1P-3;"},
+              {"(RD,EX10.1,';')", "  0X8.0P-3;"},
+              {"(RZ,EX10.1,';')", "  0X8.0P-3;"},
+              {"(RC,EX10.1,';')", "  0X8.1P-3;"},
+              {"(RN,EX10.0,';')", " 0X8.08P-3;"},
+              {"(RU,EX10.0,';')", " 0X8.08P-3;"},
+              {"(RD,EX10.0,';')", " 0X8.08P-3;"},
+              {"(RZ,EX10.0,';')", " 0X8.08P-3;"},
+              {"(RC,EX10.0,';')", " 0X8.08P-3;"},
           }},
       {// EX rounding
-          0xbff1000000000000uLL, // -1.0625
+          0xbff0100000000000uLL,
           {
-              {"(F7.4,';')", "-1.0625;"},
-              {"(EX9.1,';')", "-0X8.8P-3;"},
-              {"(EX9.0,';')", " -0X8.P-3;"},
-              {"(RN,EX9.0,';')", " -0X8.P-3;"},
-              {"(RU,EX9.0,';')", " -0X8.P-3;"},
-              {"(RD,EX9.0,';')", " -0X9.P-3;"},
-              {"(RZ,EX9.0,';')", " -0X8.P-3;"},
-              {"(RC,EX9.0,';')", " -0X9.P-3;"},
+              {"(F11.8,';')", "-1.00390625;"},
+              {"(EX10.2,';')", "-0X8.08P-3;"},
+              {"(EX10.1,';')", " -0X8.0P-3;"},
+              {"(EX10.0,';')", "-0X8.08P-3;"},
+              {"(EX0.0,';')", "-0X8.08P-3;"},
+              {"(EX0,';')", "-0X8.08P-3;"},
+              {"(RN,EX10.1,';')", " -0X8.0P-3;"},
+              {"(RU,EX10.1,';')", " -0X8.0P-3;"},
+              {"(RD,EX10.1,';')", " -0X8.1P-3;"},
+              {"(RZ,EX10.1,';')", " -0X8.0P-3;"},
+              {"(RC,EX10.1,';')", " -0X8.1P-3;"},
+              {"(RN,EX10.0,';')", "-0X8.08P-3;"},
+              {"(RU,EX10.0,';')", "-0X8.08P-3;"},
+              {"(RD,EX10.0,';')", "-0X8.08P-3;"},
+              {"(RZ,EX10.0,';')", "-0X8.08P-3;"},
+              {"(RC,EX10.0,';')", "-0X8.08P-3;"},
           }},
   };
 

From befdfae198a12b88bce6d26f840e6f71ce4a8b0c Mon Sep 17 00:00:00 2001
From: Peter Klausler <35819229+klausler@users.noreply.github.com>
Date: Tue, 26 Dec 2023 15:00:43 -0800
Subject: [PATCH 302/342] [flang][runtime] Detect & signal underflow when
 reading reals (#75232)

Extend decimal->binary conversion to detect underflow cases and raise
the corresponding floating-point exception.
---
 flang/include/flang/Decimal/decimal.h   |  1 +
 flang/lib/Decimal/decimal-to-binary.cpp | 31 +++++++++++++++++--------
 flang/runtime/edit-input.cpp            |  7 +++++-
 3 files changed, 28 insertions(+), 11 deletions(-)

diff --git a/flang/include/flang/Decimal/decimal.h b/flang/include/flang/Decimal/decimal.h
index a4e0ee7c84746..f0997fb63df01 100644
--- a/flang/include/flang/Decimal/decimal.h
+++ b/flang/include/flang/Decimal/decimal.h
@@ -34,6 +34,7 @@ enum ConversionResultFlags {
   Overflow = 1,
   Inexact = 2,
   Invalid = 4,
+  Underflow = 8,
 };
 
 struct ConversionToDecimalResult {
diff --git a/flang/lib/Decimal/decimal-to-binary.cpp b/flang/lib/Decimal/decimal-to-binary.cpp
index d5b66b9fb9338..780979f747f5b 100644
--- a/flang/lib/Decimal/decimal-to-binary.cpp
+++ b/flang/lib/Decimal/decimal-to-binary.cpp
@@ -256,12 +256,17 @@ ConversionToBinaryResult<PREC> IntermediateFloat<PREC>::ToBinary(
   if (guard != 0) {
     flags |= Inexact;
   }
-  if (fraction == 0 && guard <= oneHalf) {
-    if ((!isNegative && rounding == RoundUp) ||
-        (isNegative && rounding == RoundDown)) {
-      // round to minimum nonzero value
-    } else {
-      return {Binary{}, static_cast<enum ConversionResultFlags>(flags)};
+  if (fraction == 0) {
+    if (guard <= oneHalf) {
+      if ((!isNegative && rounding == RoundUp) ||
+          (isNegative && rounding == RoundDown)) {
+        // round to minimum nonzero value
+      } else { // round to zero
+        if (guard != 0) {
+          flags |= Underflow;
+        }
+        return {Binary{}, static_cast<enum ConversionResultFlags>(flags)};
+      }
     }
   } else {
     // The value is nonzero; normalize it.
@@ -301,8 +306,10 @@ ConversionToBinaryResult<PREC> IntermediateFloat<PREC>::ToBinary(
   }
   if (expo == 1 && fraction < topBit) {
     expo = 0; // subnormal
-  }
-  if (expo >= Binary::maxExponent) {
+    flags |= Underflow;
+  } else if (expo == 0) {
+    flags |= Underflow;
+  } else if (expo >= Binary::maxExponent) {
     expo = Binary::maxExponent; // Inf
     flags |= Overflow;
     if constexpr (Binary::bits == 80) { // x87
@@ -338,11 +345,15 @@ BigRadixFloatingPointNumber<PREC, LOG10RADIX>::ConvertToBinary() {
   // Sanity checks for ridiculous exponents
   static constexpr int crazy{2 * Real::decimalRange + log10Radix};
   if (exponent_ < -crazy) {
+    enum ConversionResultFlags flags {
+      static_cast<enum ConversionResultFlags>(Inexact | Underflow)
+    };
     if ((!isNegative_ && rounding_ == RoundUp) ||
         (isNegative_ && rounding_ == RoundDown)) {
-      return {Real{Raw{1} | SignBit()}}; // return least nonzero value
+      // return least nonzero value
+      return {Real{Raw{1} | SignBit()}, flags};
     } else { // underflow to +/-0.
-      return {Real{SignBit()}, Inexact};
+      return {Real{SignBit()}, flags};
     }
   } else if (exponent_ > crazy) { // overflow to +/-Inf.
     return {Real{Infinity()}, Overflow};
diff --git a/flang/runtime/edit-input.cpp b/flang/runtime/edit-input.cpp
index 822099b5141b1..6e26e523c5ca7 100644
--- a/flang/runtime/edit-input.cpp
+++ b/flang/runtime/edit-input.cpp
@@ -478,6 +478,9 @@ static void RaiseFPExceptions(decimal::ConversionResultFlags flags) {
   if (flags & decimal::ConversionResultFlags::Overflow) {
     RAISE(FE_OVERFLOW);
   }
+  if (flags & decimal::ConversionResultFlags::Underflow) {
+    RAISE(FE_UNDERFLOW);
+  }
   if (flags & decimal::ConversionResultFlags::Inexact) {
     RAISE(FE_INEXACT);
   }
@@ -640,10 +643,12 @@ decimal::ConversionToBinaryResult<binaryPrecision> ConvertHexadecimal(
   }
   // Package & return result
   constexpr RawType significandMask{(one << RealType::significandBits) - 1};
+  int flags{(roundingBit | guardBit) ? decimal::Inexact : decimal::Exact};
   if (!fraction) {
     expo = 0;
   } else if (expo == 1 && !(fraction >> (binaryPrecision - 1))) {
     expo = 0; // subnormal
+    flags |= decimal::Underflow;
   } else if (expo >= RealType::maxExponent) {
     expo = RealType::maxExponent; // +/-Inf
     fraction = 0;
@@ -653,7 +658,7 @@ decimal::ConversionToBinaryResult<binaryPrecision> ConvertHexadecimal(
   return decimal::ConversionToBinaryResult<binaryPrecision>{
       RealType{static_cast<RawType>(signBit |
           static_cast<RawType>(expo) << RealType::significandBits | fraction)},
-      (roundingBit | guardBit) ? decimal::Inexact : decimal::Exact};
+      static_cast<enum decimal::ConversionResultFlags>(flags)};
 }
 
 template <int KIND>

From 8fc045e26bf90586c287256c5f1da5a7826cdb90 Mon Sep 17 00:00:00 2001
From: Peter Klausler <35819229+klausler@users.noreply.github.com>
Date: Tue, 26 Dec 2023 15:12:39 -0800
Subject: [PATCH 303/342] [flang][runtime] Accept 128-bit integer SHIFT values
 in CSHIFT/EOSHIFT (#75246)

It would surprise me if this case ever arose outside a couple of tests
in llvm-test-suite/Fortran/gfortran/regression (namely
cshift_large_1.f90 and eoshift_large_1.f90), but now at least those
tests will pass.
---
 flang/runtime/tools.h              | 25 ++++++++++++++++++++
 flang/runtime/transformational.cpp | 37 +++++++++++++++++++-----------
 2 files changed, 48 insertions(+), 14 deletions(-)

diff --git a/flang/runtime/tools.h b/flang/runtime/tools.h
index 9811bce25acd3..ff05e76c8bb7b 100644
--- a/flang/runtime/tools.h
+++ b/flang/runtime/tools.h
@@ -94,6 +94,31 @@ static inline RT_API_ATTRS std::int64_t GetInt64(
   }
 }
 
+static inline RT_API_ATTRS std::optional<std::int64_t> GetInt64Safe(
+    const char *p, std::size_t bytes, Terminator &terminator) {
+  switch (bytes) {
+  case 1:
+    return *reinterpret_cast<const CppTypeFor<TypeCategory::Integer, 1> *>(p);
+  case 2:
+    return *reinterpret_cast<const CppTypeFor<TypeCategory::Integer, 2> *>(p);
+  case 4:
+    return *reinterpret_cast<const CppTypeFor<TypeCategory::Integer, 4> *>(p);
+  case 8:
+    return *reinterpret_cast<const CppTypeFor<TypeCategory::Integer, 8> *>(p);
+  case 16: {
+    using Int128 = CppTypeFor<TypeCategory::Integer, 16>;
+    auto n{*reinterpret_cast<const Int128 *>(p)};
+    std::int64_t result = n;
+    if (result == n) {
+      return result;
+    }
+    return std::nullopt;
+  }
+  default:
+    terminator.Crash("GetInt64Safe: no case for %zd bytes", bytes);
+  }
+}
+
 template <typename INT>
 inline RT_API_ATTRS bool SetInteger(INT &x, int kind, std::int64_t value) {
   switch (kind) {
diff --git a/flang/runtime/transformational.cpp b/flang/runtime/transformational.cpp
index da8ec05c884fa..cf1e61c0844d8 100644
--- a/flang/runtime/transformational.cpp
+++ b/flang/runtime/transformational.cpp
@@ -52,9 +52,11 @@ class ShiftControl {
           }
         }
       }
+    } else if (auto count{GetInt64Safe(
+                   shift_.OffsetElement<char>(), shiftElemLen_, terminator_)}) {
+      shiftCount_ = *count;
     } else {
-      shiftCount_ =
-          GetInt64(shift_.OffsetElement<char>(), shiftElemLen_, terminator_);
+      terminator_.Crash("%s: SHIFT= value exceeds 64 bits", which);
     }
   }
   RT_API_ATTRS SubscriptValue GetShift(const SubscriptValue resultAt[]) const {
@@ -67,8 +69,10 @@ class ShiftControl {
           ++k;
         }
       }
-      return GetInt64(
-          shift_.Element<char>(shiftAt), shiftElemLen_, terminator_);
+      auto count{GetInt64Safe(
+          shift_.Element<char>(shiftAt), shiftElemLen_, terminator_)};
+      RUNTIME_CHECK(terminator_, count.has_value());
+      return *count;
     } else {
       return shiftCount_; // invariant count extracted in Init()
     }
@@ -719,12 +723,15 @@ void RTDEF(Reshape)(Descriptor &result, const Descriptor &source,
   std::size_t resultElements{1};
   SubscriptValue shapeSubscript{shape.GetDimension(0).LowerBound()};
   for (int j{0}; j < resultRank; ++j, ++shapeSubscript) {
-    resultExtent[j] = GetInt64(
-        shape.Element<char>(&shapeSubscript), shapeElementBytes, terminator);
-    if (resultExtent[j] < 0) {
+    auto extent{GetInt64Safe(
+        shape.Element<char>(&shapeSubscript), shapeElementBytes, terminator)};
+    if (!extent) {
+      terminator.Crash("RESHAPE: value of SHAPE(%d) exceeds 64 bits", j + 1);
+    } else if (*extent < 0) {
       terminator.Crash("RESHAPE: bad value for SHAPE(%d)=%jd", j + 1,
-          static_cast<std::intmax_t>(resultExtent[j]));
+          static_cast<std::intmax_t>(*extent));
     }
+    resultExtent[j] = *extent;
     resultElements *= resultExtent[j];
   }
 
@@ -762,14 +769,16 @@ void RTDEF(Reshape)(Descriptor &result, const Descriptor &source,
     SubscriptValue orderSubscript{order->GetDimension(0).LowerBound()};
     std::size_t orderElementBytes{order->ElementBytes()};
     for (SubscriptValue j{0}; j < resultRank; ++j, ++orderSubscript) {
-      auto k{GetInt64(order->Element<char>(&orderSubscript), orderElementBytes,
-          terminator)};
-      if (k < 1 || k > resultRank || ((values >> k) & 1)) {
+      auto k{GetInt64Safe(order->Element<char>(&orderSubscript),
+          orderElementBytes, terminator)};
+      if (!k) {
+        terminator.Crash("RESHAPE: ORDER element value exceeds 64 bits");
+      } else if (*k < 1 || *k > resultRank || ((values >> *k) & 1)) {
         terminator.Crash("RESHAPE: bad value for ORDER element (%jd)",
-            static_cast<std::intmax_t>(k));
+            static_cast<std::intmax_t>(*k));
       }
-      values |= std::uint64_t{1} << k;
-      dimOrder[j] = k - 1;
+      values |= std::uint64_t{1} << *k;
+      dimOrder[j] = *k - 1;
     }
   } else {
     for (int j{0}; j < resultRank; ++j) {

From 475d18f15db2d6925f81c39900e725e1fe28cba1 Mon Sep 17 00:00:00 2001
From: Peter Klausler <35819229+klausler@users.noreply.github.com>
Date: Tue, 26 Dec 2023 15:18:19 -0800
Subject: [PATCH 304/342] [flang][runtime] Fix empty FINDLOC() results (#75251)

When FINDLOC() can't find its target value among the unmasked array
elements, it must return a zero result. Its implementation doesn't
sufficiently distinguish a zero result from a hit in an array with lower
bound(s) less than one. Fix by adding a flag to distinguish the case
with no hits from cases with hits.

Fixes llvm-test-suite/Fortran/gfortran/regression/findloc_6.f90.
---
 flang/runtime/findloc.cpp | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/flang/runtime/findloc.cpp b/flang/runtime/findloc.cpp
index 339e0c75f05fe..6b60e523d2a47 100644
--- a/flang/runtime/findloc.cpp
+++ b/flang/runtime/findloc.cpp
@@ -84,27 +84,27 @@ template <typename EQUALITY> class LocationAccumulator {
 public:
   LocationAccumulator(
       const Descriptor &array, const Descriptor &target, bool back)
-      : array_{array}, target_{target}, back_{back} {
-    Reinitialize();
-  }
-  void Reinitialize() {
-    // per standard: result indices are all zero if no data
-    for (int j{0}; j < rank_; ++j) {
-      location_[j] = 0;
-    }
-  }
+      : array_{array}, target_{target}, back_{back} {}
+  void Reinitialize() { gotAnything_ = false; }
   template <typename A> void GetResult(A *p, int zeroBasedDim = -1) {
     if (zeroBasedDim >= 0) {
-      *p = location_[zeroBasedDim] -
-          array_.GetDimension(zeroBasedDim).LowerBound() + 1;
-    } else {
+      *p = gotAnything_ ? location_[zeroBasedDim] -
+              array_.GetDimension(zeroBasedDim).LowerBound() + 1
+                        : 0;
+    } else if (gotAnything_) {
       for (int j{0}; j < rank_; ++j) {
         p[j] = location_[j] - array_.GetDimension(j).LowerBound() + 1;
       }
+    } else {
+      // no unmasked hits? result is all zeroes
+      for (int j{0}; j < rank_; ++j) {
+        p[j] = 0;
+      }
     }
   }
   template <typename IGNORED> bool AccumulateAt(const SubscriptValue at[]) {
     if (equality_(array_, at, target_)) {
+      gotAnything_ = true;
       for (int j{0}; j < rank_; ++j) {
         location_[j] = at[j];
       }
@@ -119,6 +119,7 @@ template <typename EQUALITY> class LocationAccumulator {
   const Descriptor &target_;
   const bool back_{false};
   const int rank_{array_.rank()};
+  bool gotAnything_{false};
   SubscriptValue location_[maxRank];
   const EQUALITY equality_{};
 };

From 933882f73971546d529ab225cb9bb982ed5ff47b Mon Sep 17 00:00:00 2001
From: Peter Klausler <35819229+klausler@users.noreply.github.com>
Date: Tue, 26 Dec 2023 15:23:01 -0800
Subject: [PATCH 305/342] [flang][runtime] Fix trailing blanks for Gw.dEe
 output editing (#75263)

When generalized numeric output editing of real data maps to Fw.d output
editing, either two or four trailing blanks are emitted depending on the
presence and value of 'e'. The code that detects field width overflow
didn't take these trailing blanks into account, and sometimes the field
width adjustment was producing an F0.d output edit descriptor (no fixed
field width). Fix by retaining the original field width, but requiring
it to also accommodate the trailing blanks.

Fixes llvm-test-suite/Fortran/gfortran/regression/fmt_g.f.
---
 flang/runtime/edit-output.cpp | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/flang/runtime/edit-output.cpp b/flang/runtime/edit-output.cpp
index 32b13a8007d0c..26e066c85fed3 100644
--- a/flang/runtime/edit-output.cpp
+++ b/flang/runtime/edit-output.cpp
@@ -341,11 +341,12 @@ bool RealOutputEditing<KIND>::EditEorDOutput(const DataEdit &edit) {
         ConvertToDecimal(significantDigits, edit.modes.round, flags)};
     if (IsInfOrNaN(converted.str, static_cast<int>(converted.length))) {
       return editWidth > 0 &&
-              converted.length > static_cast<std::size_t>(editWidth)
+              converted.length + trailingBlanks_ >
+                  static_cast<std::size_t>(editWidth)
           ? EmitRepeated(io_, '*', editWidth)
           : EmitPrefix(edit, converted.length, editWidth) &&
               EmitAscii(io_, converted.str, converted.length) &&
-              EmitSuffix(edit);
+              EmitRepeated(io_, ' ', trailingBlanks_) && EmitSuffix(edit);
     }
     if (!IsZero()) {
       converted.decimalExponent -= scale;
@@ -522,8 +523,9 @@ bool RealOutputEditing<KIND>::EditFOutput(const DataEdit &edit) {
       zeroesBeforePoint = 1; // "." -> "0."
     }
     int totalLength{signLength + digitsBeforePoint + zeroesBeforePoint +
-        1 /*'.'*/ + zeroesAfterPoint + digitsAfterPoint + trailingZeroes};
-    int width{editWidth > 0 ? editWidth : totalLength};
+        1 /*'.'*/ + zeroesAfterPoint + digitsAfterPoint + trailingZeroes +
+        trailingBlanks_ /* G editing converted to F */};
+    int width{editWidth > 0 || trailingBlanks_ ? editWidth : totalLength};
     if (totalLength > width) {
       return EmitRepeated(io_, '*', width);
     }
@@ -574,8 +576,11 @@ DataEdit RealOutputEditing<KIND>::EditForGOutput(DataEdit edit) {
   trailingBlanks_ = 0;
   if (editWidth > 0) {
     int expoDigits{edit.expoDigits.value_or(0)};
+    // F'2023 13.7.5.2.3 p5: "If 0 <= s <= d, the scale factor has no effect
+    // and F(w − n).(d − s),n(’b’) editing is used where b is a blank and
+    // n is 4 for Gw.d editing, e + 2 for Gw.dEe editing if e > 0, and
+    // 4 for Gw.dE0 editing."
     trailingBlanks_ = expoDigits > 0 ? expoDigits + 2 : 4; // 'n'
-    *edit.width = std::max(0, editWidth - trailingBlanks_);
   }
   if (edit.digits.has_value()) {
     *edit.digits = std::max(0, *edit.digits - expo);

From 39c2f59709f454f04cc13151301ca19c4ba9c152 Mon Sep 17 00:00:00 2001
From: Peter Klausler <35819229+klausler@users.noreply.github.com>
Date: Tue, 26 Dec 2023 15:28:36 -0800
Subject: [PATCH 306/342] [flang][runtime] Fix NEAREST() when exponent
 decreases (#75368)

When the result of NEAREST() has an exponent less than that of the
argument (e.g., NEAREST(1.,-1.) and NEAREST(-1.,1.)), the result was
wrong, because the increment value uses the result of SPACING() in terms
of the argument. Fix by just calling into the C runtime routine
std::nextafter().
---
 flang/runtime/numeric.cpp           | 8 +++-----
 flang/unittests/Runtime/Numeric.cpp | 2 +-
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/flang/runtime/numeric.cpp b/flang/runtime/numeric.cpp
index 25e58e79dbba0..38835c2b753ce 100644
--- a/flang/runtime/numeric.cpp
+++ b/flang/runtime/numeric.cpp
@@ -261,12 +261,10 @@ template <int PREC, typename T> inline RT_API_ATTRS T Spacing(T x) {
 // NEAREST (16.9.139)
 template <int PREC, typename T>
 inline RT_API_ATTRS T Nearest(T x, bool positive) {
-  auto spacing{Spacing<PREC>(x)};
-  if (x == 0) {
-    auto least{std::numeric_limits<T>::denorm_min()};
-    return positive ? least : -least;
+  if (positive) {
+    return std::nextafter(x, std::numeric_limits<T>::infinity());
   } else {
-    return positive ? x + spacing : x - spacing;
+    return std::nextafter(x, -std::numeric_limits<T>::infinity());
   }
 }
 
diff --git a/flang/unittests/Runtime/Numeric.cpp b/flang/unittests/Runtime/Numeric.cpp
index 5afed750c0b18..43263d1ac4231 100644
--- a/flang/unittests/Runtime/Numeric.cpp
+++ b/flang/unittests/Runtime/Numeric.cpp
@@ -86,7 +86,7 @@ TEST(Numeric, Nearest) {
   EXPECT_EQ(RTNAME(Nearest8)(Real<8>{1.0}, true),
       Real<8>{1.0} + std::ldexp(Real<8>{1.0}, -52));
   EXPECT_EQ(RTNAME(Nearest8)(Real<8>{1.0}, false),
-      Real<8>{1.0} - std::ldexp(Real<8>{1.0}, -52));
+      Real<8>{1.0} - 0.5 * std::ldexp(Real<8>{1.0}, -52));
 }
 
 TEST(Numeric, Nint) {

From 1794b61472515078e0a16ec2accf972ef1adf4be Mon Sep 17 00:00:00 2001
From: Peter Klausler <35819229+klausler@users.noreply.github.com>
Date: Tue, 26 Dec 2023 15:33:34 -0800
Subject: [PATCH 307/342] [flang][runtime] Fix spelling of INQUIRE result
 (#75372)

Embarrassingly, the runtime was returning UNKNONN rather than UNKNOWN
for things like INQUIRE(..., FORMAT=).
---
 flang/runtime/io-stmt.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/flang/runtime/io-stmt.cpp b/flang/runtime/io-stmt.cpp
index dedf1f8364ad3..4072dbf265944 100644
--- a/flang/runtime/io-stmt.cpp
+++ b/flang/runtime/io-stmt.cpp
@@ -1346,7 +1346,7 @@ bool InquireUnconnectedFileState::Inquire(
   case HashInquiryKeyword("SEQUENTIAL"):
   case HashInquiryKeyword("STREAM"):
   case HashInquiryKeyword("UNFORMATTED"):
-    str = "UNKNONN";
+    str = "UNKNOWN";
     break;
   case HashInquiryKeyword("READ"):
     str =

From 7b50176805ff53994ab87bbce7c3b7840bbc85ab Mon Sep 17 00:00:00 2001
From: Peter Klausler <35819229+klausler@users.noreply.github.com>
Date: Tue, 26 Dec 2023 15:40:19 -0800
Subject: [PATCH 308/342] [flang][runtime] Flush output before INQUIRE(...,
 SIZE=) (#75379)

Ensure that any buffered data has tranferred to an external unit before
measuring its file size.
---
 flang/runtime/io-stmt.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/flang/runtime/io-stmt.cpp b/flang/runtime/io-stmt.cpp
index 4072dbf265944..398f6127cf275 100644
--- a/flang/runtime/io-stmt.cpp
+++ b/flang/runtime/io-stmt.cpp
@@ -1219,6 +1219,7 @@ bool InquireUnitState::Inquire(
   case HashInquiryKeyword("SIZE"):
     result = -1;
     if (unit().IsConnected()) {
+      unit().FlushOutput(*this);
       if (auto size{unit().knownSize()}) {
         result = *size;
       }

From 9469dc38b01b857a6bef8a57480a30ddba61647d Mon Sep 17 00:00:00 2001
From: Peter Klausler <35819229+klausler@users.noreply.github.com>
Date: Tue, 26 Dec 2023 15:44:31 -0800
Subject: [PATCH 309/342] [flang][runtime] Handle unconsumed repeated
 list-directed input items (#75400)

If list-directed input contains a repeated item ("20*123.0") that is not
fully consumed by the READ statement's data item list, the end of that
READ statement was repositioning the input to the repeated value
("123.0"), leading to later confusion. Cancel the input item repetition
during EndIoStatement() processing to prevent this misbehavior.

Fixes llvm-test-suite/Fortran/gfortran/regression/list_read_4.f90.
---
 flang/runtime/connection.cpp | 16 ++++++++-------
 flang/runtime/connection.h   |  2 ++
 flang/runtime/io-stmt.cpp    | 39 ++++++++++++++++++++++++++++++++++++
 flang/runtime/io-stmt.h      |  7 +++++++
 4 files changed, 57 insertions(+), 7 deletions(-)

diff --git a/flang/runtime/connection.cpp b/flang/runtime/connection.cpp
index 0abacd7995b47..91ac9a0e14e47 100644
--- a/flang/runtime/connection.cpp
+++ b/flang/runtime/connection.cpp
@@ -46,13 +46,15 @@ SavedPosition::SavedPosition(IoStatementState &io) : io_{io} {
 }
 
 SavedPosition::~SavedPosition() {
-  ConnectionState &conn{io_.GetConnectionState()};
-  while (conn.currentRecordNumber > saved_.currentRecordNumber) {
-    io_.BackspaceRecord();
+  if (!cancelled_) {
+    ConnectionState &conn{io_.GetConnectionState()};
+    while (conn.currentRecordNumber > saved_.currentRecordNumber) {
+      io_.BackspaceRecord();
+    }
+    conn.leftTabLimit = saved_.leftTabLimit;
+    conn.furthestPositionInRecord = saved_.furthestPositionInRecord;
+    conn.positionInRecord = saved_.positionInRecord;
+    conn.pinnedFrame = saved_.pinnedFrame;
   }
-  conn.leftTabLimit = saved_.leftTabLimit;
-  conn.furthestPositionInRecord = saved_.furthestPositionInRecord;
-  conn.positionInRecord = saved_.positionInRecord;
-  conn.pinnedFrame = saved_.pinnedFrame;
 }
 } // namespace Fortran::runtime::io
diff --git a/flang/runtime/connection.h b/flang/runtime/connection.h
index 70c20e17fd01a..c9a7566f20988 100644
--- a/flang/runtime/connection.h
+++ b/flang/runtime/connection.h
@@ -111,10 +111,12 @@ class SavedPosition {
 public:
   explicit SavedPosition(IoStatementState &);
   ~SavedPosition();
+  void Cancel() { cancelled_ = true; }
 
 private:
   IoStatementState &io_;
   ConnectionState saved_;
+  bool cancelled_{false};
 };
 
 } // namespace Fortran::runtime::io
diff --git a/flang/runtime/io-stmt.cpp b/flang/runtime/io-stmt.cpp
index 398f6127cf275..921c6e625edb5 100644
--- a/flang/runtime/io-stmt.cpp
+++ b/flang/runtime/io-stmt.cpp
@@ -189,6 +189,17 @@ InternalListIoStatementState<DIR>::InternalListIoStatementState(
     : InternalIoStatementState<DIR>{d, sourceFile, sourceLine},
       ioStatementState_{*this} {}
 
+template <Direction DIR>
+int InternalListIoStatementState<DIR>::EndIoStatement() {
+  if constexpr (DIR == Direction::Input) {
+    if (int status{ListDirectedStatementState<DIR>::EndIoStatement()};
+        status != IostatOk) {
+      return status;
+    }
+  }
+  return InternalIoStatementState<DIR>::EndIoStatement();
+}
+
 ExternalIoStatementBase::ExternalIoStatementBase(
     ExternalFileUnit &unit, const char *sourceFile, int sourceLine)
     : IoStatementBase{sourceFile, sourceLine}, unit_{unit} {}
@@ -707,6 +718,13 @@ ListDirectedStatementState<Direction::Output>::GetNextDataEdit(
   return edit;
 }
 
+int ListDirectedStatementState<Direction::Input>::EndIoStatement() {
+  if (repeatPosition_) {
+    repeatPosition_->Cancel();
+  }
+  return IostatOk;
+}
+
 std::optional<DataEdit>
 ListDirectedStatementState<Direction::Input>::GetNextDataEdit(
     IoStatementState &io, int maxRepeat) {
@@ -818,6 +836,17 @@ ListDirectedStatementState<Direction::Input>::GetNextDataEdit(
   return edit;
 }
 
+template <Direction DIR>
+int ExternalListIoStatementState<DIR>::EndIoStatement() {
+  if constexpr (DIR == Direction::Input) {
+    if (auto status{ListDirectedStatementState<DIR>::EndIoStatement()};
+        status != IostatOk) {
+      return status;
+    }
+  }
+  return ExternalIoStatementState<DIR>::EndIoStatement();
+}
+
 template <Direction DIR>
 bool ExternalUnformattedIoStatementState<DIR>::Receive(
     char *data, std::size_t bytes, std::size_t elementBytes) {
@@ -910,6 +939,16 @@ bool ChildUnformattedIoStatementState<DIR>::Receive(
   return this->child().parent().Receive(data, bytes, elementBytes);
 }
 
+template <Direction DIR> int ChildListIoStatementState<DIR>::EndIoStatement() {
+  if constexpr (DIR == Direction::Input) {
+    if (int status{ListDirectedStatementState<DIR>::EndIoStatement()};
+        status != IostatOk) {
+      return status;
+    }
+  }
+  return ChildIoStatementState<DIR>::EndIoStatement();
+}
+
 template class InternalIoStatementState<Direction::Output>;
 template class InternalIoStatementState<Direction::Input>;
 template class InternalFormattedIoStatementState<Direction::Output>;
diff --git a/flang/runtime/io-stmt.h b/flang/runtime/io-stmt.h
index 91169f6c6e323..0b6bcbd9af025 100644
--- a/flang/runtime/io-stmt.h
+++ b/flang/runtime/io-stmt.h
@@ -304,6 +304,7 @@ class ListDirectedStatementState<Direction::Input>
     : public FormattedIoStatementState<Direction::Input> {
 public:
   bool inNamelistSequence() const { return inNamelistSequence_; }
+  int EndIoStatement();
 
   // Skips value separators, handles repetition and null values.
   // Vacant when '/' appears; present with descriptor == ListDirectedNullValue
@@ -317,6 +318,9 @@ class ListDirectedStatementState<Direction::Input>
   // NAMELIST input item.
   void ResetForNextNamelistItem(bool inNamelistSequence) {
     remaining_ = 0;
+    if (repeatPosition_) {
+      repeatPosition_->Cancel();
+    }
     eatComma_ = false;
     realPart_ = imaginaryPart_ = false;
     inNamelistSequence_ = inNamelistSequence;
@@ -399,6 +403,7 @@ class InternalListIoStatementState : public InternalIoStatementState<DIR>,
       const Descriptor &, const char *sourceFile = nullptr, int sourceLine = 0);
   IoStatementState &ioStatementState() { return ioStatementState_; }
   using ListDirectedStatementState<DIR>::GetNextDataEdit;
+  int EndIoStatement();
 
 private:
   IoStatementState ioStatementState_; // points to *this
@@ -474,6 +479,7 @@ class ExternalListIoStatementState : public ExternalIoStatementState<DIR>,
 public:
   using ExternalIoStatementState<DIR>::ExternalIoStatementState;
   using ListDirectedStatementState<DIR>::GetNextDataEdit;
+  int EndIoStatement();
 };
 
 template <Direction DIR>
@@ -532,6 +538,7 @@ class ChildListIoStatementState : public ChildIoStatementState<DIR>,
 public:
   using ChildIoStatementState<DIR>::ChildIoStatementState;
   using ListDirectedStatementState<DIR>::GetNextDataEdit;
+  int EndIoStatement();
 };
 
 template <Direction DIR>

From 1346037fffae8f173c6915689349abff0118f952 Mon Sep 17 00:00:00 2001
From: Peter Klausler <35819229+klausler@users.noreply.github.com>
Date: Tue, 26 Dec 2023 15:49:09 -0800
Subject: [PATCH 310/342] [flang][runtime] Return +/-HUGE() for some real input
 roundings (#75525)

The Fortran standard says that overflow input cases in some rounding
modes (RZ, RD, RU) should round to a "representable" number. Some
Fortran compilers interpret this to mean +/-HUGE(), some as +/-Inf.
Follow the precedent of gfortran and the Intel compilers.
---
 flang/lib/Decimal/big-radix-floating-point.h  |   6 +
 flang/lib/Decimal/decimal-to-binary.cpp       |  36 ++++--
 flang/runtime/edit-input.cpp                  |  14 ++-
 .../unittests/Runtime/NumericalFormatTest.cpp | 109 ++++++++++--------
 4 files changed, 108 insertions(+), 57 deletions(-)

diff --git a/flang/lib/Decimal/big-radix-floating-point.h b/flang/lib/Decimal/big-radix-floating-point.h
index 7d5d31b7788d7..2143d1d9b3f77 100644
--- a/flang/lib/Decimal/big-radix-floating-point.h
+++ b/flang/lib/Decimal/big-radix-floating-point.h
@@ -369,6 +369,12 @@ template <int PREC, int LOG10RADIX = 16> class BigRadixFloatingPointNumber {
     }
     return result;
   }
+  constexpr Raw HUGE() const {
+    Raw result{static_cast<Raw>(Real::maxExponent)};
+    result <<= Real::significandBits;
+    result |= SignBit();
+    return result - 1; // decrement exponent, set all significand bits
+  }
 
   Digit digit_[maxDigits]; // in little-endian order: digit_[0] is LSD
   int digits_{0}; // # of elements in digit_[] array; zero when zero
diff --git a/flang/lib/Decimal/decimal-to-binary.cpp b/flang/lib/Decimal/decimal-to-binary.cpp
index 780979f747f5b..05a5f038353e0 100644
--- a/flang/lib/Decimal/decimal-to-binary.cpp
+++ b/flang/lib/Decimal/decimal-to-binary.cpp
@@ -237,6 +237,15 @@ template <int PREC> class IntermediateFloat {
   int exponent_{0};
 };
 
+// The standard says that these overflow cases round to "representable"
+// numbers, and some popular compilers interpret that to mean +/-HUGE()
+// rather than +/-Inf.
+static inline constexpr bool RoundOverflowToHuge(
+    enum FortranRounding rounding, bool isNegative) {
+  return rounding == RoundToZero || (!isNegative && rounding == RoundDown) ||
+      (isNegative && rounding == RoundUp);
+}
+
 template <int PREC>
 ConversionToBinaryResult<PREC> IntermediateFloat<PREC>::ToBinary(
     bool isNegative, FortranRounding rounding) const {
@@ -260,7 +269,7 @@ ConversionToBinaryResult<PREC> IntermediateFloat<PREC>::ToBinary(
     if (guard <= oneHalf) {
       if ((!isNegative && rounding == RoundUp) ||
           (isNegative && rounding == RoundDown)) {
-        // round to minimum nonzero value
+        // round to least nonzero value
       } else { // round to zero
         if (guard != 0) {
           flags |= Underflow;
@@ -310,12 +319,17 @@ ConversionToBinaryResult<PREC> IntermediateFloat<PREC>::ToBinary(
   } else if (expo == 0) {
     flags |= Underflow;
   } else if (expo >= Binary::maxExponent) {
-    expo = Binary::maxExponent; // Inf
-    flags |= Overflow;
-    if constexpr (Binary::bits == 80) { // x87
-      fraction = IntType{1} << 63;
-    } else {
-      fraction = 0;
+    if (RoundOverflowToHuge(rounding, isNegative)) {
+      expo = Binary::maxExponent - 1;
+      fraction = mask;
+    } else { // Inf
+      expo = Binary::maxExponent;
+      flags |= Overflow;
+      if constexpr (Binary::bits == 80) { // x87
+        fraction = IntType{1} << 63;
+      } else {
+        fraction = 0;
+      }
     }
   }
   using Raw = typename Binary::RawType;
@@ -355,8 +369,12 @@ BigRadixFloatingPointNumber<PREC, LOG10RADIX>::ConvertToBinary() {
     } else { // underflow to +/-0.
       return {Real{SignBit()}, flags};
     }
-  } else if (exponent_ > crazy) { // overflow to +/-Inf.
-    return {Real{Infinity()}, Overflow};
+  } else if (exponent_ > crazy) { // overflow to +/-HUGE() or +/-Inf
+    if (RoundOverflowToHuge(rounding_, isNegative_)) {
+      return {Real{HUGE()}};
+    } else {
+      return {Real{Infinity()}, Overflow};
+    }
   }
   // Apply any negative decimal exponent by multiplication
   // by a power of two, adjusting the binary exponent to compensate.
diff --git a/flang/runtime/edit-input.cpp b/flang/runtime/edit-input.cpp
index 6e26e523c5ca7..2b80974906777 100644
--- a/flang/runtime/edit-input.cpp
+++ b/flang/runtime/edit-input.cpp
@@ -650,15 +650,23 @@ decimal::ConversionToBinaryResult<binaryPrecision> ConvertHexadecimal(
     expo = 0; // subnormal
     flags |= decimal::Underflow;
   } else if (expo >= RealType::maxExponent) {
-    expo = RealType::maxExponent; // +/-Inf
-    fraction = 0;
+    if (rounding == decimal::RoundToZero ||
+        (rounding == decimal::RoundDown && !isNegative) ||
+        (rounding == decimal::RoundUp && isNegative)) {
+      expo = RealType::maxExponent - 1; // +/-HUGE()
+      fraction = significandMask;
+    } else {
+      expo = RealType::maxExponent; // +/-Inf
+      fraction = 0;
+      flags |= decimal::Overflow;
+    }
   } else {
     fraction &= significandMask; // remove explicit normalization unless x87
   }
   return decimal::ConversionToBinaryResult<binaryPrecision>{
       RealType{static_cast<RawType>(signBit |
           static_cast<RawType>(expo) << RealType::significandBits | fraction)},
-      static_cast<enum decimal::ConversionResultFlags>(flags)};
+      static_cast<decimal::ConversionResultFlags>(flags)};
 }
 
 template <int KIND>
diff --git a/flang/unittests/Runtime/NumericalFormatTest.cpp b/flang/unittests/Runtime/NumericalFormatTest.cpp
index 69637d8c6cb4c..bf954a84444ac 100644
--- a/flang/unittests/Runtime/NumericalFormatTest.cpp
+++ b/flang/unittests/Runtime/NumericalFormatTest.cpp
@@ -856,49 +856,66 @@ TEST(IOApiTests, FormatIntegerValues) {
 
 // Ensure double input values correctly map to raw uint64 values
 TEST(IOApiTests, EditDoubleInputValues) {
-  using TestCaseTy = std::tuple<const char *, const char *, std::uint64_t>;
+  using TestCaseTy = std::tuple<const char *, const char *, std::uint64_t, int>;
+  int ovf{IostatRealInputOverflow};
   static const std::vector<TestCaseTy> testCases{
-      {"(F18.0)", "                 0", 0x0},
-      {"(F18.0)", "                  ", 0x0},
-      {"(F18.0)", "                -0", 0x8000000000000000},
-      {"(F18.0)", "                01", 0x3ff0000000000000},
-      {"(F18.0)", "                 1", 0x3ff0000000000000},
-      {"(F18.0)", "              125.", 0x405f400000000000},
-      {"(F18.0)", "              12.5", 0x4029000000000000},
-      {"(F18.0)", "              1.25", 0x3ff4000000000000},
-      {"(F18.0)", "             01.25", 0x3ff4000000000000},
-      {"(F18.0)", "              .125", 0x3fc0000000000000},
-      {"(F18.0)", "             0.125", 0x3fc0000000000000},
-      {"(F18.0)", "             .0625", 0x3fb0000000000000},
-      {"(F18.0)", "            0.0625", 0x3fb0000000000000},
-      {"(F18.0)", "               125", 0x405f400000000000},
-      {"(F18.1)", "               125", 0x4029000000000000},
-      {"(F18.2)", "               125", 0x3ff4000000000000},
-      {"(F18.3)", "               125", 0x3fc0000000000000},
-      {"(-1P,F18.0)", "               125", 0x4093880000000000}, // 1250
-      {"(1P,F18.0)", "               125", 0x4029000000000000}, // 12.5
-      {"(BZ,F18.0)", "              125 ", 0x4093880000000000}, // 1250
-      {"(BZ,F18.0)", "       125 . e +1 ", 0x42a6bcc41e900000}, // 1.25e13
-      {"(BZ,F18.0)", "           .      ", 0x0},
-      {"(BZ,F18.0)", "           . e +1 ", 0x0},
-      {"(DC,F18.0)", "              12,5", 0x4029000000000000},
-      {"(EX22.0)", "0X0P0                 ", 0x0}, // +0.
-      {"(EX22.0)", "-0X0P0                ", 0x8000000000000000}, // -0.
-      {"(EX22.0)", "0X.8P1                ", 0x3ff0000000000000}, // 1.0
-      {"(EX22.0)", "0X8.P-3               ", 0x3ff0000000000000}, // 1.0
-      {"(EX22.0)", "0X.1P4                ", 0x3ff0000000000000}, // 1.0
-      {"(EX22.0)", "0X10.P-4              ", 0x3ff0000000000000}, // 1.0
-      {"(EX22.0)", "0X8.00P-3             ", 0x3ff0000000000000}, // 1.0
-      {"(EX22.0)", "0X80.0P-6             ", 0x4000000000000000}, // 2.0
-      {"(EX22.0)", "0XC.CCCCCCCCCCCDP-7   ", 0x3fb999999999999a}, // 0.1
-      {"(EX22.0)", "0X.8P-1021            ", 0x0010000000000000}, // min normal
-      {"(EX22.0)", "0X.8P-1022            ", 0x0008000000000000}, // subnormal
-      {"(EX22.0)", "0X.8P-1073            ", 0x0000000000000001}, // min subn.
-      {"(EX22.0)", "0X.FFFFFFFFFFFFF8P1024", 0x7fefffffffffffff}, // max finite
-      {"(EX22.0)", "0X.8P1025             ", 0x7ff0000000000000}, // +Inf
-      {"(EX22.0)", "-0X.8P1025            ", 0xfff0000000000000}, // -Inf
+      {"(F18.0)", "                 0", 0x0, 0},
+      {"(F18.0)", "                  ", 0x0, 0},
+      {"(F18.0)", "                -0", 0x8000000000000000, 0},
+      {"(F18.0)", "                01", 0x3ff0000000000000, 0},
+      {"(F18.0)", "                 1", 0x3ff0000000000000, 0},
+      {"(F18.0)", "              125.", 0x405f400000000000, 0},
+      {"(F18.0)", "              12.5", 0x4029000000000000, 0},
+      {"(F18.0)", "              1.25", 0x3ff4000000000000, 0},
+      {"(F18.0)", "             01.25", 0x3ff4000000000000, 0},
+      {"(F18.0)", "              .125", 0x3fc0000000000000, 0},
+      {"(F18.0)", "             0.125", 0x3fc0000000000000, 0},
+      {"(F18.0)", "             .0625", 0x3fb0000000000000, 0},
+      {"(F18.0)", "            0.0625", 0x3fb0000000000000, 0},
+      {"(F18.0)", "               125", 0x405f400000000000, 0},
+      {"(F18.1)", "               125", 0x4029000000000000, 0},
+      {"(F18.2)", "               125", 0x3ff4000000000000, 0},
+      {"(F18.3)", "               125", 0x3fc0000000000000, 0},
+      {"(-1P,F18.0)", "               125", 0x4093880000000000, 0}, // 1250
+      {"(1P,F18.0)", "               125", 0x4029000000000000, 0}, // 12.5
+      {"(BZ,F18.0)", "              125 ", 0x4093880000000000, 0}, // 1250
+      {"(BZ,F18.0)", "       125 . e +1 ", 0x42a6bcc41e900000, 0}, // 1.25e13
+      {"(BZ,F18.0)", "           .      ", 0x0, 0},
+      {"(BZ,F18.0)", "           . e +1 ", 0x0, 0},
+      {"(DC,F18.0)", "              12,5", 0x4029000000000000, 0},
+      {"(EX22.0)", "0X0P0                 ", 0x0, 0}, // +0.
+      {"(EX22.0)", "-0X0P0                ", 0x8000000000000000, 0}, // -0.
+      {"(EX22.0)", "0X.8P1                ", 0x3ff0000000000000, 0}, // 1.0
+      {"(EX22.0)", "0X8.P-3               ", 0x3ff0000000000000, 0}, // 1.0
+      {"(EX22.0)", "0X.1P4                ", 0x3ff0000000000000, 0}, // 1.0
+      {"(EX22.0)", "0X10.P-4              ", 0x3ff0000000000000, 0}, // 1.0
+      {"(EX22.0)", "0X8.00P-3             ", 0x3ff0000000000000, 0}, // 1.0
+      {"(EX22.0)", "0X80.0P-6             ", 0x4000000000000000, 0}, // 2.0
+      {"(EX22.0)", "0XC.CCCCCCCCCCCDP-7   ", 0x3fb999999999999a, 0}, // 0.1
+      {"(EX22.0)", "0X.8P-1021            ", 0x0010000000000000,
+          0}, // min normal
+      {"(EX22.0)", "0X.8P-1022            ", 0x0008000000000000,
+          0}, // subnormal
+      {"(EX22.0)", "0X.8P-1073            ", 0x0000000000000001,
+          0}, // min subn.
+      {"(EX22.0)", "0X.FFFFFFFFFFFFF8P1024", 0x7fefffffffffffff,
+          0}, // max finite
+      {"(EX22.0)", "0X.8P1025             ", 0x7ff0000000000000, ovf}, // +Inf
+      {"(EX22.0)", "-0X.8P1025            ", 0xfff0000000000000, ovf}, // -Inf
+      {"(RZ,F7.0)", " 2.e308", 0x7fefffffffffffff, 0}, // +HUGE()
+      {"(RD,F7.0)", " 2.e308", 0x7fefffffffffffff, 0}, // +HUGE()
+      {"(RU,F7.0)", " 2.e308", 0x7ff0000000000000, ovf}, // +Inf
+      {"(RZ,F7.0)", "-2.e308", 0xffefffffffffffff, 0}, // -HUGE()
+      {"(RD,F7.0)", "-2.e308", 0xfff0000000000000, ovf}, // -Inf
+      {"(RU,F7.0)", "-2.e308", 0xffefffffffffffff, 0}, // -HUGE()
+      {"(RZ,F7.0)", " 1.e999", 0x7fefffffffffffff, 0}, // +HUGE()
+      {"(RD,F7.0)", " 1.e999", 0x7fefffffffffffff, 0}, // +HUGE()
+      {"(RU,F7.0)", " 1.e999", 0x7ff0000000000000, ovf}, // +Inf
+      {"(RZ,F7.0)", "-1.e999", 0xffefffffffffffff, 0}, // -HUGE()
+      {"(RD,F7.0)", "-1.e999", 0xfff0000000000000, ovf}, // -Inf
+      {"(RU,F7.0)", "-1.e999", 0xffefffffffffffff, 0}, // -HUGE()
   };
-  for (auto const &[format, data, want] : testCases) {
+  for (auto const &[format, data, want, iostat] : testCases) {
     auto cookie{IONAME(BeginInternalFormattedInput)(
         data, std::strlen(data), format, std::strlen(format))};
     union {
@@ -915,12 +932,14 @@ TEST(IOApiTests, EditDoubleInputValues) {
     char iomsg[bufferSize];
     std::memset(iomsg, '\0', bufferSize - 1);
 
-    // Ensure no errors were encountered reading input buffer into union value
+    // Ensure no unexpected errors were encountered reading input buffer into
+    // union value
     IONAME(GetIoMsg)(cookie, iomsg, bufferSize - 1);
     auto status{IONAME(EndIoStatement)(cookie)};
-    ASSERT_EQ(status, 0) << '\'' << format << "' failed reading '" << data
-                         << "', status " << static_cast<int>(status)
-                         << " iomsg '" << iomsg << "'";
+    ASSERT_EQ(status, iostat)
+        << '\'' << format << "' failed reading '" << data << "', status "
+        << static_cast<int>(status) << " != expected " << iostat << " iomsg '"
+        << iomsg << "'";
 
     // Ensure raw uint64 value matches expected conversion from double
     ASSERT_EQ(u.raw, want) << '\'' << format << "' failed reading '" << data

From f45723cded56b1e66b572fba0b71d117db6caa2e Mon Sep 17 00:00:00 2001
From: Peter Klausler <35819229+klausler@users.noreply.github.com>
Date: Tue, 26 Dec 2023 15:57:35 -0800
Subject: [PATCH 311/342] [flang][runtime] Fix RU/RD results when rounding to
 least nonzero (#75878)

When rounding what otherwise would have been a result that underflowed
to zero up (RU) or down (RD) to the least magnitude nonzero subnormal
number, ensure that the original exponent value doesn't perturb the
result.
---
 flang/lib/Decimal/decimal-to-binary.cpp         | 1 +
 flang/unittests/Runtime/NumericalFormatTest.cpp | 4 ++++
 2 files changed, 5 insertions(+)

diff --git a/flang/lib/Decimal/decimal-to-binary.cpp b/flang/lib/Decimal/decimal-to-binary.cpp
index 05a5f038353e0..d38af0f9b8005 100644
--- a/flang/lib/Decimal/decimal-to-binary.cpp
+++ b/flang/lib/Decimal/decimal-to-binary.cpp
@@ -270,6 +270,7 @@ ConversionToBinaryResult<PREC> IntermediateFloat<PREC>::ToBinary(
       if ((!isNegative && rounding == RoundUp) ||
           (isNegative && rounding == RoundDown)) {
         // round to least nonzero value
+        expo = 0;
       } else { // round to zero
         if (guard != 0) {
           flags |= Underflow;
diff --git a/flang/unittests/Runtime/NumericalFormatTest.cpp b/flang/unittests/Runtime/NumericalFormatTest.cpp
index bf954a84444ac..9dd2771fe4a75 100644
--- a/flang/unittests/Runtime/NumericalFormatTest.cpp
+++ b/flang/unittests/Runtime/NumericalFormatTest.cpp
@@ -914,6 +914,10 @@ TEST(IOApiTests, EditDoubleInputValues) {
       {"(RZ,F7.0)", "-1.e999", 0xffefffffffffffff, 0}, // -HUGE()
       {"(RD,F7.0)", "-1.e999", 0xfff0000000000000, ovf}, // -Inf
       {"(RU,F7.0)", "-1.e999", 0xffefffffffffffff, 0}, // -HUGE()
+      {"(E9.1)", " 1.0E-325", 0x0, 0},
+      {"(RU,E9.1)", " 1.0E-325", 0x1, 0},
+      {"(E9.1)", "-1.0E-325", 0x0, 0},
+      {"(RD,E9.1)", "-1.0E-325", 0x8000000000000001, 0},
   };
   for (auto const &[format, data, want, iostat] : testCases) {
     auto cookie{IONAME(BeginInternalFormattedInput)(

From c86fe3ee0b92934f5d18394d9a0cdc1d3f0eef64 Mon Sep 17 00:00:00 2001
From: Jie Fu <jiefu@tencent.com>
Date: Wed, 27 Dec 2023 08:00:58 +0800
Subject: [PATCH 312/342] [mlir][Quasipolynomials] Fix -Wunused-variable in
 QuasiPolynomial.cpp (NFC)

llvm-project/mlir/lib/Analysis/Presburger/QuasiPolynomial.cpp:29:39:
 error: unused variable 'aff' [-Werror,-Wunused-variable]
    for (const SmallVector<Fraction> &aff : term) {
                                      ^
1 error generated.
---
 mlir/lib/Analysis/Presburger/QuasiPolynomial.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/mlir/lib/Analysis/Presburger/QuasiPolynomial.cpp b/mlir/lib/Analysis/Presburger/QuasiPolynomial.cpp
index 902e3ced472f8..3ae4fb726215f 100644
--- a/mlir/lib/Analysis/Presburger/QuasiPolynomial.cpp
+++ b/mlir/lib/Analysis/Presburger/QuasiPolynomial.cpp
@@ -20,6 +20,7 @@ QuasiPolynomial::QuasiPolynomial(
     : PresburgerSpace(/*numDomain=*/numVars, /*numRange=*/1, /*numSymbols=*/0,
                       /*numLocals=*/0),
       coefficients(coeffs), affine(aff) {
+#ifndef NDEBUG
   // For each term which involves at least one affine function,
   for (const std::vector<SmallVector<Fraction>> &term : affine) {
     if (term.size() == 0)
@@ -32,6 +33,7 @@ QuasiPolynomial::QuasiPolynomial(
              "symbols!");
     }
   }
+#endif // NDEBUG
 }
 
 QuasiPolynomial QuasiPolynomial::operator+(const QuasiPolynomial &x) const {

From 5a402c56226e9b50bffdedd19d2acb8b61b408a3 Mon Sep 17 00:00:00 2001
From: Peter Klausler <35819229+klausler@users.noreply.github.com>
Date: Tue, 26 Dec 2023 16:03:03 -0800
Subject: [PATCH 313/342] [flang] USE-associated explicit INTRINSIC names
 (#76199)

The compiler doesn't USE-associate names of intrinsic procedures from
modules (in the absence of ONLY:), so that the associating scope doesn't
get populated with names of intrinsics that were used only in
declarations (e.g., SELECTED_REAL_KIND). A recent bug report (below)
shows that we should modify that policy in the case of names that appear
in explicit INTRINSIC attribute statements. The behaviors of other
Fortran compilers are not consistent and the requirements of the
standard are not clear; this fix follows the precedent set by gfortran
and nvfortran.

Fixes https://github.com/llvm/llvm-project/issues/72084.
---
 flang/docs/Extensions.md              |  4 ++
 flang/lib/Semantics/resolve-names.cpp | 59 +++++++++++++++++----------
 flang/module/iso_fortran_env.f90      |  1 +
 flang/test/Semantics/contiguous01.f90 |  2 +-
 flang/test/Semantics/intrinsics02.f90 | 38 +++++++++++++++++
 5 files changed, 81 insertions(+), 23 deletions(-)
 create mode 100644 flang/test/Semantics/intrinsics02.f90

diff --git a/flang/docs/Extensions.md b/flang/docs/Extensions.md
index 03d4310466485..6c6588025a392 100644
--- a/flang/docs/Extensions.md
+++ b/flang/docs/Extensions.md
@@ -641,6 +641,10 @@ module m
 end
 ```
 
+* When an intrinsic procedure appears in the specification part of a module
+  only in function references, but not an explicit `INTRINSIC` statement,
+  its name is not brought into other scopes by a `USE` statement.
+
 ## De Facto Standard Features
 
 * `EXTENDS_TYPE_OF()` returns `.TRUE.` if both of its arguments have the
diff --git a/flang/lib/Semantics/resolve-names.cpp b/flang/lib/Semantics/resolve-names.cpp
index e1cd34ddf65b6..f5f7b99aba255 100644
--- a/flang/lib/Semantics/resolve-names.cpp
+++ b/flang/lib/Semantics/resolve-names.cpp
@@ -2904,7 +2904,7 @@ void ModuleVisitor::Post(const parser::UseStmt &x) {
     }
     for (const auto &[name, symbol] : *useModuleScope_) {
       if (symbol->attrs().test(Attr::PUBLIC) && !IsUseRenamed(symbol->name()) &&
-          (!symbol->attrs().test(Attr::INTRINSIC) ||
+          (!symbol->implicitAttrs().test(Attr::INTRINSIC) ||
               symbol->has<UseDetails>()) &&
           !symbol->has<MiscDetails>() && useNames.count(name) == 0) {
         SourceName location{x.moduleName.source};
@@ -2998,7 +2998,7 @@ void ModuleVisitor::DoAddUse(SourceName location, SourceName localName,
     details->add_occurrence(location, *useModuleScope_);
     return;
   }
-
+  const Symbol &useUltimate{useSymbol.GetUltimate()};
   if (localSymbol.has<UnknownDetails>()) {
     localSymbol.set_details(UseDetails{localName, useSymbol});
     localSymbol.attrs() =
@@ -3010,7 +3010,6 @@ void ModuleVisitor::DoAddUse(SourceName location, SourceName localName,
   }
 
   Symbol &localUltimate{localSymbol.GetUltimate()};
-  const Symbol &useUltimate{useSymbol.GetUltimate()};
   if (&localUltimate == &useUltimate) {
     // use-associating the same symbol again -- ok
     return;
@@ -3044,13 +3043,19 @@ void ModuleVisitor::DoAddUse(SourceName location, SourceName localName,
           checkAmbiguousDerivedType(&useUltimate, localGeneric->derivedType());
     } else if (&useUltimate == &BypassGeneric(localUltimate).GetUltimate()) {
       return; // nothing to do; used subprogram is local's specific
+    } else if (useUltimate.attrs().test(Attr::INTRINSIC) &&
+        useUltimate.name() == localSymbol.name()) {
+      return; // local generic can extend intrinsic
     }
   } else if (useGeneric) {
     if (localUltimate.has<DerivedTypeDetails>()) {
       combine =
           checkAmbiguousDerivedType(&localUltimate, useGeneric->derivedType());
-    } else if (&localUltimate == &BypassGeneric(useUltimate).GetUltimate()) {
-      // Local is the specific of the used generic; replace it.
+    } else if (&localUltimate == &BypassGeneric(useUltimate).GetUltimate() ||
+        (localSymbol.attrs().test(Attr::INTRINSIC) &&
+            localUltimate.name() == useUltimate.name())) {
+      // Local is the specific of the used generic or an intrinsic with the
+      // same name; replace it.
       EraseSymbol(localSymbol);
       Symbol &newSymbol{MakeSymbol(localName,
           useUltimate.attrs() & ~Attrs{Attr::PUBLIC, Attr::PRIVATE},
@@ -3058,23 +3063,22 @@ void ModuleVisitor::DoAddUse(SourceName location, SourceName localName,
       newSymbol.flags() = useSymbol.flags();
       return;
     }
+  } else if (localUltimate.name() != useUltimate.name()) {
+    // not the same procedure
+  } else if (localUltimate.attrs().test(Attr::INTRINSIC) &&
+      useUltimate.attrs().test(Attr::INTRINSIC)) {
+    return;
   } else {
     auto localClass{ClassifyProcedure(localUltimate)};
     auto useClass{ClassifyProcedure(useUltimate)};
-    if (localClass == useClass &&
-        (localClass == ProcedureDefinitionClass::Intrinsic ||
-            localClass == ProcedureDefinitionClass::External) &&
-        localUltimate.name() == useUltimate.name()) {
+    if (localClass == ProcedureDefinitionClass::External &&
+        useClass == ProcedureDefinitionClass::External) {
       auto localChars{evaluate::characteristics::Procedure::Characterize(
           localUltimate, GetFoldingContext())};
       auto useChars{evaluate::characteristics::Procedure::Characterize(
           useUltimate, GetFoldingContext())};
-      if (localChars && useChars) {
-        if (*localChars == *useChars) {
-          // Same intrinsic or external procedure defined identically in two
-          // modules
-          return;
-        }
+      if (localChars && useChars && *localChars == *useChars) {
+        return; // same procedure defined identically in two modules
       }
     }
   }
@@ -4794,9 +4798,15 @@ Symbol &DeclarationVisitor::HandleAttributeStmt(
       }
     }
   } else if (symbol && symbol->has<UseDetails>()) {
-    Say(currStmtSource().value(),
-        "Cannot change %s attribute on use-associated '%s'"_err_en_US,
-        EnumToString(attr), name.source);
+    if (symbol->GetUltimate().attrs().test(attr)) {
+      Say(currStmtSource().value(),
+          "Use-associated '%s' already has '%s' attribute"_warn_en_US,
+          name.source, EnumToString(attr));
+    } else {
+      Say(currStmtSource().value(),
+          "Cannot change %s attribute on use-associated '%s'"_err_en_US,
+          EnumToString(attr), name.source);
+    }
     return *symbol;
   }
   if (!symbol) {
@@ -6244,8 +6254,8 @@ bool DeclarationVisitor::HandleUnrestrictedSpecificIntrinsicFunction(
     // recreated for it later on demand, but capturing its result type here
     // will make GetType() return a correct result without having to
     // probe the intrinsics table again.
-    Symbol &symbol{
-        MakeSymbol(InclusiveScope(), name.source, Attrs{Attr::INTRINSIC})};
+    Symbol &symbol{MakeSymbol(InclusiveScope(), name.source, Attrs{})};
+    SetImplicitAttr(symbol, Attr::INTRINSIC);
     CHECK(interface->functionResult.has_value());
     evaluate::DynamicType dyType{
         DEREF(interface->functionResult->GetTypeAndShape()).type()};
@@ -7708,8 +7718,8 @@ void ResolveNamesVisitor::HandleProcedureName(
   auto *symbol{FindSymbol(NonDerivedTypeScope(), name)};
   if (!symbol) {
     if (IsIntrinsic(name.source, flag)) {
-      symbol =
-          &MakeSymbol(InclusiveScope(), name.source, Attrs{Attr::INTRINSIC});
+      symbol = &MakeSymbol(InclusiveScope(), name.source, Attrs{});
+      SetImplicitAttr(*symbol, Attr::INTRINSIC);
     } else if (const auto ppcBuiltinScope =
                    currScope().context().GetPPCBuiltinsScope()) {
       // Check if it is a builtin from the predefined module
@@ -8047,6 +8057,11 @@ void ResolveNamesVisitor::CreateGeneric(const parser::GenericSpec &x) {
     } else if (ultimate.has<SubprogramDetails>() ||
         ultimate.has<SubprogramNameDetails>()) {
       genericDetails.set_specific(*existing);
+    } else if (ultimate.has<ProcEntityDetails>()) {
+      if (existing->name() != symbolName ||
+          !ultimate.attrs().test(Attr::INTRINSIC)) {
+        genericDetails.set_specific(*existing);
+      }
     } else if (ultimate.has<DerivedTypeDetails>()) {
       genericDetails.set_derivedType(*existing);
     } else if (&existing->owner() == &currScope()) {
diff --git a/flang/module/iso_fortran_env.f90 b/flang/module/iso_fortran_env.f90
index f1d540bc8e451..61d8a07e61133 100644
--- a/flang/module/iso_fortran_env.f90
+++ b/flang/module/iso_fortran_env.f90
@@ -23,6 +23,7 @@ module iso_fortran_env
     compiler_version => __builtin_compiler_version
 
   implicit none
+  private count
 
   ! TODO: Use PACK([x],test) in place of the array constructor idiom
   ! [(x, integer::j=1,COUNT([test]))] below once PACK() can be folded.
diff --git a/flang/test/Semantics/contiguous01.f90 b/flang/test/Semantics/contiguous01.f90
index 1d3600aef6c55..0f086624a20ae 100644
--- a/flang/test/Semantics/contiguous01.f90
+++ b/flang/test/Semantics/contiguous01.f90
@@ -5,7 +5,7 @@ module m0
 end
 module m
   use m0
-  !ERROR: Cannot change CONTIGUOUS attribute on use-associated 'p1'
+  !WARNING: Use-associated 'p1' already has 'CONTIGUOUS' attribute
   contiguous p1
   !ERROR: Cannot change CONTIGUOUS attribute on use-associated 'p2'
   contiguous p2
diff --git a/flang/test/Semantics/intrinsics02.f90 b/flang/test/Semantics/intrinsics02.f90
new file mode 100644
index 0000000000000..0b1f7c13a1564
--- /dev/null
+++ b/flang/test/Semantics/intrinsics02.f90
@@ -0,0 +1,38 @@
+! RUN: %python %S/test_errors.py %s %flang_fc1
+module explicit
+  intrinsic cos
+end
+subroutine testExplicit
+  use explicit
+  !ERROR: 'cos' is use-associated from module 'explicit' and cannot be re-declared
+  real :: cos = 2.
+end
+subroutine extendsUsedIntrinsic
+  use explicit
+  interface cos
+    pure real function mycos(x)
+      real, intent(in) :: x
+    end
+  end interface
+end
+subroutine sameIntrinsic1
+  use explicit
+  !WARNING: Use-associated 'cos' already has 'INTRINSIC' attribute
+  intrinsic cos
+  real :: one = cos(0.)
+end
+module renamer
+  use explicit, renamedCos => cos
+end
+subroutine sameIntrinsic2
+  use explicit
+  use renamer, cos => renamedCos
+  real :: one = cos(0.)
+end
+module implicit
+  real :: one = cos(0.)
+end
+subroutine testImplicit
+  use implicit
+  real :: cos = 2.
+end

From b3ef8dce9811b2725639b0d4fac3f85c7e112817 Mon Sep 17 00:00:00 2001
From: Jinyang He <hejinyang@loongson.cn>
Date: Wed, 27 Dec 2023 08:51:48 +0800
Subject: [PATCH 314/342] [LoongArch] Emit R_LARCH_RELAX when expanding some
 LoadAddress (#72961)

Emit relax relocs when expand non-large la.pcrel and non-large la.got on
llvm-mc stage, which like what does on GAS.
1, la.pcrel -> PCALA_HI20 + RELAX + PCALA_LO12 + RELAX
2, la.got -> GOT_PC_HI20 + RELAX + GOT_PC_LO12 + RELAX
---
 .../AsmParser/LoongArchAsmParser.cpp          | 12 +--
 .../MCTargetDesc/LoongArchMCCodeEmitter.cpp   | 13 +++
 .../MCTargetDesc/LoongArchMCExpr.cpp          |  7 +-
 .../LoongArch/MCTargetDesc/LoongArchMCExpr.h  |  8 +-
 llvm/test/MC/LoongArch/Macros/macros-la.s     | 84 ++++++++++++++++---
 llvm/test/MC/LoongArch/Misc/subsection.s      |  2 +-
 .../MC/LoongArch/Relocations/relax-addsub.s   | 16 +++-
 7 files changed, 115 insertions(+), 27 deletions(-)

diff --git a/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp b/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp
index 276374afee380..66a37fce5dda1 100644
--- a/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp
+++ b/llvm/lib/Target/LoongArch/AsmParser/LoongArchAsmParser.cpp
@@ -85,7 +85,7 @@ class LoongArchAsmParser : public MCTargetAsmParser {
   // "emitLoadAddress*" functions.
   void emitLAInstSeq(MCRegister DestReg, MCRegister TmpReg,
                      const MCExpr *Symbol, SmallVectorImpl<Inst> &Insts,
-                     SMLoc IDLoc, MCStreamer &Out);
+                     SMLoc IDLoc, MCStreamer &Out, bool RelaxHint = false);
 
   // Helper to emit pseudo instruction "la.abs $rd, sym".
   void emitLoadAddressAbs(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out);
@@ -748,12 +748,14 @@ bool LoongArchAsmParser::ParseInstruction(ParseInstructionInfo &Info,
 void LoongArchAsmParser::emitLAInstSeq(MCRegister DestReg, MCRegister TmpReg,
                                        const MCExpr *Symbol,
                                        SmallVectorImpl<Inst> &Insts,
-                                       SMLoc IDLoc, MCStreamer &Out) {
+                                       SMLoc IDLoc, MCStreamer &Out,
+                                       bool RelaxHint) {
   MCContext &Ctx = getContext();
   for (LoongArchAsmParser::Inst &Inst : Insts) {
     unsigned Opc = Inst.Opc;
     LoongArchMCExpr::VariantKind VK = Inst.VK;
-    const LoongArchMCExpr *LE = LoongArchMCExpr::create(Symbol, VK, Ctx);
+    const LoongArchMCExpr *LE =
+        LoongArchMCExpr::create(Symbol, VK, Ctx, RelaxHint);
     switch (Opc) {
     default:
       llvm_unreachable("unexpected opcode");
@@ -854,7 +856,7 @@ void LoongArchAsmParser::emitLoadAddressPcrel(MCInst &Inst, SMLoc IDLoc,
   Insts.push_back(
       LoongArchAsmParser::Inst(ADDI, LoongArchMCExpr::VK_LoongArch_PCALA_LO12));
 
-  emitLAInstSeq(DestReg, DestReg, Symbol, Insts, IDLoc, Out);
+  emitLAInstSeq(DestReg, DestReg, Symbol, Insts, IDLoc, Out, true);
 }
 
 void LoongArchAsmParser::emitLoadAddressPcrelLarge(MCInst &Inst, SMLoc IDLoc,
@@ -900,7 +902,7 @@ void LoongArchAsmParser::emitLoadAddressGot(MCInst &Inst, SMLoc IDLoc,
   Insts.push_back(
       LoongArchAsmParser::Inst(LD, LoongArchMCExpr::VK_LoongArch_GOT_PC_LO12));
 
-  emitLAInstSeq(DestReg, DestReg, Symbol, Insts, IDLoc, Out);
+  emitLAInstSeq(DestReg, DestReg, Symbol, Insts, IDLoc, Out, true);
 }
 
 void LoongArchAsmParser::emitLoadAddressGotLarge(MCInst &Inst, SMLoc IDLoc,
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp
index 45169becca37b..d2ea062dc09a7 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCCodeEmitter.cpp
@@ -19,6 +19,7 @@
 #include "llvm/MC/MCInstBuilder.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/EndianStream.h"
 
@@ -120,12 +121,15 @@ LoongArchMCCodeEmitter::getExprOpValue(const MCInst &MI, const MCOperand &MO,
                                        SmallVectorImpl<MCFixup> &Fixups,
                                        const MCSubtargetInfo &STI) const {
   assert(MO.isExpr() && "getExprOpValue expects only expressions");
+  bool RelaxCandidate = false;
+  bool EnableRelax = STI.hasFeature(LoongArch::FeatureRelax);
   const MCExpr *Expr = MO.getExpr();
   MCExpr::ExprKind Kind = Expr->getKind();
   LoongArch::Fixups FixupKind = LoongArch::fixup_loongarch_invalid;
   if (Kind == MCExpr::Target) {
     const LoongArchMCExpr *LAExpr = cast<LoongArchMCExpr>(Expr);
 
+    RelaxCandidate = LAExpr->getRelaxHint();
     switch (LAExpr->getKind()) {
     case LoongArchMCExpr::VK_LoongArch_None:
     case LoongArchMCExpr::VK_LoongArch_Invalid:
@@ -270,6 +274,15 @@ LoongArchMCCodeEmitter::getExprOpValue(const MCInst &MI, const MCOperand &MO,
 
   Fixups.push_back(
       MCFixup::create(0, Expr, MCFixupKind(FixupKind), MI.getLoc()));
+
+  // Emit an R_LARCH_RELAX if linker relaxation is enabled and LAExpr has relax
+  // hint.
+  if (EnableRelax && RelaxCandidate) {
+    const MCConstantExpr *Dummy = MCConstantExpr::create(0, Ctx);
+    Fixups.push_back(MCFixup::create(
+        0, Dummy, MCFixupKind(LoongArch::fixup_loongarch_relax), MI.getLoc()));
+  }
+
   return 0;
 }
 
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.cpp
index 993111552a314..82c992b1cc8c4 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.cpp
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.cpp
@@ -25,9 +25,10 @@ using namespace llvm;
 
 #define DEBUG_TYPE "loongarch-mcexpr"
 
-const LoongArchMCExpr *
-LoongArchMCExpr::create(const MCExpr *Expr, VariantKind Kind, MCContext &Ctx) {
-  return new (Ctx) LoongArchMCExpr(Expr, Kind);
+const LoongArchMCExpr *LoongArchMCExpr::create(const MCExpr *Expr,
+                                               VariantKind Kind, MCContext &Ctx,
+                                               bool Hint) {
+  return new (Ctx) LoongArchMCExpr(Expr, Kind, Hint);
 }
 
 void LoongArchMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.h b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.h
index 0945cf82db865..93251f8241033 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.h
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCExpr.h
@@ -67,16 +67,18 @@ class LoongArchMCExpr : public MCTargetExpr {
 private:
   const MCExpr *Expr;
   const VariantKind Kind;
+  const bool RelaxHint;
 
-  explicit LoongArchMCExpr(const MCExpr *Expr, VariantKind Kind)
-      : Expr(Expr), Kind(Kind) {}
+  explicit LoongArchMCExpr(const MCExpr *Expr, VariantKind Kind, bool Hint)
+      : Expr(Expr), Kind(Kind), RelaxHint(Hint) {}
 
 public:
   static const LoongArchMCExpr *create(const MCExpr *Expr, VariantKind Kind,
-                                       MCContext &Ctx);
+                                       MCContext &Ctx, bool Hint = false);
 
   VariantKind getKind() const { return Kind; }
   const MCExpr *getSubExpr() const { return Expr; }
+  bool getRelaxHint() const { return RelaxHint; }
 
   void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const override;
   bool evaluateAsRelocatableImpl(MCValue &Res, const MCAsmLayout *Layout,
diff --git a/llvm/test/MC/LoongArch/Macros/macros-la.s b/llvm/test/MC/LoongArch/Macros/macros-la.s
index 924e4326b8e5d..1a1d12d7d7dfd 100644
--- a/llvm/test/MC/LoongArch/Macros/macros-la.s
+++ b/llvm/test/MC/LoongArch/Macros/macros-la.s
@@ -1,66 +1,128 @@
 # RUN: llvm-mc --triple=loongarch64 %s | FileCheck %s
+# RUN: llvm-mc --filetype=obj --triple=loongarch64 --mattr=-relax %s -o %t
+# RUN: llvm-readobj -r %t | FileCheck %s --check-prefix=RELOC
+# RUN: llvm-mc --filetype=obj --triple=loongarch64 --mattr=+relax %s -o %t.relax
+# RUN: llvm-readobj -r %t.relax | FileCheck %s --check-prefixes=RELOC,RELAX
+
+# RELOC:      Relocations [
+# RELOC-NEXT:   Section ({{.*}}) .rela.text {
 
 la.abs $a0, sym_abs
 # CHECK:      lu12i.w $a0, %abs_hi20(sym_abs)
 # CHECK-NEXT: ori $a0, $a0, %abs_lo12(sym_abs)
 # CHECK-NEXT: lu32i.d $a0, %abs64_lo20(sym_abs)
 # CHECK-NEXT: lu52i.d $a0, $a0, %abs64_hi12(sym_abs)
+# CHECK-EMPTY:
+# RELOC-NEXT: R_LARCH_ABS_HI20 sym_abs 0x0
+# RELOC-NEXT: R_LARCH_ABS_LO12 sym_abs 0x0
+# RELOC-NEXT: R_LARCH_ABS64_LO20 sym_abs 0x0
+# RELOC-NEXT: R_LARCH_ABS64_HI12 sym_abs 0x0
 
 la.pcrel $a0, sym_pcrel
-# CHECK:      pcalau12i $a0, %pc_hi20(sym_pcrel)
+# CHECK-NEXT: pcalau12i $a0, %pc_hi20(sym_pcrel)
 # CHECK-NEXT: addi.d $a0, $a0, %pc_lo12(sym_pcrel)
+# CHECK-EMPTY:
+# RELOC-NEXT: R_LARCH_PCALA_HI20 sym_pcrel 0x0
+# RELAX-NEXT: R_LARCH_RELAX - 0x0
+# RELOC-NEXT: R_LARCH_PCALA_LO12 sym_pcrel 0x0
+# RELAX-NEXT: R_LARCH_RELAX - 0x0
 
 la.pcrel $a0, $a1, sym_pcrel_large
-# CHECK:      pcalau12i $a0, %pc_hi20(sym_pcrel_large)
+# CHECK-NEXT: pcalau12i $a0, %pc_hi20(sym_pcrel_large)
 # CHECK-NEXT: addi.d $a1, $zero, %pc_lo12(sym_pcrel_large)
 # CHECK-NEXT: lu32i.d $a1, %pc64_lo20(sym_pcrel_large)
 # CHECK-NEXT: lu52i.d $a1, $a1, %pc64_hi12(sym_pcrel_large)
 # CHECK-NEXT: add.d $a0, $a0, $a1
+# CHECK-EMPTY:
+# RELOC-NEXT: R_LARCH_PCALA_HI20 sym_pcrel_large 0x0
+# RELOC-NEXT: R_LARCH_PCALA_LO12 sym_pcrel_large 0x0
+# RELOC-NEXT: R_LARCH_PCALA64_LO20 sym_pcrel_large 0x0
+# RELOC-NEXT: R_LARCH_PCALA64_HI12 sym_pcrel_large 0x0
 
 la.got $a0, sym_got
-# CHECK:      pcalau12i $a0, %got_pc_hi20(sym_got)
+# CHECK-NEXT: pcalau12i $a0, %got_pc_hi20(sym_got)
 # CHECK-NEXT: ld.d $a0, $a0, %got_pc_lo12(sym_got)
+# CHECK-EMPTY:
+# RELOC-NEXT: R_LARCH_GOT_PC_HI20 sym_got 0x0
+# RELAX-NEXT: R_LARCH_RELAX - 0x0
+# RELOC-NEXT: R_LARCH_GOT_PC_LO12 sym_got 0x0
+# RELAX-NEXT: R_LARCH_RELAX - 0x0
 
 la.got $a0, $a1, sym_got_large
-# CHECK:      pcalau12i $a0, %got_pc_hi20(sym_got_large)
+# CHECK-NEXT: pcalau12i $a0, %got_pc_hi20(sym_got_large)
 # CHECK-NEXT: addi.d $a1, $zero, %got_pc_lo12(sym_got_large)
 # CHECK-NEXT: lu32i.d $a1, %got64_pc_lo20(sym_got_large)
 # CHECK-NEXT: lu52i.d $a1, $a1, %got64_pc_hi12(sym_got_large)
 # CHECK-NEXT: ldx.d $a0, $a0, $a1
+# CHECK-EMPTY:
+# RELOC-NEXT: R_LARCH_GOT_PC_HI20 sym_got_large 0x0
+# RELOC-NEXT: R_LARCH_GOT_PC_LO12 sym_got_large 0x0
+# RELOC-NEXT: R_LARCH_GOT64_PC_LO20 sym_got_large 0x0
+# RELOC-NEXT: R_LARCH_GOT64_PC_HI12 sym_got_large 0x0
 
 la.tls.le $a0, sym_le
-# CHECK:      lu12i.w $a0, %le_hi20(sym_le)
+# CHECK-NEXT: lu12i.w $a0, %le_hi20(sym_le)
 # CHECK-NEXT: ori $a0, $a0, %le_lo12(sym_le)
+# CHECK-EMPTY:
+# RELOC-NEXT: R_LARCH_TLS_LE_HI20 sym_le 0x0
+# RELOC-NEXT: R_LARCH_TLS_LE_LO12 sym_le 0x0
 
 la.tls.ie $a0, sym_ie
-# CHECK:      pcalau12i $a0, %ie_pc_hi20(sym_ie)
+# CHECK-NEXT: pcalau12i $a0, %ie_pc_hi20(sym_ie)
 # CHECK-NEXT: ld.d $a0, $a0, %ie_pc_lo12(sym_ie)
+# CHECK-EMPTY:
+# RELOC-NEXT: R_LARCH_TLS_IE_PC_HI20 sym_ie 0x0
+# RELOC-NEXT: R_LARCH_TLS_IE_PC_LO12 sym_ie 0x0
 
 la.tls.ie $a0, $a1, sym_ie_large
-# CHECK:      pcalau12i $a0, %ie_pc_hi20(sym_ie_large)
+# CHECK-NEXT: pcalau12i $a0, %ie_pc_hi20(sym_ie_large)
 # CHECK-NEXT: addi.d $a1, $zero, %ie_pc_lo12(sym_ie_large)
 # CHECK-NEXT: lu32i.d $a1, %ie64_pc_lo20(sym_ie_large)
 # CHECK-NEXT: lu52i.d $a1, $a1, %ie64_pc_hi12(sym_ie_large)
 # CHECK-NEXT: ldx.d $a0, $a0, $a1
+# CHECK-EMPTY:
+# RELOC-NEXT: R_LARCH_TLS_IE_PC_HI20 sym_ie_large 0x0
+# RELOC-NEXT: R_LARCH_TLS_IE_PC_LO12 sym_ie_large 0x0
+# RELOC-NEXT: R_LARCH_TLS_IE64_PC_LO20 sym_ie_large 0x0
+# RELOC-NEXT: R_LARCH_TLS_IE64_PC_HI12 sym_ie_large 0x0
 
 la.tls.ld $a0, sym_ld
-# CHECK:      pcalau12i $a0, %ld_pc_hi20(sym_ld)
+# CHECK-NEXT: pcalau12i $a0, %ld_pc_hi20(sym_ld)
 # CHECK-NEXT: addi.d $a0, $a0, %got_pc_lo12(sym_ld)
+# CHECK-EMPTY:
+# RELOC-NEXT: R_LARCH_TLS_LD_PC_HI20 sym_ld 0x0
+# RELOC-NEXT: R_LARCH_GOT_PC_LO12 sym_ld 0x0
 
 la.tls.ld $a0, $a1, sym_ld_large
-# CHECK:      pcalau12i $a0, %ld_pc_hi20(sym_ld_large)
+# CHECK-NEXT: pcalau12i $a0, %ld_pc_hi20(sym_ld_large)
 # CHECK-NEXT: addi.d $a1, $zero, %got_pc_lo12(sym_ld_large)
 # CHECK-NEXT: lu32i.d $a1, %got64_pc_lo20(sym_ld_large)
 # CHECK-NEXT: lu52i.d $a1, $a1, %got64_pc_hi12(sym_ld_large)
 # CHECK-NEXT: add.d $a0, $a0, $a1
+# CHECK-EMPTY:
+# RELOC-NEXT: R_LARCH_TLS_LD_PC_HI20 sym_ld_large 0x0
+# RELOC-NEXT: R_LARCH_GOT_PC_LO12 sym_ld_large 0x0
+# RELOC-NEXT: R_LARCH_GOT64_PC_LO20 sym_ld_large 0x0
+# RELOC-NEXT: R_LARCH_GOT64_PC_HI12 sym_ld_large 0x0
 
 la.tls.gd $a0, sym_gd
-# CHECK:      pcalau12i $a0, %gd_pc_hi20(sym_gd)
+# CHECK-NEXT: pcalau12i $a0, %gd_pc_hi20(sym_gd)
 # CHECK-NEXT: addi.d $a0, $a0, %got_pc_lo12(sym_gd)
+# CHECK-EMPTY:
+# RELOC-NEXT: R_LARCH_TLS_GD_PC_HI20 sym_gd 0x0
+# RELOC-NEXT: R_LARCH_GOT_PC_LO12 sym_gd 0x0
 
 la.tls.gd $a0, $a1, sym_gd_large
-# CHECK:      pcalau12i $a0, %gd_pc_hi20(sym_gd_large)
+# CHECK-NEXT: pcalau12i $a0, %gd_pc_hi20(sym_gd_large)
 # CHECK-NEXT: addi.d $a1, $zero, %got_pc_lo12(sym_gd_large)
 # CHECK-NEXT: lu32i.d $a1, %got64_pc_lo20(sym_gd_large)
 # CHECK-NEXT: lu52i.d $a1, $a1, %got64_pc_hi12(sym_gd_large)
 # CHECK-NEXT: add.d $a0, $a0, $a1
+# CHECK-EMPTY:
+# RELOC-NEXT: R_LARCH_TLS_GD_PC_HI20 sym_gd_large 0x0
+# RELOC-NEXT: R_LARCH_GOT_PC_LO12 sym_gd_large 0x0
+# RELOC-NEXT: R_LARCH_GOT64_PC_LO20 sym_gd_large 0x0
+# RELOC-NEXT: R_LARCH_GOT64_PC_HI12 sym_gd_large 0x0
+
+# RELOC-NEXT:   }
+# RELOC-NEXT: ]
diff --git a/llvm/test/MC/LoongArch/Misc/subsection.s b/llvm/test/MC/LoongArch/Misc/subsection.s
index 0bd22b474536c..566a2408d6913 100644
--- a/llvm/test/MC/LoongArch/Misc/subsection.s
+++ b/llvm/test/MC/LoongArch/Misc/subsection.s
@@ -1,5 +1,5 @@
 # RUN: not llvm-mc --filetype=obj --triple=loongarch64 --mattr=-relax %s -o /dev/null 2>&1 | FileCheck %s --check-prefixes=ERR,NORELAX --implicit-check-not=error:
-## TODO: not llvm-mc --filetype=obj --triple=loongarch64 --mattr=+relax %s -o /dev/null 2>&1 | FileCheck %s --check-prefixes=ERR,RELAX --implicit-check-not=error:
+# RUN: not llvm-mc --filetype=obj --triple=loongarch64 --mattr=+relax %s -o /dev/null 2>&1 | FileCheck %s --check-prefixes=ERR,RELAX --implicit-check-not=error:
 
 a:
   nop
diff --git a/llvm/test/MC/LoongArch/Relocations/relax-addsub.s b/llvm/test/MC/LoongArch/Relocations/relax-addsub.s
index 532eb4e0561ac..c4454f5bb98d1 100644
--- a/llvm/test/MC/LoongArch/Relocations/relax-addsub.s
+++ b/llvm/test/MC/LoongArch/Relocations/relax-addsub.s
@@ -18,7 +18,9 @@
 # RELAX:       Relocations [
 # RELAX-NEXT:    Section ({{.*}}) .rela.text {
 # RELAX-NEXT:      0x10 R_LARCH_PCALA_HI20 .L1 0x0
+# RELAX-NEXT:      0x10 R_LARCH_RELAX - 0x0
 # RELAX-NEXT:      0x14 R_LARCH_PCALA_LO12 .L1 0x0
+# RELAX-NEXT:      0x14 R_LARCH_RELAX - 0x0
 # RELAX-NEXT:    }
 # RELAX-NEXT:    Section ({{.*}}) .rela.data {
 # RELAX-NEXT:      0xF R_LARCH_ADD8 .L3 0x0
@@ -29,13 +31,21 @@
 # RELAX-NEXT:      0x12 R_LARCH_SUB32 .L2 0x0
 # RELAX-NEXT:      0x16 R_LARCH_ADD64 .L3 0x0
 # RELAX-NEXT:      0x16 R_LARCH_SUB64 .L2 0x0
+# RELAX-NEXT:      0x1E R_LARCH_ADD8 .L4 0x0
+# RELAX-NEXT:      0x1E R_LARCH_SUB8 .L3 0x0
+# RELAX-NEXT:      0x1F R_LARCH_ADD16 .L4 0x0
+# RELAX-NEXT:      0x1F R_LARCH_SUB16 .L3 0x0
+# RELAX-NEXT:      0x21 R_LARCH_ADD32 .L4 0x0
+# RELAX-NEXT:      0x21 R_LARCH_SUB32 .L3 0x0
+# RELAX-NEXT:      0x25 R_LARCH_ADD64 .L4 0x0
+# RELAX-NEXT:      0x25 R_LARCH_SUB64 .L3 0x0
 # RELAX-NEXT:    }
 # RELAX-NEXT:  ]
 
 # RELAX:      Hex dump of section '.data':
 # RELAX-NEXT: 0x00000000 04040004 00000004 00000000 00000000
-# RELAX-NEXT: 0x00000010 00000000 00000000 00000000 00000808
-# RELAX-NEXT: 0x00000020 00080000 00080000 00000000 00
+# RELAX-NEXT: 0x00000010 00000000 00000000 00000000 00000000
+# RELAX-NEXT: 0x00000020 00000000 00000000 00000000 00
 
 .text
 .L1:
@@ -60,8 +70,6 @@
 .short .L3 - .L2
 .word  .L3 - .L2
 .dword .L3 - .L2
-## TODO
-## With relaxation, emit relocs because la.pcrel is a linker-relaxable inst.
 .byte  .L4 - .L3
 .short .L4 - .L3
 .word  .L4 - .L3

From 8ddb0fcff9ec73aeef20b1288b4ab5e03cd0bd56 Mon Sep 17 00:00:00 2001
From: Freddy Ye <freddy.ye@intel.com>
Date: Wed, 27 Dec 2023 09:01:55 +0800
Subject: [PATCH 315/342] [X86] Correct operand order of UWRMSR. (#76389)

---
 clang/lib/Headers/usermsrintrin.h           | 21 +++++++++++++++++++++
 llvm/lib/Target/X86/X86InstrSystem.td       |  4 ++--
 llvm/test/CodeGen/X86/usermsr-intrinsics.ll |  8 ++++----
 3 files changed, 27 insertions(+), 6 deletions(-)

diff --git a/clang/lib/Headers/usermsrintrin.h b/clang/lib/Headers/usermsrintrin.h
index 6d1424ad3b2ed..61388376706dc 100644
--- a/clang/lib/Headers/usermsrintrin.h
+++ b/clang/lib/Headers/usermsrintrin.h
@@ -14,12 +14,33 @@
 #define __USERMSRINTRIN_H
 #ifdef __x86_64__
 
+/// Reads the contents of a 64-bit MSR specified in \a __A into \a dst.
+///
+/// This intrinsic corresponds to the <c> URDMSR </c> instruction.
+/// \param __A
+///    An unsigned long long.
+///
+/// \code{.operation}
+///    DEST := MSR[__A]
+/// \endcode
 static __inline__ unsigned long long
     __attribute__((__always_inline__, __nodebug__, __target__("usermsr")))
     _urdmsr(unsigned long long __A) {
   return __builtin_ia32_urdmsr(__A);
 }
 
+/// Writes the contents of \a __B into the 64-bit MSR specified in \a __A.
+///
+/// This intrinsic corresponds to the <c> UWRMSR </c> instruction.
+///
+/// \param __A
+///    An unsigned long long.
+/// \param __B
+///    An unsigned long long.
+///
+/// \code{.operation}
+///    MSR[__A] := __B
+/// \endcode
 static __inline__ void
     __attribute__((__always_inline__, __nodebug__, __target__("usermsr")))
     _uwrmsr(unsigned long long __A, unsigned long long __B) {
diff --git a/llvm/lib/Target/X86/X86InstrSystem.td b/llvm/lib/Target/X86/X86InstrSystem.td
index efb58c6102dd1..699e5847e63fb 100644
--- a/llvm/lib/Target/X86/X86InstrSystem.td
+++ b/llvm/lib/Target/X86/X86InstrSystem.td
@@ -446,11 +446,11 @@ let Predicates = [HasUSERMSR], mayLoad = 1 in {
 }
 let Predicates = [HasUSERMSR], mayStore = 1 in {
   def UWRMSRrr : I<0xf8, MRMSrcReg, (outs), (ins GR64:$src1, GR64:$src2),
-                "uwrmsr\t{$src1, $src2|$src2, $src1}",
+                "uwrmsr\t{$src2, $src1|$src1, $src2}",
                 [(int_x86_uwrmsr GR64:$src1, GR64:$src2)]>, T8, XS;
   def UWRMSRir : Ii32<0xf8, MRM0r, (outs), (ins GR64:$src, i64i32imm:$imm),
                 "uwrmsr\t{$src, $imm|$imm, $src}",
-                [(int_x86_uwrmsr GR64:$src, i64immSExt32_su:$imm)]>, T_MAP7, XS, VEX;
+                [(int_x86_uwrmsr i64immSExt32_su:$imm, GR64:$src)]>, T_MAP7, XS, VEX;
 }
 let Defs = [RAX, RDX], Uses = [ECX] in
 def RDPMC : I<0x33, RawFrm, (outs), (ins), "rdpmc", []>, TB;
diff --git a/llvm/test/CodeGen/X86/usermsr-intrinsics.ll b/llvm/test/CodeGen/X86/usermsr-intrinsics.ll
index 29801a494f498..fa569affdd9ff 100644
--- a/llvm/test/CodeGen/X86/usermsr-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/usermsr-intrinsics.ll
@@ -35,7 +35,7 @@ declare i64 @llvm.x86.urdmsr(i64 %A)
 define void @test_int_x86_uwrmsr(i64 %A, i64 %B) nounwind {
 ; X64-LABEL: test_int_x86_uwrmsr:
 ; X64:       # %bb.0:
-; X64-NEXT:    uwrmsr %rdi, %rsi # encoding: [0xf3,0x0f,0x38,0xf8,0xfe]
+; X64-NEXT:    uwrmsr %rsi, %rdi # encoding: [0xf3,0x0f,0x38,0xf8,0xfe]
 ; X64-NEXT:    retq # encoding: [0xc3]
   call void @llvm.x86.uwrmsr(i64 %A, i64 %B)
   ret void
@@ -46,7 +46,7 @@ define void @test_int_x86_uwrmsr_const(i64 %A) nounwind {
 ; X64:       # %bb.0:
 ; X64-NEXT:    uwrmsr %rdi, $123 # encoding: [0xc4,0xe7,0x7a,0xf8,0xc7,0x7b,0x00,0x00,0x00]
 ; X64-NEXT:    retq # encoding: [0xc3]
-  call void @llvm.x86.uwrmsr(i64 %A, i64 123)
+  call void @llvm.x86.uwrmsr(i64 123, i64 %A)
   ret void
 }
 
@@ -55,9 +55,9 @@ define void @test_int_x86_uwrmsr_const_i64(i64 %A) nounwind {
 ; X64:       # %bb.0:
 ; X64-NEXT:    movabsq $8589934591, %rax # encoding: [0x48,0xb8,0xff,0xff,0xff,0xff,0x01,0x00,0x00,0x00]
 ; X64-NEXT:    # imm = 0x1FFFFFFFF
-; X64-NEXT:    uwrmsr %rdi, %rax # encoding: [0xf3,0x0f,0x38,0xf8,0xf8]
+; X64-NEXT:    uwrmsr %rdi, %rax # encoding: [0xf3,0x0f,0x38,0xf8,0xc7]
 ; X64-NEXT:    retq # encoding: [0xc3]
-  call void @llvm.x86.uwrmsr(i64 %A, i64 8589934591)
+  call void @llvm.x86.uwrmsr(i64 8589934591, i64 %A)
   ret void
 }
 

From 256bf56afa58679b50a72b69c0e2a4d198d42247 Mon Sep 17 00:00:00 2001
From: Yeting Kuo <46629943+yetingk@users.noreply.github.com>
Date: Wed, 27 Dec 2023 09:11:34 +0800
Subject: [PATCH 316/342] [RISCV] Update DecoderMethod and MCOperandPredicate
 of spimm. (#76061)

he spimm operand is an immediate whose only 4-5th bit could be setted
and not based on rlist operand
---
 llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp | 2 --
 llvm/lib/Target/RISCV/RISCVInstrInfoZc.td                | 2 +-
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
index 53e2b6b4d94ea..184000b48987e 100644
--- a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
+++ b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
@@ -462,10 +462,8 @@ static DecodeStatus decodeRegReg(MCInst &Inst, uint32_t Insn, uint64_t Address,
   return MCDisassembler::Success;
 }
 
-// spimm is based on rlist now.
 static DecodeStatus decodeZcmpSpimm(MCInst &Inst, unsigned Imm,
                                     uint64_t Address, const void *Decoder) {
-  // TODO: check if spimm matches rlist
   Inst.addOperand(MCOperand::createImm(Imm));
   return MCDisassembler::Success;
 }
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZc.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZc.td
index 9a7249fe3e3d6..3506204d6c255 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZc.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZc.td
@@ -69,7 +69,7 @@ def spimm : Operand<OtherVT> {
     int64_t Imm;
     if (!MCOp.evaluateAsConstantImm(Imm))
       return false;
-    return isShiftedUInt<5, 4>(Imm);
+    return isShiftedUInt<2, 4>(Imm);
   }];
 }
 

From ce0c149f65d9fe708b91610fb94d3db443247262 Mon Sep 17 00:00:00 2001
From: Yeting Kuo <46629943+yetingk@users.noreply.github.com>
Date: Wed, 27 Dec 2023 09:12:35 +0800
Subject: [PATCH 317/342] [RISCV][Clang] Remove -save-restore from default
 features. (#76390)

It's unnecessary to defaultly pass feature `-save-restore`, since risc-v
backend defaultly disables save-restore functionality.
---
 clang/lib/Driver/ToolChains/Arch/RISCV.cpp | 7 -------
 clang/test/Driver/riscv-default-features.c | 2 --
 clang/test/Driver/riscv-features.c         | 2 +-
 3 files changed, 1 insertion(+), 10 deletions(-)

diff --git a/clang/lib/Driver/ToolChains/Arch/RISCV.cpp b/clang/lib/Driver/ToolChains/Arch/RISCV.cpp
index 25b43cefce6b5..8ae47d1680bd2 100644
--- a/clang/lib/Driver/ToolChains/Arch/RISCV.cpp
+++ b/clang/lib/Driver/ToolChains/Arch/RISCV.cpp
@@ -167,13 +167,6 @@ void riscv::getRISCVTargetFeatures(const Driver &D, const llvm::Triple &Triple,
     Features.push_back("-relax");
   }
 
-  // GCC Compatibility: -mno-save-restore is default, unless -msave-restore is
-  // specified.
-  if (Args.hasFlag(options::OPT_msave_restore, options::OPT_mno_save_restore, false))
-    Features.push_back("+save-restore");
-  else
-    Features.push_back("-save-restore");
-
   // -mno-unaligned-access is default, unless -munaligned-access is specified.
   AddTargetFeature(Args, Features, options::OPT_munaligned_access,
                    options::OPT_mno_unaligned_access, "fast-unaligned-access");
diff --git a/clang/test/Driver/riscv-default-features.c b/clang/test/Driver/riscv-default-features.c
index 6e48f7cc37dcb..4c3883c1cc118 100644
--- a/clang/test/Driver/riscv-default-features.c
+++ b/clang/test/Driver/riscv-default-features.c
@@ -2,9 +2,7 @@
 // RUN: %clang --target=riscv64-unknown-elf -S -emit-llvm %s -o - | FileCheck %s -check-prefix=RV64
 
 // RV32: "target-features"="+32bit,+a,+c,+m,+relax,
-// RV32-SAME: -save-restore
 // RV64: "target-features"="+64bit,+a,+c,+m,+relax,
-// RV64-SAME: -save-restore
 
 // Dummy function
 int foo(void){
diff --git a/clang/test/Driver/riscv-features.c b/clang/test/Driver/riscv-features.c
index 716f3f6da57b8..d3700f71aa7e1 100644
--- a/clang/test/Driver/riscv-features.c
+++ b/clang/test/Driver/riscv-features.c
@@ -24,7 +24,7 @@
 
 // SAVE-RESTORE: "-target-feature" "+save-restore"
 // NO-SAVE-RESTORE: "-target-feature" "-save-restore"
-// DEFAULT: "-target-feature" "-save-restore"
+// DEFAULT-NOT: "-target-feature" "-save-restore"
 // DEFAULT-NOT: "-target-feature" "+save-restore"
 
 // RUN: %clang --target=riscv32-unknown-elf -### %s -munaligned-access 2>&1 | FileCheck %s -check-prefix=FAST-UNALIGNED-ACCESS

From 36fd7291cdd85b2820000950d3782758353d259e Mon Sep 17 00:00:00 2001
From: "Balaji V. Iyer" <43187390+bviyer@users.noreply.github.com>
Date: Tue, 26 Dec 2023 19:29:04 -0600
Subject: [PATCH 318/342] [mlir][Quasipolynomial] Fixed -Wunused-variable in
 GeneratorFunction.h (#76419)

```
llvm-project/mlir/lib/Analysis/Presburger/GeneratingFunction.h:56:28:
error: unused variable 'term' [-Werror,-Wunused-variable]
   56 |     for (const ParamPoint &term : numerators)
      |                            ^~~~
1 error generated.
```
---
 mlir/lib/Analysis/Presburger/GeneratingFunction.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/mlir/lib/Analysis/Presburger/GeneratingFunction.h b/mlir/lib/Analysis/Presburger/GeneratingFunction.h
index dad9594f960d1..f7deba921ea51 100644
--- a/mlir/lib/Analysis/Presburger/GeneratingFunction.h
+++ b/mlir/lib/Analysis/Presburger/GeneratingFunction.h
@@ -53,10 +53,12 @@ class GeneratingFunction {
                      std::vector<ParamPoint> nums,
                      std::vector<std::vector<Point>> dens)
       : numParam(numParam), signs(signs), numerators(nums), denominators(dens) {
+#ifndef NDEBUG
     for (const ParamPoint &term : numerators)
       assert(term.getNumColumns() == numParam + 1 &&
              "dimensionality of numerator exponents does not match number of "
              "parameters!");
+#endif // NDEBUG
   }
 
   unsigned getNumParams() { return numParam; }

From 4a601cf75c6c7033cdd5557d54a74bacb372679b Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Tue, 26 Dec 2023 18:36:06 -0800
Subject: [PATCH 319/342] [RISCV] Fix formatting. NFC

---
 llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td
index 1b63ee7ac4bbd..38d05877bb45c 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td
@@ -581,10 +581,10 @@ defset list<VTypeInfoToWide> VQMACCDODInfoPairs = {
 }
 
 defset list<VTypeInfoToWide> VQMACCQOQInfoPairs = {
-   def : VTypeInfoToWide<VI8MF2, VI32M1>;
-   def : VTypeInfoToWide<VI8M1, VI32M2>;
-   def : VTypeInfoToWide<VI8M2, VI32M4>;
-   def : VTypeInfoToWide<VI8M4, VI32M8>;
+  def : VTypeInfoToWide<VI8MF2, VI32M1>;
+  def : VTypeInfoToWide<VI8M1, VI32M2>;
+  def : VTypeInfoToWide<VI8M2, VI32M4>;
+  def : VTypeInfoToWide<VI8M4, VI32M8>;
 }
 
 multiclass VPatVQMACCDOD<string intrinsic, string instruction, string kind>

From 23b82c987d690939f3e7b1431d6004f409c10425 Mon Sep 17 00:00:00 2001
From: Wang Pengcheng <wangpengcheng.pp@bytedance.com>
Date: Wed, 27 Dec 2023 11:19:38 +0800
Subject: [PATCH 320/342] [RISCV] Move Zimop to RISCVInstrInfoZimop.td (#76392)

So the structure of TableGen files is still clear.
---
 llvm/lib/Target/RISCV/RISCVInstrFormats.td   | 21 -------
 llvm/lib/Target/RISCV/RISCVInstrInfo.td      | 29 +---------
 llvm/lib/Target/RISCV/RISCVInstrInfoZimop.td | 59 ++++++++++++++++++++
 3 files changed, 60 insertions(+), 49 deletions(-)
 create mode 100644 llvm/lib/Target/RISCV/RISCVInstrInfoZimop.td

diff --git a/llvm/lib/Target/RISCV/RISCVInstrFormats.td b/llvm/lib/Target/RISCV/RISCVInstrFormats.td
index 288c33cfe11c8..f56f49ae24571 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrFormats.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrFormats.td
@@ -410,27 +410,6 @@ class RVInstIUnary<bits<12> imm12, bits<3> funct3, RISCVOpcode opcode,
   let Inst{31-20} = imm12;
 }
 
-class RVInstIMopr<bits<7> imm7, bits<5> imm5, bits<3> funct3, RISCVOpcode opcode,
-                   dag outs, dag ins, string opcodestr, string argstr>
-    : RVInstIBase<funct3, opcode, outs, ins, opcodestr, argstr> {
-  let Inst{31} = imm7{6};
-  let Inst{30} = imm5{4};
-  let Inst{29-28} = imm7{5-4};
-  let Inst{27-26} = imm5{3-2};
-  let Inst{25-22} = imm7{3-0};
-  let Inst{21-20} = imm5{1-0};
-}
-
-class RVInstRMoprr<bits<4> imm4, bits<3> imm3, bits<3> funct3, RISCVOpcode opcode,
-                   dag outs, dag ins, string opcodestr, string argstr>
-    : RVInstRBase<funct3, opcode, outs, ins, opcodestr, argstr> {
-  let Inst{31} = imm4{3};
-  let Inst{30} = imm3{2};
-  let Inst{29-28} = imm4{2-1};
-  let Inst{27-26} = imm3{1-0};
-  let Inst{25} = imm4{0};
-}
-
 class RVInstS<bits<3> funct3, RISCVOpcode opcode, dag outs, dag ins,
               string opcodestr, string argstr>
     : RVInst<outs, ins, opcodestr, argstr, [], InstFormatS> {
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
index 099cc0abd1424..3ee2a08089a96 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
@@ -597,18 +597,6 @@ class Priv_rr<string opcodestr, bits<7> funct7>
   let rd = 0;
 }
 
-let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
-class RVMopr<bits<7> imm7, bits<5> imm5, bits<3> funct3,
-             RISCVOpcode opcode, string opcodestr>
-    : RVInstIMopr<imm7, imm5, funct3, opcode, (outs GPR:$rd), (ins GPR:$rs1),
-                   opcodestr, "$rd, $rs1">;
-
-let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
-class RVMoprr<bits<4> imm4, bits<3> imm3, bits<3> funct3,
-             RISCVOpcode opcode, string opcodestr>
-    : RVInstRMoprr<imm4, imm3, funct3, opcode, (outs GPR:$rd), (ins GPR:$rs1, GPR:$rs2),
-                   opcodestr, "$rd, $rs1, $rs2">;
-
 //===----------------------------------------------------------------------===//
 // Instructions
 //===----------------------------------------------------------------------===//
@@ -798,22 +786,6 @@ def SRAW  : ALUW_rr<0b0100000, 0b101, "sraw">,
 } // IsSignExtendingOpW = 1
 } // Predicates = [IsRV64]
 
-// Zimop instructions
-
-foreach i = 0...31 in {
-    let Predicates = [HasStdExtZimop] in {
-    def MOPR#i : RVMopr<0b1000111, i, 0b100, OPC_SYSTEM, "mop.r."#i>,
-                 Sched<[]>;
-    } // Predicates = [HasStdExtZimop]
-}
-
-foreach i = 0...7 in {
-    let Predicates = [HasStdExtZimop] in {
-    def MOPRR#i : RVMoprr<0b1001, i, 0b100, OPC_SYSTEM, "mop.rr."#i>,
-                  Sched<[]>;
-    } // Predicates = [HasStdExtZimop]
-}
-
 //===----------------------------------------------------------------------===//
 // Privileged instructions
 //===----------------------------------------------------------------------===//
@@ -2140,6 +2112,7 @@ include "RISCVInstrInfoV.td"
 include "RISCVInstrInfoZvk.td"
 
 // Integer
+include "RISCVInstrInfoZimop.td"
 include "RISCVInstrInfoZicbo.td"
 include "RISCVInstrInfoZicond.td"
 
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZimop.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZimop.td
new file mode 100644
index 0000000000000..1e8c70046c634
--- /dev/null
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZimop.td
@@ -0,0 +1,59 @@
+//===-- RISCVInstrInfoZimop.td -----------------------------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the RISC-V instructions from the standard
+// May-Be-Operations Extension (Zimop).
+// This version is still experimental as the 'Zimop' extension hasn't been
+// ratified yet. It is based on v0.1 of the specification.
+//
+//===----------------------------------------------------------------------===//
+
+class RVInstIMopr<bits<7> imm7, bits<5> imm5, bits<3> funct3, RISCVOpcode opcode,
+                   dag outs, dag ins, string opcodestr, string argstr>
+    : RVInstIBase<funct3, opcode, outs, ins, opcodestr, argstr> {
+  let Inst{31} = imm7{6};
+  let Inst{30} = imm5{4};
+  let Inst{29-28} = imm7{5-4};
+  let Inst{27-26} = imm5{3-2};
+  let Inst{25-22} = imm7{3-0};
+  let Inst{21-20} = imm5{1-0};
+}
+
+class RVInstRMoprr<bits<4> imm4, bits<3> imm3, bits<3> funct3, RISCVOpcode opcode,
+                   dag outs, dag ins, string opcodestr, string argstr>
+    : RVInstRBase<funct3, opcode, outs, ins, opcodestr, argstr> {
+  let Inst{31} = imm4{3};
+  let Inst{30} = imm3{2};
+  let Inst{29-28} = imm4{2-1};
+  let Inst{27-26} = imm3{1-0};
+  let Inst{25} = imm4{0};
+}
+
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
+class RVMopr<bits<7> imm7, bits<5> imm5, bits<3> funct3,
+             RISCVOpcode opcode, string opcodestr>
+    : RVInstIMopr<imm7, imm5, funct3, opcode, (outs GPR:$rd), (ins GPR:$rs1),
+                   opcodestr, "$rd, $rs1">;
+
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
+class RVMoprr<bits<4> imm4, bits<3> imm3, bits<3> funct3,
+             RISCVOpcode opcode, string opcodestr>
+    : RVInstRMoprr<imm4, imm3, funct3, opcode, (outs GPR:$rd), (ins GPR:$rs1, GPR:$rs2),
+                   opcodestr, "$rd, $rs1, $rs2">;
+
+foreach i = 0...31 in {
+  let Predicates = [HasStdExtZimop] in
+  def MOPR#i : RVMopr<0b1000111, i, 0b100, OPC_SYSTEM, "mop.r."#i>,
+               Sched<[]>;
+}
+
+foreach i = 0...7 in {
+  let Predicates = [HasStdExtZimop] in
+  def MOPRR#i : RVMoprr<0b1001, i, 0b100, OPC_SYSTEM, "mop.rr."#i>,
+                Sched<[]>;
+}

From 6f85075ff7281be1802c9fd30758d34b67481a1e Mon Sep 17 00:00:00 2001
From: Min-Yih Hsu <min@myhsu.dev>
Date: Tue, 26 Dec 2023 13:20:47 -0800
Subject: [PATCH 321/342] [M68k] U/SMULd32d16 are not supposed to be
 communitive

M68k only has 16-bit x 16-bit -> 32-bit variant for multiplications
taking 16-bit operands. We still define two input operands for this
class of instructions, and tie the first operand to the result value.
The problem is that the two operands have different register classes
(DR32 and DR16) hence making these instructions communitive produces
invalid MachineInstr (though the final assembly will still be correct).
---
 llvm/lib/Target/M68k/M68kInstrArithmetic.td   | 10 ++--
 .../CodeGen/M68k/Arith/smul-with-overflow.ll  | 58 +++++++++++++++++--
 .../CodeGen/M68k/Arith/umul-with-overflow.ll  | 50 ++++++++++++++++
 3 files changed, 108 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/Target/M68k/M68kInstrArithmetic.td b/llvm/lib/Target/M68k/M68kInstrArithmetic.td
index 7f250f4e56ef2..1f5f1e815e2bf 100644
--- a/llvm/lib/Target/M68k/M68kInstrArithmetic.td
+++ b/llvm/lib/Target/M68k/M68kInstrArithmetic.td
@@ -623,11 +623,9 @@ class MxDiMuOp_DI<string MN, bits<4> CMD, bit SIGNED = false,
 } // let Constraints
 } // Defs = [CCR]
 
-multiclass MxDiMuOp<string MN, bits<4> CMD, bit isComm = 0> {
-  let isCommutable = isComm in {
-    def "S"#NAME#"d32d16" : MxDiMuOp_DD<MN#"s", CMD, /*SIGNED*/true, MxDRD32, MxDRD16>;
-    def "U"#NAME#"d32d16" : MxDiMuOp_DD<MN#"u", CMD, /*SIGNED*/false, MxDRD32, MxDRD16>;
-  }
+multiclass MxDiMuOp<string MN, bits<4> CMD> {
+  def "S"#NAME#"d32d16" : MxDiMuOp_DD<MN#"s", CMD, /*SIGNED*/true, MxDRD32, MxDRD16>;
+  def "U"#NAME#"d32d16" : MxDiMuOp_DD<MN#"u", CMD, /*SIGNED*/false, MxDRD32, MxDRD16>;
 
   def "S"#NAME#"d32i16" : MxDiMuOp_DI<MN#"s", CMD, /*SIGNED*/true, MxDRD32, Mxi16imm>;
   def "U"#NAME#"d32i16" : MxDiMuOp_DI<MN#"u", CMD, /*SIGNED*/false, MxDRD32, Mxi16imm>;
@@ -729,7 +727,7 @@ def : Pat<(urem i16:$dst, MximmSExt16:$opd),
              MxSubRegIndex16Lo)>;
 
 
-defm MUL : MxDiMuOp<"mul", 0xC, 1>;
+defm MUL : MxDiMuOp<"mul", 0xC>;
 
 def SMULd32d32 : MxDiMuOp_DD_Long<"muls.l", MxSMul, 0x130, /*SIGNED*/true>;
 def UMULd32d32 : MxDiMuOp_DD_Long<"mulu.l", MxUMul, 0x130, /*SIGNED*/false>;
diff --git a/llvm/test/CodeGen/M68k/Arith/smul-with-overflow.ll b/llvm/test/CodeGen/M68k/Arith/smul-with-overflow.ll
index cd9349181a631..485cc50fbee2b 100644
--- a/llvm/test/CodeGen/M68k/Arith/smul-with-overflow.ll
+++ b/llvm/test/CodeGen/M68k/Arith/smul-with-overflow.ll
@@ -1,6 +1,56 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=m68k-linux -verify-machineinstrs | FileCheck %s
 
+define zeroext i8 @smul_i8(i8 signext %a, i8 signext %b) nounwind ssp {
+; CHECK-LABEL: smul_i8:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    move.b (11,%sp), %d1
+; CHECK-NEXT:    and.l #255, %d1
+; CHECK-NEXT:    move.b (7,%sp), %d0
+; CHECK-NEXT:    and.l #255, %d0
+; CHECK-NEXT:    muls %d1, %d0
+; CHECK-NEXT:    move.b #0, %d1
+; CHECK-NEXT:    move.w %d1, %ccr
+; CHECK-NEXT:    bvs .LBB0_2
+; CHECK-NEXT:  ; %bb.1: ; %entry
+; CHECK-NEXT:    move.b #42, %d0
+; CHECK-NEXT:  .LBB0_2: ; %entry
+; CHECK-NEXT:    and.l #255, %d0
+; CHECK-NEXT:    rts
+entry:
+  %smul = tail call { i8, i1 } @llvm.smul.with.overflow.i8(i8 %a, i8 %b)
+  %cmp = extractvalue { i8, i1 } %smul, 1
+  %smul.result = extractvalue { i8, i1 } %smul, 0
+  %X = select i1 %cmp, i8 %smul.result, i8 42
+  ret i8 %X
+}
+
+declare { i8, i1 } @llvm.smul.with.overflow.i8(i8, i8) nounwind readnone
+
+define zeroext i16 @smul_i16(i16 signext %a, i16 signext %b) nounwind ssp {
+; CHECK-LABEL: smul_i16:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    move.w (6,%sp), %d0
+; CHECK-NEXT:    move.w (10,%sp), %d1
+; CHECK-NEXT:    muls %d1, %d0
+; CHECK-NEXT:    move.b #0, %d1
+; CHECK-NEXT:    move.w %d1, %ccr
+; CHECK-NEXT:    bvs .LBB1_2
+; CHECK-NEXT:  ; %bb.1: ; %entry
+; CHECK-NEXT:    move.w #42, %d0
+; CHECK-NEXT:  .LBB1_2: ; %entry
+; CHECK-NEXT:    and.l #65535, %d0
+; CHECK-NEXT:    rts
+entry:
+  %smul = tail call { i16, i1 } @llvm.smul.with.overflow.i16(i16 %a, i16 %b)
+  %cmp = extractvalue { i16, i1 } %smul, 1
+  %smul.result = extractvalue { i16, i1 } %smul, 0
+  %X = select i1 %cmp, i16 %smul.result, i16 42
+  ret i16 %X
+}
+
+declare { i16, i1 } @llvm.smul.with.overflow.i16(i16, i16) nounwind readnone
+
 declare i32 @printf(i8*, ...) nounwind
 declare {i32, i1} @llvm.smul.with.overflow.i32(i32, i32)
 
@@ -12,7 +62,7 @@ define fastcc i1 @test1(i32 %v1, i32 %v2) nounwind {
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    suba.l #12, %sp
 ; CHECK-NEXT:    muls.l %d1, %d0
-; CHECK-NEXT:    bvc .LBB0_1
+; CHECK-NEXT:    bvc .LBB2_1
 ; CHECK-NEXT:  ; %bb.2: ; %overflow
 ; CHECK-NEXT:    lea (no,%pc), %a0
 ; CHECK-NEXT:    move.l %a0, (%sp)
@@ -20,7 +70,7 @@ define fastcc i1 @test1(i32 %v1, i32 %v2) nounwind {
 ; CHECK-NEXT:    move.b #0, %d0
 ; CHECK-NEXT:    adda.l #12, %sp
 ; CHECK-NEXT:    rts
-; CHECK-NEXT:  .LBB0_1: ; %normal
+; CHECK-NEXT:  .LBB2_1: ; %normal
 ; CHECK-NEXT:    move.l %d0, (4,%sp)
 ; CHECK-NEXT:    lea (ok,%pc), %a0
 ; CHECK-NEXT:    move.l %a0, (%sp)
@@ -50,7 +100,7 @@ define fastcc i1 @test2(i32 %v1, i32 %v2) nounwind {
 ; CHECK-NEXT:    muls.l %d1, %d0
 ; CHECK-NEXT:    svs %d1
 ; CHECK-NEXT:    sub.b #1, %d1
-; CHECK-NEXT:    bne .LBB1_2
+; CHECK-NEXT:    bne .LBB3_2
 ; CHECK-NEXT:  ; %bb.1: ; %overflow
 ; CHECK-NEXT:    lea (no,%pc), %a0
 ; CHECK-NEXT:    move.l %a0, (%sp)
@@ -58,7 +108,7 @@ define fastcc i1 @test2(i32 %v1, i32 %v2) nounwind {
 ; CHECK-NEXT:    move.b #0, %d0
 ; CHECK-NEXT:    adda.l #12, %sp
 ; CHECK-NEXT:    rts
-; CHECK-NEXT:  .LBB1_2: ; %normal
+; CHECK-NEXT:  .LBB3_2: ; %normal
 ; CHECK-NEXT:    move.l %d0, (4,%sp)
 ; CHECK-NEXT:    lea (ok,%pc), %a0
 ; CHECK-NEXT:    move.l %a0, (%sp)
diff --git a/llvm/test/CodeGen/M68k/Arith/umul-with-overflow.ll b/llvm/test/CodeGen/M68k/Arith/umul-with-overflow.ll
index ef7171dc386fe..1dfb959e468ce 100644
--- a/llvm/test/CodeGen/M68k/Arith/umul-with-overflow.ll
+++ b/llvm/test/CodeGen/M68k/Arith/umul-with-overflow.ll
@@ -1,6 +1,56 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=m68k -verify-machineinstrs | FileCheck %s
 
+define zeroext i8 @umul_i8(i8 signext %a, i8 signext %b) nounwind ssp {
+; CHECK-LABEL: umul_i8:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    move.b (11,%sp), %d1
+; CHECK-NEXT:    and.l #255, %d1
+; CHECK-NEXT:    move.b (7,%sp), %d0
+; CHECK-NEXT:    and.l #255, %d0
+; CHECK-NEXT:    muls %d1, %d0
+; CHECK-NEXT:    move.b #0, %d1
+; CHECK-NEXT:    move.w %d1, %ccr
+; CHECK-NEXT:    bvs .LBB0_2
+; CHECK-NEXT:  ; %bb.1: ; %entry
+; CHECK-NEXT:    move.b #42, %d0
+; CHECK-NEXT:  .LBB0_2: ; %entry
+; CHECK-NEXT:    and.l #255, %d0
+; CHECK-NEXT:    rts
+entry:
+  %umul = tail call { i8, i1 } @llvm.umul.with.overflow.i8(i8 %a, i8 %b)
+  %cmp = extractvalue { i8, i1 } %umul, 1
+  %umul.result = extractvalue { i8, i1 } %umul, 0
+  %X = select i1 %cmp, i8 %umul.result, i8 42
+  ret i8 %X
+}
+
+declare { i8, i1 } @llvm.umul.with.overflow.i8(i8, i8) nounwind readnone
+
+define zeroext i16 @umul_i16(i16 signext %a, i16 signext %b) nounwind ssp {
+; CHECK-LABEL: umul_i16:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    move.w (6,%sp), %d0
+; CHECK-NEXT:    move.w (10,%sp), %d1
+; CHECK-NEXT:    muls %d1, %d0
+; CHECK-NEXT:    move.b #0, %d1
+; CHECK-NEXT:    move.w %d1, %ccr
+; CHECK-NEXT:    bvs .LBB1_2
+; CHECK-NEXT:  ; %bb.1: ; %entry
+; CHECK-NEXT:    move.w #42, %d0
+; CHECK-NEXT:  .LBB1_2: ; %entry
+; CHECK-NEXT:    and.l #65535, %d0
+; CHECK-NEXT:    rts
+entry:
+  %umul = tail call { i16, i1 } @llvm.umul.with.overflow.i16(i16 %a, i16 %b)
+  %cmp = extractvalue { i16, i1 } %umul, 1
+  %umul.result = extractvalue { i16, i1 } %umul, 0
+  %X = select i1 %cmp, i16 %umul.result, i16 42
+  ret i16 %X
+}
+
+declare { i16, i1 } @llvm.umul.with.overflow.i16(i16, i16) nounwind readnone
+
 declare {i32, i1} @llvm.umul.with.overflow.i32(i32 %a, i32 %b)
 
 define i1 @a(i32 %x)  nounwind {

From 2476e2a91140b57ca3ad0792597be4f4d20ddb1a Mon Sep 17 00:00:00 2001
From: Min-Yih Hsu <min@myhsu.dev>
Date: Tue, 26 Dec 2023 16:29:02 -0800
Subject: [PATCH 322/342] [M68k] Optimize for overflow arithmetics that will
 never overflow

We lower overflow arithmetics to its M68kISD counterparts that produce
results of {i16/i32, i8} in which the second resut represents CCR. In
the event where we're certain there won't be an overflow, for instance
8 & 16-bit multiplications, we simply use zero in replacement of the
second result.
This patch replaces M68kISD::CMOV that takes this kind of zero or
all-ones CCR as condition value with its corresponding operand value.
---
 llvm/lib/Target/M68k/M68kISelLowering.cpp     | 13 +++++-
 .../CodeGen/M68k/Arith/smul-with-overflow.ll  | 45 ++++++++++---------
 .../CodeGen/M68k/Arith/umul-with-overflow.ll  | 37 ++++++++-------
 3 files changed, 56 insertions(+), 39 deletions(-)

diff --git a/llvm/lib/Target/M68k/M68kISelLowering.cpp b/llvm/lib/Target/M68k/M68kISelLowering.cpp
index 6ca5962965bde..f42882dafa095 100644
--- a/llvm/lib/Target/M68k/M68kISelLowering.cpp
+++ b/llvm/lib/Target/M68k/M68kISelLowering.cpp
@@ -1637,7 +1637,7 @@ SDValue M68kTargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) const {
   if (isa<ConstantSDNode>(CCR)) {
     // It's likely a result of operations that will not overflow
     // hence no setcc is needed.
-    Overflow = DAG.getZExtOrTrunc(CCR, DL, N->getValueType(1));
+    Overflow = CCR;
   } else {
     // Generate a M68kISD::SETCC.
     Overflow = DAG.getNode(M68kISD::SETCC, DL, N->getValueType(1),
@@ -2406,6 +2406,17 @@ SDValue M68kTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
     }
   }
 
+  // Simple optimization when Cond is a constant to avoid generating
+  // M68kISD::CMOV if possible.
+  // TODO: Generalize this to use SelectionDAG::computeKnownBits.
+  if (auto *Const = dyn_cast<ConstantSDNode>(Cond.getNode())) {
+    const APInt &C = Const->getAPIntValue();
+    if (C.countr_zero() >= 5)
+      return Op2;
+    else if (C.countr_one() >= 5)
+      return Op1;
+  }
+
   // M68kISD::CMOV means set the result (which is operand 1) to the RHS if
   // condition is true.
   SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
diff --git a/llvm/test/CodeGen/M68k/Arith/smul-with-overflow.ll b/llvm/test/CodeGen/M68k/Arith/smul-with-overflow.ll
index 485cc50fbee2b..b649b2ba16147 100644
--- a/llvm/test/CodeGen/M68k/Arith/smul-with-overflow.ll
+++ b/llvm/test/CodeGen/M68k/Arith/smul-with-overflow.ll
@@ -4,19 +4,28 @@
 define zeroext i8 @smul_i8(i8 signext %a, i8 signext %b) nounwind ssp {
 ; CHECK-LABEL: smul_i8:
 ; CHECK:       ; %bb.0: ; %entry
-; CHECK-NEXT:    move.b (11,%sp), %d1
-; CHECK-NEXT:    and.l #255, %d1
-; CHECK-NEXT:    move.b (7,%sp), %d0
+; CHECK-NEXT:    move.b (11,%sp), %d0
 ; CHECK-NEXT:    and.l #255, %d0
-; CHECK-NEXT:    muls %d1, %d0
-; CHECK-NEXT:    move.b #0, %d1
-; CHECK-NEXT:    move.w %d1, %ccr
-; CHECK-NEXT:    bvs .LBB0_2
-; CHECK-NEXT:  ; %bb.1: ; %entry
-; CHECK-NEXT:    move.b #42, %d0
-; CHECK-NEXT:  .LBB0_2: ; %entry
+; CHECK-NEXT:    move.b (7,%sp), %d1
+; CHECK-NEXT:    and.l #255, %d1
+; CHECK-NEXT:    muls %d0, %d1
+; CHECK-NEXT:    move.l %d1, %d0
+; CHECK-NEXT:    and.l #65535, %d0
 ; CHECK-NEXT:    and.l #255, %d0
 ; CHECK-NEXT:    rts
+entry:
+  %smul = tail call { i8, i1 } @llvm.smul.with.overflow.i8(i8 %a, i8 %b)
+  %cmp = extractvalue { i8, i1 } %smul, 1
+  %smul.result = extractvalue { i8, i1 } %smul, 0
+  %X = select i1 %cmp, i8 42, i8 %smul.result
+  ret i8 %X
+}
+
+define zeroext i8 @smul_i8_no_ovf(i8 signext %a, i8 signext %b) nounwind ssp {
+; CHECK-LABEL: smul_i8_no_ovf:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    move.l #42, %d0
+; CHECK-NEXT:    rts
 entry:
   %smul = tail call { i8, i1 } @llvm.smul.with.overflow.i8(i8 %a, i8 %b)
   %cmp = extractvalue { i8, i1 } %smul, 1
@@ -33,19 +42,13 @@ define zeroext i16 @smul_i16(i16 signext %a, i16 signext %b) nounwind ssp {
 ; CHECK-NEXT:    move.w (6,%sp), %d0
 ; CHECK-NEXT:    move.w (10,%sp), %d1
 ; CHECK-NEXT:    muls %d1, %d0
-; CHECK-NEXT:    move.b #0, %d1
-; CHECK-NEXT:    move.w %d1, %ccr
-; CHECK-NEXT:    bvs .LBB1_2
-; CHECK-NEXT:  ; %bb.1: ; %entry
-; CHECK-NEXT:    move.w #42, %d0
-; CHECK-NEXT:  .LBB1_2: ; %entry
 ; CHECK-NEXT:    and.l #65535, %d0
 ; CHECK-NEXT:    rts
 entry:
   %smul = tail call { i16, i1 } @llvm.smul.with.overflow.i16(i16 %a, i16 %b)
   %cmp = extractvalue { i16, i1 } %smul, 1
   %smul.result = extractvalue { i16, i1 } %smul, 0
-  %X = select i1 %cmp, i16 %smul.result, i16 42
+  %X = select i1 %cmp, i16 42, i16 %smul.result
   ret i16 %X
 }
 
@@ -62,7 +65,7 @@ define fastcc i1 @test1(i32 %v1, i32 %v2) nounwind {
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    suba.l #12, %sp
 ; CHECK-NEXT:    muls.l %d1, %d0
-; CHECK-NEXT:    bvc .LBB2_1
+; CHECK-NEXT:    bvc .LBB3_1
 ; CHECK-NEXT:  ; %bb.2: ; %overflow
 ; CHECK-NEXT:    lea (no,%pc), %a0
 ; CHECK-NEXT:    move.l %a0, (%sp)
@@ -70,7 +73,7 @@ define fastcc i1 @test1(i32 %v1, i32 %v2) nounwind {
 ; CHECK-NEXT:    move.b #0, %d0
 ; CHECK-NEXT:    adda.l #12, %sp
 ; CHECK-NEXT:    rts
-; CHECK-NEXT:  .LBB2_1: ; %normal
+; CHECK-NEXT:  .LBB3_1: ; %normal
 ; CHECK-NEXT:    move.l %d0, (4,%sp)
 ; CHECK-NEXT:    lea (ok,%pc), %a0
 ; CHECK-NEXT:    move.l %a0, (%sp)
@@ -100,7 +103,7 @@ define fastcc i1 @test2(i32 %v1, i32 %v2) nounwind {
 ; CHECK-NEXT:    muls.l %d1, %d0
 ; CHECK-NEXT:    svs %d1
 ; CHECK-NEXT:    sub.b #1, %d1
-; CHECK-NEXT:    bne .LBB3_2
+; CHECK-NEXT:    bne .LBB4_2
 ; CHECK-NEXT:  ; %bb.1: ; %overflow
 ; CHECK-NEXT:    lea (no,%pc), %a0
 ; CHECK-NEXT:    move.l %a0, (%sp)
@@ -108,7 +111,7 @@ define fastcc i1 @test2(i32 %v1, i32 %v2) nounwind {
 ; CHECK-NEXT:    move.b #0, %d0
 ; CHECK-NEXT:    adda.l #12, %sp
 ; CHECK-NEXT:    rts
-; CHECK-NEXT:  .LBB3_2: ; %normal
+; CHECK-NEXT:  .LBB4_2: ; %normal
 ; CHECK-NEXT:    move.l %d0, (4,%sp)
 ; CHECK-NEXT:    lea (ok,%pc), %a0
 ; CHECK-NEXT:    move.l %a0, (%sp)
diff --git a/llvm/test/CodeGen/M68k/Arith/umul-with-overflow.ll b/llvm/test/CodeGen/M68k/Arith/umul-with-overflow.ll
index 1dfb959e468ce..fd128a3e52bd3 100644
--- a/llvm/test/CodeGen/M68k/Arith/umul-with-overflow.ll
+++ b/llvm/test/CodeGen/M68k/Arith/umul-with-overflow.ll
@@ -4,19 +4,28 @@
 define zeroext i8 @umul_i8(i8 signext %a, i8 signext %b) nounwind ssp {
 ; CHECK-LABEL: umul_i8:
 ; CHECK:       ; %bb.0: ; %entry
-; CHECK-NEXT:    move.b (11,%sp), %d1
-; CHECK-NEXT:    and.l #255, %d1
-; CHECK-NEXT:    move.b (7,%sp), %d0
+; CHECK-NEXT:    move.b (11,%sp), %d0
 ; CHECK-NEXT:    and.l #255, %d0
-; CHECK-NEXT:    muls %d1, %d0
-; CHECK-NEXT:    move.b #0, %d1
-; CHECK-NEXT:    move.w %d1, %ccr
-; CHECK-NEXT:    bvs .LBB0_2
-; CHECK-NEXT:  ; %bb.1: ; %entry
-; CHECK-NEXT:    move.b #42, %d0
-; CHECK-NEXT:  .LBB0_2: ; %entry
+; CHECK-NEXT:    move.b (7,%sp), %d1
+; CHECK-NEXT:    and.l #255, %d1
+; CHECK-NEXT:    muls %d0, %d1
+; CHECK-NEXT:    move.l %d1, %d0
+; CHECK-NEXT:    and.l #65535, %d0
 ; CHECK-NEXT:    and.l #255, %d0
 ; CHECK-NEXT:    rts
+entry:
+  %umul = tail call { i8, i1 } @llvm.umul.with.overflow.i8(i8 %a, i8 %b)
+  %cmp = extractvalue { i8, i1 } %umul, 1
+  %umul.result = extractvalue { i8, i1 } %umul, 0
+  %X = select i1 %cmp, i8 42, i8 %umul.result
+  ret i8 %X
+}
+
+define zeroext i8 @umul_i8_no_ovf(i8 signext %a, i8 signext %b) nounwind ssp {
+; CHECK-LABEL: umul_i8_no_ovf:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    move.l #42, %d0
+; CHECK-NEXT:    rts
 entry:
   %umul = tail call { i8, i1 } @llvm.umul.with.overflow.i8(i8 %a, i8 %b)
   %cmp = extractvalue { i8, i1 } %umul, 1
@@ -33,19 +42,13 @@ define zeroext i16 @umul_i16(i16 signext %a, i16 signext %b) nounwind ssp {
 ; CHECK-NEXT:    move.w (6,%sp), %d0
 ; CHECK-NEXT:    move.w (10,%sp), %d1
 ; CHECK-NEXT:    muls %d1, %d0
-; CHECK-NEXT:    move.b #0, %d1
-; CHECK-NEXT:    move.w %d1, %ccr
-; CHECK-NEXT:    bvs .LBB1_2
-; CHECK-NEXT:  ; %bb.1: ; %entry
-; CHECK-NEXT:    move.w #42, %d0
-; CHECK-NEXT:  .LBB1_2: ; %entry
 ; CHECK-NEXT:    and.l #65535, %d0
 ; CHECK-NEXT:    rts
 entry:
   %umul = tail call { i16, i1 } @llvm.umul.with.overflow.i16(i16 %a, i16 %b)
   %cmp = extractvalue { i16, i1 } %umul, 1
   %umul.result = extractvalue { i16, i1 } %umul, 0
-  %X = select i1 %cmp, i16 %umul.result, i16 42
+  %X = select i1 %cmp, i16 42, i16 %umul.result
   ret i16 %X
 }
 

From 9f4b6e1bd33cb19b3ed9b8e2b0c3aa4c48d728d6 Mon Sep 17 00:00:00 2001
From: Shengchen Kan <shengchen.kan@intel.com>
Date: Wed, 27 Dec 2023 13:55:44 +0800
Subject: [PATCH 323/342] [X86][NFC] Simplify the definitions of INC/DEC and
 NEG/NOT

This patch is to extract the NFC in #76319 into a separate commit.
---
 llvm/lib/Target/X86/X86InstrArithmetic.td | 230 ++++++++++++----------
 llvm/lib/Target/X86/X86InstrUtils.td      |  26 +--
 2 files changed, 134 insertions(+), 122 deletions(-)

diff --git a/llvm/lib/Target/X86/X86InstrArithmetic.td b/llvm/lib/Target/X86/X86InstrArithmetic.td
index 0582270285180..220ca31a825f9 100644
--- a/llvm/lib/Target/X86/X86InstrArithmetic.td
+++ b/llvm/lib/Target/X86/X86InstrArithmetic.td
@@ -277,31 +277,63 @@ class BinOpAIF_AF<bits<8> o, string m, X86TypeInfo t, Register areg,
   let SchedRW = [WriteADC];
 }
 
-// UnaryOpR - Instructions that read "reg" and write "reg".
-class UnaryOpR<bits<8> o, Format f, string m, X86TypeInfo t, list<dag> p>
-  : ITy<o, f, t, (outs t.RegClass:$dst),
-        (ins t.RegClass:$src1), m, "$dst", p>, Sched<[WriteALU]>;
-
-// UnaryOpM - Instructions that read "[mem]" and writes "[mem]".
-class UnaryOpM<bits<8> o, Format f, string m, X86TypeInfo t, list<dag> p>
-  : ITy<o, f, t, (outs), (ins t.MemOperand:$dst), m, "$dst", p>,
-    Sched<[WriteALURMW]> {
+// UnaryOpR - Instructions that read "reg".
+class UnaryOpR<bits<8> o, Format f, string m, string args, X86TypeInfo t,
+               dag out, list<dag> p>
+  : ITy<o, f, t, out, (ins t.RegClass:$src), m, args, p>, Sched<[WriteALU]>;
+// UnaryOpR_R - Instructions that read "reg" and write "reg".
+class UnaryOpR_R<bits<8> o, Format f, string m, X86TypeInfo t,
+                  SDPatternOperator node>
+  : UnaryOpR<o, f, m, unaryop_args, t, (outs t.RegClass:$dst),
+             [(set t.RegClass:$dst, (node t.RegClass:$src))]>;
+// UnaryOpR_RF - Instructions that read "reg" and write "reg"/EFLAGS.
+class UnaryOpR_RF<bits<8> o, Format f, string m, X86TypeInfo t,
+                  SDPatternOperator node>
+  : UnaryOpR<o, f, m, unaryop_args, t, (outs t.RegClass:$dst),
+             [(set t.RegClass:$dst, (node t.RegClass:$src)),
+              (implicit EFLAGS)]>, DefEFLAGS;
+
+// UnaryOpM - Instructions that read "[mem]".
+class UnaryOpM<bits<8> o, Format f, string m, string args, X86TypeInfo t,
+               dag out, list<dag> p>
+  : ITy<o, f, t, out, (ins t.MemOperand:$src), m, args, p> {
   let mayLoad = 1;
+}
+// UnaryOpM_M - Instructions that read "[mem]" and writes "[mem]".
+class UnaryOpM_M<bits<8> o, Format f, string m, X86TypeInfo t,
+                  SDPatternOperator node>
+  : UnaryOpM<o, f, m, unaryop_args, t, (outs),
+             [(store (node (t.LoadNode addr:$src)), addr:$src)]>,
+    Sched<[WriteALURMW]>{
+  let mayStore = 1;
+}
+// UnaryOpM_MF - Instructions that read "[mem]" and writes "[mem]"/EFLAGS.
+class UnaryOpM_MF<bits<8> o, Format f, string m, X86TypeInfo t,
+                  SDPatternOperator node>
+  : UnaryOpM<o, f, m, unaryop_args, t, (outs),
+             [(store (node (t.LoadNode addr:$src)), addr:$src),
+              (implicit EFLAGS)]>, Sched<[WriteALURMW]>, DefEFLAGS {
   let mayStore = 1;
 }
 
 //===----------------------------------------------------------------------===//
 // MUL/IMUL and DIV/IDIV Instructions
 //
-class MulOpR<bits<8> o, Format f, string m, X86TypeInfo t,
+class MulDivOpR<bits<8> o, Format f, string m, X86TypeInfo t,
              X86FoldableSchedWrite sched, list<dag> p>
-  : ITy<o, f, t, (outs), (ins t.RegClass:$src), m, "$src", p>, Sched<[sched]>;
+  : UnaryOpR<o, f, m, "$src", t, (outs), p> {
+  let SchedRW = [sched];
+}
 
-class MulOpM<bits<8> o, Format f, string m, X86TypeInfo t,
+class MulDivOpM<bits<8> o, Format f, string m, X86TypeInfo t,
              X86FoldableSchedWrite sched, list<dag> p>
-  : ITy<o, f, t, (outs), (ins t.MemOperand:$src), m,
-        "$src", p>, SchedLoadReg<sched> {
-  let mayLoad = 1;
+  : UnaryOpM<o, f, m, "$src", t, (outs), p> {
+  let SchedRW =
+    [sched.Folded,
+     // Memory operand.
+     ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault,
+     // Register reads (implicit or explicit).
+     sched.ReadAfterFold, sched.ReadAfterFold];
 }
 
 multiclass Mul<bits<8> o, string m, Format RegMRM, Format MemMRM, SDPatternOperator node> {
@@ -312,23 +344,23 @@ multiclass Mul<bits<8> o, string m, Format RegMRM, Format MemMRM, SDPatternOpera
   // This probably ought to be moved to a def : Pat<> if the
   // syntax can be accepted.
   let Defs = [AL,EFLAGS,AX], Uses = [AL] in
-  def 8r : MulOpR<o, RegMRM, m, Xi8, WriteIMul8,
+  def 8r : MulDivOpR<o, RegMRM, m, Xi8, WriteIMul8,
                   [(set AL, (node AL, GR8:$src)), (implicit EFLAGS)]>;
   let Defs = [AX,DX,EFLAGS], Uses = [AX] in
-  def 16r : MulOpR<o, RegMRM, m, Xi16, WriteIMul16, []>, OpSize16;
+  def 16r : MulDivOpR<o, RegMRM, m, Xi16, WriteIMul16, []>, OpSize16;
   let Defs = [EAX,EDX,EFLAGS], Uses = [EAX] in
-  def 32r : MulOpR<o, RegMRM, m, Xi32, WriteIMul32, []>, OpSize32;
+  def 32r : MulDivOpR<o, RegMRM, m, Xi32, WriteIMul32, []>, OpSize32;
   let Defs = [RAX,RDX,EFLAGS], Uses = [RAX] in
-  def 64r : MulOpR<o, RegMRM, m, Xi64, WriteIMul64, []>;
+  def 64r : MulDivOpR<o, RegMRM, m, Xi64, WriteIMul64, []>;
   let Defs = [AL,EFLAGS,AX], Uses = [AL] in
-  def 8m : MulOpM<o, MemMRM, m, Xi8, WriteIMul8,
+  def 8m : MulDivOpM<o, MemMRM, m, Xi8, WriteIMul8,
                   [(set AL, (node AL, (loadi8 addr:$src))), (implicit EFLAGS)]>;
   let Defs = [AX,DX,EFLAGS], Uses = [AX] in
-  def 16m : MulOpM<o, MemMRM, m, Xi16, WriteIMul16, []>, OpSize16;
+  def 16m : MulDivOpM<o, MemMRM, m, Xi16, WriteIMul16, []>, OpSize16;
   let Defs = [EAX,EDX,EFLAGS], Uses = [EAX] in
-  def 32m : MulOpM<o, MemMRM, m, Xi32, WriteIMul32, []>, OpSize32;
+  def 32m : MulDivOpM<o, MemMRM, m, Xi32, WriteIMul32, []>, OpSize32;
   let Defs = [RAX,RDX,EFLAGS], Uses = [RAX] in
-  def 64m : MulOpM<o, MemMRM, m, Xi64, WriteIMul64, []>, Requires<[In64BitMode]>;
+  def 64m : MulDivOpM<o, MemMRM, m, Xi64, WriteIMul64, []>, Requires<[In64BitMode]>;
 }
 
 defm MUL : Mul<0xF7, "mul", MRM4r, MRM4m, mul>;
@@ -340,21 +372,21 @@ multiclass Div<bits<8> o, string m, Format RegMRM, Format MemMRM> {
   defvar sched32 = !if(!eq(m, "div"), WriteDiv32, WriteIDiv32);
   defvar sched64 = !if(!eq(m, "div"), WriteDiv64, WriteIDiv64);
   let Defs = [AL,AH,EFLAGS], Uses = [AX] in
-  def 8r  : MulOpR<o, RegMRM, m, Xi8, sched8, []>;
+  def 8r  : MulDivOpR<o, RegMRM, m, Xi8, sched8, []>;
   let Defs = [AX,DX,EFLAGS], Uses = [AX,DX] in
-  def 16r : MulOpR<o, RegMRM, m, Xi16, sched16, []>, OpSize16;
+  def 16r : MulDivOpR<o, RegMRM, m, Xi16, sched16, []>, OpSize16;
   let Defs = [EAX,EDX,EFLAGS], Uses = [EAX,EDX] in
-  def 32r : MulOpR<o, RegMRM, m, Xi32, sched32, []>, OpSize32;
+  def 32r : MulDivOpR<o, RegMRM, m, Xi32, sched32, []>, OpSize32;
   let Defs = [RAX,RDX,EFLAGS], Uses = [RAX,RDX] in
-  def 64r : MulOpR<o, RegMRM, m, Xi64, sched64, []>;
+  def 64r : MulDivOpR<o, RegMRM, m, Xi64, sched64, []>;
   let Defs = [AL,AH,EFLAGS], Uses = [AX] in
-  def 8m  : MulOpM<o, MemMRM, m, Xi8, sched8, []>;
+  def 8m  : MulDivOpM<o, MemMRM, m, Xi8, sched8, []>;
   let Defs = [AX,DX,EFLAGS], Uses = [AX,DX] in
-  def 16m : MulOpM<o, MemMRM, m, Xi16, sched16, []>, OpSize16;
+  def 16m : MulDivOpM<o, MemMRM, m, Xi16, sched16, []>, OpSize16;
   let Defs = [EAX,EDX,EFLAGS], Uses = [EAX,EDX] in
-  def 32m : MulOpM<o, MemMRM, m, Xi32, sched32, []>, OpSize32;
+  def 32m : MulDivOpM<o, MemMRM, m, Xi32, sched32, []>, OpSize32;
   let Defs = [RAX,RDX,EFLAGS], Uses = [RAX,RDX] in
-  def 64m : MulOpM<o, MemMRM, m, Xi64, sched64, []>, Requires<[In64BitMode]>;
+  def 64m : MulDivOpM<o, MemMRM, m, Xi64, sched64, []>, Requires<[In64BitMode]>;
 }
 let hasSideEffects = 1 in { // so that we don't speculatively execute
 defm DIV: Div<0xF7, "div", MRM6r, MRM6m>;
@@ -426,92 +458,84 @@ def IMUL64rmi32 : IMulOpMI_R<Xi64, WriteIMul64Imm>;
 //===----------------------------------------------------------------------===//
 // INC and DEC Instructions
 //
-class INCDECR<Format f, string m, X86TypeInfo t, SDPatternOperator node>
-  : UnaryOpR<0xFF, f, m, t,
-             [(set t.RegClass:$dst, EFLAGS, (node t.RegClass:$src1, 1))]>,
-    DefEFLAGS {
-  let isConvertibleToThreeAddress = 1; // Can xform into LEA.
+class IncOpR_RF<X86TypeInfo t> : UnaryOpR_RF<0xFF, MRM0r, "inc", t, null_frag> {
+  let Pattern = [(set t.RegClass:$dst, EFLAGS,
+                 (X86add_flag_nocf t.RegClass:$src, 1))];
 }
-class INCDECM<Format f, string m, X86TypeInfo t, int num>
-  : UnaryOpM<0xFF, f, m, t,
-             [(store (add (t.LoadNode addr:$dst), num), addr:$dst),
-              (implicit EFLAGS)]>, DefEFLAGS;
-// INCDECR_ALT - Instructions like "inc reg" short forms.
-class INCDECR_ALT<bits<8> o, string m, X86TypeInfo t>
-  : UnaryOpR<o, AddRegFrm, m, t, []>, DefEFLAGS {
-  // Short forms only valid in 32-bit mode. Selected during MCInst lowering.
-  let Predicates = [Not64BitMode];
+class DecOpR_RF<X86TypeInfo t> : UnaryOpR_RF<0xFF, MRM1r, "dec", t, null_frag> {
+  let Pattern = [(set t.RegClass:$dst, EFLAGS,
+                 (X86sub_flag_nocf t.RegClass:$src, 1))];
 }
-let Constraints = "$src1 = $dst" in {
-def INC16r_alt : INCDECR_ALT<0x40, "inc", Xi16>, OpSize16;
-def INC32r_alt : INCDECR_ALT<0x40, "inc", Xi32>, OpSize32;
-def INC8r  : INCDECR<MRM0r, "inc", Xi8, X86add_flag_nocf>;
-def INC16r : INCDECR<MRM0r, "inc", Xi16, X86add_flag_nocf>, OpSize16;
-def INC32r : INCDECR<MRM0r, "inc", Xi32, X86add_flag_nocf>, OpSize32;
-def INC64r : INCDECR<MRM0r, "inc", Xi64, X86add_flag_nocf>;
-def DEC16r_alt : INCDECR_ALT<0x48, "dec", Xi16>, OpSize16;
-def DEC32r_alt : INCDECR_ALT<0x48, "dec", Xi32>, OpSize32;
-def DEC8r  : INCDECR<MRM1r, "dec", Xi8, X86sub_flag_nocf>;
-def DEC16r : INCDECR<MRM1r, "dec", Xi16, X86sub_flag_nocf>, OpSize16;
-def DEC32r : INCDECR<MRM1r, "dec", Xi32, X86sub_flag_nocf>, OpSize32;
-def DEC64r : INCDECR<MRM1r, "dec", Xi64, X86sub_flag_nocf>;
+class IncOpM_M<X86TypeInfo t> : UnaryOpM_MF<0xFF, MRM0m, "inc", t, null_frag> {
+  let Pattern = [(store (add (t.LoadNode addr:$src), 1), addr:$src),
+                 (implicit EFLAGS)];
+}
+class DecOpM_M<X86TypeInfo t> : UnaryOpM_MF<0xFF, MRM1m, "dec", t, null_frag> {
+  let Pattern = [(store (add (t.LoadNode addr:$src), -1), addr:$src),
+                 (implicit EFLAGS)];
+}
+// IncDec_Alt - Instructions like "inc reg" short forms.
+// Short forms only valid in 32-bit mode. Selected during MCInst lowering.
+class IncDec_Alt<bits<8> o, string m, X86TypeInfo t>
+  : UnaryOpR_RF<o, AddRegFrm, m, t, null_frag>, Requires<[Not64BitMode]>;
+
+let Constraints = "$src = $dst", isConvertibleToThreeAddress = 1 in {
+def INC16r_alt : IncDec_Alt<0x40, "inc", Xi16>, OpSize16;
+def INC32r_alt : IncDec_Alt<0x40, "inc", Xi32>, OpSize32;
+def DEC16r_alt : IncDec_Alt<0x48, "dec", Xi16>, OpSize16;
+def DEC32r_alt : IncDec_Alt<0x48, "dec", Xi32>, OpSize32;
+def INC8r  : IncOpR_RF<Xi8>;
+def INC16r : IncOpR_RF<Xi16>, OpSize16;
+def INC32r : IncOpR_RF<Xi32>, OpSize32;
+def INC64r : IncOpR_RF<Xi64>;
+def DEC8r  : DecOpR_RF<Xi8>;
+def DEC16r : DecOpR_RF<Xi16>, OpSize16;
+def DEC32r : DecOpR_RF<Xi32>, OpSize32;
+def DEC64r : DecOpR_RF<Xi64>;
 }
 let Predicates = [UseIncDec] in {
-def INC8m  : INCDECM<MRM0m, "inc", Xi8, 1>;
-def INC16m : INCDECM<MRM0m, "inc", Xi16, 1>, OpSize16;
-def INC32m : INCDECM<MRM0m, "inc", Xi32, 1>, OpSize32;
-def DEC8m  : INCDECM<MRM1m, "dec", Xi8, -1>;
-def DEC16m : INCDECM<MRM1m, "dec", Xi16, -1>, OpSize16;
-def DEC32m : INCDECM<MRM1m, "dec", Xi32, -1>, OpSize32;
+def INC8m  : IncOpM_M<Xi8>;
+def INC16m : IncOpM_M<Xi16>, OpSize16;
+def INC32m : IncOpM_M<Xi32>, OpSize32;
+def DEC8m  : DecOpM_M<Xi8>;
+def DEC16m : DecOpM_M<Xi16>, OpSize16;
+def DEC32m : DecOpM_M<Xi32>, OpSize32;
 }
 let Predicates = [UseIncDec, In64BitMode] in {
-def INC64m : INCDECM<MRM0m, "inc", Xi64, 1>;
-def DEC64m : INCDECM<MRM1m, "dec", Xi64, -1>;
+def INC64m : IncOpM_M<Xi64>;
+def DEC64m : DecOpM_M<Xi64>;
 }
 
 //===----------------------------------------------------------------------===//
 // NEG and NOT Instructions
 //
-class NegOpR<bits<8> o, string m, X86TypeInfo t>
-  : UnaryOpR<o, MRM3r, m, t,
-             [(set t.RegClass:$dst, (ineg t.RegClass:$src1)),
-              (implicit EFLAGS)]>, DefEFLAGS;
-class NegOpM<bits<8> o, string m, X86TypeInfo t>
-  : UnaryOpM<o, MRM3m, m, t,
-             [(store (ineg (t.LoadNode addr:$dst)), addr:$dst),
-              (implicit EFLAGS)]>, DefEFLAGS;
-
-// NOTE: NOT does not set EFLAGS!
-class NotOpR<bits<8> o, string m, X86TypeInfo t>
-  : UnaryOpR<o, MRM2r, m, t, [(set t.RegClass:$dst, (not t.RegClass:$src1))]>;
-
-class NotOpM<bits<8> o, string m, X86TypeInfo t>
-  : UnaryOpM<o, MRM2m, m,  t,
-             [(store (not (t.LoadNode addr:$dst)), addr:$dst)]>;
-
-let Constraints = "$src1 = $dst" in {
-def NEG8r  : NegOpR<0xF6, "neg", Xi8>;
-def NEG16r : NegOpR<0xF7, "neg", Xi16>, OpSize16;
-def NEG32r : NegOpR<0xF7, "neg", Xi32>, OpSize32;
-def NEG64r : NegOpR<0xF7, "neg", Xi64>;
+class NegOpR_RF<X86TypeInfo t> : UnaryOpR_RF<0xF7, MRM3r, "neg", t, ineg>;
+class NegOpM_MF<X86TypeInfo t> : UnaryOpM_MF<0xF7, MRM3m, "neg", t, ineg>;
+
+class NotOpR_R<X86TypeInfo t> : UnaryOpR_R<0xF7, MRM2r, "not", t, not>;
+class NotOpM_M<X86TypeInfo t> : UnaryOpM_M<0xF7, MRM2m, "not", t, not>;
+
+let Constraints = "$src = $dst" in {
+def NEG8r  : NegOpR_RF<Xi8>;
+def NEG16r : NegOpR_RF<Xi16>, OpSize16;
+def NEG32r : NegOpR_RF<Xi32>, OpSize32;
+def NEG64r : NegOpR_RF<Xi64>;
+
+def NOT8r  : NotOpR_R<Xi8>;
+def NOT16r : NotOpR_R<Xi16>, OpSize16;
+def NOT32r : NotOpR_R<Xi32>, OpSize32;
+def NOT64r : NotOpR_R<Xi64>;
 }
 
-def NEG8m  : NegOpM<0xF6, "neg", Xi8>;
-def NEG16m : NegOpM<0xF7, "neg", Xi16>, OpSize16;
-def NEG32m : NegOpM<0xF7, "neg", Xi32>, OpSize32;
-def NEG64m : NegOpM<0xF7, "neg", Xi64>, Requires<[In64BitMode]>;
-
-let Constraints = "$src1 = $dst" in {
-def NOT8r  : NotOpR<0xF6, "not", Xi8>;
-def NOT16r : NotOpR<0xF7, "not", Xi16>, OpSize16;
-def NOT32r : NotOpR<0xF7, "not", Xi32>, OpSize32;
-def NOT64r : NotOpR<0xF7, "not", Xi64>;
-}
+def NEG8m  : NegOpM_MF<Xi8>;
+def NEG16m : NegOpM_MF<Xi16>, OpSize16;
+def NEG32m : NegOpM_MF<Xi32>, OpSize32;
+def NEG64m : NegOpM_MF<Xi64>, Requires<[In64BitMode]>;
 
-def NOT8m  : NotOpM<0xF6, "not", Xi8>;
-def NOT16m : NotOpM<0xF7, "not", Xi16>, OpSize16;
-def NOT32m : NotOpM<0xF7, "not", Xi32>, OpSize32;
-def NOT64m : NotOpM<0xF7, "not", Xi64>, Requires<[In64BitMode]>;
+def NOT8m  : NotOpM_M<Xi8>;
+def NOT16m : NotOpM_M<Xi16>, OpSize16;
+def NOT32m : NotOpM_M<Xi32>, OpSize32;
+def NOT64m : NotOpM_M<Xi64>, Requires<[In64BitMode]>;
 
 /// ArithBinOp_RF - This is an arithmetic binary operator where the pattern is
 /// defined with "(set GPR:$dst, EFLAGS, (...".
diff --git a/llvm/lib/Target/X86/X86InstrUtils.td b/llvm/lib/Target/X86/X86InstrUtils.td
index 89f5653c04f2d..b7d2d8096ff54 100644
--- a/llvm/lib/Target/X86/X86InstrUtils.td
+++ b/llvm/lib/Target/X86/X86InstrUtils.td
@@ -99,17 +99,6 @@ class DisassembleOnly {
   bit ForceDisassemble = 1;
 }
 
-
-// SchedModel info for instruction that loads one value and gets the second
-// (and possibly third) value from a register.
-// This is used for instructions that put the memory operands before other
-// uses.
-class SchedLoadReg<X86FoldableSchedWrite Sched> : Sched<[Sched.Folded,
-  // Memory operand.
-  ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault,
-  // Register reads (implicit or explicit).
-  Sched.ReadAfterFold, Sched.ReadAfterFold]>;
-
 //===----------------------------------------------------------------------===//
 // X86 Type infomation definitions
 //===----------------------------------------------------------------------===//
@@ -957,16 +946,15 @@ class MMXIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
 /// 2. Infers whether the instruction should have a 0x40 REX_W prefix.
 /// 3. Infers whether the low bit of the opcode should be 0 (for i8 operations)
 ///    or 1 (for i16,i32,i64 operations).
-class ITy<bits<8> opcode, Format f, X86TypeInfo typeinfo, dag outs, dag ins,
-          string mnemonic, string args, list<dag> pattern>
-  : I<{opcode{7}, opcode{6}, opcode{5}, opcode{4},
-       opcode{3}, opcode{2}, opcode{1},
-       !if(!eq(typeinfo.HasEvenOpcode, 1), 0, opcode{0})}, f, outs, ins,
-      !strconcat(mnemonic, "{", typeinfo.InstrSuffix, "}\t", args), pattern> {
-
+class ITy<bits<8> o, Format f, X86TypeInfo t, dag outs, dag ins, string m,
+          string args, list<dag> p>
+  : I<{o{7}, o{6}, o{5}, o{4}, o{3}, o{2}, o{1},
+       !if(!eq(t.HasEvenOpcode, 1), 0, o{0})}, f, outs, ins,
+      !strconcat(m, "{", t.InstrSuffix, "}\t", args), p> {
   let hasSideEffects = 0;
-  let hasREX_W  = typeinfo.HasREX_W;
+  let hasREX_W  = t.HasREX_W;
 }
 
+defvar unaryop_args = "$src";
 defvar binop_args = "{$src2, $src1|$src1, $src2}";
 defvar binop_ndd_args = "{$src2, $src1, $dst|$dst, $src1, $src2}";

From 039d9aa56e9432e119c4b62e575f74fcf3cacb82 Mon Sep 17 00:00:00 2001
From: Jim Lin <jim@andestech.com>
Date: Wed, 27 Dec 2023 00:14:50 -0600
Subject: [PATCH 324/342] [RISCV] Remove redundant variable Log2LMUL from vset
 intrinsic. NFC (#76422)

---
 clang/include/clang/Basic/riscv_vector.td | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/clang/include/clang/Basic/riscv_vector.td b/clang/include/clang/Basic/riscv_vector.td
index f2dde7f540fb7..e7d78b03511fe 100644
--- a/clang/include/clang/Basic/riscv_vector.td
+++ b/clang/include/clang/Basic/riscv_vector.td
@@ -2441,11 +2441,9 @@ let HasMasked = false, HasVL = false, IRName = "" in {
         return Builder.CreateInsertVector(ResultType, Ops[0], Ops[2], Ops[1]);
       }
       }] in {
-    let Log2LMUL = [0, 1, 2] in {
-      foreach dst_lmul = ["(LFixedLog2LMUL:1)", "(LFixedLog2LMUL:2)", "(LFixedLog2LMUL:3)"] in {
-        def : RVVBuiltin<"v" # dst_lmul # "v", dst_lmul # "v" # dst_lmul # "vKzv", "csilxfd">;
-        def : RVVBuiltin<"Uv" # dst_lmul # "Uv", dst_lmul # "Uv" # dst_lmul #"UvKzUv", "csil">;
-      }
+    foreach dst_lmul = ["(LFixedLog2LMUL:1)", "(LFixedLog2LMUL:2)", "(LFixedLog2LMUL:3)"] in {
+      def : RVVBuiltin<"v" # dst_lmul # "v", dst_lmul # "v" # dst_lmul # "vKzv", "csilxfd">;
+      def : RVVBuiltin<"Uv" # dst_lmul # "Uv", dst_lmul # "Uv" # dst_lmul #"UvKzUv", "csil">;
     }
     foreach nf = NFList in {
       defvar T = "(Tuple:" # nf # ")";

From b8424eaede5fffc222b1a3afbd1aa4102f6d0b08 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Tue, 26 Dec 2023 23:31:20 -0800
Subject: [PATCH 325/342] [llvm-profdata] Make tests more readable (NFC)

These tests generally use one printf for each field of
RawInstrProf::ProfileData except the lines being touched in this
patch.  These lines print two fields, namely NumValueSites and
NumBitmapBytes, with one printf, which is very confusing.  (Note that
the 4-byte printf at the end of the group is padding to make the
struct size a multiple of 8 bytes.)

This patch makes the tests a litle more readable by splitting
NumValueSites and NumBitmapBytes into two separate lines.
---
 llvm/test/tools/llvm-profdata/raw-32-bits-be.test | 6 ++++--
 llvm/test/tools/llvm-profdata/raw-32-bits-le.test | 6 ++++--
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/llvm/test/tools/llvm-profdata/raw-32-bits-be.test b/llvm/test/tools/llvm-profdata/raw-32-bits-be.test
index fbd31d044382a..8220361df6cfa 100644
--- a/llvm/test/tools/llvm-profdata/raw-32-bits-be.test
+++ b/llvm/test/tools/llvm-profdata/raw-32-bits-be.test
@@ -20,7 +20,8 @@ RUN: printf '\3\0\0\0' >> %t
 RUN: printf '\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0' >> %t
 RUN: printf '\0\0\0\1' >> %t
-RUN: printf '\0\0\0\0\0\0\0\3' >> %t
+RUN: printf '\0\0\0\0' >> %t
+RUN: printf '\0\0\0\3' >> %t
 RUN: printf '\0\0\0\0' >> %t
 
 RUN: printf '\344\023\165\112\031\035\265\067' >> %t
@@ -30,7 +31,8 @@ RUN: printf '\2\xff\xff\xd3' >> %t
 RUN: printf '\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0' >> %t
 RUN: printf '\0\0\0\2' >> %t
-RUN: printf '\0\0\0\0\0\0\0\1' >> %t
+RUN: printf '\0\0\0\0' >> %t
+RUN: printf '\0\0\0\1' >> %t
 RUN: printf '\0\0\0\0' >> %t
 
 RUN: printf '\0\0\0\0\0\0\0\023' >> %t
diff --git a/llvm/test/tools/llvm-profdata/raw-32-bits-le.test b/llvm/test/tools/llvm-profdata/raw-32-bits-le.test
index bb899c5fdb555..9352ae132380d 100644
--- a/llvm/test/tools/llvm-profdata/raw-32-bits-le.test
+++ b/llvm/test/tools/llvm-profdata/raw-32-bits-le.test
@@ -20,7 +20,8 @@ RUN: printf '\0\0\0\3' >> %t
 RUN: printf '\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0' >> %t
 RUN: printf '\1\0\0\0' >> %t
-RUN: printf '\0\0\0\0\3\0\0\0' >> %t
+RUN: printf '\0\0\0\0' >> %t
+RUN: printf '\3\0\0\0' >> %t
 RUN: printf '\0\0\0\0' >> %t
 
 RUN: printf '\067\265\035\031\112\165\023\344' >> %t
@@ -30,7 +31,8 @@ RUN: printf '\xd3\xff\xff\2' >> %t
 RUN: printf '\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0' >> %t
 RUN: printf '\2\0\0\0' >> %t
-RUN: printf '\0\0\0\0\1\0\0\0' >> %t
+RUN: printf '\0\0\0\0' >> %t
+RUN: printf '\1\0\0\0' >> %t
 RUN: printf '\0\0\0\0' >> %t
 
 RUN: printf '\023\0\0\0\0\0\0\0' >> %t

From 4128edeaadf82245f7ade74900ca3eae0ff94e6a Mon Sep 17 00:00:00 2001
From: Kai Luo <lkail@cn.ibm.com>
Date: Wed, 27 Dec 2023 07:29:35 +0000
Subject: [PATCH 326/342] [JITLink] Tell yaml2obj the MachO file is
 little-endian. NFC.

Big-endian is not supported on aarch64 and x86 yet. Try to fix
buildbot failure on ppc64be, https://lab.llvm.org/buildbot/#/builders/93/builds/18057.
---
 .../JITLink/AArch64/MachO_subtractor_single_block.yaml           | 1 +
 .../JITLink/x86-64/MachO_subtractor_single_block.yaml            | 1 +
 2 files changed, 2 insertions(+)

diff --git a/llvm/test/ExecutionEngine/JITLink/AArch64/MachO_subtractor_single_block.yaml b/llvm/test/ExecutionEngine/JITLink/AArch64/MachO_subtractor_single_block.yaml
index bf72750eece05..e45f2961d0174 100644
--- a/llvm/test/ExecutionEngine/JITLink/AArch64/MachO_subtractor_single_block.yaml
+++ b/llvm/test/ExecutionEngine/JITLink/AArch64/MachO_subtractor_single_block.yaml
@@ -6,6 +6,7 @@
 # section).
 
 --- !mach-o
+IsLittleEndian: true
 FileHeader:
   magic:           0xFEEDFACF
   cputype:         0x100000C
diff --git a/llvm/test/ExecutionEngine/JITLink/x86-64/MachO_subtractor_single_block.yaml b/llvm/test/ExecutionEngine/JITLink/x86-64/MachO_subtractor_single_block.yaml
index 12542cf7c3142..05c16d1ad1ca3 100644
--- a/llvm/test/ExecutionEngine/JITLink/x86-64/MachO_subtractor_single_block.yaml
+++ b/llvm/test/ExecutionEngine/JITLink/x86-64/MachO_subtractor_single_block.yaml
@@ -6,6 +6,7 @@
 # section).
 
 --- !mach-o
+IsLittleEndian: true
 FileHeader:
   magic:           0xFEEDFACF
   cputype:         0x1000007

From 4972a19702a00ce0a66d66d38b982c706a008ec8 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Tue, 26 Dec 2023 23:56:21 -0800
Subject: [PATCH 327/342] [wasm] Use StringRef::trim (NFC)

---
 lld/wasm/InputFiles.cpp | 11 +----------
 1 file changed, 1 insertion(+), 10 deletions(-)

diff --git a/lld/wasm/InputFiles.cpp b/lld/wasm/InputFiles.cpp
index 96ac1e1610dd3..5709a5ced584c 100644
--- a/lld/wasm/InputFiles.cpp
+++ b/lld/wasm/InputFiles.cpp
@@ -680,16 +680,7 @@ Symbol *ObjFile::createUndefined(const WasmSymbol &sym, bool isCalledDirectly) {
   llvm_unreachable("unknown symbol kind");
 }
 
-
-StringRef strip(StringRef s) {
-  while (s.starts_with(" ")) {
-    s = s.drop_front();
-  }
-  while (s.ends_with(" ")) {
-    s = s.drop_back();
-  }
-  return s;
-}
+StringRef strip(StringRef s) { return s.trim(' '); }
 
 void StubFile::parse() {
   bool first = true;

From 72390c5c56c5f6a2c2d55231d96f03471eee8cb4 Mon Sep 17 00:00:00 2001
From: Congcong Cai <congcongcai0907@163.com>
Date: Wed, 27 Dec 2023 16:16:57 +0800
Subject: [PATCH 328/342] [clang-tidy][misleading-indentation]ignore
 false-positives for line started with empty macro (#75061)

Fixes: #71767
---
 .../MisleadingIndentationCheck.cpp            | 22 ++++++++++++++++---
 .../readability/MisleadingIndentationCheck.h  |  3 ++-
 clang-tools-extra/docs/ReleaseNotes.rst       |  4 ++++
 .../readability/misleading-indentation.cpp    |  9 ++++++++
 4 files changed, 34 insertions(+), 4 deletions(-)

diff --git a/clang-tools-extra/clang-tidy/readability/MisleadingIndentationCheck.cpp b/clang-tools-extra/clang-tidy/readability/MisleadingIndentationCheck.cpp
index 2c011f5c0e690..e32f79589a059 100644
--- a/clang-tools-extra/clang-tidy/readability/MisleadingIndentationCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/MisleadingIndentationCheck.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "MisleadingIndentationCheck.h"
+#include "../utils/LexerUtils.h"
 #include "clang/AST/ASTContext.h"
 #include "clang/ASTMatchers/ASTMatchFinder.h"
 
@@ -51,8 +52,20 @@ void MisleadingIndentationCheck::danglingElseCheck(const SourceManager &SM,
     diag(ElseLoc, "different indentation for 'if' and corresponding 'else'");
 }
 
-void MisleadingIndentationCheck::missingBracesCheck(const SourceManager &SM,
-                                                    const CompoundStmt *CStmt) {
+static bool isAtStartOfLineIncludingEmptyMacro(SourceLocation NextLoc,
+                                               const SourceManager &SM,
+                                               const LangOptions &LangOpts) {
+  const SourceLocation BeforeLoc =
+      utils::lexer::getPreviousTokenAndStart(NextLoc, SM, LangOpts).second;
+  if (BeforeLoc.isInvalid())
+    return false;
+  return SM.getExpansionLineNumber(BeforeLoc) !=
+         SM.getExpansionLineNumber(NextLoc);
+}
+
+void MisleadingIndentationCheck::missingBracesCheck(
+    const SourceManager &SM, const CompoundStmt *CStmt,
+    const LangOptions &LangOpts) {
   const static StringRef StmtNames[] = {"if", "for", "while"};
   for (unsigned int I = 0; I < CStmt->size() - 1; I++) {
     const Stmt *CurrentStmt = CStmt->body_begin()[I];
@@ -92,6 +105,8 @@ void MisleadingIndentationCheck::missingBracesCheck(const SourceManager &SM,
 
     if (NextLoc.isInvalid() || NextLoc.isMacroID())
       continue;
+    if (!isAtStartOfLineIncludingEmptyMacro(NextLoc, SM, LangOpts))
+      continue;
 
     if (SM.getExpansionColumnNumber(InnerLoc) ==
         SM.getExpansionColumnNumber(NextLoc)) {
@@ -117,7 +132,8 @@ void MisleadingIndentationCheck::check(const MatchFinder::MatchResult &Result) {
     danglingElseCheck(*Result.SourceManager, Result.Context, If);
 
   if (const auto *CStmt = Result.Nodes.getNodeAs<CompoundStmt>("compound"))
-    missingBracesCheck(*Result.SourceManager, CStmt);
+    missingBracesCheck(*Result.SourceManager, CStmt,
+                       Result.Context->getLangOpts());
 }
 
 } // namespace clang::tidy::readability
diff --git a/clang-tools-extra/clang-tidy/readability/MisleadingIndentationCheck.h b/clang-tools-extra/clang-tidy/readability/MisleadingIndentationCheck.h
index c336abbc7c4a9..9c92fc1e18b6f 100644
--- a/clang-tools-extra/clang-tidy/readability/MisleadingIndentationCheck.h
+++ b/clang-tools-extra/clang-tidy/readability/MisleadingIndentationCheck.h
@@ -32,7 +32,8 @@ class MisleadingIndentationCheck : public ClangTidyCheck {
 private:
   void danglingElseCheck(const SourceManager &SM, ASTContext *Context,
                          const IfStmt *If);
-  void missingBracesCheck(const SourceManager &SM, const CompoundStmt *CStmt);
+  void missingBracesCheck(const SourceManager &SM, const CompoundStmt *CStmt,
+                          const LangOptions &LangOpts);
 };
 
 } // namespace clang::tidy::readability
diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index fe7f40d95fe6c..571808a51596a 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -465,6 +465,10 @@ Changes in existing checks
   `AllowPointerConditions` options. It also now provides more consistent
   suggestions when parentheses are added to the return value.
 
+- Improved :doc:`readability-misleading-indentation
+  <clang-tidy/checks/readability/misleading-indentation>` check to ignore
+  false-positives for line started with empty macro.
+
 - Improved :doc:`readability-non-const-parameter
   <clang-tidy/checks/readability/non-const-parameter>` check to ignore
   false-positives in initializer list of record.
diff --git a/clang-tools-extra/test/clang-tidy/checkers/readability/misleading-indentation.cpp b/clang-tools-extra/test/clang-tidy/checkers/readability/misleading-indentation.cpp
index aea0618d120db..5d4d60f5f1a35 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/readability/misleading-indentation.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/readability/misleading-indentation.cpp
@@ -4,6 +4,8 @@ void foo1();
 void foo2();
 void foo3();
 
+#define E
+
 #define BLOCK \
   if (cond1)  \
     foo1();   \
@@ -109,6 +111,13 @@ void f()
   }
 
   BLOCK
+
+  if (cond1)
+    foo1();
+  else
+    foo2();
+  E foo3();
+  // CHECK-MESSAGES-NOT: :[[@LINE-1]]readability-misleading-indentation
 }
 
 void g(bool x) {

From 2df0fa41a3aa416d84f9f22490f329bf851d447e Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Wed, 27 Dec 2023 00:19:12 -0800
Subject: [PATCH 329/342] [clang] Use StringRef::consume_front (NFC)

---
 clang/lib/Basic/Warnings.cpp             | 6 +-----
 clang/lib/Driver/ToolChains/Arch/X86.cpp | 4 +---
 2 files changed, 2 insertions(+), 8 deletions(-)

diff --git a/clang/lib/Basic/Warnings.cpp b/clang/lib/Basic/Warnings.cpp
index bab1af4f03b67..92954cab6fb04 100644
--- a/clang/lib/Basic/Warnings.cpp
+++ b/clang/lib/Basic/Warnings.cpp
@@ -96,11 +96,7 @@ void clang::ProcessWarningOptions(DiagnosticsEngine &Diags,
 
       // Check to see if this warning starts with "no-", if so, this is a
       // negative form of the option.
-      bool isPositive = true;
-      if (Opt.starts_with("no-")) {
-        isPositive = false;
-        Opt = Opt.substr(3);
-      }
+      bool isPositive = !Opt.consume_front("no-");
 
       // Figure out how this option affects the warning.  If -Wfoo, map the
       // diagnostic to a warning, if -Wno-foo, map it to ignore.
diff --git a/clang/lib/Driver/ToolChains/Arch/X86.cpp b/clang/lib/Driver/ToolChains/Arch/X86.cpp
index fef0522aaf45b..53e26a9f8e229 100644
--- a/clang/lib/Driver/ToolChains/Arch/X86.cpp
+++ b/clang/lib/Driver/ToolChains/Arch/X86.cpp
@@ -237,9 +237,7 @@ void x86::getX86TargetFeatures(const Driver &D, const llvm::Triple &Triple,
     assert(Name.starts_with("m") && "Invalid feature name.");
     Name = Name.substr(1);
 
-    bool IsNegative = Name.starts_with("no-");
-    if (IsNegative)
-      Name = Name.substr(3);
+    bool IsNegative = Name.consume_front("no-");
 
 #ifndef NDEBUG
     assert(Name.starts_with("avx10.") && "Invalid AVX10 feature name.");

From ae0b2633c935950084860e5f6a1c2c3203726489 Mon Sep 17 00:00:00 2001
From: DavidKorczynski <david@adalogics.com>
Date: Wed, 27 Dec 2023 08:26:21 +0000
Subject: [PATCH 330/342] [BitcodeReader] Add bounds checking on Strtab
 (#76403)

This is needed to protect against global overflows, which was found by a
fuzzer recently.

Fixes: https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=65283

---------

Signed-off-by: David Korczynski <david@adalogics.com>
---
 llvm/lib/Bitcode/Reader/BitcodeReader.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
index 8907f6fa4ff3f..a027d0c21ba0b 100644
--- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -4218,6 +4218,9 @@ Error BitcodeReader::parseGlobalIndirectSymbolRecord(
 
   // Check whether we have enough values to read a partition name.
   if (OpNum + 1 < Record.size()) {
+    // Check Strtab has enough values for the partition.
+    if (Record[OpNum] + Record[OpNum + 1] > Strtab.size())
+      return error("Malformed partition, too large.");
     NewGA->setPartition(
         StringRef(Strtab.data() + Record[OpNum], Record[OpNum + 1]));
     OpNum += 2;

From 9f6bf00b258e4dae42c6bf3143714a3bef808da7 Mon Sep 17 00:00:00 2001
From: Shao-Ce SUN <sunshaoce@outlook.com>
Date: Wed, 27 Dec 2023 17:20:54 +0800
Subject: [PATCH 331/342] [DAGCombine] Add DAG optimisation for BF16_TO_FP
 (#69426)

fold bf16_to_fp(op & 0xffff) -> bf16_to_fp(op)
---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 16 +++++--
 llvm/test/CodeGen/RISCV/bfloat-convert.ll     | 42 -------------------
 llvm/test/CodeGen/RISCV/bfloat.ll             | 32 ++------------
 3 files changed, 17 insertions(+), 73 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 0d46c7868d87e..eafa95ce7fcf7 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -546,6 +546,7 @@ namespace {
     SDValue visitFP_TO_FP16(SDNode *N);
     SDValue visitFP16_TO_FP(SDNode *N);
     SDValue visitFP_TO_BF16(SDNode *N);
+    SDValue visitBF16_TO_FP(SDNode *N);
     SDValue visitVECREDUCE(SDNode *N);
     SDValue visitVPOp(SDNode *N);
     SDValue visitGET_FPENV_MEM(SDNode *N);
@@ -2047,6 +2048,7 @@ SDValue DAGCombiner::visit(SDNode *N) {
   case ISD::FP_TO_FP16:         return visitFP_TO_FP16(N);
   case ISD::FP16_TO_FP:         return visitFP16_TO_FP(N);
   case ISD::FP_TO_BF16:         return visitFP_TO_BF16(N);
+  case ISD::BF16_TO_FP:         return visitBF16_TO_FP(N);
   case ISD::FREEZE:             return visitFREEZE(N);
   case ISD::GET_FPENV_MEM:      return visitGET_FPENV_MEM(N);
   case ISD::SET_FPENV_MEM:      return visitSET_FPENV_MEM(N);
@@ -26256,14 +26258,17 @@ SDValue DAGCombiner::visitFP_TO_FP16(SDNode *N) {
 }
 
 SDValue DAGCombiner::visitFP16_TO_FP(SDNode *N) {
+  auto Op = N->getOpcode();
+  assert((Op == ISD::FP16_TO_FP || Op == ISD::BF16_TO_FP) &&
+         "opcode should be FP16_TO_FP or BF16_TO_FP.");
   SDValue N0 = N->getOperand(0);
 
-  // fold fp16_to_fp(op & 0xffff) -> fp16_to_fp(op)
+  // fold fp16_to_fp(op & 0xffff) -> fp16_to_fp(op) or
+  // fold bf16_to_fp(op & 0xffff) -> bf16_to_fp(op)
   if (!TLI.shouldKeepZExtForFP16Conv() && N0->getOpcode() == ISD::AND) {
     ConstantSDNode *AndConst = getAsNonOpaqueConstant(N0.getOperand(1));
     if (AndConst && AndConst->getAPIntValue() == 0xffff) {
-      return DAG.getNode(ISD::FP16_TO_FP, SDLoc(N), N->getValueType(0),
-                         N0.getOperand(0));
+      return DAG.getNode(Op, SDLoc(N), N->getValueType(0), N0.getOperand(0));
     }
   }
 
@@ -26280,6 +26285,11 @@ SDValue DAGCombiner::visitFP_TO_BF16(SDNode *N) {
   return SDValue();
 }
 
+SDValue DAGCombiner::visitBF16_TO_FP(SDNode *N) {
+  // fold bf16_to_fp(op & 0xffff) -> bf16_to_fp(op)
+  return visitFP16_TO_FP(N);
+}
+
 SDValue DAGCombiner::visitVECREDUCE(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   EVT VT = N0.getValueType();
diff --git a/llvm/test/CodeGen/RISCV/bfloat-convert.ll b/llvm/test/CodeGen/RISCV/bfloat-convert.ll
index 8a0c4240d161b..bfa2c3bb4a8ba 100644
--- a/llvm/test/CodeGen/RISCV/bfloat-convert.ll
+++ b/llvm/test/CodeGen/RISCV/bfloat-convert.ll
@@ -39,8 +39,6 @@ define i16 @fcvt_si_bf16(bfloat %a) nounwind {
 ; RV64ID-LABEL: fcvt_si_bf16:
 ; RV64ID:       # %bb.0:
 ; RV64ID-NEXT:    fmv.x.w a0, fa0
-; RV64ID-NEXT:    slli a0, a0, 48
-; RV64ID-NEXT:    srli a0, a0, 48
 ; RV64ID-NEXT:    slli a0, a0, 16
 ; RV64ID-NEXT:    fmv.w.x fa5, a0
 ; RV64ID-NEXT:    fcvt.l.s a0, fa5, rtz
@@ -100,8 +98,6 @@ define i16 @fcvt_si_bf16_sat(bfloat %a) nounwind {
 ; RV64ID-LABEL: fcvt_si_bf16_sat:
 ; RV64ID:       # %bb.0: # %start
 ; RV64ID-NEXT:    fmv.x.w a0, fa0
-; RV64ID-NEXT:    slli a0, a0, 48
-; RV64ID-NEXT:    srli a0, a0, 48
 ; RV64ID-NEXT:    slli a0, a0, 16
 ; RV64ID-NEXT:    fmv.w.x fa5, a0
 ; RV64ID-NEXT:    feq.s a0, fa5, fa5
@@ -145,8 +141,6 @@ define i16 @fcvt_ui_bf16(bfloat %a) nounwind {
 ; RV64ID-LABEL: fcvt_ui_bf16:
 ; RV64ID:       # %bb.0:
 ; RV64ID-NEXT:    fmv.x.w a0, fa0
-; RV64ID-NEXT:    slli a0, a0, 48
-; RV64ID-NEXT:    srli a0, a0, 48
 ; RV64ID-NEXT:    slli a0, a0, 16
 ; RV64ID-NEXT:    fmv.w.x fa5, a0
 ; RV64ID-NEXT:    fcvt.lu.s a0, fa5, rtz
@@ -196,8 +190,6 @@ define i16 @fcvt_ui_bf16_sat(bfloat %a) nounwind {
 ; RV64ID-NEXT:    lui a0, %hi(.LCPI3_0)
 ; RV64ID-NEXT:    flw fa5, %lo(.LCPI3_0)(a0)
 ; RV64ID-NEXT:    fmv.x.w a0, fa0
-; RV64ID-NEXT:    slli a0, a0, 48
-; RV64ID-NEXT:    srli a0, a0, 48
 ; RV64ID-NEXT:    slli a0, a0, 16
 ; RV64ID-NEXT:    fmv.w.x fa4, a0
 ; RV64ID-NEXT:    fmv.w.x fa3, zero
@@ -235,8 +227,6 @@ define i32 @fcvt_w_bf16(bfloat %a) nounwind {
 ; RV64ID-LABEL: fcvt_w_bf16:
 ; RV64ID:       # %bb.0:
 ; RV64ID-NEXT:    fmv.x.w a0, fa0
-; RV64ID-NEXT:    slli a0, a0, 48
-; RV64ID-NEXT:    srli a0, a0, 48
 ; RV64ID-NEXT:    slli a0, a0, 16
 ; RV64ID-NEXT:    fmv.w.x fa5, a0
 ; RV64ID-NEXT:    fcvt.l.s a0, fa5, rtz
@@ -281,8 +271,6 @@ define i32 @fcvt_w_bf16_sat(bfloat %a) nounwind {
 ; RV64ID-LABEL: fcvt_w_bf16_sat:
 ; RV64ID:       # %bb.0: # %start
 ; RV64ID-NEXT:    fmv.x.w a0, fa0
-; RV64ID-NEXT:    slli a0, a0, 48
-; RV64ID-NEXT:    srli a0, a0, 48
 ; RV64ID-NEXT:    slli a0, a0, 16
 ; RV64ID-NEXT:    fmv.w.x fa5, a0
 ; RV64ID-NEXT:    fcvt.w.s a0, fa5, rtz
@@ -321,8 +309,6 @@ define i32 @fcvt_wu_bf16(bfloat %a) nounwind {
 ; RV64ID-LABEL: fcvt_wu_bf16:
 ; RV64ID:       # %bb.0:
 ; RV64ID-NEXT:    fmv.x.w a0, fa0
-; RV64ID-NEXT:    slli a0, a0, 48
-; RV64ID-NEXT:    srli a0, a0, 48
 ; RV64ID-NEXT:    slli a0, a0, 16
 ; RV64ID-NEXT:    fmv.w.x fa5, a0
 ; RV64ID-NEXT:    fcvt.lu.s a0, fa5, rtz
@@ -361,8 +347,6 @@ define i32 @fcvt_wu_bf16_multiple_use(bfloat %x, ptr %y) nounwind {
 ; RV64ID-LABEL: fcvt_wu_bf16_multiple_use:
 ; RV64ID:       # %bb.0:
 ; RV64ID-NEXT:    fmv.x.w a0, fa0
-; RV64ID-NEXT:    slli a0, a0, 48
-; RV64ID-NEXT:    srli a0, a0, 48
 ; RV64ID-NEXT:    slli a0, a0, 16
 ; RV64ID-NEXT:    fmv.w.x fa5, a0
 ; RV64ID-NEXT:    fcvt.lu.s a0, fa5, rtz
@@ -413,8 +397,6 @@ define i32 @fcvt_wu_bf16_sat(bfloat %a) nounwind {
 ; RV64ID-LABEL: fcvt_wu_bf16_sat:
 ; RV64ID:       # %bb.0: # %start
 ; RV64ID-NEXT:    fmv.x.w a0, fa0
-; RV64ID-NEXT:    slli a0, a0, 48
-; RV64ID-NEXT:    srli a0, a0, 48
 ; RV64ID-NEXT:    slli a0, a0, 16
 ; RV64ID-NEXT:    fmv.w.x fa5, a0
 ; RV64ID-NEXT:    fcvt.wu.s a0, fa5, rtz
@@ -463,8 +445,6 @@ define i64 @fcvt_l_bf16(bfloat %a) nounwind {
 ; RV64ID-LABEL: fcvt_l_bf16:
 ; RV64ID:       # %bb.0:
 ; RV64ID-NEXT:    fmv.x.w a0, fa0
-; RV64ID-NEXT:    slli a0, a0, 48
-; RV64ID-NEXT:    srli a0, a0, 48
 ; RV64ID-NEXT:    slli a0, a0, 16
 ; RV64ID-NEXT:    fmv.w.x fa5, a0
 ; RV64ID-NEXT:    fcvt.l.s a0, fa5, rtz
@@ -606,8 +586,6 @@ define i64 @fcvt_l_bf16_sat(bfloat %a) nounwind {
 ; RV64ID-LABEL: fcvt_l_bf16_sat:
 ; RV64ID:       # %bb.0: # %start
 ; RV64ID-NEXT:    fmv.x.w a0, fa0
-; RV64ID-NEXT:    slli a0, a0, 48
-; RV64ID-NEXT:    srli a0, a0, 48
 ; RV64ID-NEXT:    slli a0, a0, 16
 ; RV64ID-NEXT:    fmv.w.x fa5, a0
 ; RV64ID-NEXT:    fcvt.l.s a0, fa5, rtz
@@ -654,8 +632,6 @@ define i64 @fcvt_lu_bf16(bfloat %a) nounwind {
 ; RV64ID-LABEL: fcvt_lu_bf16:
 ; RV64ID:       # %bb.0:
 ; RV64ID-NEXT:    fmv.x.w a0, fa0
-; RV64ID-NEXT:    slli a0, a0, 48
-; RV64ID-NEXT:    srli a0, a0, 48
 ; RV64ID-NEXT:    slli a0, a0, 16
 ; RV64ID-NEXT:    fmv.w.x fa5, a0
 ; RV64ID-NEXT:    fcvt.lu.s a0, fa5, rtz
@@ -730,8 +706,6 @@ define i64 @fcvt_lu_bf16_sat(bfloat %a) nounwind {
 ; RV64ID-LABEL: fcvt_lu_bf16_sat:
 ; RV64ID:       # %bb.0: # %start
 ; RV64ID-NEXT:    fmv.x.w a0, fa0
-; RV64ID-NEXT:    slli a0, a0, 48
-; RV64ID-NEXT:    srli a0, a0, 48
 ; RV64ID-NEXT:    slli a0, a0, 16
 ; RV64ID-NEXT:    fmv.w.x fa5, a0
 ; RV64ID-NEXT:    fcvt.lu.s a0, fa5, rtz
@@ -1200,8 +1174,6 @@ define float @fcvt_s_bf16(bfloat %a) nounwind {
 ; RV64ID-LABEL: fcvt_s_bf16:
 ; RV64ID:       # %bb.0:
 ; RV64ID-NEXT:    fmv.x.w a0, fa0
-; RV64ID-NEXT:    slli a0, a0, 48
-; RV64ID-NEXT:    srli a0, a0, 48
 ; RV64ID-NEXT:    slli a0, a0, 16
 ; RV64ID-NEXT:    fmv.w.x fa0, a0
 ; RV64ID-NEXT:    ret
@@ -1313,8 +1285,6 @@ define double @fcvt_d_bf16(bfloat %a) nounwind {
 ; RV64ID-LABEL: fcvt_d_bf16:
 ; RV64ID:       # %bb.0:
 ; RV64ID-NEXT:    fmv.x.w a0, fa0
-; RV64ID-NEXT:    slli a0, a0, 48
-; RV64ID-NEXT:    srli a0, a0, 48
 ; RV64ID-NEXT:    slli a0, a0, 16
 ; RV64ID-NEXT:    fmv.w.x fa5, a0
 ; RV64ID-NEXT:    fcvt.d.s fa0, fa5
@@ -1521,8 +1491,6 @@ define signext i8 @fcvt_w_s_i8(bfloat %a) nounwind {
 ; RV64ID-LABEL: fcvt_w_s_i8:
 ; RV64ID:       # %bb.0:
 ; RV64ID-NEXT:    fmv.x.w a0, fa0
-; RV64ID-NEXT:    slli a0, a0, 48
-; RV64ID-NEXT:    srli a0, a0, 48
 ; RV64ID-NEXT:    slli a0, a0, 16
 ; RV64ID-NEXT:    fmv.w.x fa5, a0
 ; RV64ID-NEXT:    fcvt.l.s a0, fa5, rtz
@@ -1582,8 +1550,6 @@ define signext i8 @fcvt_w_s_sat_i8(bfloat %a) nounwind {
 ; RV64ID-LABEL: fcvt_w_s_sat_i8:
 ; RV64ID:       # %bb.0: # %start
 ; RV64ID-NEXT:    fmv.x.w a0, fa0
-; RV64ID-NEXT:    slli a0, a0, 48
-; RV64ID-NEXT:    srli a0, a0, 48
 ; RV64ID-NEXT:    slli a0, a0, 16
 ; RV64ID-NEXT:    fmv.w.x fa5, a0
 ; RV64ID-NEXT:    feq.s a0, fa5, fa5
@@ -1627,8 +1593,6 @@ define zeroext i8 @fcvt_wu_s_i8(bfloat %a) nounwind {
 ; RV64ID-LABEL: fcvt_wu_s_i8:
 ; RV64ID:       # %bb.0:
 ; RV64ID-NEXT:    fmv.x.w a0, fa0
-; RV64ID-NEXT:    slli a0, a0, 48
-; RV64ID-NEXT:    srli a0, a0, 48
 ; RV64ID-NEXT:    slli a0, a0, 16
 ; RV64ID-NEXT:    fmv.w.x fa5, a0
 ; RV64ID-NEXT:    fcvt.lu.s a0, fa5, rtz
@@ -1676,8 +1640,6 @@ define zeroext i8 @fcvt_wu_s_sat_i8(bfloat %a) nounwind {
 ; RV64ID-LABEL: fcvt_wu_s_sat_i8:
 ; RV64ID:       # %bb.0: # %start
 ; RV64ID-NEXT:    fmv.x.w a0, fa0
-; RV64ID-NEXT:    slli a0, a0, 48
-; RV64ID-NEXT:    srli a0, a0, 48
 ; RV64ID-NEXT:    slli a0, a0, 16
 ; RV64ID-NEXT:    fmv.w.x fa5, a0
 ; RV64ID-NEXT:    fmv.w.x fa4, zero
@@ -1731,8 +1693,6 @@ define zeroext i32 @fcvt_wu_bf16_sat_zext(bfloat %a) nounwind {
 ; RV64ID-LABEL: fcvt_wu_bf16_sat_zext:
 ; RV64ID:       # %bb.0: # %start
 ; RV64ID-NEXT:    fmv.x.w a0, fa0
-; RV64ID-NEXT:    slli a0, a0, 48
-; RV64ID-NEXT:    srli a0, a0, 48
 ; RV64ID-NEXT:    slli a0, a0, 16
 ; RV64ID-NEXT:    fmv.w.x fa5, a0
 ; RV64ID-NEXT:    fcvt.wu.s a0, fa5, rtz
@@ -1784,8 +1744,6 @@ define signext i32 @fcvt_w_bf16_sat_sext(bfloat %a) nounwind {
 ; RV64ID-LABEL: fcvt_w_bf16_sat_sext:
 ; RV64ID:       # %bb.0: # %start
 ; RV64ID-NEXT:    fmv.x.w a0, fa0
-; RV64ID-NEXT:    slli a0, a0, 48
-; RV64ID-NEXT:    srli a0, a0, 48
 ; RV64ID-NEXT:    slli a0, a0, 16
 ; RV64ID-NEXT:    fmv.w.x fa5, a0
 ; RV64ID-NEXT:    fcvt.w.s a0, fa5, rtz
diff --git a/llvm/test/CodeGen/RISCV/bfloat.ll b/llvm/test/CodeGen/RISCV/bfloat.ll
index 5013f76f9b0b3..d62f35388123f 100644
--- a/llvm/test/CodeGen/RISCV/bfloat.ll
+++ b/llvm/test/CodeGen/RISCV/bfloat.ll
@@ -164,8 +164,6 @@ define float @bfloat_to_float(bfloat %a) nounwind {
 ;
 ; RV64ID-LP64-LABEL: bfloat_to_float:
 ; RV64ID-LP64:       # %bb.0:
-; RV64ID-LP64-NEXT:    slli a0, a0, 48
-; RV64ID-LP64-NEXT:    srli a0, a0, 48
 ; RV64ID-LP64-NEXT:    slli a0, a0, 16
 ; RV64ID-LP64-NEXT:    ret
 ;
@@ -179,8 +177,6 @@ define float @bfloat_to_float(bfloat %a) nounwind {
 ; RV64ID-LP64D-LABEL: bfloat_to_float:
 ; RV64ID-LP64D:       # %bb.0:
 ; RV64ID-LP64D-NEXT:    fmv.x.w a0, fa0
-; RV64ID-LP64D-NEXT:    slli a0, a0, 48
-; RV64ID-LP64D-NEXT:    srli a0, a0, 48
 ; RV64ID-LP64D-NEXT:    slli a0, a0, 16
 ; RV64ID-LP64D-NEXT:    fmv.w.x fa0, a0
 ; RV64ID-LP64D-NEXT:    ret
@@ -223,8 +219,6 @@ define double @bfloat_to_double(bfloat %a) nounwind {
 ;
 ; RV64ID-LP64-LABEL: bfloat_to_double:
 ; RV64ID-LP64:       # %bb.0:
-; RV64ID-LP64-NEXT:    slli a0, a0, 48
-; RV64ID-LP64-NEXT:    srli a0, a0, 48
 ; RV64ID-LP64-NEXT:    slli a0, a0, 16
 ; RV64ID-LP64-NEXT:    fmv.w.x fa5, a0
 ; RV64ID-LP64-NEXT:    fcvt.d.s fa5, fa5
@@ -242,8 +236,6 @@ define double @bfloat_to_double(bfloat %a) nounwind {
 ; RV64ID-LP64D-LABEL: bfloat_to_double:
 ; RV64ID-LP64D:       # %bb.0:
 ; RV64ID-LP64D-NEXT:    fmv.x.w a0, fa0
-; RV64ID-LP64D-NEXT:    slli a0, a0, 48
-; RV64ID-LP64D-NEXT:    srli a0, a0, 48
 ; RV64ID-LP64D-NEXT:    slli a0, a0, 16
 ; RV64ID-LP64D-NEXT:    fmv.w.x fa5, a0
 ; RV64ID-LP64D-NEXT:    fcvt.d.s fa0, fa5
@@ -366,10 +358,6 @@ define bfloat @bfloat_add(bfloat %a, bfloat %b) nounwind {
 ; RV64ID-LP64:       # %bb.0:
 ; RV64ID-LP64-NEXT:    addi sp, sp, -16
 ; RV64ID-LP64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
-; RV64ID-LP64-NEXT:    lui a2, 16
-; RV64ID-LP64-NEXT:    addi a2, a2, -1
-; RV64ID-LP64-NEXT:    and a0, a0, a2
-; RV64ID-LP64-NEXT:    and a1, a1, a2
 ; RV64ID-LP64-NEXT:    slli a1, a1, 16
 ; RV64ID-LP64-NEXT:    fmv.w.x fa5, a1
 ; RV64ID-LP64-NEXT:    slli a0, a0, 16
@@ -408,11 +396,7 @@ define bfloat @bfloat_add(bfloat %a, bfloat %b) nounwind {
 ; RV64ID-LP64D-NEXT:    addi sp, sp, -16
 ; RV64ID-LP64D-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; RV64ID-LP64D-NEXT:    fmv.x.w a0, fa0
-; RV64ID-LP64D-NEXT:    lui a1, 16
-; RV64ID-LP64D-NEXT:    addi a1, a1, -1
-; RV64ID-LP64D-NEXT:    and a0, a0, a1
-; RV64ID-LP64D-NEXT:    fmv.x.w a2, fa1
-; RV64ID-LP64D-NEXT:    and a1, a2, a1
+; RV64ID-LP64D-NEXT:    fmv.x.w a1, fa1
 ; RV64ID-LP64D-NEXT:    slli a1, a1, 16
 ; RV64ID-LP64D-NEXT:    fmv.w.x fa5, a1
 ; RV64ID-LP64D-NEXT:    slli a0, a0, 16
@@ -604,12 +588,8 @@ define void @bfloat_store(ptr %a, bfloat %b, bfloat %c) nounwind {
 ; RV64ID-LP64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
 ; RV64ID-LP64-NEXT:    sd s0, 0(sp) # 8-byte Folded Spill
 ; RV64ID-LP64-NEXT:    mv s0, a0
-; RV64ID-LP64-NEXT:    lui a0, 16
-; RV64ID-LP64-NEXT:    addi a0, a0, -1
-; RV64ID-LP64-NEXT:    and a1, a1, a0
-; RV64ID-LP64-NEXT:    and a0, a2, a0
-; RV64ID-LP64-NEXT:    slli a0, a0, 16
-; RV64ID-LP64-NEXT:    fmv.w.x fa5, a0
+; RV64ID-LP64-NEXT:    slli a2, a2, 16
+; RV64ID-LP64-NEXT:    fmv.w.x fa5, a2
 ; RV64ID-LP64-NEXT:    slli a1, a1, 16
 ; RV64ID-LP64-NEXT:    fmv.w.x fa4, a1
 ; RV64ID-LP64-NEXT:    fadd.s fa5, fa4, fa5
@@ -651,11 +631,7 @@ define void @bfloat_store(ptr %a, bfloat %b, bfloat %c) nounwind {
 ; RV64ID-LP64D-NEXT:    sd s0, 0(sp) # 8-byte Folded Spill
 ; RV64ID-LP64D-NEXT:    mv s0, a0
 ; RV64ID-LP64D-NEXT:    fmv.x.w a0, fa0
-; RV64ID-LP64D-NEXT:    lui a1, 16
-; RV64ID-LP64D-NEXT:    addi a1, a1, -1
-; RV64ID-LP64D-NEXT:    and a0, a0, a1
-; RV64ID-LP64D-NEXT:    fmv.x.w a2, fa1
-; RV64ID-LP64D-NEXT:    and a1, a2, a1
+; RV64ID-LP64D-NEXT:    fmv.x.w a1, fa1
 ; RV64ID-LP64D-NEXT:    slli a1, a1, 16
 ; RV64ID-LP64D-NEXT:    fmv.w.x fa5, a1
 ; RV64ID-LP64D-NEXT:    slli a0, a0, 16

From 38c9390b59c4d2b9181614d6a909887497d3692f Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Wed, 27 Dec 2023 10:40:46 +0000
Subject: [PATCH 332/342] [AArch64] Add an extra test for #75822. NFC

---
 .../AArch64/neon-compare-instructions.ll      | 32 +++++++++++++++----
 1 file changed, 26 insertions(+), 6 deletions(-)

diff --git a/llvm/test/CodeGen/AArch64/neon-compare-instructions.ll b/llvm/test/CodeGen/AArch64/neon-compare-instructions.ll
index e43fcef30b00e..b2fc477d8655a 100644
--- a/llvm/test/CodeGen/AArch64/neon-compare-instructions.ll
+++ b/llvm/test/CodeGen/AArch64/neon-compare-instructions.ll
@@ -1789,6 +1789,26 @@ define <8 x i1> @not_cmle8xi8(<8 x i8> %0) {
   ret <8 x i1> %cmp.i
 }
 
+define <4 x i1> @not_cmle16xi8(<4 x i32> %0) {
+; CHECK-SD-LABEL: not_cmle16xi8:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmle v0.4s, v0.4s, #0
+; CHECK-SD-NEXT:    xtn v0.4h, v0.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: not_cmle16xi8:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    adrp x8, .LCPI134_0
+; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI134_0]
+; CHECK-GI-NEXT:    cmgt v0.4s, v1.4s, v0.4s
+; CHECK-GI-NEXT:    xtn v0.4h, v0.4s
+; CHECK-GI-NEXT:    ret
+entry:
+  %bc = bitcast <16 x i8> <i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0, i8 1, i8 0> to <4 x i32>
+  %cmp.i = icmp slt <4 x i32> %0, %bc
+  ret <4 x i1> %cmp.i
+}
+
 define <8 x i8> @cmltz8xi8_alt(<8 x i8> %A) {
 ; CHECK-SD-LABEL: cmltz8xi8_alt:
 ; CHECK-SD:       // %bb.0:
@@ -2082,8 +2102,8 @@ define <2 x i64> @cmhsz2xi64(<2 x i64> %A) {
 ;
 ; CHECK-GI-LABEL: cmhsz2xi64:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    adrp x8, .LCPI154_0
-; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI154_0]
+; CHECK-GI-NEXT:    adrp x8, .LCPI155_0
+; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI155_0]
 ; CHECK-GI-NEXT:    cmhs v0.2d, v0.2d, v1.2d
 ; CHECK-GI-NEXT:    ret
   %tmp3 = icmp uge <2 x i64> %A, <i64 2, i64 2>
@@ -2168,8 +2188,8 @@ define <2 x i64> @cmhiz2xi64(<2 x i64> %A) {
 ;
 ; CHECK-GI-LABEL: cmhiz2xi64:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    adrp x8, .LCPI161_0
-; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI161_0]
+; CHECK-GI-NEXT:    adrp x8, .LCPI162_0
+; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI162_0]
 ; CHECK-GI-NEXT:    cmhi v0.2d, v0.2d, v1.2d
 ; CHECK-GI-NEXT:    ret
   %tmp3 = icmp ugt <2 x i64> %A, <i64 1, i64 1>
@@ -2344,8 +2364,8 @@ define <2 x i64> @cmloz2xi64(<2 x i64> %A) {
 ;
 ; CHECK-GI-LABEL: cmloz2xi64:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    adrp x8, .LCPI175_0
-; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI175_0]
+; CHECK-GI-NEXT:    adrp x8, .LCPI176_0
+; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI176_0]
 ; CHECK-GI-NEXT:    cmhi v0.2d, v1.2d, v0.2d
 ; CHECK-GI-NEXT:    ret
   %tmp3 = icmp ult <2 x i64> %A, <i64 2, i64 2>

From 1150e8ef7765f43a730575bd224eda18e916ac1e Mon Sep 17 00:00:00 2001
From: Xiang Li <python3kgae@outlook.com>
Date: Wed, 27 Dec 2023 07:32:21 -0800
Subject: [PATCH 333/342] [mlir::spirv] Support scf.if in mlir-vulkan-runner
 (#75367)

1. Register SCFDialect in mlir-vulkan-runner
2. Add SCFToSPIRV in GPUToSPIRVPass to lower scf.

Fixes https://github.com/llvm/llvm-project/issues/74939
---
 .../Conversion/GPUToSPIRV/GPUToSPIRVPass.cpp  |  3 ++
 mlir/test/mlir-vulkan-runner/addf_if.mlir     | 54 +++++++++++++++++++
 .../mlir-vulkan-runner/mlir-vulkan-runner.cpp |  5 +-
 3 files changed, 60 insertions(+), 2 deletions(-)
 create mode 100644 mlir/test/mlir-vulkan-runner/addf_if.mlir

diff --git a/mlir/lib/Conversion/GPUToSPIRV/GPUToSPIRVPass.cpp b/mlir/lib/Conversion/GPUToSPIRV/GPUToSPIRVPass.cpp
index ae89774239b58..8279b3408a6e6 100644
--- a/mlir/lib/Conversion/GPUToSPIRV/GPUToSPIRVPass.cpp
+++ b/mlir/lib/Conversion/GPUToSPIRV/GPUToSPIRVPass.cpp
@@ -17,6 +17,7 @@
 #include "mlir/Conversion/FuncToSPIRV/FuncToSPIRV.h"
 #include "mlir/Conversion/GPUToSPIRV/GPUToSPIRV.h"
 #include "mlir/Conversion/MemRefToSPIRV/MemRefToSPIRV.h"
+#include "mlir/Conversion/SCFToSPIRV/SCFToSPIRV.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/SPIRV/IR/SPIRVDialect.h"
@@ -126,6 +127,8 @@ void GPUToSPIRVPass::runOnOperation() {
 
     // TODO: Change SPIR-V conversion to be progressive and remove the following
     // patterns.
+    ScfToSPIRVContext scfContext;
+    populateSCFToSPIRVPatterns(typeConverter, scfContext, patterns);
     mlir::arith::populateArithToSPIRVPatterns(typeConverter, patterns);
     populateMemRefToSPIRVPatterns(typeConverter, patterns);
     populateFuncToSPIRVPatterns(typeConverter, patterns);
diff --git a/mlir/test/mlir-vulkan-runner/addf_if.mlir b/mlir/test/mlir-vulkan-runner/addf_if.mlir
new file mode 100644
index 0000000000000..fbd1fae6d0b59
--- /dev/null
+++ b/mlir/test/mlir-vulkan-runner/addf_if.mlir
@@ -0,0 +1,54 @@
+// RUN: mlir-vulkan-runner %s --shared-libs=%vulkan-runtime-wrappers,%mlir_runner_utils --entry-point-result=void | FileCheck %s
+
+// CHECK: [3.3,  3.3,  3.3,  3.3,  0,  0,  0,  0]
+module attributes {
+  gpu.container_module,
+  spirv.target_env = #spirv.target_env<
+    #spirv.vce<v1.0, [Shader], [SPV_KHR_storage_buffer_storage_class]>, #spirv.resource_limits<>>
+} {
+  gpu.module @kernels {
+    gpu.func @kernel_add(%arg0 : memref<8xf32>, %arg1 : memref<8xf32>, %arg2 : memref<8xf32>)
+      kernel attributes { spirv.entry_point_abi = #spirv.entry_point_abi<workgroup_size = [1, 1, 1]>} {
+      %0 = gpu.block_id x
+      %limit = arith.constant 4 : index
+      %cond = arith.cmpi slt, %0, %limit : index
+      scf.if %cond {
+        %1 = memref.load %arg0[%0] : memref<8xf32>
+        %2 = memref.load %arg1[%0] : memref<8xf32>
+        %3 = arith.addf %1, %2 : f32
+        memref.store %3, %arg2[%0] : memref<8xf32>
+      }
+      gpu.return
+    }
+  }
+
+  func.func @main() {
+    %arg0 = memref.alloc() : memref<8xf32>
+    %arg1 = memref.alloc() : memref<8xf32>
+    %arg2 = memref.alloc() : memref<8xf32>
+    %0 = arith.constant 0 : i32
+    %1 = arith.constant 1 : i32
+    %2 = arith.constant 2 : i32
+    %value0 = arith.constant 0.0 : f32
+    %value1 = arith.constant 1.1 : f32
+    %value2 = arith.constant 2.2 : f32
+    %arg3 = memref.cast %arg0 : memref<8xf32> to memref<?xf32>
+    %arg4 = memref.cast %arg1 : memref<8xf32> to memref<?xf32>
+    %arg5 = memref.cast %arg2 : memref<8xf32> to memref<?xf32>
+    call @fillResource1DFloat(%arg3, %value1) : (memref<?xf32>, f32) -> ()
+    call @fillResource1DFloat(%arg4, %value2) : (memref<?xf32>, f32) -> ()
+    call @fillResource1DFloat(%arg5, %value0) : (memref<?xf32>, f32) -> ()
+
+    %cst1 = arith.constant 1 : index
+    %cst8 = arith.constant 8 : index
+    gpu.launch_func @kernels::@kernel_add
+        blocks in (%cst8, %cst1, %cst1) threads in (%cst1, %cst1, %cst1)
+        args(%arg0 : memref<8xf32>, %arg1 : memref<8xf32>, %arg2 : memref<8xf32>)
+    %arg6 = memref.cast %arg5 : memref<?xf32> to memref<*xf32>
+    call @printMemrefF32(%arg6) : (memref<*xf32>) -> ()
+    return
+  }
+  func.func private @fillResource1DFloat(%0 : memref<?xf32>, %1 : f32)
+  func.func private @printMemrefF32(%ptr : memref<*xf32>)
+}
+
diff --git a/mlir/tools/mlir-vulkan-runner/mlir-vulkan-runner.cpp b/mlir/tools/mlir-vulkan-runner/mlir-vulkan-runner.cpp
index 5b8e236b4618f..032f5760361f4 100644
--- a/mlir/tools/mlir-vulkan-runner/mlir-vulkan-runner.cpp
+++ b/mlir/tools/mlir-vulkan-runner/mlir-vulkan-runner.cpp
@@ -27,6 +27,7 @@
 #include "mlir/Dialect/LLVMIR/Transforms/RequestCWrappers.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/MemRef/Transforms/Passes.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/Dialect/SPIRV/IR/SPIRVDialect.h"
 #include "mlir/Dialect/SPIRV/IR/SPIRVOps.h"
 #include "mlir/Dialect/SPIRV/Transforms/Passes.h"
@@ -105,8 +106,8 @@ int main(int argc, char **argv) {
   mlir::DialectRegistry registry;
   registry.insert<mlir::arith::ArithDialect, mlir::LLVM::LLVMDialect,
                   mlir::gpu::GPUDialect, mlir::spirv::SPIRVDialect,
-                  mlir::func::FuncDialect, mlir::memref::MemRefDialect,
-                  mlir::vector::VectorDialect>();
+                  mlir::scf::SCFDialect, mlir::func::FuncDialect,
+                  mlir::memref::MemRefDialect, mlir::vector::VectorDialect>();
   mlir::registerBuiltinDialectTranslation(registry);
   mlir::registerLLVMDialectTranslation(registry);
 

From 8cf6bcf5a30673dd8a234ae3ef4ab4c1e63786b1 Mon Sep 17 00:00:00 2001
From: gitoleg <forown@yandex.ru>
Date: Wed, 27 Dec 2023 19:08:35 +0300
Subject: [PATCH 334/342] [mlir][llvm] Add assert in CallOp builder (#76240)

This commit adds an assert in one of the CallOp builders to ensure it is not use to create an indirect call. Otherwise, the callee type would include the callee pointer type which is handed in as first argument.
---
 mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
index 458bf83eac17f..64388a9a01812 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
@@ -908,6 +908,7 @@ void CallOp::build(OpBuilder &builder, OperationState &state, TypeRange results,
 
 void CallOp::build(OpBuilder &builder, OperationState &state, TypeRange results,
                    FlatSymbolRefAttr callee, ValueRange args) {
+  assert(callee && "expected non-null callee in direct call builder");
   build(builder, state, results,
         TypeAttr::get(getLLVMFuncType(builder.getContext(), results, args)),
         callee, args, /*fastmathFlags=*/nullptr, /*branch_weights=*/nullptr,

From fe94ae83ecf356fd2d63b7cd15f37aaafdbc14ef Mon Sep 17 00:00:00 2001
From: Jan Patrick Lehr <jplehr@users.noreply.github.com>
Date: Wed, 27 Dec 2023 17:27:59 +0100
Subject: [PATCH 335/342] [libc][FIXME] Disable another test on GPU (#76444)

This test fails on some internal buildbot machines / setups. Disable for
now and to fix later.
---
 libc/test/src/__support/CMakeLists.txt | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/libc/test/src/__support/CMakeLists.txt b/libc/test/src/__support/CMakeLists.txt
index fe8b3c4c84c38..a92e6da56096a 100644
--- a/libc/test/src/__support/CMakeLists.txt
+++ b/libc/test/src/__support/CMakeLists.txt
@@ -42,6 +42,8 @@ add_libc_test(
   DEPENDS
     libc.src.__support.high_precision_decimal
     libc.src.__support.uint128
+  # FIXME Test segfaults on gfx90a GPU
+  UNIT_TEST_ONLY
 )
 
 add_libc_test(

From a70dcc2cda6b60c2dbd4b96d229b4c1bf43c23d1 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Wed, 27 Dec 2023 09:10:39 -0800
Subject: [PATCH 336/342] [clang] Use StringRef::ltrim (NFC)

---
 clang/lib/ASTMatchers/Dynamic/Parser.cpp | 6 ++----
 clang/lib/Basic/IdentifierTable.cpp      | 3 +--
 clang/lib/Basic/Targets/AArch64.cpp      | 3 +--
 clang/lib/Basic/Targets/ARM.cpp          | 3 +--
 clang/lib/Basic/Targets/X86.cpp          | 3 +--
 5 files changed, 6 insertions(+), 12 deletions(-)

diff --git a/clang/lib/ASTMatchers/Dynamic/Parser.cpp b/clang/lib/ASTMatchers/Dynamic/Parser.cpp
index 27096a83b8dd6..6a16c2184fcfb 100644
--- a/clang/lib/ASTMatchers/Dynamic/Parser.cpp
+++ b/clang/lib/ASTMatchers/Dynamic/Parser.cpp
@@ -299,10 +299,8 @@ class Parser::CodeTokenizer {
 
   /// Consume all leading whitespace from \c Code.
   void consumeWhitespace() {
-    Code = Code.drop_while([](char c) {
-      // Don't trim newlines.
-      return StringRef(" \t\v\f\r").contains(c);
-    });
+    // Don't trim newlines.
+    Code = Code.ltrim(" \t\v\f\r");
   }
 
   SourceLocation currentLocation() {
diff --git a/clang/lib/Basic/IdentifierTable.cpp b/clang/lib/Basic/IdentifierTable.cpp
index 5902c6dc3ce0b..d0d8316385b45 100644
--- a/clang/lib/Basic/IdentifierTable.cpp
+++ b/clang/lib/Basic/IdentifierTable.cpp
@@ -628,8 +628,7 @@ ObjCMethodFamily Selector::getMethodFamilyImpl(Selector sel) {
     return OMF_performSelector;
 
   // The other method families may begin with a prefix of underscores.
-  while (!name.empty() && name.front() == '_')
-    name = name.substr(1);
+  name = name.ltrim('_');
 
   if (name.empty()) return OMF_None;
   switch (name.front()) {
diff --git a/clang/lib/Basic/Targets/AArch64.cpp b/clang/lib/Basic/Targets/AArch64.cpp
index 3ee39133fcee7..2f8395cb8932f 100644
--- a/clang/lib/Basic/Targets/AArch64.cpp
+++ b/clang/lib/Basic/Targets/AArch64.cpp
@@ -1365,8 +1365,7 @@ bool AArch64TargetInfo::validateConstraintModifier(
     StringRef Constraint, char Modifier, unsigned Size,
     std::string &SuggestedModifier) const {
   // Strip off constraint modifiers.
-  while (Constraint[0] == '=' || Constraint[0] == '+' || Constraint[0] == '&')
-    Constraint = Constraint.substr(1);
+  Constraint = Constraint.ltrim("=+&");
 
   switch (Constraint[0]) {
   default:
diff --git a/clang/lib/Basic/Targets/ARM.cpp b/clang/lib/Basic/Targets/ARM.cpp
index 6e1842fc64e50..01f9e844da12a 100644
--- a/clang/lib/Basic/Targets/ARM.cpp
+++ b/clang/lib/Basic/Targets/ARM.cpp
@@ -1230,8 +1230,7 @@ bool ARMTargetInfo::validateConstraintModifier(
   bool isInOut = (Constraint[0] == '+');
 
   // Strip off constraint modifiers.
-  while (Constraint[0] == '=' || Constraint[0] == '+' || Constraint[0] == '&')
-    Constraint = Constraint.substr(1);
+  Constraint = Constraint.ltrim("=+&");
 
   switch (Constraint[0]) {
   default:
diff --git a/clang/lib/Basic/Targets/X86.cpp b/clang/lib/Basic/Targets/X86.cpp
index b97f88647fa49..3deaa19f8d4fc 100644
--- a/clang/lib/Basic/Targets/X86.cpp
+++ b/clang/lib/Basic/Targets/X86.cpp
@@ -1613,8 +1613,7 @@ bool X86TargetInfo::validateOutputSize(const llvm::StringMap<bool> &FeatureMap,
                                        StringRef Constraint,
                                        unsigned Size) const {
   // Strip off constraint modifiers.
-  while (Constraint[0] == '=' || Constraint[0] == '+' || Constraint[0] == '&')
-    Constraint = Constraint.substr(1);
+  Constraint = Constraint.ltrim("=+&");
 
   return validateOperandSize(FeatureMap, Constraint, Size);
 }

From 410066a0fad14a390dbdb883ba4b3e018fe62582 Mon Sep 17 00:00:00 2001
From: "Balaji V. Iyer" <43187390+bviyer@users.noreply.github.com>
Date: Wed, 27 Dec 2023 11:17:48 -0600
Subject: [PATCH 337/342] [Bazel] Added SCFDialect to mlir-vulkan-runner
 (#76454)

Added SCFDialect to mlir-vulkan-runner cc_binary.
---
 utils/bazel/llvm-project-overlay/mlir/BUILD.bazel | 1 +
 1 file changed, 1 insertion(+)

diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index c80d714f6a991..9e730c33db2de 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -9360,6 +9360,7 @@ cc_binary(
         ":MlirJitRunner",
         ":Pass",
         ":ReconcileUnrealizedCasts",
+	":SCFDialect",
         ":SPIRVDialect",
         ":SPIRVTransforms",
         ":ToLLVMIRTranslation",

From cdb7d8adbb2dd59076c3b688493bc9506083861b Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Wed, 27 Dec 2023 09:45:18 -0800
Subject: [PATCH 338/342] [RISCV] Minor improvements to RISCVInstrInfoXSf.td.
 NFC (#76424)

Use an explicit list of LMULInfos instead of indexing part of other
lists. Use wvrclass field to double LMUL instead of using two lists.

Use range instead of list in another spot. I wish I could use a list of
LMULInfo here but there's no way to quadruple an LMUL yet.
---
 llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td
index 38d05877bb45c..561ab8d7403d6 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td
@@ -356,10 +356,9 @@ multiclass VPseudoSiFiveVQMACCDOD<string Constraint = ""> {
 }
 
 multiclass VPseudoSiFiveVQMACCQOQ<string Constraint = ""> {
-  foreach i = 0-3 in
-    let VLMul = MxListVF4[i].value in
-    defm NAME : VPseudoSiFiveVMACC<MxListVF4[i].MX, MxListVF8[i].vrclass,
-                                   MxListVF4[i].vrclass, Constraint>;
+  foreach m = [V_MF2, V_M1, V_M2, V_M4] in
+    let VLMul = m.value in
+    defm NAME : VPseudoSiFiveVMACC<m.MX, m.wvrclass, m.vrclass, Constraint>;
 }
 
 multiclass VPseudoSiFiveVFWMACC<string Constraint = ""> {
@@ -369,7 +368,7 @@ multiclass VPseudoSiFiveVFWMACC<string Constraint = ""> {
 }
 
 multiclass VPseudoSiFiveVFNRCLIP<string Constraint = "@earlyclobber $rd"> {
-  foreach i = [0, 1, 2, 3, 4] in
+  foreach i = 0-4 in
     let hasSideEffects = 0 in
       defm "Pseudo" # NAME : VPseudoBinaryRoundingMode<MxListW[i].vrclass,
                                                        MxListVF4[i].vrclass,

From 840e23a7d50472d0b51bd31364c899c5044c7a3d Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Wed, 27 Dec 2023 09:45:50 -0800
Subject: [PATCH 339/342] [RISCV] Simplify VTypeInfo and GroupVTypeInfo tblgen
 templates. NFC (#76427)

We don't need to pass in the register class, we can get it from the
LMULInfo.
---
 .../Target/RISCV/RISCVInstrInfoVPseudos.td    | 94 +++++++++----------
 1 file changed, 47 insertions(+), 47 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
index 488ffa73f4e48..be4bc3b58766e 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
@@ -257,13 +257,13 @@ class SegRegClass<LMULInfo m, int nf> {
 // Vector register and vector group type information.
 //===----------------------------------------------------------------------===//
 
-class VTypeInfo<ValueType Vec, ValueType Mas, int Sew, VReg Reg, LMULInfo M,
+class VTypeInfo<ValueType Vec, ValueType Mas, int Sew, LMULInfo M,
                 ValueType Scal = XLenVT, RegisterClass ScalarReg = GPR> {
   ValueType Vector = Vec;
   ValueType Mask = Mas;
   int SEW = Sew;
   int Log2SEW = !logtwo(Sew);
-  VReg RegClass = Reg;
+  VReg RegClass = M.vrclass;
   LMULInfo LMul = M;
   ValueType Scalar = Scal;
   RegisterClass ScalarRegClass = ScalarReg;
@@ -279,9 +279,9 @@ class VTypeInfo<ValueType Vec, ValueType Mas, int Sew, VReg Reg, LMULInfo M,
 }
 
 class GroupVTypeInfo<ValueType Vec, ValueType VecM1, ValueType Mas, int Sew,
-                     VReg Reg, LMULInfo M, ValueType Scal = XLenVT,
+                     LMULInfo M, ValueType Scal = XLenVT,
                      RegisterClass ScalarReg = GPR>
-    : VTypeInfo<Vec, Mas, Sew, Reg, M, Scal, ScalarReg> {
+    : VTypeInfo<Vec, Mas, Sew, M, Scal, ScalarReg> {
   ValueType VectorM1 = VecM1;
 }
 
@@ -289,70 +289,70 @@ defset list<VTypeInfo> AllVectors = {
   defset list<VTypeInfo> AllIntegerVectors = {
     defset list<VTypeInfo> NoGroupIntegerVectors = {
       defset list<VTypeInfo> FractionalGroupIntegerVectors = {
-        def VI8MF8: VTypeInfo<vint8mf8_t,  vbool64_t,  8, VR, V_MF8>;
-        def VI8MF4: VTypeInfo<vint8mf4_t,  vbool32_t,  8, VR, V_MF4>;
-        def VI8MF2: VTypeInfo<vint8mf2_t,  vbool16_t,  8, VR, V_MF2>;
-        def VI16MF4: VTypeInfo<vint16mf4_t, vbool64_t, 16, VR, V_MF4>;
-        def VI16MF2: VTypeInfo<vint16mf2_t, vbool32_t, 16, VR, V_MF2>;
-        def VI32MF2: VTypeInfo<vint32mf2_t, vbool64_t, 32, VR, V_MF2>;
+        def VI8MF8:  VTypeInfo<vint8mf8_t,  vbool64_t, 8,  V_MF8>;
+        def VI8MF4:  VTypeInfo<vint8mf4_t,  vbool32_t, 8,  V_MF4>;
+        def VI8MF2:  VTypeInfo<vint8mf2_t,  vbool16_t, 8,  V_MF2>;
+        def VI16MF4: VTypeInfo<vint16mf4_t, vbool64_t, 16, V_MF4>;
+        def VI16MF2: VTypeInfo<vint16mf2_t, vbool32_t, 16, V_MF2>;
+        def VI32MF2: VTypeInfo<vint32mf2_t, vbool64_t, 32, V_MF2>;
       }
-      def VI8M1: VTypeInfo<vint8m1_t,   vbool8_t,   8, VR, V_M1>;
-      def VI16M1: VTypeInfo<vint16m1_t,  vbool16_t, 16, VR, V_M1>;
-      def VI32M1: VTypeInfo<vint32m1_t,  vbool32_t, 32, VR, V_M1>;
-      def VI64M1: VTypeInfo<vint64m1_t,  vbool64_t, 64, VR, V_M1>;
+      def VI8M1:  VTypeInfo<vint8m1_t,  vbool8_t,   8, V_M1>;
+      def VI16M1: VTypeInfo<vint16m1_t, vbool16_t, 16, V_M1>;
+      def VI32M1: VTypeInfo<vint32m1_t, vbool32_t, 32, V_M1>;
+      def VI64M1: VTypeInfo<vint64m1_t, vbool64_t, 64, V_M1>;
     }
     defset list<GroupVTypeInfo> GroupIntegerVectors = {
-      def VI8M2: GroupVTypeInfo<vint8m2_t, vint8m1_t, vbool4_t, 8, VRM2, V_M2>;
-      def VI8M4: GroupVTypeInfo<vint8m4_t, vint8m1_t, vbool2_t, 8, VRM4, V_M4>;
-      def VI8M8: GroupVTypeInfo<vint8m8_t, vint8m1_t, vbool1_t, 8, VRM8, V_M8>;
+      def VI8M2: GroupVTypeInfo<vint8m2_t, vint8m1_t, vbool4_t, 8, V_M2>;
+      def VI8M4: GroupVTypeInfo<vint8m4_t, vint8m1_t, vbool2_t, 8, V_M4>;
+      def VI8M8: GroupVTypeInfo<vint8m8_t, vint8m1_t, vbool1_t, 8, V_M8>;
 
-      def VI16M2: GroupVTypeInfo<vint16m2_t,vint16m1_t,vbool8_t, 16,VRM2, V_M2>;
-      def VI16M4: GroupVTypeInfo<vint16m4_t,vint16m1_t,vbool4_t, 16,VRM4, V_M4>;
-      def VI16M8: GroupVTypeInfo<vint16m8_t,vint16m1_t,vbool2_t, 16,VRM8, V_M8>;
+      def VI16M2: GroupVTypeInfo<vint16m2_t, vint16m1_t, vbool8_t, 16, V_M2>;
+      def VI16M4: GroupVTypeInfo<vint16m4_t, vint16m1_t, vbool4_t, 16, V_M4>;
+      def VI16M8: GroupVTypeInfo<vint16m8_t, vint16m1_t, vbool2_t, 16, V_M8>;
 
-      def VI32M2: GroupVTypeInfo<vint32m2_t,vint32m1_t,vbool16_t,32,VRM2, V_M2>;
-      def VI32M4: GroupVTypeInfo<vint32m4_t,vint32m1_t,vbool8_t, 32,VRM4, V_M4>;
-      def VI32M8: GroupVTypeInfo<vint32m8_t,vint32m1_t,vbool4_t, 32,VRM8, V_M8>;
+      def VI32M2: GroupVTypeInfo<vint32m2_t, vint32m1_t, vbool16_t, 32, V_M2>;
+      def VI32M4: GroupVTypeInfo<vint32m4_t, vint32m1_t, vbool8_t,  32, V_M4>;
+      def VI32M8: GroupVTypeInfo<vint32m8_t, vint32m1_t, vbool4_t,  32, V_M8>;
 
-      def VI64M2: GroupVTypeInfo<vint64m2_t,vint64m1_t,vbool32_t,64,VRM2, V_M2>;
-      def VI64M4: GroupVTypeInfo<vint64m4_t,vint64m1_t,vbool16_t,64,VRM4, V_M4>;
-      def VI64M8: GroupVTypeInfo<vint64m8_t,vint64m1_t,vbool8_t, 64,VRM8, V_M8>;
+      def VI64M2: GroupVTypeInfo<vint64m2_t, vint64m1_t, vbool32_t, 64, V_M2>;
+      def VI64M4: GroupVTypeInfo<vint64m4_t, vint64m1_t, vbool16_t, 64, V_M4>;
+      def VI64M8: GroupVTypeInfo<vint64m8_t, vint64m1_t, vbool8_t,  64, V_M8>;
     }
   }
 
   defset list<VTypeInfo> AllFloatVectors = {
     defset list<VTypeInfo> NoGroupFloatVectors = {
       defset list<VTypeInfo> FractionalGroupFloatVectors = {
-        def VF16MF4: VTypeInfo<vfloat16mf4_t, vbool64_t, 16, VR, V_MF4, f16, FPR16>;
-        def VF16MF2: VTypeInfo<vfloat16mf2_t, vbool32_t, 16, VR, V_MF2, f16, FPR16>;
-        def VF32MF2: VTypeInfo<vfloat32mf2_t,vbool64_t, 32, VR, V_MF2, f32, FPR32>;
+        def VF16MF4: VTypeInfo<vfloat16mf4_t, vbool64_t, 16, V_MF4, f16, FPR16>;
+        def VF16MF2: VTypeInfo<vfloat16mf2_t, vbool32_t, 16, V_MF2, f16, FPR16>;
+        def VF32MF2: VTypeInfo<vfloat32mf2_t, vbool64_t, 32, V_MF2, f32, FPR32>;
       }
-      def VF16M1:  VTypeInfo<vfloat16m1_t,  vbool16_t, 16, VR, V_M1,  f16, FPR16>;
-      def VF32M1:  VTypeInfo<vfloat32m1_t, vbool32_t, 32, VR, V_M1,  f32, FPR32>;
-      def VF64M1: VTypeInfo<vfloat64m1_t, vbool64_t, 64, VR, V_M1, f64, FPR64>;
+      def VF16M1: VTypeInfo<vfloat16m1_t, vbool16_t, 16, V_M1, f16, FPR16>;
+      def VF32M1: VTypeInfo<vfloat32m1_t, vbool32_t, 32, V_M1, f32, FPR32>;
+      def VF64M1: VTypeInfo<vfloat64m1_t, vbool64_t, 64, V_M1, f64, FPR64>;
     }
 
     defset list<GroupVTypeInfo> GroupFloatVectors = {
       def VF16M2: GroupVTypeInfo<vfloat16m2_t, vfloat16m1_t, vbool8_t, 16,
-                                 VRM2, V_M2, f16, FPR16>;
+                                 V_M2, f16, FPR16>;
       def VF16M4: GroupVTypeInfo<vfloat16m4_t, vfloat16m1_t, vbool4_t, 16,
-                                 VRM4, V_M4, f16, FPR16>;
+                                 V_M4, f16, FPR16>;
       def VF16M8: GroupVTypeInfo<vfloat16m8_t, vfloat16m1_t, vbool2_t, 16,
-                                 VRM8, V_M8, f16, FPR16>;
+                                 V_M8, f16, FPR16>;
 
       def VF32M2: GroupVTypeInfo<vfloat32m2_t, vfloat32m1_t, vbool16_t, 32,
-                                 VRM2, V_M2, f32, FPR32>;
+                                 V_M2, f32, FPR32>;
       def VF32M4: GroupVTypeInfo<vfloat32m4_t, vfloat32m1_t, vbool8_t,  32,
-                                 VRM4, V_M4, f32, FPR32>;
+                                 V_M4, f32, FPR32>;
       def VF32M8: GroupVTypeInfo<vfloat32m8_t, vfloat32m1_t, vbool4_t,  32,
-                                 VRM8, V_M8, f32, FPR32>;
+                                 V_M8, f32, FPR32>;
 
       def VF64M2: GroupVTypeInfo<vfloat64m2_t, vfloat64m1_t, vbool32_t, 64,
-                                 VRM2, V_M2, f64, FPR64>;
+                                 V_M2, f64, FPR64>;
       def VF64M4: GroupVTypeInfo<vfloat64m4_t, vfloat64m1_t, vbool16_t, 64,
-                                 VRM4, V_M4, f64, FPR64>;
+                                 V_M4, f64, FPR64>;
       def VF64M8: GroupVTypeInfo<vfloat64m8_t, vfloat64m1_t, vbool8_t,  64,
-                                 VRM8, V_M8, f64, FPR64>;
+                                 V_M8, f64, FPR64>;
     }
   }
 }
@@ -360,19 +360,19 @@ defset list<VTypeInfo> AllVectors = {
 defset list<VTypeInfo> AllBFloatVectors = {
   defset list<VTypeInfo> NoGroupBFloatVectors = {
     defset list<VTypeInfo> FractionalGroupBFloatVectors = {
-      def VBF16MF4: VTypeInfo<vbfloat16mf4_t, vbool64_t, 16, VR, V_MF4, bf16, FPR16>;
-      def VBF16MF2: VTypeInfo<vbfloat16mf2_t, vbool32_t, 16, VR, V_MF2, bf16, FPR16>;
+      def VBF16MF4: VTypeInfo<vbfloat16mf4_t, vbool64_t, 16, V_MF4, bf16, FPR16>;
+      def VBF16MF2: VTypeInfo<vbfloat16mf2_t, vbool32_t, 16, V_MF2, bf16, FPR16>;
     }
-    def VBF16M1:  VTypeInfo<vbfloat16m1_t,  vbool16_t, 16, VR, V_M1,  bf16, FPR16>;
+    def VBF16M1:  VTypeInfo<vbfloat16m1_t, vbool16_t, 16, V_M1, bf16, FPR16>;
   }
 
   defset list<GroupVTypeInfo> GroupBFloatVectors = {
     def VBF16M2: GroupVTypeInfo<vbfloat16m2_t, vbfloat16m1_t, vbool8_t, 16,
-                                VRM2, V_M2, bf16, FPR16>;
+                                V_M2, bf16, FPR16>;
     def VBF16M4: GroupVTypeInfo<vbfloat16m4_t, vbfloat16m1_t, vbool4_t, 16,
-                                VRM4, V_M4, bf16, FPR16>;
+                                V_M4, bf16, FPR16>;
     def VBF16M8: GroupVTypeInfo<vbfloat16m8_t, vbfloat16m1_t, vbool2_t, 16,
-                                VRM8, V_M8, bf16, FPR16>;
+                                V_M8, bf16, FPR16>;
   }
 }
 

From a01b58aef0e42fb1b52e358adf4c56678a884d37 Mon Sep 17 00:00:00 2001
From: Gheorghe-Teodor Bercea <doru.bercea@amd.com>
Date: Wed, 27 Dec 2023 12:58:41 -0500
Subject: [PATCH 340/342] [OpenMP][libomptarget][Fix] Add missing array
 initialization (#76457)

Add missing array initialization as the array was not initialized and
the value zero was assumed.
---
 .../test/offloading/struct_mapping_with_pointers.cpp             | 1 +
 1 file changed, 1 insertion(+)

diff --git a/openmp/libomptarget/test/offloading/struct_mapping_with_pointers.cpp b/openmp/libomptarget/test/offloading/struct_mapping_with_pointers.cpp
index befed120ca138..f0fde50889dac 100644
--- a/openmp/libomptarget/test/offloading/struct_mapping_with_pointers.cpp
+++ b/openmp/libomptarget/test/offloading/struct_mapping_with_pointers.cpp
@@ -29,6 +29,7 @@ int main() {
 
   dat.datum[7] = 7;
   dat.more_datum[17] = 17;
+  dat.datum[dat.arr[0][0]] = 0;
 
   /// The struct is mapped with type 0x0 when the pointer fields are mapped.
   /// The struct is also map explicitely by the user. The second mapping by

From 7f1c8fc25a4dbf34ed479e0f5c4e85f32d98b8f2 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Wed, 27 Dec 2023 12:27:12 -0800
Subject: [PATCH 341/342] [InstCombine] Use ConstantInt::getSigned to sign
 extend -2 for large types. (#76464)

Using ContantInt::get will zero extend.

Fixes #76441
---
 .../InstCombine/InstCombineAddSub.cpp          |  4 ++--
 .../Transforms/InstCombine/free-inversion.ll   | 18 ++++++++++++++++++
 2 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index 719a2678fc189..556fde37efeb2 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -1685,8 +1685,8 @@ Instruction *InstCombinerImpl::visitAdd(BinaryOperator &I) {
       assert(NotLHS != nullptr && NotRHS != nullptr &&
              "isFreeToInvert desynced with getFreelyInverted");
       Value *LHSPlusRHS = Builder.CreateAdd(NotLHS, NotRHS);
-      return BinaryOperator::CreateSub(ConstantInt::get(RHS->getType(), -2),
-                                       LHSPlusRHS);
+      return BinaryOperator::CreateSub(
+          ConstantInt::getSigned(RHS->getType(), -2), LHSPlusRHS);
     }
   }
 
diff --git a/llvm/test/Transforms/InstCombine/free-inversion.ll b/llvm/test/Transforms/InstCombine/free-inversion.ll
index 5e5e65164f707..be9bedbf79859 100644
--- a/llvm/test/Transforms/InstCombine/free-inversion.ll
+++ b/llvm/test/Transforms/InstCombine/free-inversion.ll
@@ -133,6 +133,24 @@ define i8 @sub_2(i8 %a, i1 %c, i8 %x, i8 %y) {
   ret i8 %not_ab
 }
 
+; Same as above but with a type larger than i64 to make sure we create -2
+; correctly.
+define i128 @sub_3(i128 %a, i1 %c, i128 %x, i128 %y) {
+; CHECK-LABEL: @sub_3(
+; CHECK-NEXT:    [[TMP1:%.*]] = xor i128 [[Y:%.*]], -124
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[C:%.*]], i128 [[X:%.*]], i128 [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = add i128 [[TMP2]], [[A:%.*]]
+; CHECK-NEXT:    [[NOT_AB:%.*]] = sub i128 -2, [[TMP3]]
+; CHECK-NEXT:    ret i128 [[NOT_AB]]
+;
+  %nx = xor i128 %x, -1
+  %yy = xor i128 %y, 123
+  %b = select i1 %c, i128 %nx, i128 %yy
+  %ab = sub i128 %a, %b
+  %not_ab = xor i128 %ab, -1
+  ret i128 %not_ab
+}
+
 define i8 @sub_fail(i8 %a, i1 %c, i8 %x, i8 %y) {
 ; CHECK-LABEL: @sub_fail(
 ; CHECK-NEXT:    [[NX:%.*]] = xor i8 [[X:%.*]], -1

From bc8c4bbd7973ab9527a78a20000aecde9bed652d Mon Sep 17 00:00:00 2001
From: Alexey Bataev <5361294+alexey-bataev@users.noreply.github.com>
Date: Wed, 27 Dec 2023 15:57:21 -0500
Subject: [PATCH 342/342] [SLP][TTI][X86]Add addsub pattern cost estimation.
 (#76461)

SLP/TTI do not know about the cost estimation for addsub pattern,
supported by X86. Previously the support for pattern detection was added
(seeTTI::isLegalAltInstr), but the cost still did not estimated
properly.
---
 .../llvm/Analysis/TargetTransformInfo.h       | 22 +++++++++++++++++++
 .../llvm/Analysis/TargetTransformInfoImpl.h   |  7 ++++++
 llvm/lib/Analysis/TargetTransformInfo.cpp     |  9 ++++++++
 .../lib/Target/X86/X86TargetTransformInfo.cpp |  9 ++++++++
 llvm/lib/Target/X86/X86TargetTransformInfo.h  |  5 +++++
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 19 ++++++++++++++++
 .../Transforms/SLPVectorizer/X86/supernode.ll | 22 ++++++++++---------
 .../X86/vectorize-widest-phis.ll              | 20 +++++++++--------
 8 files changed, 94 insertions(+), 19 deletions(-)

diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 735be3680aea0..048912beaba5a 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1243,6 +1243,18 @@ class TargetTransformInfo {
       ArrayRef<const Value *> Args = ArrayRef<const Value *>(),
       const Instruction *CxtI = nullptr) const;
 
+  /// Returns the cost estimation for alternating opcode pattern that can be
+  /// lowered to a single instruction on the target. In X86 this is for the
+  /// addsub instruction which corrsponds to a Shuffle + Fadd + FSub pattern in
+  /// IR. This function expects two opcodes: \p Opcode1 and \p Opcode2 being
+  /// selected by \p OpcodeMask. The mask contains one bit per lane and is a `0`
+  /// when \p Opcode0 is selected and `1` when Opcode1 is selected.
+  /// \p VecTy is the vector type of the instruction to be generated.
+  InstructionCost getAltInstrCost(
+      VectorType *VecTy, unsigned Opcode0, unsigned Opcode1,
+      const SmallBitVector &OpcodeMask,
+      TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) const;
+
   /// \return The cost of a shuffle instruction of kind Kind and of type Tp.
   /// The exact mask may be passed as Mask, or else the array will be empty.
   /// The index and subtype parameters are used by the subvector insertion and
@@ -1944,6 +1956,10 @@ class TargetTransformInfo::Concept {
       unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
       OperandValueInfo Opd1Info, OperandValueInfo Opd2Info,
       ArrayRef<const Value *> Args, const Instruction *CxtI = nullptr) = 0;
+  virtual InstructionCost getAltInstrCost(
+      VectorType *VecTy, unsigned Opcode0, unsigned Opcode1,
+      const SmallBitVector &OpcodeMask,
+      TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) const = 0;
 
   virtual InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *Tp,
                                          ArrayRef<int> Mask,
@@ -2555,6 +2571,12 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
     return Impl.getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info, Opd2Info,
                                        Args, CxtI);
   }
+  InstructionCost getAltInstrCost(VectorType *VecTy, unsigned Opcode0,
+                                  unsigned Opcode1,
+                                  const SmallBitVector &OpcodeMask,
+                                  TTI::TargetCostKind CostKind) const override {
+    return Impl.getAltInstrCost(VecTy, Opcode0, Opcode1, OpcodeMask, CostKind);
+  }
 
   InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *Tp,
                                  ArrayRef<int> Mask,
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 1d8f523e9792b..7ad3ce512a355 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -554,6 +554,13 @@ class TargetTransformInfoImplBase {
     return 1;
   }
 
+  InstructionCost getAltInstrCost(VectorType *VecTy, unsigned Opcode0,
+                                  unsigned Opcode1,
+                                  const SmallBitVector &OpcodeMask,
+                                  TTI::TargetCostKind CostKind) const {
+    return InstructionCost::getInvalid();
+  }
+
   InstructionCost
   getShuffleCost(TTI::ShuffleKind Kind, VectorType *Ty, ArrayRef<int> Mask,
                  TTI::TargetCostKind CostKind, int Index, VectorType *SubTp,
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 3f76dfdaac317..67246afa23147 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -862,6 +862,15 @@ InstructionCost TargetTransformInfo::getArithmeticInstrCost(
   return Cost;
 }
 
+InstructionCost TargetTransformInfo::getAltInstrCost(
+    VectorType *VecTy, unsigned Opcode0, unsigned Opcode1,
+    const SmallBitVector &OpcodeMask, TTI::TargetCostKind CostKind) const {
+  InstructionCost Cost =
+      TTIImpl->getAltInstrCost(VecTy, Opcode0, Opcode1, OpcodeMask, CostKind);
+  assert(Cost >= 0 && "TTI should not produce negative costs!");
+  return Cost;
+}
+
 InstructionCost TargetTransformInfo::getShuffleCost(
     ShuffleKind Kind, VectorType *Ty, ArrayRef<int> Mask,
     TTI::TargetCostKind CostKind, int Index, VectorType *SubTp,
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 8a04987e768a1..e09dc7ff02a07 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -1459,6 +1459,15 @@ InstructionCost X86TTIImpl::getArithmeticInstrCost(
                                        Args, CxtI);
 }
 
+InstructionCost
+X86TTIImpl::getAltInstrCost(VectorType *VecTy, unsigned Opcode0,
+                            unsigned Opcode1, const SmallBitVector &OpcodeMask,
+                            TTI::TargetCostKind CostKind) const {
+  if (isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask))
+    return TTI::TCC_Basic;
+  return InstructionCost::getInvalid();
+}
+
 InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
                                            VectorType *BaseTp,
                                            ArrayRef<int> Mask,
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h
index 0fa0d240a548b..07a3fff4f84b3 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.h
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h
@@ -140,6 +140,11 @@ class X86TTIImpl : public BasicTTIImplBase<X86TTIImpl> {
       TTI::OperandValueInfo Op2Info = {TTI::OK_AnyValue, TTI::OP_None},
       ArrayRef<const Value *> Args = ArrayRef<const Value *>(),
       const Instruction *CxtI = nullptr);
+  InstructionCost getAltInstrCost(VectorType *VecTy, unsigned Opcode0,
+                                  unsigned Opcode1,
+                                  const SmallBitVector &OpcodeMask,
+                                  TTI::TargetCostKind CostKind) const;
+
   InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp,
                                  ArrayRef<int> Mask,
                                  TTI::TargetCostKind CostKind, int Index,
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 32913b3f55697..c0ace2996c32c 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -8428,6 +8428,25 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
           Mask);
       VecCost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc,
                                      FinalVecTy, Mask);
+      // Patterns like [fadd,fsub] can be combined into a single instruction
+      // in x86. Reordering them into [fsub,fadd] blocks this pattern. So we
+      // need to take into account their order when looking for the most used
+      // order.
+      unsigned Opcode0 = E->getOpcode();
+      unsigned Opcode1 = E->getAltOpcode();
+      // The opcode mask selects between the two opcodes.
+      SmallBitVector OpcodeMask(E->Scalars.size(), false);
+      for (unsigned Lane : seq<unsigned>(0, E->Scalars.size()))
+        if (cast<Instruction>(E->Scalars[Lane])->getOpcode() == Opcode1)
+          OpcodeMask.set(Lane);
+      // If this pattern is supported by the target then we consider the
+      // order.
+      if (TTI->isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
+        InstructionCost AltVecCost =
+            TTI->getAltInstrCost(VecTy, Opcode0, Opcode1, OpcodeMask, CostKind);
+        return AltVecCost < VecCost ? AltVecCost : VecCost;
+      }
+      // TODO: Check the reverse order too.
       return VecCost;
     };
     return GetCostDiff(GetScalarCost, GetVectorCost);
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/supernode.ll b/llvm/test/Transforms/SLPVectorizer/X86/supernode.ll
index d4c71285a93ab..87063fc3f7a82 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/supernode.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/supernode.ll
@@ -103,21 +103,23 @@ define void @test_supernode_addsub_alt(ptr %Aarray, ptr %Barray, ptr %Carray, pt
 ; ENABLED-LABEL: @test_supernode_addsub_alt(
 ; ENABLED-NEXT:  entry:
 ; ENABLED-NEXT:    [[IDXA1:%.*]] = getelementptr inbounds double, ptr [[AARRAY:%.*]], i64 1
-; ENABLED-NEXT:    [[IDXB1:%.*]] = getelementptr inbounds double, ptr [[BARRAY:%.*]], i64 1
 ; ENABLED-NEXT:    [[IDXC1:%.*]] = getelementptr inbounds double, ptr [[CARRAY:%.*]], i64 1
-; ENABLED-NEXT:    [[IDXS1:%.*]] = getelementptr inbounds double, ptr [[SARRAY:%.*]], i64 1
 ; ENABLED-NEXT:    [[A0:%.*]] = load double, ptr [[AARRAY]], align 8
 ; ENABLED-NEXT:    [[A1:%.*]] = load double, ptr [[IDXA1]], align 8
-; ENABLED-NEXT:    [[B0:%.*]] = load double, ptr [[BARRAY]], align 8
-; ENABLED-NEXT:    [[B1:%.*]] = load double, ptr [[IDXB1]], align 8
 ; ENABLED-NEXT:    [[C0:%.*]] = load double, ptr [[CARRAY]], align 8
 ; ENABLED-NEXT:    [[C1:%.*]] = load double, ptr [[IDXC1]], align 8
-; ENABLED-NEXT:    [[SUBA0B0:%.*]] = fsub fast double [[A0]], [[B0]]
-; ENABLED-NEXT:    [[ADDB1C1:%.*]] = fadd fast double [[B1]], [[C1]]
-; ENABLED-NEXT:    [[SUB0:%.*]] = fsub fast double [[SUBA0B0]], [[C0]]
-; ENABLED-NEXT:    [[ADD1:%.*]] = fadd fast double [[ADDB1C1]], [[A1]]
-; ENABLED-NEXT:    store double [[SUB0]], ptr [[SARRAY]], align 8
-; ENABLED-NEXT:    store double [[ADD1]], ptr [[IDXS1]], align 8
+; ENABLED-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[BARRAY:%.*]], align 8
+; ENABLED-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> poison, double [[A0]], i32 0
+; ENABLED-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[C1]], i32 1
+; ENABLED-NEXT:    [[TMP3:%.*]] = fsub fast <2 x double> [[TMP2]], [[TMP0]]
+; ENABLED-NEXT:    [[TMP4:%.*]] = fadd fast <2 x double> [[TMP2]], [[TMP0]]
+; ENABLED-NEXT:    [[TMP5:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP4]], <2 x i32> <i32 0, i32 3>
+; ENABLED-NEXT:    [[TMP6:%.*]] = insertelement <2 x double> poison, double [[C0]], i32 0
+; ENABLED-NEXT:    [[TMP7:%.*]] = insertelement <2 x double> [[TMP6]], double [[A1]], i32 1
+; ENABLED-NEXT:    [[TMP8:%.*]] = fsub fast <2 x double> [[TMP5]], [[TMP7]]
+; ENABLED-NEXT:    [[TMP9:%.*]] = fadd fast <2 x double> [[TMP5]], [[TMP7]]
+; ENABLED-NEXT:    [[TMP10:%.*]] = shufflevector <2 x double> [[TMP8]], <2 x double> [[TMP9]], <2 x i32> <i32 0, i32 3>
+; ENABLED-NEXT:    store <2 x double> [[TMP10]], ptr [[SARRAY:%.*]], align 8
 ; ENABLED-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-widest-phis.ll b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-widest-phis.ll
index aa3c2be7dc9c2..17f9f371ff6ef 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-widest-phis.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-widest-phis.ll
@@ -12,22 +12,24 @@ define void @foo() {
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> [[TMP0]], float [[CONV]], i32 1
 ; CHECK-NEXT:    br label [[BB2:%.*]]
 ; CHECK:       bb2:
-; CHECK-NEXT:    [[TMP2:%.*]] = phi <4 x float> [ [[TMP1]], [[BB1]] ], [ [[TMP10:%.*]], [[BB3:%.*]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = phi <4 x float> [ [[TMP1]], [[BB1]] ], [ [[TMP14:%.*]], [[BB3:%.*]] ]
 ; CHECK-NEXT:    [[TMP3:%.*]] = load double, ptr undef, align 8
 ; CHECK-NEXT:    br i1 undef, label [[BB3]], label [[BB4:%.*]]
 ; CHECK:       bb4:
 ; CHECK-NEXT:    [[TMP4:%.*]] = fpext <4 x float> [[TMP2]] to <4 x double>
 ; CHECK-NEXT:    [[CONV2:%.*]] = uitofp i16 undef to double
-; CHECK-NEXT:    [[ADD1:%.*]] = fadd double [[TMP3]], [[CONV2]]
-; CHECK-NEXT:    [[SUB1:%.*]] = fsub double undef, undef
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x double> <double poison, double poison, double undef, double undef>, double [[SUB1]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x double> [[TMP5]], double [[ADD1]], i32 1
-; CHECK-NEXT:    [[TMP7:%.*]] = fcmp ogt <4 x double> [[TMP6]], [[TMP4]]
-; CHECK-NEXT:    [[TMP8:%.*]] = fptrunc <4 x double> [[TMP6]] to <4 x float>
-; CHECK-NEXT:    [[TMP9:%.*]] = select <4 x i1> [[TMP7]], <4 x float> [[TMP2]], <4 x float> [[TMP8]]
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> <double undef, double poison>, double [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x double> <double undef, double poison>, double [[CONV2]], i32 1
+; CHECK-NEXT:    [[TMP7:%.*]] = fsub <2 x double> [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = fadd <2 x double> [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> [[TMP8]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x double> [[TMP9]], <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP11:%.*]] = fcmp ogt <4 x double> [[TMP10]], [[TMP4]]
+; CHECK-NEXT:    [[TMP12:%.*]] = fptrunc <4 x double> [[TMP10]] to <4 x float>
+; CHECK-NEXT:    [[TMP13:%.*]] = select <4 x i1> [[TMP11]], <4 x float> [[TMP2]], <4 x float> [[TMP12]]
 ; CHECK-NEXT:    br label [[BB3]]
 ; CHECK:       bb3:
-; CHECK-NEXT:    [[TMP10]] = phi <4 x float> [ [[TMP9]], [[BB4]] ], [ [[TMP2]], [[BB2]] ]
+; CHECK-NEXT:    [[TMP14]] = phi <4 x float> [ [[TMP13]], [[BB4]] ], [ [[TMP2]], [[BB2]] ]
 ; CHECK-NEXT:    br label [[BB2]]
 ;
 entry: